#    INDEPENDENT T TEST in R SCRIPT FILE.
#    www_statstutor_ac_uk Community Project.
#    Sofia Maria Karadimitriou, Sheffield University.
#    Reviewed by Basile Marquier, University of Sheffield.
#    Dataset: birthweight_reduced csv.
#    Resource: INDEPENDENT T TEST in R.

#Open the birthweight reduced dataset which is saved as a csv file and call it birthweightR.  
#If your file is saved as a standard Excel file, save it as a csv file first.  

#Download the data set in .csv format and put it in a directory on your computer
#Load the directory in which the .csv file is: by "File"->"Change dir." 
birthweightR<-read.csv("stcp-Rdataset-birthweight_reduced.csv",header=T,sep=",")

#This example refers to the memory stick where the data is stored as stcp-Rdataset-birthweight_reduced.
birthweightR<-read.csv("E:\\stcp-Rdataset-birthweight_reduced.csv",header=T,sep=",")

#Tell R we are using the birthweight dataset until further notice using attach.
#This means that 'Gestation' can be used instead of birthweightR$Gestation.
attach(birthweightR)

#R assumes all numeric values are continuous so tell it that 'smoker' is a factor. 
#and attach labels to the categories (for example 0 in smoker means the mother is a non-smoker).
# The factor command uses variable<-factor(variable,c(category numbers),labels=c(category names)).

smoker<-factor(birthweightR$smoker,c(0,1),labels=c('Non-smoker','Smoker'))
attach(birthweightR)

#calculate means and standard deviations for each diet.
#na.rm=T removes rows that missing values exist.

mean<-tapply(Birthweight,smoker,mean,na.rm=T)
sd<-tapply(Birthweight,smoker,sd,na.rm=T)

#Combine in one table and give rownames.
results1<-cbind(mean,sd)

#Round and display all the summary statistics to 2 decimal places.
round(results1,2)

#To calculate the difference between the means.
round(mean[1]-mean[2],2)

#Checking assumptions.
#The dependent variable by group needs to be normally distributed.
#This can be checked using histograms, QQplots or tests (see Checking normality in R sheet).

#Specify that two charts are needed next to each other.
par(mfrow=c(1,2))

#Plot histogram for the birthweight of babies of non-smoker mothers
hist(Birthweight[smoker=='Non-smoker'],main='Histogram for non smokers',xlab='Birthweight')

#Plot histogram for the birthweight of babies with smoker mothers
hist(Birthweight[smoker=='Smoker'],main='Histogram for smokers',xlab='Birthweight')

#Checking the assumption of equality of variances using the Levene's test.
library(car)

#Once loaded, carry out Levene's test.
leveneTest(Birthweight~smoker,center='mean')

#Note: Rstudio currently has some issues with not all commands will work.
#An alternative is available in the lawstat package.
#Load through Tools --> install packages.

library(lawstat)
levene.test(Birthweight,smoker)

#Carry out the t-test.
#use var.equal=TRUE if equal variances can be used and var.equal=FALSE if not.
t.test(Birthweight~smoker,var.equal=TRUE)


#If the t-test is significant, there is a difference between means.
#Finish by reporting what that difference is.