#    SUMMARISING CONTINUOUS VARIABLE in R SCRIPT FILE.
#    www_statstutor_ac_uk Community Project.
#    Sofia Maria Karadimitriou, Sheffield University.
#    Reviewed by Basile Marquier, University of Sheffield.
#    Dataset: birthweight_reduced csv.
#    Resource: Summarising Continuous Variables  in R.


#Download the data set in .csv format and put it in a directory on your computer.
#If your file is saved as a standard Excel file, save it as a csv file first.  
#Open the birthweight reduced dataset which is saved as a csv file and call it birthweightR.  

#This example refers to the memory stick where the data is stored as stcp-Rdataset-birthweight_reduced.
birthweightR<-read.csv("D:\\stcp-Rdataset-birthweight_reduced.csv",header=T,sep=",")
#Tell R we are using the birthweight dataset until further notice using attach.
#This means that 'Gestation' can be used instead of birthweightR$Gestation.
attach(birthweightR)


#R assumes all numeric values are continuous so tell it that ‘smoker’ is a factor. 
#and attach labels to the categories (for example 0 in smoker means the mother is a non-smoker).
# The factor command uses variable<-factor(variable,c(category numbers),labels=c(category names)).
smoker<-factor(smoker,c(0,1),labels=c('Non-smoker','Smoker'))
attach(birthweightR)

############ Summarising one continuous variable ##############.

#To calculate a range of summary statistics quickly use summary(variable)
summary(birthweight)
#There is no standard deviation in the summary command.
#Individual summary statistics such as mean and standard deviation with missing rows removed
mean(Birthweight,na.rm=T)
#Standard deviation.
sd(Birthweight,na.rm=T)
#Median
median (Birthweight, na.rm=T)
#Interquartile range expressed as one number.
IQR(Birthweight, na.rm=T)


############ Summarising one continuous variable by group ##############.
#Request summary statistics of birthweight by group 
#give each summary a name. 

Smoking<-summary(Birthweight[smoker=='Smoker'])
Non_smoking<-summary(Birthweight[smoker=='Non-smoker'])
#Combine the results into one table and give it a name
compare1<-cbind(Smoking,Non_smoking)
#Then reduce the decimal places to 2. 
round(compare1,2)

#Calculating individual summary statistics.
#Mean by one group
mean(Birthweight[smoker=='Smoker'])
#Standard deviation by one group
sd(Birthweight[smoker=='Smoker'])


#Calculate the mean birthweight by using tapply.
#calculate means and standard deviations for each group.
#na.rm=T removes rows that missing values exist.
means<-tapply(Birthweight,smoker,mean,na.rm=T)
sds<-tapply(Birthweight,smoker,sd,na.rm=T)
#Combine in one table and give rownames.
results1<-cbind(means,sds)
#Round all the summary statistics to 2 decimal places.
round(results1,2)



####### Histograms #################.

#To plot histograms of frequencies for one variable and blue bars.
hist(Birthweight,main='Histogram of Birthweight',xlab='Birthweight (lbs)',probability=F,col="lightblue")

#To add a normal curve, the histogram must contain densities (probabilities) rather than frequencies.
#Change probability=F to probability=T.
hist(Birthweight,main='Histogram of Birthweight',xlab='Birthweight',probability=T,col="lightblue")
#To change the number of bars, use the breaks command.  
#You can specify a number e.g. breaks=5 or the break points e.g. breaks =c(6,7,8,9,10).
hist(Birthweight,main='Histogram of Birthweight',xlab='Birthweight',probability=T,col="lightblue",breaks=7)

#A line representing a very smooth histogram can be added to the plot to aide normality checks.
#This command adds a smooth version of the histogram.
lines(density(Birthweight))
#If you wish to add a standard normal curve generate many data points from a normal distribution with the same mean and variance.
#col changes the colour of the line.
lines(density(rnorm(n=1000000,mean=mean(Birthweight),sd=sd(Birthweight))),col=2)


############ Histograms by group ###############.
#Specify that two charts are needed on top of each other.
par(mfrow=c(2,1))
#Plot a histogram for the birthweight of babies of non-smoker mothers.
#ylim=c(min,max) gives limits for the y-axis scale. 
hist(Birthweight[smoker=='Non-smoker'],main='Histogram for non smokers',xlab='Birthweight (lbs)',probability=T,ylim=c(0,0.4),col="lightblue",breaks=c(4,5,6,7,8,9,10,11))

#Plotting the normal curve (generate many data points from normal distribution with the same mean and variance)
lines(density(rnorm(n=10000000,mean=mean(Birthweight[smoker=='Non-smoker']),sd=sd(Birthweight[smoker=='Non-smoker']))),col=2)

#Plot the histogram for smokers.
hist(Birthweight[smoker=='Smoker'],main='Histogram for smokers',xlab='Birthweight (lbs)',probability=T,ylim=c(0,0.4),col="lightblue",breaks=c(4,5,6,7,8,9,10,11))
#Plotting the normal curve (generate many data points from normal distribution with the same mean and variance)
lines(density(rnorm(n=10000000,mean=mean(Birthweight[smoker=='Smoker']),sd=sd(Birthweight[smoker=='Smoker']))),col=2)


#Plotting Boxplots in one screen.
par(mfrow=c(1,1))
boxplot(Birthweight~smoker,col='tomato',main='Birthweight by smoking group of mother',xlab='Smoking group',ylab='Birthweight')