# SUMMARISING CONTINUOUS VARIABLE in R SCRIPT FILE. # www_statstutor_ac_uk Community Project. # Sofia Maria Karadimitriou, Sheffield University. # Reviewed by Basile Marquier, University of Sheffield. # Dataset: birthweight_reduced csv. # Resource: Summarising Continuous Variables in R. #Download the data set in .csv format and put it in a directory on your computer. #If your file is saved as a standard Excel file, save it as a csv file first. #Open the birthweight reduced dataset which is saved as a csv file and call it birthweightR. #This example refers to the memory stick where the data is stored as stcp-Rdataset-birthweight_reduced. birthweightR<-read.csv("D:\\stcp-Rdataset-birthweight_reduced.csv",header=T,sep=",") #Tell R we are using the birthweight dataset until further notice using attach. #This means that 'Gestation' can be used instead of birthweightR$Gestation. attach(birthweightR) #R assumes all numeric values are continuous so tell it that ‘smoker’ is a factor. #and attach labels to the categories (for example 0 in smoker means the mother is a non-smoker). # The factor command uses variable<-factor(variable,c(category numbers),labels=c(category names)). smoker<-factor(smoker,c(0,1),labels=c('Non-smoker','Smoker')) attach(birthweightR) ############ Summarising one continuous variable ##############. #To calculate a range of summary statistics quickly use summary(variable) summary(birthweight) #There is no standard deviation in the summary command. #Individual summary statistics such as mean and standard deviation with missing rows removed mean(Birthweight,na.rm=T) #Standard deviation. sd(Birthweight,na.rm=T) #Median median (Birthweight, na.rm=T) #Interquartile range expressed as one number. IQR(Birthweight, na.rm=T) ############ Summarising one continuous variable by group ##############. #Request summary statistics of birthweight by group #give each summary a name. Smoking<-summary(Birthweight[smoker=='Smoker']) Non_smoking<-summary(Birthweight[smoker=='Non-smoker']) #Combine the results into one table and give it a name compare1<-cbind(Smoking,Non_smoking) #Then reduce the decimal places to 2. round(compare1,2) #Calculating individual summary statistics. #Mean by one group mean(Birthweight[smoker=='Smoker']) #Standard deviation by one group sd(Birthweight[smoker=='Smoker']) #Calculate the mean birthweight by using tapply. #calculate means and standard deviations for each group. #na.rm=T removes rows that missing values exist. means<-tapply(Birthweight,smoker,mean,na.rm=T) sds<-tapply(Birthweight,smoker,sd,na.rm=T) #Combine in one table and give rownames. results1<-cbind(means,sds) #Round all the summary statistics to 2 decimal places. round(results1,2) ####### Histograms #################. #To plot histograms of frequencies for one variable and blue bars. hist(Birthweight,main='Histogram of Birthweight',xlab='Birthweight (lbs)',probability=F,col="lightblue") #To add a normal curve, the histogram must contain densities (probabilities) rather than frequencies. #Change probability=F to probability=T. hist(Birthweight,main='Histogram of Birthweight',xlab='Birthweight',probability=T,col="lightblue") #To change the number of bars, use the breaks command. #You can specify a number e.g. breaks=5 or the break points e.g. breaks =c(6,7,8,9,10). hist(Birthweight,main='Histogram of Birthweight',xlab='Birthweight',probability=T,col="lightblue",breaks=7) #A line representing a very smooth histogram can be added to the plot to aide normality checks. #This command adds a smooth version of the histogram. lines(density(Birthweight)) #If you wish to add a standard normal curve generate many data points from a normal distribution with the same mean and variance. #col changes the colour of the line. lines(density(rnorm(n=1000000,mean=mean(Birthweight),sd=sd(Birthweight))),col=2) ############ Histograms by group ###############. #Specify that two charts are needed on top of each other. par(mfrow=c(2,1)) #Plot a histogram for the birthweight of babies of non-smoker mothers. #ylim=c(min,max) gives limits for the y-axis scale. hist(Birthweight[smoker=='Non-smoker'],main='Histogram for non smokers',xlab='Birthweight (lbs)',probability=T,ylim=c(0,0.4),col="lightblue",breaks=c(4,5,6,7,8,9,10,11)) #Plotting the normal curve (generate many data points from normal distribution with the same mean and variance) lines(density(rnorm(n=10000000,mean=mean(Birthweight[smoker=='Non-smoker']),sd=sd(Birthweight[smoker=='Non-smoker']))),col=2) #Plot the histogram for smokers. hist(Birthweight[smoker=='Smoker'],main='Histogram for smokers',xlab='Birthweight (lbs)',probability=T,ylim=c(0,0.4),col="lightblue",breaks=c(4,5,6,7,8,9,10,11)) #Plotting the normal curve (generate many data points from normal distribution with the same mean and variance) lines(density(rnorm(n=10000000,mean=mean(Birthweight[smoker=='Smoker']),sd=sd(Birthweight[smoker=='Smoker']))),col=2) #Plotting Boxplots in one screen. par(mfrow=c(1,1)) boxplot(Birthweight~smoker,col='tomato',main='Birthweight by smoking group of mother',xlab='Smoking group',ylab='Birthweight')