# Summarising categorical data in R SCRIPT FILE. # www_statstutor_ac_uk Community Project. # Sofia Maria Karadimitriou and Ellen Marshall, Sheffield University. # Reviewed by Basile Marquier, University of Sheffield. # Dataset: Titanic csv. # Resource: Summarising categorical data in R. #Open the titanic dataset which is saved as a csv file and call it titanic. #If your file is saved as a standard Excel file, save it as a csv file first. #You will need to change the command depending on where you have saved the file. #For example, the dataset has been saved as stcp-Rdataset-Titanic on a memory stick which is the D drive. #TitanicR<-data.frame(read.csv("E:\\stcp-Rdataset-Titanic.csv",header=T,sep=",")) TitanicR<-read.table(file.choose(),sep=",", header=T) #Tell R we are using the titanic dataset until further notice using attach. #This means that variable names such as survived can be used instead of Titanic$survived. attach(TitanicR) ##Occasionally this does not work! If so use Titanic$survived #what are the names of the variables names(TitanicR) ## look at the first six cases head(TitanicR) #R assumes all numeric values are continuous so tell it that ‘survived’ and ‘class’ are factors. #and attach labels to the categories (for example 0 in survived means a person died). #The factor command uses variable<-factor(variable,c(category numbers),labels=c(category names)). survivedf<-factor(survived,c(0,1),labels=c('Died','Survived')) pclassf<-factor(ï..pclass,c(1,2,3),labels=c('First','Second','Third')) Residencef<-factor(Residence,levels=c(0,1,2),labels=c('American','British','Other')) Genderf<-factor(Gender,levels=c(0,1),labels=c('Male','Female')) ######## One variable ################## #To summarise frequencies use the table command and give the table a name. SurT<-table(survived) #To view the table. SurT #To add row and column totals. addmargins(SurT) #To calculate proportions of one variable from the frequency table. prop.table(SurT) #Reduce the number of decimal places using the round function. round(prop.table(SurT),digits=2) #To produce percentages rounded to whole numbers. round(100*prop.table(SurT),digits=0) ######## Two variables ################## #To produce a contingency table of frequencies, use the table command. cross<-table(survivedf,pclassf) #To add row and column totals. addmargins(cross) #To produce a contingency tables of proportions, use the prop.table command. #To get proportions of row totals. prop.table(cross, 1) #To get column percentages prop.table(cross,2) #We are interested in survival within class so we require column percentages here. #Reduce the number of decimal places using the round function. round(prop.table(cross,2),digits=2) #To produce a contingency table of column percentages rounded to whole numbers. round(100*prop.table(cross,2),digits=0) #To get row percentages (not needed here). round(100*prop.table(cross,1),digits=0) ######## Bar charts ################## par(mfrow=c(1,1)) #Stacked Bar Plot with Colours and Legend barplot(cross, xlab='Class',ylab='Frequency',main="Survival by class",col=c("darkblue","lightcyan"), legend=rownames(cross), args.legend = list(x = "topleft")) #To get more information on barchart options. ?barplot #Stacked Bar Plot for percentages with Colours and Legend barplot(prop.table(cross,2)*100, xlab='Class',ylab='Percentages',main="Percentage survival by class",col=c("darkblue","lightcyan"), legend=rownames(cross), args.legend = list(x = "topleft")) #Alternatively, use a clustered bar chart. #Clustered Bar Plot for percentages with Colours and Legend barplot(prop.table(cross,2)*100, xlab='Class',ylab='Percentages',main="Percentage survival by class",beside=T,col=c("darkblue","lightcyan"), legend=rownames(cross), args.legend = list(x = "topleft"))