#    Summarising categorical data in R SCRIPT FILE.
#    www_statstutor_ac_uk Community Project.
#    Sofia Maria Karadimitriou and Ellen Marshall, Sheffield University.
#    Reviewed by Basile Marquier, University of Sheffield.
#    Dataset: Titanic csv.
#    Resource: Summarising categorical data in R.

#Open the titanic dataset which is saved as a csv file and call it titanic.  
#If your file is saved as a standard Excel file, save it as a csv file first.  
#You will need to change the command depending on where you have saved the file.
#For example, the dataset has been saved as stcp-Rdataset-Titanic on a memory stick which is the D drive.

#TitanicR<-data.frame(read.csv("E:\\stcp-Rdataset-Titanic.csv",header=T,sep=","))

TitanicR<-read.table(file.choose(),sep=",", header=T)

#Tell R we are using the titanic dataset until further notice using attach.
#This means that variable names such as survived can be used instead of Titanic$survived.
attach(TitanicR) 

##Occasionally this does not work!  If so use Titanic$survived


#what are the names of the variables
names(TitanicR)

## look at the first six cases
head(TitanicR)


#R assumes all numeric values are continuous so tell it that ‘survived’ and ‘class’ are factors. 
#and attach labels to the categories (for example 0 in survived means a person died).
#The factor command uses variable<-factor(variable,c(category numbers),labels=c(category names)).
survivedf<-factor(survived,c(0,1),labels=c('Died','Survived'))
pclassf<-factor(ď..pclass,c(1,2,3),labels=c('First','Second','Third'))
Residencef<-factor(Residence,levels=c(0,1,2),labels=c('American','British','Other'))
Genderf<-factor(Gender,levels=c(0,1),labels=c('Male','Female'))

########  One variable ##################
#To summarise frequencies use the table command and give the table a name.
SurT<-table(survived)
#To view the table.
SurT
#To add row and column totals.
addmargins(SurT)

#To calculate proportions of one variable from the frequency table.
prop.table(SurT)

#Reduce the number of decimal places using the round function.
round(prop.table(SurT),digits=2)

#To produce percentages rounded to whole numbers.
round(100*prop.table(SurT),digits=0)

########  Two variables ##################

#To produce a contingency table of frequencies, use the table command.
cross<-table(survivedf,pclassf)
#To add row and column totals.
addmargins(cross)

#To produce a contingency tables of proportions, use the prop.table command.
#To get proportions of row totals.
prop.table(cross, 1)
       
#To get column percentages
prop.table(cross,2)

#We are interested in survival within class so we require column percentages here.
#Reduce the number of decimal places using the round function.
round(prop.table(cross,2),digits=2)

#To produce a contingency table of column percentages rounded to whole numbers.
round(100*prop.table(cross,2),digits=0)

#To get row percentages (not needed here).
round(100*prop.table(cross,1),digits=0)

########  Bar charts ##################


par(mfrow=c(1,1))
#Stacked Bar Plot with Colours and Legend
barplot(cross, xlab='Class',ylab='Frequency',main="Survival by class",col=c("darkblue","lightcyan"),
legend=rownames(cross), args.legend = list(x = "topleft"))

#To get more information on barchart options.
?barplot

#Stacked Bar Plot for percentages with Colours and Legend
barplot(prop.table(cross,2)*100, xlab='Class',ylab='Percentages',main="Percentage survival by class",col=c("darkblue","lightcyan"),
legend=rownames(cross), args.legend = list(x = "topleft"))

#Alternatively, use a clustered bar chart.
#Clustered Bar Plot for percentages with Colours and Legend
barplot(prop.table(cross,2)*100, xlab='Class',ylab='Percentages',main="Percentage survival by class",beside=T,col=c("darkblue","lightcyan"),
legend=rownames(cross), args.legend = list(x = "topleft"))