# CHI SQUARED in R SCRIPT FILE.
# www_statstutor_ac_uk Community Project.
# Sofia Maria Karadimitriou and Ellen Marshall, Sheffield University.
# Reviewed by Basile Marquier, University of Sheffield.
# Dataset: Titanic csv.
# Resource: Chi Squared in R.
#Open the titanic dataset which is saved as a csv file and call it titanic.
#If your file is saved as a standard Excel file, save it as a csv file first.
#You will need to change the command depending on where you have saved the file.
#For example, the dataset has been saved as stcp-Rdataset-Titanic on a memory stick which is the D drive.
TitanicR<-data.frame(read.csv("D:\\stcp-Rdataset-Titanic.csv",header=T,sep=","))
#Tell R we are using the titanic dataset until further notice using attach.
#This means that variable names such as survived can be used instead of titanic$survived.
attach(TitanicR)
#R assumes all numeric values are continuous so tell it that ‘survived’ and ‘class’ are factors.
#and attach labels to the categories (for example 0 in survived means a person died).
#The factor command uses variable<-factor(variable,c(category numbers),labels=c(category names)).
survived<-factor(survived,c(0,1),labels=c('Died','Survived'))
pclass<-factor(ï..pclass,c(1,2,3),labels=c('First','Second','Third'))
Residence<-factor(Residence,levels=c(0,1,2),labels=c('American','British','Other'))
Gender<-factor(Gender,levels=c(0,1),labels=c('Male','Female'))
#To produce a contingency table of frequencies, use the table command.
cross<-table(survived, Residence)
#To add row and column totals.
addmargins(cross)
#We are interested in survival within Residence so we require column percentages here.
#To produce a contingency table of column percentages use the prop.table command.
#If you want row percentages, change the 2 to a 1.
100*prop.table(cross,2)
#To produce a contingency table of column percentages rounded to whole numbers.
round(100*prop.table(cross,2),digits=0)
#To calculate the percentage of people who died overall.
margin.table(cross,1)/sum(cross)
#Clustered Bar Plot for percentages with Colours and Legend
barplot(prop.table(cross,2)*100, xlab='Nationality',ylab='Percentages',main="Percentage survival by nationality",beside=T,col=c("gray","black"),
legend=rownames(cross), args.legend = list(x = "topleft"))
#Carrying out the Chi-squared Test
#Firstly we need to load the library in which the command is included
library(MASS)
#Chi squared Test
#use chisq.test(variable1,variable2) and give it a name e.g. result
result<-chisq.test(table(survived,Residence))
#Ask for the results
result
#Ask for the expected values to check the assumptions
result$expected
#If any of the assumptions have not been met, use fishers test instead.
#The Fisher's Exact Test is:
fisher.test(table(survived,Residence))
#######################################################
#Chi-squared for 2x2 tables.
#A 2x2 table has two groups in each variable e.g survived and gender.
#To produce a contingency table of frequencies, use the table command.
cross2<-table(survived, Gender)
#To add row and column totals.
addmargins(cross2)
#To produce a contingency table of column percentages rounded to whole numbers.
round(100*prop.table(cross2,2),digits=0)
#Yate's Continuity Correction is made for 2x2 tables.
#However, the test can be quite conservative so Fishers Exact test is often preferred.
chisq.test(table(survived,Gender),correct=TRUE)