# LOGISTIC REGRESSION in R SCRIPT FILE.
# www_statstutor_ac_uk Community Project.
# Sofia Maria Karadimitriou and Ellen Marshall, Sheffield University.
# Reviewed by Jim Bull, University of Swansea.
# Dataset: Titanic csv.
# Resource: LOGISTIC REGRESSION in R.
#Open the titanic dataset which is saved as a csv file and call it titanic.
#If your file is saved as a standard Excel file, save it as a csv file first.
#You will need to change the command depending on where you have saved the file.
#For example, the dataset has been saved as stcp-Rdataset-Titanic on a memory stick which is the D drive.
#TitanicR<-data.frame(read.csv("E:\\stcp-Rdataset-Titanic.csv",header=T,sep=","))
TitanicR<-read.table(file.choose(),sep=",", header=T)
#Tell R we are using the titanic dataset until further notice using attach.
#This means that variable names such as survived can be used instead of Titanic$survived.
attach(TitanicR)
##Occasionally this does not work! If so use Titanic$survived
#what are the names of the variables
names(TitanicR)
## look at the first six cases
head(TitanicR)
#R assumes all numeric values are continuous so tell it that ‘survived’ and ‘class’ are factors.
#and attach labels to the categories (for example 0 in survived means a person died).
#The factor command uses variable<-factor(variable,c(category numbers),labels=c(category names)).
survivedf<-factor(survived,c(0,1),labels=c('Died','Survived'))
pclassf<-factor(ï..pclass,c(1,2,3),labels=c('First','Second','Third'))
Residencef<-factor(Residence,levels=c(0,1,2),labels=c('American','British','Other'))
Genderf<-factor(Gender,levels=c(0,1),labels=c('Male','Female'))
###############Logistic Regression###############
#Fit the regression model using the glm(dependent~Independent,family=binomial) command and give it a name(fit).
#I() is used to define the reference category
fit<-glm(I(survivedf=='Survived')~I(pclassf=='First')+I(pclassf=='Second')+Residencef+I(Genderf=='Male'),family=binomial)
#Request the regression output.
summary(fit)
#getting the exponential values of the coefficients
exp(coef(fit))
#building confidence intervals for the coefficients
confint.default(fit)
#Extracting the Odds Ratio and the 95% CI. i.e. the odds of survival for class 1 (pclass = 1)
#over the odds of survival for 3rd class (pclass = 3) is exp(1.631) = 5.10
exp(cbind(OR = coef(fit), confint(fit)))
#Calculate how many of the cases where well classfied
table(survivedf,predict(fit)>0)
8#78% of the cases were correctly classified.
#Fitting the reduced model
fit2<-glm(I(survivedf=='Survived')~I(pclassf=='First')+I(pclassf=='Second')+I(Genderf=='Male'),family=binomial)
#Comparing the models via Chi squared test (Th equivalent F-test for linear regression)
anova(fit2,fit,test='Chi')
#Fitting the intercept model to check whether the selected model is better than the null model
fitnull<-glm(I(survivedf==1)~1,family=binomial)
#Comparing them
anova(fitnull,fit2,test='Chi')
#Deriving the coefficiens of the final model
coef(fit2)
#To check the assumption of independence using plots, first tell R to produce a scatterplot
#of pearson standardised residuals vs predicted values to check the assumption of independence
f2 <- fitted(fit2) # the fitted probability of y=1, for each observation
plot( residuals(fit2, "pearson"), (1-f2)/sqrt(f2*(1-f2)))
abline(0,1) # they match