#SIMPLE LINEAR REGRESSION IN R SCRIPT FILE. #Associated files: Birthweight.csv and Simple linear regression in R worksheet. ##########################################################. #STATSTUTOR COMMUNITY PROJECT. #Sofia Maria Karadimitriou and Ellen Marshall, University of Sheffield. #Reviewer: Jim Bull, University of Swansea. #######################################################. #Open the birthweight reduced dataset which is saved as a csv file and call it birthweightR. #If your file is saved as a standard Excel file, save it as a csv file first. #You will need to change the command depending on where you have saved the file. birthweightR<-read.csv("D:\\Birthweight reduced.csv",header=T) #Tell R we are using the birthweight dataset until further notice using attach. #This means that 'Gestation' can be used instead of birthweightR\$Gestation. attach(birthweightR) #A scatterplot shows the relationship between two continuous variables. plot(Gestation,Birthweight,main='Scatterplot of gestational age and birthweight',xlab='Gestational age at birth (weeks)',ylab='Weight of baby at birth(lbs)') #Adding a regression line to the plot using the lm(Dependent~Independent) command. abline(lm(Birthweight~Gestation),col='red',lwd=2) #Calculating Pearson's correlation coefficient gives a measure of the strength of a relationship.. cor(Gestation,Birthweight) #Fit the regression model using the lm(dependent~Independent) command and give it a name (reg1). reg1<-lm(Birthweight~Gestation) #Request the regression output. summary(reg1) #To check the assumptions using plots, first tell R you 2 plots next to each other. par(mfrow=c(1,2)) #First produce a histogram of standardised residuals to check the assumption of normality. hist(resid(reg1),main='Histogram of residuals',xlab='Standardised residuals',ylab='Frequency') #fit(reg1) gives a set of plots to check assumptions. #Check the assumptions using plot(reg1, which= ... ). #which=1 gives the predicted (fitted) values vs residuals. #Fitted values and residuals plot to check the assumption of homoscedasticity. plot(reg1, which = 1) #------------------------------------------------------------------------------------. #################### Extra checks ############. ######### see Further regression resource #####. ##### Durbin Watson ############. #If you wish to carry out the Durbin Watson Statistic for autocorrelation you must load the library car. library(car) #If this command does not work, you will need to go to the Packages --> Install package(s) and select the UK (London)CRAN mirror. #Then look for the package 'car' and click. A lot of extra menus will download. Then try library(car) again. #Note:The package car is available in Rstudio via Tools --> Install packages but some versions do not recognise the commands. #You can download and run using the basic free R package though. #Request the Durbin Watson test dwt(reg1) #If there is no autocorrelation (where subsequent observations are related), the p-value will be above 0.05. ######## Influential observations ################. #If you want to investigate if there are any influential observations, produce the following charts. #First tell R you want two charts in one window. par(mfrow=c(2,1)) #To produce a bar chart of Cook's distance for each individual. plot(reg1, which = 4) #R identifies observation with Cooks > 4/n where n = number of observations. #To produce a scatterplot of Leverage Values against standardised residuals. plot(reg1, which = 5) #Leverage values 3 times (k + 1)/ n are large, where k = number of independent variables. #--------------------------------------------------------------------------.