#SIMPLE LINEAR REGRESSION IN R SCRIPT FILE.
#Associated files: Birthweight.csv and Simple linear regression in R worksheet.
##########################################################.

#STATSTUTOR COMMUNITY PROJECT.
#Sofia Maria Karadimitriou and Ellen Marshall, University of Sheffield.
#Reviewer: Jim Bull, University of Swansea.
#######################################################.


#Open the birthweight reduced dataset which is saved as a csv file and call it birthweightR.  
#If your file is saved as a standard Excel file, save it as a csv file first.  
#You will need to change the command depending on where you have saved the file.
birthweightR<-read.csv("D:\\Birthweight reduced.csv",header=T)

#Tell R we are using the birthweight dataset until further notice using attach.
#This means that 'Gestation' can be used instead of birthweightR$Gestation.
attach(birthweightR) 

#A scatterplot shows the relationship between two continuous variables.
plot(Gestation,Birthweight,main='Scatterplot of gestational age and birthweight',xlab='Gestational age at birth (weeks)',ylab='Weight of baby at birth(lbs)')
#Adding a regression line to the plot using the lm(Dependent~Independent) command.
abline(lm(Birthweight~Gestation),col='red',lwd=2)


#Calculating Pearson's correlation coefficient gives a measure of the strength of a relationship..
cor(Gestation,Birthweight)

#Fit the regression model using the lm(dependent~Independent) command and give it a name (reg1).
reg1<-lm(Birthweight~Gestation)
#Request the regression output.
summary(reg1)


#To check the assumptions using plots, first tell R you 2 plots next to each other.
par(mfrow=c(1,2))
#First produce a histogram of standardised residuals to check the assumption of normality.
hist(resid(reg1),main='Histogram of residuals',xlab='Standardised residuals',ylab='Frequency')
#fit(reg1) gives a set of plots to check assumptions.
#Check the assumptions using plot(reg1, which= ... ).
#which=1 gives the predicted (fitted) values vs residuals.
#Fitted values and residuals plot to check the assumption of homoscedasticity.
plot(reg1, which = 1)

#------------------------------------------------------------------------------------.
#################### Extra checks ############.
#########  see Further regression resource #####.
##### Durbin Watson ############.

#If you wish to carry out the Durbin Watson Statistic for autocorrelation you must load the library car.
library(car)
#If this command does not work, you will need to go to the Packages --> Install package(s) and select the UK (London)CRAN mirror.
#Then look for the package 'car' and click.  A lot of extra menus will download. Then try library(car) again.
#Note:The package car is available in Rstudio via Tools --> Install packages but some versions do not recognise the commands.
#You can download and run using the basic free R package though.
#Request the Durbin Watson test
dwt(reg1)
#If there is no autocorrelation (where subsequent observations are related), the p-value will be above 0.05.

######## Influential observations ################.
#If you want to investigate if there are any influential observations, produce the following charts.
#First tell R you want two charts in one window.
par(mfrow=c(2,1))

#To produce a bar chart of Cook's distance for each individual.  
plot(reg1, which = 4)
#R identifies observation with Cooks > 4/n where n = number of observations.

#To produce a scatterplot of Leverage Values against standardised residuals.  
plot(reg1, which = 5)
#Leverage values 3 times (k + 1)/ n are large, where k = number of independent variables.

#--------------------------------------------------------------------------.