library(Stat2Data) library(ggplot2) #------------------------------------------------------------------------ #Example 1: Can age and height predict sales price in horses? #Get data data(HorsePrices) #Test Assumptions #Independence #depends on sample design and taken to be valid here #Normality hist(HorsePrices$Price, col="brown") #not great hist(log(HorsePrices$Price), col="brown") #not much better hist(sqrt(HorsePrices$Price), col="brown") #good enough HorsePrices$sPrice <-sqrt(HorsePrices$Price) #Multicollinearity round(cor(na.omit(HorsePrices[,3:4])),2) #not correlated #Linear Relationship and Homeoscedasticity plot(sPrice~Age, data=HorsePrices) #Non-linearity present plot(sPrice~Height, data=HorsePrices) #Heteroscedascicity present lm1 <- lm(sPrice~Age+Height, data=HorsePrices) plot(fitted(lm1), residuals(lm1)); abline(h=0, lty=2) #problematic #Multiple Linear Regression lm1 <- lm(sPrice~Age+Height, data=HorsePrices) summary(lm1) # Variable Beta t-value p-value # Age -6.0 -4.6 <0.0001 # Height 43.4 8.8 <0.0001 #do not trust because assumptions are not met #------------------------------------------------------------------------ #Example 2: Can skull length, bill depth, and bill length predict overall body mass in blue jays? #Get data data(BlueJays) #Test Assumptions #Independence #depends on sample design and taken to be valid here #Normality hist(BlueJays$Mass, col='blue') #good #Multicollinearity Jay_Xvars <-data.frame(BlueJays$Skull, BlueJays$BillDepth, BlueJays$BillLength) round(cor(Jay_Xvars),2) #not highly correlated #Linear Relationship and Homeoscedasticity plot(Mass~Skull, data=BlueJays) #looks good plot(Mass~BillDepth, data=BlueJays) #looks good plot(Mass~BillLength, data=BlueJays) #looks good lm2 <- lm(Mass~Skull + BillDepth + BillLength, data=BlueJays) plot(fitted(lm2), residuals(lm2)); abline(h=0, lty=2) #looks good #Multiple Linear Regression lm2 <- lm(Mass~Skull + BillDepth + BillLength, data=BlueJays) summary(lm2) # Variable Beta t-value p-value # Skull 2.4 6.8 <0.0001 # BillDepth 2.8 2.8 0.0063 # BillLength 0.9 2.9 0.0051 #------------------------------------------------------------------------ #Example 3: Can SAT score, ACT score, adjusted GPA, and HS class size predict math placement exam scores in college students? #Get data data(MathPlacement) #Test Assumptions #Independence #depends on sample design and taken to be valid here #Normality hist(MathPlacement$PlcmtScore, col="green") #looks good #Multicollinearity Math_Xvars <-data.frame(MathPlacement$SATM, MathPlacement$ACTM, MathPlacement$GPAadj, MathPlacement$Size) round(cor(na.omit(Math_Xvars)),2) #SAT and ACT highly correlated Math_vars2 <-data.frame(MathPlacement$PlcmtScore ,MathPlacement$SATM, MathPlacement$ACTM) round(cor(na.omit(Math_vars2)),2) #ACT more highly correlated with Placement Score #Keep ACT #Drop SAT #Linear Relationship and Homeoscedasticity plot(PlcmtScore~ACTM, data=MathPlacement) #good plot(PlcmtScore~GPAadj, data=MathPlacement) #good enough plot(PlcmtScore~Size, data=MathPlacement) #good lm3 <- lm(PlcmtScore~ACTM + GPAadj + Size, data=MathPlacement) plot(fitted(lm3), residuals(lm3)); abline(h=0, lty=2) #some Heteroskedastisticity #Multiple Linear Regression lm3 <- lm(PlcmtScore~ACTM + GPAadj + Size, data=MathPlacement) summary(lm3) # Variable Beta t-value p-value # ACT 1.6 67.7 <0.0001 # GPA 1.1 56.1 <0.0001 # Size <0.1 <0.1 0.9950 #grain of salt trust because assumptions tilted