library(Stat2Data) library(ggplot2) library(MASS) #------------------------------------------------------------------------ #Example 1: Can carapace length predict egg clutch size in turtles? #Get data X <-c(284,290,290,290,298,299,302,306,306,309,310,311,317,317,320,323,334,334) Y <-c(3,2,7,7,11,12,10,8,8,9,10,13,7,9,6,13,2,8) Turtle <- data.frame("Length"=X, "Clutch"=Y) #Check linearity/relationship plot(Clutch~Length, data=Turtle) #Non-linearity appears present lm1 <- lm(Clutch~Length, data=Turtle) plot(fitted(lm1), residuals(lm1)); abline(h=0, lty=2, col="green") #Test Assumptions #Independence #depends on sample design and taken to be valid here #Normality hist(Turtle$Clutch, col="green") #good enough #Linear Regression lm1.1 <- lm(Clutch~Length, data=Turtle) summary(lm1.1) #Quadratic Regression Turtle$Length2 <-(Turtle$Length)^2 lm1.2 <- lm(Clutch~Length + Length2, data=Turtle) summary(lm1.2) #Cubic Regression Turtle$Length3 <-(Turtle$Length)^3 lm1.3 <- lm(Clutch~Length + Length2 + Length3, data=Turtle) summary(lm1.3) #Model Comparison AIC(lm1.1); AIC(lm1.2); AIC(lm1.3) #Plots ggplot(data=Turtle, aes(x=Length, y=Clutch))+ geom_point()+ stat_smooth(method='lm', formula= y ~ x, size=2, col="green") ggplot(data=Turtle, aes(x=Length, y=Clutch))+ geom_point()+ stat_smooth(method='lm', formula= y ~ poly(x,2), size=2, col="green") #best fit ggplot(data=Turtle, aes(x=Length, y=Clutch))+ geom_point()+ stat_smooth(method='lm', formula= y ~ poly(x,3), size=2, col="green") #------------------------------------------------------------------------ #Example 2: Can body weight predict backpack weight in in college students? data(Backpack) #Check linearity/relationship plot(BackpackWeight~BodyWeight, data=Backpack) #Non-linearity unclear lm2 <- lm(BackpackWeight~BodyWeight, data=Backpack) plot(fitted(lm2), residuals(lm2)); abline(h=0, lty=2, col="orange") #Test Assumptions #Independence #depends on sample design and taken to be valid here #Normality hist(Backpack$BackpackWeight, col="orange") #good enough #Linear Regression lm2.1 <- lm(BackpackWeight~BodyWeight, data=Backpack) summary(lm2.1) #Quadratic Regression Backpack$BodyWeight2 <-(Backpack$BodyWeight)^2 lm2.2 <- lm(BackpackWeight~BodyWeight + BodyWeight2, data=Backpack) summary(lm2.2) #Cubic Regression Backpack$BodyWeight3 <-(Backpack$BodyWeight)^3 lm2.3 <- lm(BackpackWeight~BodyWeight + BodyWeight2 + BodyWeight3, data=Backpack) summary(lm2.3) #Model Comparison AIC(lm2.1); AIC(lm2.2); AIC(lm2.3) #Plots ggplot(data=Backpack, aes(x=BodyWeight, y=BackpackWeight))+ geom_point()+ stat_smooth(method='lm', formula= y ~ x, size=2, col="orange") ggplot(data=Backpack, aes(x=BodyWeight, y=BackpackWeight))+ geom_point()+ stat_smooth(method='lm', formula= y ~ poly(x,2), size=2, col="orange") #best fit ggplot(data=Backpack, aes(x=BodyWeight, y=BackpackWeight))+ geom_point()+ stat_smooth(method='lm', formula= y ~ poly(x,3), size=2, col="orange") #------------------------------------------------------------------------ #Example 3: Can the percent of lower status population predict median value of homes in Boston? #Get data data(Boston) #Check linearity/relationship plot(medv~lstat, data=Boston) #Non-linearity seems clear lm3 <- lm(medv~lstat, data=Boston) plot(fitted(lm3), residuals(lm3)); abline(h=0, lty=2, col="blue") #Test Assumptions #Independence #depends on sample design and taken to be valid here #Normality hist(Boston$medv, col="blue") #good enough #Linear Regression lm3.1 <- lm(medv~lstat, data=Boston) summary(lm3.1) #Quadratic Regression Boston$lstat2 <-(Boston$lstat)^2 lm3.2 <- lm(medv~lstat + lstat2, data=Boston) summary(lm3.2) #Cubic Regression Boston$lstat3 <-(Boston$lstat)^3 lm3.3 <- lm(medv~lstat + lstat2 + lstat3, data=Boston) summary(lm3.3) #Model Comparison AIC(lm3.1); AIC(lm3.2); AIC(lm3.3) #Plots ggplot(data=Boston, aes(x=lstat, y=medv))+ geom_point()+ stat_smooth(method='lm', formula= y ~ x, size=2, col="blue") ggplot(data=Boston, aes(x=lstat, y=medv))+ geom_point()+ stat_smooth(method='lm', formula= y ~ poly(x,2), size=2, col="blue") ggplot(data=Boston, aes(x=lstat, y=medv))+ geom_point()+ stat_smooth(method='lm', formula= y ~ poly(x,3), size=2, col="blue") #best fit, justify?