#Model Gauntlet in R Code: #Packages: you'll need to install.packages("package.name") if you do not already have a package library(MASS) library(gmodels) library(PairedData) library(dplyr) library(ggplot2) library(lme4) library(nlme) library(agricolae) library(tidyverse) library(multcomp) data() #shows available datasets #************************************************************************************************************ #One-sample t-tests #1) Is the average miles per gallon (mpg) of 1975 Motor Trend vehicles less than 25? t.test(x=mtcars$mpg, mu=25, alternative="less") #sig. #2) Is the average quarter mile time in seconds (qsec) of 1975 Motor Trend vehicles greater than 17.5? t.test(x=mtcars$qsec, mu=17.5, alternative="greater") #not sig #3) Is the average weight in 1000 lbs (wt) of 1975 Motor Trend vehicles different than 3? t.test(x=mtcars$wt, mu=3, alternative="two.sided") #not sig. #4) Is the log baseline number of seizures (lbase) for the placebo group of eptilepticss different than 0? t.test(x=epil$lbase[epil$trt=='placebo'], mu=0, alternative="two.sided") #not sig. #5) Is the average yield of barley for 1931 greater than 100? t.test(x=immer$Y1, mu=100, alternative = "greater") # sig. #6) Is the average yield of barely for 1932 different than 1931's (109)? t.test(x=immer$Y2, mu=109, alternative = "two.sided") # sig. #************************************************************************************************************ #Two-sample t-tests #Function to create two-sample barcharts #numset & catset must be in dataset$variable format; cat1, cat2 must be format they come in in the dataset; #numlab, & catlab must be strings; barcols must be list of cols barchart.2sample <- function(numset, catset, cat1, cat2, numlab, catlab, barcols) { c1.cat <-numset[catset==cat1] c2.cat <-numset[catset==cat2] c1.mean <-mean(c1.cat) c2.mean <-mean(c2.cat) ll<-c(ci(c1.cat)[2], ci(c2.cat)[2]) ul<-c(ci(c1.cat)[3], ci(c2.cat)[3]) x<-c(1,2) y<-c(c1.mean,c2.mean) x<-barplot(y) ymax<-ul[2] if (ul[1]>ul[2]){ ymax<-ul[1] } ymin<-ll[2] if (ll[1]0){ ymin<-0 } barplot(y, xlab=catlab, ylab=numlab, ylim=c(ymin*1.10,ymax*1.10), col=barcols, axes=F) arrows(x,ul,x,ll, length=0.05, angle=90, code=3) axis(1, at=x, labels=c(cat1,cat2)) axis(2) box() clip(0.1,2.49,-100, 100) abline(h=0) } #1) Is the average miles per gallon (mpg) of 1975 Motor Trend vehicles different between v-shaped (0) and # straight (1) engines (vs)? t.test(mtcars$mpg~mtcars$vs, alternative="two.sided") #sig. barchart.2sample(mtcars$mpg, mtcars$vs, 0, 1, "MPG", "engine type", c("red","blue")) #2) Is the average quarter mile time in seconds (qsec) of 1975 Motor Trend vehicles greater in automatic (0) compared to # manual (1) transmissions (am)? t.test(mtcars$qsec~mtcars$am, alternative="greater") #not sig barchart.2sample(mtcars$qsec, mtcars$am, 0, 1, "Quarter mile time (seconds)", "transmission type", c("purple","orange")) #3) Is the average tooth length of guinea pigs (len) different between orange juice (OJ) and ascorbic acid (VC) supplement of # vitamin C (supp)? t.test(ToothGrowth$len~ToothGrowth$supp, alternative="two.sided") #not sig barchart.2sample(ToothGrowth$len, ToothGrowth$supp, "OJ", "VC", "Tooth length", "supplement type", c("orange","green")) #4) Is the average weight (weight) lower in linseed fed chicks compared to sunflower fed ones (feed)? chickwts2 <- chickwts[which(chickwts$feed=='sunflower'|chickwts$feed=='linseed'),] t.test(chickwts2$weight~chickwts2$feed, alternative='less') #sig. barchart.2sample(chickwts2$weight,chickwts2$feed,"sunflower","linseed", "weight", "feed type", c('yellow','brown')) #5) Is the change in hours of sleep (extra) different between two groups of students (group)? t.test(sleep$extra~sleep$group, alterative='two.sided') #not sig. barchart.2sample(sleep$extra,sleep$group, 1, 2, "change in sleep (hours)", "group", c("blue","green")) #6) Is the number of warp breaks per loom (breaks) different between two types of wool (wool)? t.test(warpbreaks$breaks~warpbreaks$wool, alternative='two.sided') #not sig. barchart.2sample(warpbreaks$breaks,warpbreaks$wool, "A", "B", "warp breaks per loom", "wool type", c("grey","white")) #************************************************************************************************************ #Paired t-tests #1) Is the change in hours of sleep (extra) different between students before (1) and after (2) a drug (group)? #NOTE: this is the same dataset used as for the two-sample t-test, but correctly should be paired t.test(sleep$extra~sleep$group, alterative='two.sided', paired=T) #sig. interaction.plot(sleep$group, sleep$ID, sleep$extra, ylab="change in sleep (hours)", xlab="Drug (1=before, 2=after)", col=c(1:10), legend=F) #2) Is the average weight (Weight) of anorexia females greater in the post study period (Post) compared to the pre study (Pre) # for the Control group? data(anorexia) anorexia.control <- data.frame("ID"=c(1:26, 1:26), "Period"=c(rep("Pre",times=26), rep("Post",times=26)), "Period_code"=c(rep(1,times=26),rep(2,times=26)), "Weight"=c(anorexia$Prewt[anorexia$Treat=="Cont"],anorexia$Postwt[anorexia$Treat=="Cont"])) t.test(anorexia.control$Weight~anorexia.control$Period, alternative="greater", paired=T) #not sign. interaction.plot(anorexia.control$Period_code, anorexia.control$ID, anorexia.control$Weight, ylab="Weight", xlab="Study period: 1=Pre, 2=Post", col=c(1:26), legend=F) #3) Is the average weight (Weight) of anorexia females greater in the post study period (Post) compared to the pre study (Pre) # for the Cognitive Behavioral Treatment group? anorexia.CBT <- data.frame("ID"=c(1:29, 1:29), "Period"=c(rep("Pre",times=29), rep("Post",times=29)), "Period_code"=c(rep(1,times=29),rep(2,times=29)), "Weight"=c(anorexia$Prewt[anorexia$Treat=="CBT"],anorexia$Postwt[anorexia$Treat=="CBT"])) t.test(anorexia.CBT$Weight~anorexia.CBT$Period, alternative="greater", paired=T) #sign. interaction.plot(anorexia.CBT$Period_code, anorexia.CBT$ID, anorexia.CBT$Weight, ylab="Weight", xlab="Study period: 1=Pre, 2=Post", col=c(1:29), legend=F) #4) Is the average weight (Weight) of anorexia females greater in the post study period (Post) compared to the pre study (Pre) # for the Family Treatment group? anorexia.FT <- data.frame("ID"=c(1:17, 1:17), "Period"=c(rep("Pre",times=17), rep("Post",times=17)), "Period_code"=c(rep(1,times=17),rep(2,times=17)), "Weight"=c(anorexia$Prewt[anorexia$Treat=="FT"],anorexia$Postwt[anorexia$Treat=="FT"])) t.test(anorexia.FT$Weight~anorexia.FT$Period, alternative="greater", paired=T) #sign. interaction.plot(anorexia.FT$Period_code, anorexia.FT$ID, anorexia.FT$Weight, ylab="Weight", xlab="Study period: 1=Pre, 2=Post", col=c(1:17), legend=F) #5) Is the average blink rate (Rate) different between participants steering a pencil along a straight or oscillating # track (Track)? data(Blink) Blink.mod <- data.frame("ID"=c(1:12), "Track"=c(rep("Straight",times=12),rep("Oscillating",times=12)), "Rate"=c(Blink$Straight,Blink$Oscillating)) t.test(Blink.mod$Rate~Blink.mod$Track, alternative="two.sided",paired=T) #sign. interaction.plot(Blink.mod$Track, Blink.mod$ID, Blink.mod$Rate, ylab="Blink rate", xlab="Track type", col=c(1:12), legend=F) #6) Is the average blood lead level (Levels) lower in control children compared to exposed children (Status)? data(BloodLead) BloodLead.mod <-data.frame("ID"=c(1:33), "Status"=c(rep("Control",times=33),rep("Exposed",times=33)), "Levels"=c(BloodLead$Control, BloodLead$Exposed)) t.test(BloodLead.mod$Levels~BloodLead.mod$Status, alternative="less", paired=T) #sign. interaction.plot(BloodLead.mod$Status, BloodLead.mod$ID, BloodLead.mod$Levels, ylab="Blood Lead Levels (mg/dl)", xlab="Exposure Status", col=c(1:33), legend=F) #************************************************************************************************************ #One-way ANOVA #Function for creating 1-way ANOVA barcharts #df is the dataset, cat_var is the categorical variable, num_var is the numerical variable barchart.anova <- function(df, cat_var, num_var) { cat_var <- enquo(cat_var) num_var <- enquo(num_var) df_sum<-df %>% group_by(!!cat_var) %>% summarise(mean=mean(!!num_var), sd = sd(!!num_var), error = qt(0.975,df=n()-1)*sd/sqrt(n()), ul = mean + error, ll = mean - error) #print(df_sum) df_plot <<-ggplot(df_sum, aes(!!cat_var, mean, fill=!!cat_var))+ geom_bar(stat="identity", color="black")+ geom_hline(yintercept = 0) + geom_errorbar(aes(ymin=ll, ymax=ul), width=0.1) + theme_classic() + theme(legend.position="none") } #1) Is the average sepal length different across three species of iris (Species)? aov1<-aov(iris$Sepal.Length~iris$Species) summary(aov1) #sign. aov1.mcp <-TukeyHSD(aov1) plot(aov1.mcp, xlim=c(0,2)) #all comparisons sigm. barchart.anova(iris, Species, Sepal.Length) df_plot + labs(y="Sepal length (cm", x="Species") + scale_y_continuous(limits=c(0,7),breaks=c(0:7))+ geom_text(aes(label=c('*','*','*')), vjust=-0.8, size=6) #2) Is the average sepal width different across three species of iris (Species)? aov2<-aov(iris$Sepal.Width~iris$Species) summary(aov2) #sign. aov2.mcp <-TukeyHSD(aov2) plot(aov2.mcp) #all comparisons sign. barchart.anova(iris, Species, Sepal.Width) df_plot + labs(y="Sepal width (cm", x="Species") + scale_y_continuous(limits=c(0,4),breaks=c(0:4))+ geom_text(aes(label=c('*','*','*')), vjust=-0.8, size=6) #3) Is the average plant growth (weight) different across treatments (group)? boxplot(PlantGrowth$weight~PlantGrowth$group) aov3<-aov(PlantGrowth$weight~PlantGrowth$group) summary(aov3) #sign. aov3.mcp <-TukeyHSD(aov3) plot(aov3.mcp) #trt1 vs. trt2 significant. barchart.anova(PlantGrowth, group, weight) df_plot + scale_fill_manual(values=c("white","blue","purple")) + geom_text(aes(label=c('*'), x=2.5, y=6.7, size=8)) + geom_segment(aes(x=2,xend=3,y=6.5,yend=6.5)) + labs(y="mean plant weight", x="treatment") + scale_y_continuous(limits=c(0,7),breaks=c(0:7)) #4) Is the average highway MPG (MPG.highway) different across three car manufacturers (Manufacturer)? Cars93_mod <-na.exclude(Cars93[,c(1,3,8,10,11,18)]) Cars93_mod[20,]$Manufacturer <-"Chrysler" Cars93_mod2 <-Cars93_mod[which(Cars93_mod$Manufacturer=='Chevrolet'|Cars93_mod$Manufacturer=='Chrysler'|Cars93_mod$Manufacturer=='Dodge'),] aov4<-aov(Cars93_mod2$MPG.highway~Cars93_mod2$Manufacturer) summary(aov4) #not sign. aov4.mcp <-TukeyHSD(aov4) plot(aov4.mcp) barchart.anova(Cars93_mod2, Manufacturer, MPG.highway) df_plot + scale_fill_manual(values=c("grey","grey","grey")) + labs(y="mean highway MPG", x="Manufacturer") + scale_y_continuous(limits=c(0,35),breaks=c(0,5,10,15,20,25,30,35)) #5) Is the average highway MPG (MPG.highway) different across four types of cars (Type)? Cars93_mod3 <-Cars93_mod[which(Cars93_mod$Type=='Compact'|Cars93_mod$Type=='Midsize'|Cars93_mod$Type=='Small'|Cars93_mod$Type=='Sporty'),] aov5<-aov(Cars93_mod3$MPG.highway~Cars93_mod3$Type) summary(aov5) #sign. aov5.mcp <-TukeyHSD(aov5) par(mar = c(5.1, 6.1, 4.1, 2.1)) plot(aov5.mcp, cex.axis=0.7, las=1) #Small-Compact, Small-Midsize, and Sporty-Small different par(mar = c(5.1, 4.1, 4.1, 2.1)) barchart.anova(Cars93_mod3, Type, MPG.highway) df_plot + scale_fill_manual(values=c("red","blue","white","orange")) + geom_text(aes(label=c('A','A','B','A')), vjust=-2, size=6) + labs(y="mean highway MPG", x="Type") + scale_y_continuous(limits=c(0,40),breaks=c(0,5,10,15,20,25,30,35,40)) #6) Is the average number of insects (count) after spraying different across six types of spray (spray)? aov6 <-aov(InsectSprays$count~InsectSprays$spray) summary(aov6) #sign. aov6.mcp <-TukeyHSD(aov6) plot(aov6.mcp, cex.axis=0.6, las=1) #C-A, D-A, E-A, C-B, D-B, E-B, F-C, F-D, and F-E sign. barchart.anova(InsectSprays, spray, count) df_plot + geom_text(aes(label=c('&','&','#','#','#','&')), vjust=-4, size=6) + labs(y="mean insect count", x="spray type") + scale_y_continuous(limits=c(0,25),breaks=c(0,5,10,15,20,25)) #************************************************************************************************************ #Two-way ANOVA #Function for creating 2-way ANOVA barcharts #df is the dataset, cat1 and cat2 are the categorical variables, num_var is the numerical variable #let_list is a list of letters/symbols to indicate different groups; needs to be in c("A","B",...) format #dodg_n is a number to set how much the barcharts are dodged by (try 0.9 as your default and adjust from there) #v_just is the vertical adjustment of the text from the let_list (try -4 as your default and adjust from there) barchart.2anova <- function(df, cat1, cat2, num_var,let_list,dodg_n,v_just) { cat1 <- enquo(cat1) cat2 <- enquo(cat2) num_var <- enquo(num_var) df_sum<-df %>% group_by(!!cat1,!!cat2) %>% summarise(mean=mean(!!num_var), sd = sd(!!num_var), error = qt(0.975,df=n()-1)*sd/sqrt(n()), ul = mean + error, ll = mean - error) #print(df_sum) df_plot <<-ggplot(df_sum, aes(x=as.factor(!!cat1), y=mean))+ geom_bar(aes(fill=as.factor(!!cat2)),stat="identity", color="black", position=position_dodge()) + geom_hline(yintercept = 0) + geom_text(aes(label=let_list,group=as.factor(!!cat2)), vjust=v_just, size=4, position=position_dodge(dodg_n))+ geom_errorbar(aes(ymin=ll, ymax=ul, group=as.factor(!!cat2)), width=0.1, position=position_dodge(dodg_n)) + theme_classic() } #1) Is the average miles per gallon (mpg) of 1975 Motor Trend vehicles different across the engine (vs) and # transmission (am) type? tw.aov1<-aov(mtcars$mpg~mtcars$vs*mtcars$am) summary(tw.aov1) #vs and am sign.; interaction not barchart.2sample(mtcars$mpg, mtcars$vs, 0, 1, "MPG", "engine type", c("red","blue")) barchart.2sample(mtcars$mpg, mtcars$am, 0, 1, "MPG", "transmission type", c("purple","orange")) #2) Is the average gross horsepower (hp)) of 1975 Motor Trend vehicles different across the engine (vs) and # transmission (am) type? tw.aov2<-aov(mtcars$hp~mtcars$vs*mtcars$am) summary(tw.aov2) #only vs sign. barchart.2sample(mtcars$hp, mtcars$vs, 0, 1, "horsepower", "engine type", c("lightcoral","lightblue")) #3) Is the average tooth length of guinea pigs (len) different across supplement (supp) and dose (dose)? tw.aov3<-aov(ToothGrowth$len~ToothGrowth$supp*as.factor(ToothGrowth$dose)) summary(tw.aov3) #all sign. tw.aov3.mcp <-TukeyHSD(tw.aov3) par(mar = c(5.1, 6.1, 4.1, 2.1)) plot(tw.aov3.mcp,cex.axis=0.7, las=1) #not sig. VC:1-OJ;0.5 ; OJ:2-OJ:1 ; VC:2-OJ:1 ; VC:2-OJ:2 par(mar = c(5.1, 4.1, 4.1, 2.1)) barchart.2anova(ToothGrowth, supp, dose, len, c("A","B","B","C","A","B"), 0.9,-4) df_plot + labs(y="mean tooth length", x="supplement type", fill="doses") + scale_fill_manual(values=c("slateblue1","slateblue", "slateblue4")) + scale_y_continuous(limits=c(0,40),breaks=c(0,5,10,15,20,25,30,35,40)) #4) Is the number of warp breaks per loom (breaks) different across wool (wool) and tension (tension)? tw.aov4<-aov(warpbreaks$breaks~warpbreaks$wool*warpbreaks$tension) summary(tw.aov4) #tension and interaction significant tw.aov4.mcp <-TukeyHSD(tw.aov4) par(mar = c(5.1, 6.1, 4.1, 2.1)) plot(tw.aov4.mcp,cex.axis=0.7, las=1) #sig. B:L-A:L ; A:M-A:L ; B:M-A:L, A:H-A:L; B:H-A:L par(mar = c(5.1, 4.1, 4.1, 2.1)) barchart.2anova(warpbreaks, tension, wool, breaks, c("&","#","#","#","#","#"), 0.9,-7) df_plot + labs(y="mean break number", x="tension", fill="wool type") + scale_fill_manual(values=c("grey","brown")) + scale_y_continuous(limits=c(0,70),breaks=c(0,10,20,30,40,50,60,70)) #5) Is the average yield for peas different across nitrogen (N) and phosphorus (P) application? tw.aov5<-aov(npk$yield~npk$N*npk$P) summary(tw.aov5) #only nitrogen significant barchart.2sample(npk$yield,npk$N, 0, 1, "mean yield", "nitrogen application", c("white","blue")) #6) Is the average price of diamonds different across cut (cut) and color (color)? diamonds2<-diamonds[,c(2,3,7)] #Only included top 3 cut and color categories diamonds2 <-diamonds2[which(diamonds2$color=="D"|diamonds2$color=="E"|diamonds2$color=="F"),] diamonds2 <-diamonds2[which(diamonds2$cut=="Very Good"|diamonds2$cut=="Premium"|diamonds2$cut=="Ideal"),] diamonds2$color <-factor(diamonds2$color) diamonds2$cut <-factor(diamonds2$cut) tw.aov6<-aov(diamonds2$price~diamonds2$cut*diamonds2$color) summary(tw.aov6) #all significant tw.aov6.mcp <-TukeyHSD(tw.aov6) par(mar = c(5.1, 8.5, 4.1, 2.1)) plot(tw.aov6.mcp,cex.axis=0.5, las=1) #us HSD.test to get groups par(mar = c(5.1, 4.1, 4.1, 2.1)) interact2 <-with(diamonds2,interaction(cut,color)) out.interact2 <-aov(price~interact2, data=diamonds2) HSD.test(out.interact2, "interact2", group=T, console=T) #set B->a, C->b, D->c, A->d, E->e for clarity barchart.2anova(diamonds2, cut, color, price, c("abc","c","a","ab","ab","d","e","e","bc"), 0.9,-2) df_plot + labs(y="Mean Price", x="Cut ($)", fill="Color") + scale_fill_manual(values=c("grey100","gray80", 'gray60')) + scale_y_continuous(limits=c(0,5000),breaks=c(0,1000,2000,3000,4000,5000)) #************************************************************************************************************ #Blocked/Nested ANOVA #1) Is the average yield (Y) of oats per sub-plot different across 3 varieties (V), while controlling for block (B)? data(oats) bl.aov1 <-lmer(Y~V + Error(B),data=oats) summary(bl.aov1) #not sign. barchart.anova(oats, V, Y) df_plot + labs(y="mean yield", x="oat variety") + scale_y_continuous(limits=c(0,150),breaks=c(0,25,50,75,100,125,150)) + scale_fill_manual(values=c("grey","grey", "grey")) #2 Is the average yield for peas different across nitrogen (N) and potassium (K) application, while controlling for block? bl.aov2 <-aov(yield~N*K + Error(block), data=npk) summary(bl.aov2) #N and K significant, but interaction not barchart.2sample(npk$yield,npk$N, 0, 1, "mean yield", "nitrogen application", c("white","blue")) barchart.2sample(npk$yield,npk$K, 0, 1, "mean yield", "potassium application", c("white","red")) #3) Is the decrease in honeybees (decrease) different across spray type (treatments), while controlling for row (rowpos) # and column (colpos)? bl.aov3 <-aov(decrease~treatment + Error(colpos/rowpos), data=OrchardSprays) summary(bl.aov3) #sign. bl.aov3b <-lme(decrease~treatment, random=~1| colpos/rowpos, data=OrchardSprays) bl.ph3 <-glht(bl.aov3b, linfct=mcp(treatment="Tukey")) plot(bl.ph3) barchart.anova(OrchardSprays, treatment, decrease) df_plot + geom_text(aes(label=c('&','&','&#','&#','#','#','#','#')), vjust=-4.5, size=6) + labs(y="mean honeybee decrease", x="spray type") + scale_y_continuous(limits=c(0,125),breaks=c(0,25,50,75,100,125)) #4) Is the log-optical density (logDens) different across samples (samples) when nested in block (Block)? ne.aov1 <-aov(logDens~sample + Error(Block/sample), data=Assay) summary(ne.aov1) #looks significant ne.aov1b <-lme(logDens~sample, random=~1| Block/sample, data=Assay) summary(ne.aov1b) #not significant ne.ph1 <-glht(ne.aov1b, linfct=mcp(sample="Tukey")) plot(ne.ph1) #5) Is the number of words recollected (Recall) different across word type (Valance) when nested in subject? #based from https://personality-project.org/r/r.guide/r.anova.html datafilename <-"http://personality-project.org/r/datasets/R.appendix3.data" data.ex1 <-read.table(datafilename,header=T) ne.aov2 <-aov(Recall~Valence + Error(Subject/Valence), data=data.ex1) summary(ne.aov2) #sign. ne.aov2b <-lme(Recall~Valence, random=~1| Subject/Valence, data=data.ex1) ne.ph2 <-glht(ne.aov2b, linfct=mcp(Valence="Tukey")) plot(ne.ph2, cex.axis=0.7) barchart.anova(data.ex1, Valence, Recall) df_plot + geom_text(aes(label=c('A','B','C')), vjust=-3, size=6) + labs(y="mean recalls", x="word type (Valence)") + scale_y_continuous(limits=c(0,50),breaks=c(0,10,20,30,40,50))+ scale_fill_manual(values=c("red","yellow","blue")) #6) Is the number of words recollected (Recall) different across word type (Valance) and (Task) when nested in subject? #based from https://personality-project.org/r/r.guide/r.anova.html datafilename2<-"http://personality-project.org/r/datasets/R.appendix4.data" data.ex2=read.table(datafilename2,header=T) #read the data into a table ne.aov3 <-aov(Recall~(Task*Valence)+Error(Subject/(Task*Valence)),data.ex2) summary(ne.aov3) #not sign. barchart.2anova(data.ex2,Valence,Task,Recall,c("","","","","",""),0.9,0) df_plot + labs(y="mean recalls", x="word type (Valence)", fill="Task") + scale_fill_manual(values=c("grey80","gray50")) + scale_y_continuous(limits=c(0,22),breaks=c(0,5,10,15,20)) #************************************************************************************************************ #Simple Linear Regression #1) Is there a relationship between car speed (speed) and stopping distance (dist)? reg1 <-lm(speed~dist, data=cars) summary(reg1) #sign. summary(cars$dist) pred.frame<-data.frame(dist=2:120) pp<-predict(reg1, newdata=pred.frame,int="p") plot(speed~dist, data=cars, ylim=range(speed,pp), ylab="speed (MPH)", xlab="stopping distance (ft)") legend("bottomright", inset=0.02, legend=c("R-squared= 0.65")) matlines(pred.frame$dist, pp, lty=c(1,2,2), col=c("black")) #2) Is there a relationship between log animal brain weight (brain) and log body weight (body)? data(Animals) Animals$log_brain <-log(Animals$brain) Animals$log_body <-log(Animals$body) reg2 <- lm(log_brain~log_body, data=Animals) summary(reg2) #sign. summary(Animals$log_body) pred.frame<-data.frame(log_body=-3.722:11.374) pp<-predict(reg2, newdata=pred.frame,int="p") plot(log_brain~log_body, data=Animals, ylim=range(log_brain,pp), ylab="log brain weight", xlab="log body weight") legend("bottomright", inset=0.02, legend=c("R-squared= 0.61")) matlines(pred.frame$log_body, pp, lty=c(1,2,2), col=c("black")) #3) Is there a relationship between the area of pore space (area) and perimeter (peri) in petroleum rock samples? data(rock) reg3 <-lm(area~peri, data=rock) summary(reg3) #sign. summary(rock$peri) pred.frame<-data.frame(peri=308.6:4864.2) pp<-predict(reg3, newdata=pred.frame,int="p") plot(area~peri, data=rock, ylim=range(area,pp), ylab="pore space area (pixels)", xlab="perimeter (pixels)") legend("bottomright", inset=0.02, legend=c("R-squared= 0.68")) matlines(pred.frame$peri, pp, lty=c(1,2,2), col=c("black")) #4) Is there a relationship between the area of pore space (area) and permeability (perm) in petroleum rock samples? reg4 <-lm(area~perm, data=rock) summary(reg4) #sign. summary(rock$perm) pred.frame<-data.frame(perm=6.30:1300.00) pp<-predict(reg4, newdata=pred.frame,int="p") plot(area~perm, data=rock, ylim=range(area,pp), ylab="pore space area (pixels)", xlab="permeability (milli-Darcies)") legend("topright", inset=0.02, legend=c("R-squared= 0.16")) matlines(pred.frame$perm, pp, lty=c(1,2,2), col=c("black")) #5) Is there a relationship between fertility rates (Fertility) and the percent of education beyond # primary school for draftees (Education) in Swiss provinces? reg5 <-lm(Fertility~Education, data=swiss) summary(reg5) #sign. summary(swiss$Education) pred.frame<-data.frame(Education=1:53) pp<-predict(reg5, newdata=pred.frame,int="p") plot(Fertility~Education, data=swiss, ylim=range(Fertility,pp), ylab="common standardized fertility", xlab="% education beyond primary school for draftees") legend("topright", inset=0.02, legend=c("R-squared= 0.44")) matlines(pred.frame$Education, pp, lty=c(1,2,2), col=c("black")) #6) Is there a relationship between infant mortality rates (Infant.Mortality) and the percent of males # in agriculture (Agriculture) in Swiss provinces? reg6 <-lm(Infant.Mortality~Agriculture ,data=swiss) summary(reg6) #not sign. summary(swiss$Agriculture) plot(Infant.Mortality~Agriculture, data=swiss, ylab="live births who live less than year", xlab="% of males involved in agriculture as occupation") abline(20.337955,-0.007805, col="red", lty=3)#from summary of regression #************************************************************************************************************ #Multiple Linear Regression #1) Can the area of pore space (area) in petroleum rock samples be predicted by perimeter (peri) and permeability (perm)? mreg1a<-lm(area~peri,data=rock) mreg1b<-lm(area~perm,data=rock) mreg1c<-lm(area~peri+perm, data=rock) #full model summary(mreg1a) #sign. summary(mreg1b) #sign. summary(mreg1c) #both sign. plot(peri~perm,data=rock) #somewhat correlated AIC(mreg1a,mreg1b,mreg1c) #full model is best fit #2) Can frontal lobe size (FL) in crabs be predicted by rear width (RW) and body depth (BD)? data(crabs) mreg2a<-lm(FL~RW, data=crabs) mreg2b<-lm(FL~BD, data=crabs) mreg2c<-lm(FL~RW+BD, data=crabs) summary(mreg2a) #sign. summary(mreg2b) #sign. summary(mreg2c) #both sign. plot(RW~BD, data=crabs) #highly correlated AIC(mreg2a,mreg2b,mreg2c) #full model is best fit but RW and BD highly correlated, so just use BD as best single model #3) Can frontal lobe size (FL) in crabs be predicted by carapace length (CL), width (CW), and their interaction? mreg3a<-lm(FL~CL, data=crabs) mreg3b<-lm(FL~CW, data=crabs) mreg3c<-lm(FL~CL*CW, data=crabs) summary(mreg3a) summary(mreg3b) summary(mreg3c) plot(CL~CW, data=crabs) #highly correlated AIC(mreg3a,mreg3b,mreg3c) #full model is best fit but CL and CW highly correlated, so just use CL as best single model #4) Can infant mortality rates (Infant.Mortality) in Swiss provinces be predicted by fertility rates (Fertility), # % of males in agriculture (Agriculture), % Catholic (Catholic), and % education beyond primary school (Education)? mreg4a<-lm(Infant.Mortality~Fertility+Agriculture+Catholic+Education, data=swiss) summary(mreg4a) #only Fertility sign. pairs(~Fertility+Agriculture+Catholic+Education, data=swiss) #no variables appear too correlated mreg4b <-lm(Infant.Mortality~Fertility, data=swiss) #single model summary(mreg4b) AIC(mreg4a,mreg4b) #5) Can calories per portion (calories) in US cereals be predicted by protein (protein), fat (fat), # carbohydrates (carbo), or their interactions (sqrt-transformed data)? data(UScereal) UScereal$sqrt_calories<-sqrt(UScereal$calories) UScereal$sqrt_protein<-sqrt(UScereal$protein) UScereal$sqrt_fat<-sqrt(UScereal$fat) UScereal$sqrt_carbo<-sqrt(UScereal$carbo) mreg5a<-lm(sqrt_calories~sqrt_protein*sqrt_fat*sqrt_carbo, data=UScereal) sreg1 <-step(mreg5a, direction="both") #retained protein, fat, carb, and interaction of protein:carb pairs(~sqrt_protein+sqrt_fat+sqrt_carbo,data=UScereal) #variables somewhat correlated mreg5b<-lm(sqrt_calories~sqrt_protein+sqrt_fat+sqrt_carbo+sqrt_protein:sqrt_carbo, data=UScereal) summary(mreg5b) #protein, fat, and interaction between protein and carbs was sign. #6) Can miles per gallon (mpg) be predicted by gross horsepower (hp), weight (wt), quarter mile time (qsec), # or their interactions? mreg6a<- lm(mpg~hp*wt*qsec,data=mtcars) sreg2 <-step(mreg6a, direction="both") #hp, wt, qsec, and interaction of hp:wt mreg6b <-lm(mpg ~ hp + wt + qsec + hp:wt, data=mtcars) summary(mreg6b) #only weight is sign. pairs(~hp+wt+qsec,data=mtcars) #hp and wt are highly correlated plot(hp~wt,data=mtcars) mreg6c <-lm(mpg~wt+qsec,data=mtcars) summary(mreg6c) AIC(mreg6a, mreg6b, mreg6c) #************************************************************************************************************ #Logistic Regression #1) Can midwest counties' metropolitan status (inmetro=Yes vs. No) be predicted by the log population # density(log_popdensity)? midwest$log_popdensity <-log(midwest$popdensity) lreg1 <- glm(inmetro ~ log_popdensity, data=midwest, family="binomial") summary(lreg1) #sign. plot(midwest$log_popdensity, midwest$inmetro, ylab="Metro status (1=yes, 0=no)", xlab="log population density") curve(predict(lreg1, data.frame(log_popdensity=x), type="resp"), add=T) #2) Can midwest counties' metropolitan status (inmetro=Yes vs. No) be predicted by the percentage of # college educated adults (percollege)? lreg2 <- glm(inmetro ~ percollege, data=midwest, family="binomial") summary(lreg2) #sign. plot(midwest$percollege, midwest$inmetro, ylab="Metro status (1=yes, 0=no)", xlab="percentage of college educated adults") curve(predict(lreg2, data.frame(percollege=x), type="resp"), add=T) #3) Can a car's origin (Origin2: USA vs. non-USA) be predicted by city miles per gallon (MPG.city)? Cars93_lg <- mutate(Cars93, Origin2=ifelse(Origin=="USA",1,0)) lreg3 <- glm(Origin2 ~ MPG.city, data=Cars93_lg, family="binomial") summary(lreg3) #sign. plot(Cars93_lg$MPG.city,Cars93_lg$Origin2, ylab="Origin (1=USA, 0=non-USA", xlab="city MPG") curve(predict(lreg3, data.frame(MPG.city=x), type="resp"), add=T) points(Cars93_lg$MPG.city, fitted(lreg3), pch=20) #4) Can a car's origin (Origin2: USA vs. non-USA) be predicted by midrange price (Price)? lreg4 <- glm(Origin2 ~ Price, data=Cars93_lg, family="binomial") summary(lreg4) #not sign. plot(Cars93_lg$Price,Cars93_lg$Origin2, ylab="Origin (1=USA, 0=non-USA", xlab="midrange price ($/thousands") curve(predict(lreg4, data.frame(Price=x), type="resp"), add=T, col="red") points(Cars93_lg$Price, fitted(lreg4), pch=20, col="red") #5) Can the presence of a bacteria species (y: yes vs. no) be predicted by drug status (ap: active/placebo) and # compliance (hilo: low/high)? lreg5 <- glm(y~ap+hilo, data=bacteria, family="binomial") summary(lreg5) exp(cbind(OR=coef(lreg7),confint(lreg7))) #6) Can the presence of a tumor (status: tumor vs. none) be predicted by sex (sex: male/female) and # treatment (rex: drug/control)? lreg6 <-glm(status~rx+sex, data=rats, family="binomial") summary(lreg6) exp(cbind(OR=coef(lreg6),confint(lreg6)))