#Model Gauntlet in R Code:

#Packages: you'll need to install.packages("package.name") if you do not already have a package
library(MASS)
library(gmodels)
library(PairedData)
library(dplyr)
library(ggplot2)
library(lme4)
library(nlme)
library(agricolae)
library(tidyverse)
library(multcomp)

data() #shows available datasets
#************************************************************************************************************

#One-sample t-tests

#1) Is the average miles per gallon (mpg) of 1975 Motor Trend vehicles less than 25?
t.test(x=mtcars$mpg, mu=25, alternative="less") #sig.

#2) Is the average quarter mile time in seconds (qsec) of 1975 Motor Trend vehicles greater than 17.5?
t.test(x=mtcars$qsec, mu=17.5, alternative="greater") #not sig

#3) Is the average weight in 1000 lbs (wt) of 1975 Motor Trend vehicles different than 3? 
t.test(x=mtcars$wt, mu=3, alternative="two.sided") #not sig.

#4) Is the log baseline number of seizures (lbase) for the placebo group of eptilepticss different than 0?
t.test(x=epil$lbase[epil$trt=='placebo'], mu=0, alternative="two.sided") #not sig.

#5) Is the average yield of barley for 1931 greater than 100?
t.test(x=immer$Y1, mu=100, alternative = "greater") # sig.

#6) Is the average yield of barely for 1932 different than 1931's (109)?
t.test(x=immer$Y2, mu=109, alternative = "two.sided") # sig.
#************************************************************************************************************


#Two-sample t-tests

#Function to create two-sample barcharts 
  #numset & catset must be in dataset$variable format; cat1, cat2 must be format they come in in the dataset; 
  #numlab, & catlab must be strings; barcols must be list of cols
barchart.2sample <- function(numset, catset, cat1, cat2, numlab, catlab, barcols) {
  c1.cat <-numset[catset==cat1]
  c2.cat <-numset[catset==cat2]
  c1.mean <-mean(c1.cat)
  c2.mean <-mean(c2.cat)
  ll<-c(ci(c1.cat)[2], ci(c2.cat)[2])
  ul<-c(ci(c1.cat)[3], ci(c2.cat)[3])
  x<-c(1,2)
  y<-c(c1.mean,c2.mean)
  x<-barplot(y)
  ymax<-ul[2]
  if (ul[1]>ul[2]){
    ymax<-ul[1]
  }
  ymin<-ll[2]
  if (ll[1]<ll[2]){
    ymin<-ll[1]
  }
  if (ymin>0){
    ymin<-0
  }
  barplot(y, xlab=catlab, ylab=numlab, ylim=c(ymin*1.10,ymax*1.10), col=barcols, axes=F)
  arrows(x,ul,x,ll, length=0.05, angle=90, code=3)
  axis(1, at=x, labels=c(cat1,cat2))
  axis(2) 
  box()
  clip(0.1,2.49,-100, 100)
  abline(h=0)
}

#1) Is the average miles per gallon (mpg) of 1975 Motor Trend vehicles different between v-shaped (0) and 
#   straight (1) engines (vs)?
t.test(mtcars$mpg~mtcars$vs, alternative="two.sided") #sig.
barchart.2sample(mtcars$mpg, mtcars$vs, 0, 1, "MPG", "engine type", c("red","blue"))

#2) Is the average quarter mile time in seconds (qsec) of 1975 Motor Trend vehicles greater in automatic (0) compared to
#   manual (1) transmissions (am)?
t.test(mtcars$qsec~mtcars$am, alternative="greater") #not sig
barchart.2sample(mtcars$qsec, mtcars$am, 0, 1, "Quarter mile time (seconds)", "transmission type", c("purple","orange"))

#3) Is the average tooth length of guinea pigs (len) different between orange juice (OJ) and ascorbic acid (VC) supplement of
#   vitamin C (supp)?
t.test(ToothGrowth$len~ToothGrowth$supp, alternative="two.sided") #not sig
barchart.2sample(ToothGrowth$len, ToothGrowth$supp, "OJ", "VC", "Tooth length", "supplement type", c("orange","green"))

#4) Is the average weight (weight) lower in linseed fed chicks compared to sunflower fed ones (feed)?
chickwts2 <- chickwts[which(chickwts$feed=='sunflower'|chickwts$feed=='linseed'),]
t.test(chickwts2$weight~chickwts2$feed, alternative='less') #sig.
barchart.2sample(chickwts2$weight,chickwts2$feed,"sunflower","linseed", "weight", "feed type", c('yellow','brown'))

#5) Is the change in hours of sleep (extra) different between two groups of students (group)?
t.test(sleep$extra~sleep$group, alterative='two.sided') #not sig.
barchart.2sample(sleep$extra,sleep$group, 1, 2, "change in sleep (hours)", "group", c("blue","green"))

#6) Is the number of warp breaks per loom (breaks) different between two types of wool (wool)?
t.test(warpbreaks$breaks~warpbreaks$wool, alternative='two.sided') #not sig.
barchart.2sample(warpbreaks$breaks,warpbreaks$wool, "A", "B", "warp breaks per loom", "wool type", c("grey","white"))
#************************************************************************************************************
 

#Paired t-tests

#1) Is the change in hours of sleep (extra) different between students before (1) and after (2) a drug (group)?
      #NOTE: this is the same dataset used as for the two-sample t-test, but correctly should be paired
t.test(sleep$extra~sleep$group, alterative='two.sided', paired=T) #sig.
interaction.plot(sleep$group, sleep$ID, sleep$extra, ylab="change in sleep (hours)", xlab="Drug (1=before, 2=after)",
                 col=c(1:10), legend=F)
  
#2) Is the average weight (Weight) of anorexia females greater in the post study period (Post) compared to the pre study (Pre)
#   for the Control group?
data(anorexia)
anorexia.control <- data.frame("ID"=c(1:26, 1:26), "Period"=c(rep("Pre",times=26), rep("Post",times=26)), 
                               "Period_code"=c(rep(1,times=26),rep(2,times=26)),
                              "Weight"=c(anorexia$Prewt[anorexia$Treat=="Cont"],anorexia$Postwt[anorexia$Treat=="Cont"]))
t.test(anorexia.control$Weight~anorexia.control$Period, alternative="greater", paired=T) #not sign.
interaction.plot(anorexia.control$Period_code, anorexia.control$ID, anorexia.control$Weight, 
                 ylab="Weight", xlab="Study period: 1=Pre, 2=Post", col=c(1:26), legend=F)

#3) Is the average weight (Weight) of anorexia females greater in the post study period (Post) compared to the pre study (Pre)
#   for the Cognitive Behavioral Treatment group?
anorexia.CBT <- data.frame("ID"=c(1:29, 1:29), "Period"=c(rep("Pre",times=29), rep("Post",times=29)), 
                               "Period_code"=c(rep(1,times=29),rep(2,times=29)),
                               "Weight"=c(anorexia$Prewt[anorexia$Treat=="CBT"],anorexia$Postwt[anorexia$Treat=="CBT"]))
t.test(anorexia.CBT$Weight~anorexia.CBT$Period, alternative="greater", paired=T) #sign.
interaction.plot(anorexia.CBT$Period_code, anorexia.CBT$ID, anorexia.CBT$Weight, 
                 ylab="Weight", xlab="Study period: 1=Pre, 2=Post", col=c(1:29), legend=F)

#4) Is the average weight (Weight) of anorexia females greater in the post study period (Post) compared to the pre study (Pre)
#   for the Family Treatment group?
anorexia.FT <- data.frame("ID"=c(1:17, 1:17), "Period"=c(rep("Pre",times=17), rep("Post",times=17)), 
                           "Period_code"=c(rep(1,times=17),rep(2,times=17)),
                           "Weight"=c(anorexia$Prewt[anorexia$Treat=="FT"],anorexia$Postwt[anorexia$Treat=="FT"]))
t.test(anorexia.FT$Weight~anorexia.FT$Period, alternative="greater", paired=T) #sign.

interaction.plot(anorexia.FT$Period_code, anorexia.FT$ID, anorexia.FT$Weight, 
                 ylab="Weight", xlab="Study period: 1=Pre, 2=Post", col=c(1:17), legend=F)

#5) Is the average blink rate (Rate) different between participants steering a pencil along a straight or oscillating 
#   track (Track)? 
data(Blink)
Blink.mod <- data.frame("ID"=c(1:12), "Track"=c(rep("Straight",times=12),rep("Oscillating",times=12)),
                       "Rate"=c(Blink$Straight,Blink$Oscillating))
t.test(Blink.mod$Rate~Blink.mod$Track, alternative="two.sided",paired=T) #sign.
interaction.plot(Blink.mod$Track, Blink.mod$ID, Blink.mod$Rate, 
                 ylab="Blink rate", xlab="Track type", col=c(1:12), legend=F)

#6) Is the average blood lead level (Levels) lower in control children compared to exposed children (Status)?
data(BloodLead)
BloodLead.mod <-data.frame("ID"=c(1:33), "Status"=c(rep("Control",times=33),rep("Exposed",times=33)),
                           "Levels"=c(BloodLead$Control, BloodLead$Exposed))
t.test(BloodLead.mod$Levels~BloodLead.mod$Status, alternative="less", paired=T) #sign.
interaction.plot(BloodLead.mod$Status, BloodLead.mod$ID, BloodLead.mod$Levels, 
                 ylab="Blood Lead Levels (mg/dl)", xlab="Exposure Status", col=c(1:33), legend=F)
#************************************************************************************************************


#One-way ANOVA

#Function for creating 1-way ANOVA barcharts
  #df is the dataset, cat_var is the categorical variable, num_var is the numerical variable
barchart.anova <- function(df, cat_var, num_var) {
  cat_var <- enquo(cat_var) 
  num_var <- enquo(num_var)
  df_sum<-df %>% 
    group_by(!!cat_var) %>%       
    summarise(mean=mean(!!num_var),
              sd = sd(!!num_var),
              error = qt(0.975,df=n()-1)*sd/sqrt(n()),
              ul = mean + error,
              ll = mean - error)
  #print(df_sum)
  df_plot <<-ggplot(df_sum, aes(!!cat_var, mean, fill=!!cat_var))+
    geom_bar(stat="identity", color="black")+
    geom_hline(yintercept = 0) +
    geom_errorbar(aes(ymin=ll, ymax=ul), width=0.1) +
    theme_classic() + theme(legend.position="none") 
}

#1) Is the average sepal length different across three species of iris (Species)?
aov1<-aov(iris$Sepal.Length~iris$Species)
summary(aov1) #sign.
aov1.mcp <-TukeyHSD(aov1)
plot(aov1.mcp, xlim=c(0,2)) #all comparisons sigm.

barchart.anova(iris, Species, Sepal.Length)
df_plot + labs(y="Sepal length (cm", x="Species") + scale_y_continuous(limits=c(0,7),breaks=c(0:7))+
  geom_text(aes(label=c('*','*','*')), vjust=-0.8, size=6)

#2) Is the average sepal width different across three species of iris (Species)?
aov2<-aov(iris$Sepal.Width~iris$Species)
summary(aov2) #sign.
aov2.mcp <-TukeyHSD(aov2)
plot(aov2.mcp) #all comparisons sign.

barchart.anova(iris, Species, Sepal.Width)
df_plot + labs(y="Sepal width (cm", x="Species") + scale_y_continuous(limits=c(0,4),breaks=c(0:4))+
  geom_text(aes(label=c('*','*','*')), vjust=-0.8, size=6)

#3) Is the average plant growth (weight) different across treatments (group)?
boxplot(PlantGrowth$weight~PlantGrowth$group)
aov3<-aov(PlantGrowth$weight~PlantGrowth$group)
summary(aov3) #sign.
aov3.mcp <-TukeyHSD(aov3)
plot(aov3.mcp) #trt1 vs. trt2 significant.

barchart.anova(PlantGrowth, group, weight)
df_plot + scale_fill_manual(values=c("white","blue","purple")) +
  geom_text(aes(label=c('*'), x=2.5, y=6.7, size=8)) +
  geom_segment(aes(x=2,xend=3,y=6.5,yend=6.5)) +
  labs(y="mean plant weight", x="treatment") + scale_y_continuous(limits=c(0,7),breaks=c(0:7))

#4) Is the average highway MPG (MPG.highway) different across three car manufacturers (Manufacturer)?
Cars93_mod <-na.exclude(Cars93[,c(1,3,8,10,11,18)])
Cars93_mod[20,]$Manufacturer <-"Chrysler"
Cars93_mod2 <-Cars93_mod[which(Cars93_mod$Manufacturer=='Chevrolet'|Cars93_mod$Manufacturer=='Chrysler'|Cars93_mod$Manufacturer=='Dodge'),]
aov4<-aov(Cars93_mod2$MPG.highway~Cars93_mod2$Manufacturer)
summary(aov4) #not sign.
aov4.mcp <-TukeyHSD(aov4)
plot(aov4.mcp)

barchart.anova(Cars93_mod2, Manufacturer, MPG.highway)
df_plot + scale_fill_manual(values=c("grey","grey","grey")) +
  labs(y="mean highway MPG", x="Manufacturer") + scale_y_continuous(limits=c(0,35),breaks=c(0,5,10,15,20,25,30,35))

#5) Is the average highway MPG (MPG.highway) different across four types of cars (Type)?
Cars93_mod3 <-Cars93_mod[which(Cars93_mod$Type=='Compact'|Cars93_mod$Type=='Midsize'|Cars93_mod$Type=='Small'|Cars93_mod$Type=='Sporty'),]
aov5<-aov(Cars93_mod3$MPG.highway~Cars93_mod3$Type)
summary(aov5) #sign.
aov5.mcp <-TukeyHSD(aov5)
par(mar = c(5.1, 6.1, 4.1, 2.1)) 
plot(aov5.mcp, cex.axis=0.7, las=1) #Small-Compact, Small-Midsize, and Sporty-Small different
par(mar = c(5.1, 4.1, 4.1, 2.1)) 

barchart.anova(Cars93_mod3, Type, MPG.highway)
df_plot + scale_fill_manual(values=c("red","blue","white","orange")) +
  geom_text(aes(label=c('A','A','B','A')), vjust=-2, size=6) +
  labs(y="mean highway MPG", x="Type") + scale_y_continuous(limits=c(0,40),breaks=c(0,5,10,15,20,25,30,35,40))


#6) Is the average number of insects (count) after spraying different across six types of spray (spray)?
aov6 <-aov(InsectSprays$count~InsectSprays$spray)
summary(aov6) #sign.
aov6.mcp <-TukeyHSD(aov6)
plot(aov6.mcp, cex.axis=0.6, las=1) #C-A, D-A, E-A, C-B, D-B, E-B, F-C, F-D, and F-E sign.

barchart.anova(InsectSprays, spray, count)
df_plot + geom_text(aes(label=c('&','&','#','#','#','&')), vjust=-4, size=6) +
  labs(y="mean insect count", x="spray type") + scale_y_continuous(limits=c(0,25),breaks=c(0,5,10,15,20,25))

#************************************************************************************************************


#Two-way ANOVA

#Function for creating 2-way ANOVA barcharts
  #df is the dataset, cat1 and cat2 are the categorical variables, num_var is the numerical variable
  #let_list is a list of letters/symbols to indicate different groups; needs to be in c("A","B",...) format
  #dodg_n is a number to set how much the barcharts are dodged by (try 0.9 as your default and adjust from there)
  #v_just is the vertical adjustment of the text from the let_list (try -4 as your default and adjust from there)
barchart.2anova <- function(df, cat1, cat2, num_var,let_list,dodg_n,v_just) {
  cat1 <- enquo(cat1) 
  cat2 <- enquo(cat2)
  num_var <- enquo(num_var)
  df_sum<-df %>% 
    group_by(!!cat1,!!cat2) %>%       
    summarise(mean=mean(!!num_var),
              sd = sd(!!num_var),
              error = qt(0.975,df=n()-1)*sd/sqrt(n()),
              ul = mean + error,
              ll = mean - error)
  #print(df_sum)
  df_plot <<-ggplot(df_sum, aes(x=as.factor(!!cat1), y=mean))+
    geom_bar(aes(fill=as.factor(!!cat2)),stat="identity", color="black", position=position_dodge()) +
    geom_hline(yintercept = 0) + 
    geom_text(aes(label=let_list,group=as.factor(!!cat2)), vjust=v_just, size=4, position=position_dodge(dodg_n))+
    geom_errorbar(aes(ymin=ll, ymax=ul, group=as.factor(!!cat2)), width=0.1, position=position_dodge(dodg_n)) +
    theme_classic() 
}


#1) Is the average miles per gallon (mpg) of 1975 Motor Trend vehicles different across the engine (vs) and
#   transmission (am) type?
tw.aov1<-aov(mtcars$mpg~mtcars$vs*mtcars$am)
summary(tw.aov1) #vs and am sign.; interaction not

barchart.2sample(mtcars$mpg, mtcars$vs, 0, 1, "MPG", "engine type", c("red","blue"))
barchart.2sample(mtcars$mpg, mtcars$am, 0, 1, "MPG", "transmission type", c("purple","orange"))

#2) Is the average gross horsepower (hp)) of 1975 Motor Trend vehicles different across the engine (vs) and
#   transmission (am) type?
tw.aov2<-aov(mtcars$hp~mtcars$vs*mtcars$am)
summary(tw.aov2) #only vs sign.

barchart.2sample(mtcars$hp, mtcars$vs, 0, 1, "horsepower", "engine type", c("lightcoral","lightblue"))

#3) Is the average tooth length of guinea pigs (len) different across supplement (supp) and dose (dose)?
tw.aov3<-aov(ToothGrowth$len~ToothGrowth$supp*as.factor(ToothGrowth$dose))
summary(tw.aov3) #all sign.
tw.aov3.mcp <-TukeyHSD(tw.aov3)
par(mar = c(5.1, 6.1, 4.1, 2.1)) 
plot(tw.aov3.mcp,cex.axis=0.7, las=1) #not sig. VC:1-OJ;0.5 ; OJ:2-OJ:1 ; VC:2-OJ:1 ; VC:2-OJ:2
par(mar = c(5.1, 4.1, 4.1, 2.1)) 

barchart.2anova(ToothGrowth, supp, dose, len, c("A","B","B","C","A","B"), 0.9,-4)
df_plot + labs(y="mean tooth length", x="supplement type", fill="doses") + 
  scale_fill_manual(values=c("slateblue1","slateblue", "slateblue4")) +
  scale_y_continuous(limits=c(0,40),breaks=c(0,5,10,15,20,25,30,35,40))

#4) Is the number of warp breaks per loom (breaks) different across wool (wool) and tension (tension)?
tw.aov4<-aov(warpbreaks$breaks~warpbreaks$wool*warpbreaks$tension)
summary(tw.aov4) #tension and interaction significant
tw.aov4.mcp <-TukeyHSD(tw.aov4)
par(mar = c(5.1, 6.1, 4.1, 2.1)) 
plot(tw.aov4.mcp,cex.axis=0.7, las=1) #sig. B:L-A:L ; A:M-A:L ; B:M-A:L, A:H-A:L; B:H-A:L
par(mar = c(5.1, 4.1, 4.1, 2.1)) 

barchart.2anova(warpbreaks, tension, wool, breaks, c("&","#","#","#","#","#"), 0.9,-7)
df_plot + labs(y="mean break number", x="tension", fill="wool type") +
  scale_fill_manual(values=c("grey","brown")) +
  scale_y_continuous(limits=c(0,70),breaks=c(0,10,20,30,40,50,60,70))

#5) Is the average yield for peas different across nitrogen (N) and phosphorus (P) application?
tw.aov5<-aov(npk$yield~npk$N*npk$P)
summary(tw.aov5) #only nitrogen significant
barchart.2sample(npk$yield,npk$N, 0, 1, "mean yield", "nitrogen application", c("white","blue"))

#6) Is the average price of diamonds different across cut (cut) and color (color)?
diamonds2<-diamonds[,c(2,3,7)]
  #Only included top 3 cut and color categories
diamonds2 <-diamonds2[which(diamonds2$color=="D"|diamonds2$color=="E"|diamonds2$color=="F"),] 
diamonds2 <-diamonds2[which(diamonds2$cut=="Very Good"|diamonds2$cut=="Premium"|diamonds2$cut=="Ideal"),]
diamonds2$color <-factor(diamonds2$color)
diamonds2$cut <-factor(diamonds2$cut)

tw.aov6<-aov(diamonds2$price~diamonds2$cut*diamonds2$color)
summary(tw.aov6) #all significant
tw.aov6.mcp <-TukeyHSD(tw.aov6)
par(mar = c(5.1, 8.5, 4.1, 2.1)) 
plot(tw.aov6.mcp,cex.axis=0.5, las=1) #us HSD.test to get groups
par(mar = c(5.1, 4.1, 4.1, 2.1)) 
interact2 <-with(diamonds2,interaction(cut,color))
out.interact2 <-aov(price~interact2, data=diamonds2)
HSD.test(out.interact2, "interact2", group=T, console=T) #set B->a, C->b, D->c, A->d, E->e for clarity

barchart.2anova(diamonds2, cut, color, price, c("abc","c","a","ab","ab","d","e","e","bc"), 0.9,-2)
df_plot + labs(y="Mean Price", x="Cut ($)", fill="Color") +
  scale_fill_manual(values=c("grey100","gray80", 'gray60')) +
  scale_y_continuous(limits=c(0,5000),breaks=c(0,1000,2000,3000,4000,5000))
#************************************************************************************************************


#Blocked/Nested ANOVA

#1) Is the average yield (Y) of oats per sub-plot different across 3 varieties (V), while controlling for block (B)?
data(oats)
bl.aov1 <-lmer(Y~V + Error(B),data=oats)
summary(bl.aov1) #not sign.

barchart.anova(oats, V, Y)
df_plot + labs(y="mean yield", x="oat variety") + scale_y_continuous(limits=c(0,150),breaks=c(0,25,50,75,100,125,150)) +
  scale_fill_manual(values=c("grey","grey", "grey")) 

#2 Is the average yield for peas different across nitrogen (N) and potassium (K) application, while controlling for block?
bl.aov2 <-aov(yield~N*K + Error(block), data=npk)
summary(bl.aov2) #N and K significant, but interaction not

barchart.2sample(npk$yield,npk$N, 0, 1, "mean yield", "nitrogen application", c("white","blue"))
barchart.2sample(npk$yield,npk$K, 0, 1, "mean yield", "potassium application", c("white","red"))

#3) Is the decrease in honeybees (decrease) different across spray type (treatments), while controlling for row (rowpos) 
#   and column (colpos)?
bl.aov3 <-aov(decrease~treatment + Error(colpos/rowpos), data=OrchardSprays)
summary(bl.aov3) #sign.

bl.aov3b <-lme(decrease~treatment, random=~1| colpos/rowpos, data=OrchardSprays)
bl.ph3 <-glht(bl.aov3b, linfct=mcp(treatment="Tukey"))
plot(bl.ph3)

barchart.anova(OrchardSprays, treatment, decrease)
df_plot + geom_text(aes(label=c('&','&','&#','&#','#','#','#','#')), vjust=-4.5, size=6) +
  labs(y="mean honeybee decrease", x="spray type") + scale_y_continuous(limits=c(0,125),breaks=c(0,25,50,75,100,125))

#4) Is the log-optical density (logDens) different across samples (samples) when nested in block (Block)?
ne.aov1 <-aov(logDens~sample + Error(Block/sample), data=Assay)
summary(ne.aov1) #looks significant
ne.aov1b <-lme(logDens~sample, random=~1| Block/sample, data=Assay)
summary(ne.aov1b) #not significant
ne.ph1 <-glht(ne.aov1b, linfct=mcp(sample="Tukey"))
plot(ne.ph1)

#5) Is the number of words recollected (Recall) different across word type (Valance) when nested in subject?
    #based from https://personality-project.org/r/r.guide/r.anova.html 
datafilename <-"http://personality-project.org/r/datasets/R.appendix3.data"
data.ex1 <-read.table(datafilename,header=T)

ne.aov2 <-aov(Recall~Valence + Error(Subject/Valence), data=data.ex1)
summary(ne.aov2) #sign. 
ne.aov2b <-lme(Recall~Valence, random=~1| Subject/Valence, data=data.ex1)
ne.ph2 <-glht(ne.aov2b, linfct=mcp(Valence="Tukey"))
plot(ne.ph2, cex.axis=0.7)

barchart.anova(data.ex1, Valence, Recall)
df_plot + geom_text(aes(label=c('A','B','C')), vjust=-3, size=6) +
  labs(y="mean recalls", x="word type (Valence)") + scale_y_continuous(limits=c(0,50),breaks=c(0,10,20,30,40,50))+
  scale_fill_manual(values=c("red","yellow","blue"))

#6) Is the number of words recollected (Recall) different across word type (Valance) and (Task) when nested in subject?
    #based from https://personality-project.org/r/r.guide/r.anova.html
datafilename2<-"http://personality-project.org/r/datasets/R.appendix4.data"
data.ex2=read.table(datafilename2,header=T)   #read the data into a table

ne.aov3 <-aov(Recall~(Task*Valence)+Error(Subject/(Task*Valence)),data.ex2)
summary(ne.aov3) #not sign.

barchart.2anova(data.ex2,Valence,Task,Recall,c("","","","","",""),0.9,0)
df_plot + labs(y="mean recalls", x="word type (Valence)", fill="Task") +
  scale_fill_manual(values=c("grey80","gray50")) +
  scale_y_continuous(limits=c(0,22),breaks=c(0,5,10,15,20))
#************************************************************************************************************


#Simple Linear Regression

#1) Is there a relationship between car speed (speed) and stopping distance (dist)?
reg1 <-lm(speed~dist, data=cars)
summary(reg1) #sign.
summary(cars$dist)

pred.frame<-data.frame(dist=2:120)
pp<-predict(reg1, newdata=pred.frame,int="p")
plot(speed~dist, data=cars, ylim=range(speed,pp), 
     ylab="speed (MPH)", xlab="stopping distance (ft)")
legend("bottomright", inset=0.02, legend=c("R-squared= 0.65"))
matlines(pred.frame$dist, pp, lty=c(1,2,2), col=c("black"))

#2) Is there a relationship between log animal brain weight (brain) and log body weight (body)?
data(Animals)
Animals$log_brain <-log(Animals$brain)
Animals$log_body <-log(Animals$body)
reg2 <- lm(log_brain~log_body, data=Animals)
summary(reg2) #sign.
summary(Animals$log_body)

pred.frame<-data.frame(log_body=-3.722:11.374)
pp<-predict(reg2, newdata=pred.frame,int="p")
plot(log_brain~log_body, data=Animals, ylim=range(log_brain,pp),
     ylab="log brain weight", xlab="log body weight")
legend("bottomright", inset=0.02, legend=c("R-squared= 0.61"))
matlines(pred.frame$log_body, pp, lty=c(1,2,2), col=c("black"))

#3) Is there a relationship between the area of pore space (area) and perimeter (peri) in petroleum rock samples?
data(rock)
reg3 <-lm(area~peri, data=rock)
summary(reg3) #sign.
summary(rock$peri)

pred.frame<-data.frame(peri=308.6:4864.2)
pp<-predict(reg3, newdata=pred.frame,int="p")
plot(area~peri, data=rock, ylim=range(area,pp),
     ylab="pore space area (pixels)", xlab="perimeter (pixels)")
legend("bottomright", inset=0.02, legend=c("R-squared= 0.68"))
matlines(pred.frame$peri, pp, lty=c(1,2,2), col=c("black"))

#4) Is there a relationship between the area of pore space (area) and permeability (perm) in petroleum rock samples?
reg4 <-lm(area~perm, data=rock)
summary(reg4) #sign.
summary(rock$perm)

pred.frame<-data.frame(perm=6.30:1300.00)
pp<-predict(reg4, newdata=pred.frame,int="p")
plot(area~perm, data=rock, ylim=range(area,pp),
     ylab="pore space area (pixels)", xlab="permeability (milli-Darcies)")
legend("topright", inset=0.02, legend=c("R-squared= 0.16"))
matlines(pred.frame$perm, pp, lty=c(1,2,2), col=c("black"))

#5) Is there a relationship between fertility rates (Fertility) and the percent of education beyond
#   primary school for draftees (Education) in Swiss provinces?
reg5 <-lm(Fertility~Education, data=swiss)
summary(reg5) #sign.
summary(swiss$Education)

pred.frame<-data.frame(Education=1:53)
pp<-predict(reg5, newdata=pred.frame,int="p")
plot(Fertility~Education, data=swiss, ylim=range(Fertility,pp),
     ylab="common standardized fertility", xlab="% education beyond primary school for draftees")
legend("topright", inset=0.02, legend=c("R-squared= 0.44"))
matlines(pred.frame$Education, pp, lty=c(1,2,2), col=c("black"))
       
#6) Is there a relationship between infant mortality rates (Infant.Mortality) and the percent of males
#   in agriculture (Agriculture) in Swiss provinces?
reg6 <-lm(Infant.Mortality~Agriculture ,data=swiss) 
summary(reg6) #not sign.
summary(swiss$Agriculture)

plot(Infant.Mortality~Agriculture, data=swiss,
     ylab="live births who live less than year", xlab="% of males involved in agriculture as occupation")
abline(20.337955,-0.007805, col="red", lty=3)#from summary of regression
#************************************************************************************************************


#Multiple Linear Regression

#1) Can the area of pore space (area) in petroleum rock samples be predicted by perimeter (peri) and permeability (perm)?
mreg1a<-lm(area~peri,data=rock)
mreg1b<-lm(area~perm,data=rock)
mreg1c<-lm(area~peri+perm, data=rock) #full model
summary(mreg1a) #sign.
summary(mreg1b) #sign.
summary(mreg1c) #both sign.

plot(peri~perm,data=rock) #somewhat correlated
AIC(mreg1a,mreg1b,mreg1c) #full model is best fit

#2) Can frontal lobe size (FL) in crabs be predicted by rear width (RW) and body depth (BD)?
data(crabs)
mreg2a<-lm(FL~RW, data=crabs)
mreg2b<-lm(FL~BD, data=crabs)
mreg2c<-lm(FL~RW+BD, data=crabs)
summary(mreg2a) #sign.
summary(mreg2b) #sign.
summary(mreg2c) #both sign.

plot(RW~BD, data=crabs) #highly correlated
AIC(mreg2a,mreg2b,mreg2c) #full model is best fit but RW and BD highly correlated, so just use BD as best single model

#3) Can frontal lobe size (FL) in crabs be predicted by carapace length (CL), width (CW), and their interaction?
mreg3a<-lm(FL~CL, data=crabs)
mreg3b<-lm(FL~CW, data=crabs)
mreg3c<-lm(FL~CL*CW, data=crabs)
summary(mreg3a)
summary(mreg3b)
summary(mreg3c)

plot(CL~CW, data=crabs) #highly correlated
AIC(mreg3a,mreg3b,mreg3c) #full model is best fit but CL and CW highly correlated, so just use CL as best single model

#4) Can infant mortality rates (Infant.Mortality) in Swiss provinces be predicted by fertility rates (Fertility),
#   % of males in agriculture (Agriculture), % Catholic (Catholic), and % education beyond primary school (Education)?
mreg4a<-lm(Infant.Mortality~Fertility+Agriculture+Catholic+Education, data=swiss)
summary(mreg4a) #only Fertility sign.
pairs(~Fertility+Agriculture+Catholic+Education, data=swiss) #no variables appear too correlated
mreg4b <-lm(Infant.Mortality~Fertility, data=swiss) #single model
summary(mreg4b)
AIC(mreg4a,mreg4b)

#5) Can calories per portion (calories) in US cereals be predicted by protein (protein), fat (fat), 
#   carbohydrates (carbo), or their interactions (sqrt-transformed data)?
data(UScereal)
UScereal$sqrt_calories<-sqrt(UScereal$calories)
UScereal$sqrt_protein<-sqrt(UScereal$protein)
UScereal$sqrt_fat<-sqrt(UScereal$fat)
UScereal$sqrt_carbo<-sqrt(UScereal$carbo)
mreg5a<-lm(sqrt_calories~sqrt_protein*sqrt_fat*sqrt_carbo, data=UScereal)
sreg1 <-step(mreg5a, direction="both") #retained protein, fat, carb, and interaction of protein:carb
pairs(~sqrt_protein+sqrt_fat+sqrt_carbo,data=UScereal) #variables somewhat correlated

mreg5b<-lm(sqrt_calories~sqrt_protein+sqrt_fat+sqrt_carbo+sqrt_protein:sqrt_carbo, data=UScereal)
summary(mreg5b) #protein, fat, and interaction between protein and carbs was sign.

#6) Can miles per gallon (mpg) be predicted by gross horsepower (hp), weight (wt), quarter mile time (qsec), 
#   or their interactions?
mreg6a<- lm(mpg~hp*wt*qsec,data=mtcars)
sreg2 <-step(mreg6a, direction="both") #hp, wt, qsec, and interaction of hp:wt
mreg6b <-lm(mpg ~ hp + wt + qsec + hp:wt, data=mtcars)
summary(mreg6b) #only weight is sign.
pairs(~hp+wt+qsec,data=mtcars) #hp and wt are highly correlated

plot(hp~wt,data=mtcars)
mreg6c <-lm(mpg~wt+qsec,data=mtcars)
summary(mreg6c)
AIC(mreg6a, mreg6b, mreg6c)
#************************************************************************************************************


#Logistic Regression

#1) Can midwest counties' metropolitan status (inmetro=Yes vs. No) be predicted by the log population
#   density(log_popdensity)?
midwest$log_popdensity <-log(midwest$popdensity)
lreg1 <- glm(inmetro ~ log_popdensity, data=midwest, family="binomial")
summary(lreg1) #sign.

plot(midwest$log_popdensity, midwest$inmetro, 
     ylab="Metro status (1=yes, 0=no)", xlab="log population density")
curve(predict(lreg1, data.frame(log_popdensity=x), type="resp"), add=T)

#2) Can midwest counties' metropolitan status (inmetro=Yes vs. No) be predicted by the percentage of
#   college educated adults (percollege)?
lreg2 <- glm(inmetro ~ percollege, data=midwest, family="binomial")
summary(lreg2) #sign.

plot(midwest$percollege, midwest$inmetro,
     ylab="Metro status (1=yes, 0=no)", xlab="percentage of college educated adults")
curve(predict(lreg2, data.frame(percollege=x), type="resp"), add=T)

#3) Can a car's origin (Origin2: USA vs. non-USA) be predicted by city miles per gallon (MPG.city)?
Cars93_lg <- mutate(Cars93, Origin2=ifelse(Origin=="USA",1,0))
lreg3 <- glm(Origin2 ~ MPG.city, data=Cars93_lg, family="binomial")
summary(lreg3) #sign.

plot(Cars93_lg$MPG.city,Cars93_lg$Origin2,
     ylab="Origin (1=USA, 0=non-USA", xlab="city MPG")
curve(predict(lreg3, data.frame(MPG.city=x), type="resp"), add=T)
points(Cars93_lg$MPG.city, fitted(lreg3), pch=20)

#4) Can a car's origin (Origin2: USA vs. non-USA) be predicted by midrange price (Price)?
lreg4 <- glm(Origin2 ~ Price, data=Cars93_lg, family="binomial")
summary(lreg4) #not sign. 

plot(Cars93_lg$Price,Cars93_lg$Origin2,
     ylab="Origin (1=USA, 0=non-USA", xlab="midrange price ($/thousands")
curve(predict(lreg4, data.frame(Price=x), type="resp"), add=T, col="red")
points(Cars93_lg$Price, fitted(lreg4), pch=20, col="red")

#5) Can the presence of a bacteria species (y: yes vs. no) be predicted by drug status (ap: active/placebo) and
#   compliance (hilo: low/high)?
lreg5 <- glm(y~ap+hilo, data=bacteria, family="binomial")
summary(lreg5)
exp(cbind(OR=coef(lreg7),confint(lreg7)))

#6) Can the presence of a tumor (status: tumor vs. none) be predicted by sex (sex: male/female) and
#   treatment (rex: drug/control)?
lreg6 <-glm(status~rx+sex, data=rats, family="binomial")
summary(lreg6)
exp(cbind(OR=coef(lreg6),confint(lreg6)))