#Resampling Magic R-code: #References: # https://rcompanion.org/handbook/K_01.html # https://www.statology.org/bootstrap-standard-error-in-r/ # https://stackoverflow.com/questions/58393608/bootstrapped-correlation-in-r # https://www.statology.org/k-fold-cross-validation-in-r/ # #https://search.r-project.org/CRAN/refmans/pls/html/jack.test.html #Get data and packages library(lmPerm) library(dplyr) library(ggplot2) library(boot) library(caret) library(pls) #magic cards: setwd("C:/Users/Mark.Williamson.2/OneDrive - North Dakota University System/Desktop/Williamson Data/Magic") magic<-read.csv("cards_condensed_10e_clean.csv") head(magic) boxplot(magic$convertedManaCost~magic$colors, col=c("grey", "green", "red", "blue", "white")) boxplot(magic$convertedManaCost[magic$types=="Creature"]~magic$colors[magic$types=="Creature"], col=c("grey", "green", "red", "blue", "white")) boxplot(magic$toughness[magic$types=="Creature"]~magic$colors[magic$types=="Creature"], col=c("grey", "green", "red", "blue", "white")) boxplot(magic$power[magic$types=="Creature"]~magic$colors[magic$types=="Creature"], col=c("grey", "green", "red", "blue", "white")) creatures <-magic[magic$types=="Creature",] par(mfrow=c(2,3)) hist(creatures$convertedManaCost[creatures$colors=="R"], col="red", main="Mana Cost", xlab="Cost") hist(creatures$convertedManaCost[creatures$colors=="U"], col="blue", main="", xlab="Cost") hist(creatures$convertedManaCost[creatures$colors=="B"], col="grey", main="", xlab="Cost") hist(creatures$convertedManaCost[creatures$colors=="W"], col="white", main="", xlab="Cost") hist(creatures$convertedManaCost[creatures$colors=="G"], col="green", main="", xlab="Cost") par(mfrow=c(1,1)) par(mfrow=c(2,3)) hist(creatures$power[creatures$colors=="R"], col="red", main="Power", xlab="Power") hist(creatures$power[creatures$colors=="U"], col="blue", main="", xlab="Power") hist(creatures$power[creatures$colors=="B"], col="grey", main="", xlab="Power") hist(creatures$power[creatures$colors=="W"], col="white", main="", xlab="Power") hist(creatures$power[creatures$colors=="G"], col="green", main="", xlab="Power") par(mfrow=c(1,1)) par(mfrow=c(2,3)) hist(creatures$toughness[creatures$colors=="R"], col="red", main="Toughness", xlab="Toughness") hist(creatures$toughness[creatures$colors=="U"], col="blue", main="", xlab="Toughness") hist(creatures$toughness[creatures$colors=="B"], col="grey", main="", xlab="Toughness") hist(creatures$toughness[creatures$colors=="W"], col="white", main="", xlab="Toughness") hist(creatures$toughness[creatures$colors=="G"], col="green", main="", xlab="Toughness") par(mfrow=c(1,1)) creatures %>% group_by(colors) %>% summarize(mean_Mana = mean(convertedManaCost), mean_power= mean(power), mean_toughness=mean(toughness)) #------------------------------------------------------------------------------; #Example 1: Permutation test #Question: Is average power different between Green and White creatures? creatures_GW <-subset(creatures, colors =="G" | colors == "W") head(creatures_GW) glm1 <- glm(power ~ colors, data=creatures_GW, family="poisson") summary(glm1) lm1 <-lm(power ~ colors, data=creatures_GW) summary(lm1) anova(lm1) barplot(c(2.74,1.38)~c("G","W"), xlab="Power", ylab="mean", col=c("Green", "White"), ylim=c(0,3)) box() lmp1 <-lmp(power ~ colors, data=creatures_GW, perm="Prob", seqs=FALSE) summary(lmp1) anova(lmp1) #Question: Is average toughness different between Black and Green creatures? creatures_BG <-subset(creatures, colors =="B" | colors == "G") head(creatures_BG) glm2 <- glm(toughness ~ colors, data=creatures_BG, family="poisson") summary(glm2) lm2 <-lm(toughness ~ colors, data=creatures_BG) summary(lm2) anova(lm2) barplot(c(2.12,2.90)~c("B","G"), xlab="Toughness", ylab="mean", col=c("Grey", "Green"), ylim=c(0,3)) box() lmp2 <-lmp(toughness ~ colors, data=creatures_BG, perm="Prob", seqs=FALSE) summary(lmp2) anova(lmp2) #------------------------------------------------------------------------------; #visualization: t_coeff<-unname(lm1$coefficients[2]) #estimate of difference between Green and White creature power set.seed(10) coeff_list <-c() for (i in seq(1:1000)) { perm_colors<-sample(creatures_GW$colors,replace=FALSE) perm_power <-creatures_GW$power perm_df <-data.frame(colors=perm_colors, power=perm_power) plm <-lm(power ~ colors, data=perm_df) perm_coeff <-unname(plm$coefficients[2]) coeff_list <<-c(coeff_list, perm_coeff) } coeff_df <-data.frame(coeff=coeff_list) plot1 <-ggplot(data=coeff_df, aes(x=coeff))+ geom_histogram(, bins=12, fill="blue", col="black") plot1 t_coeff #actual coefficient we got 1000-sum(coeff_list>t_coeff) plot1+ geom_segment(aes(x=t_coeff, y=0, xend=t_coeff, yend=250), colour="red", linewidth=1)+ geom_text(aes(label=c("4/1000, p=0.004")), x=t_coeff, y=257, size=3) #------------------------------------------------------------------------------; #Example 2: Bootstrapping #Question: What is the bootstrap confidence in mean cost for Instants? head(magic[magic$types=="Instant",]) Instant_Cost <-magic$convertedManaCost[magic$types=="Instant"] mean(Instant_Cost) #Bootstrap standard error set.seed(10) meanFunc <- function(x,i){mean(x[i])} reps1 <-boot(Instant_Cost, meanFunc, 50) reps2 <-boot(Instant_Cost, meanFunc, 10000) #Plot Bootstrap distribution plot(reps1) plot(reps2) #Calculate confidence intervals boot.ci(reps1, type="bca") boot.ci(reps2, type="bca") #Question: How confident can we be in a correlation between Mana cost and Creature power? Power_cost <-data.frame(power=creatures$power, cost=creatures$convertedManaCost) head(Power_cost) cor1<-cor(Power_cost$power, Power_cost$cost, method="spearman") cor1 plot(Power_cost$power, Power_cost$cost) corFunc <- function(data,i){cor(data[i,"power"], data[i,"cost"], method="spearman")} reps3 <-boot(Power_cost, corFunc, 10000) reps3 plot(reps3) boot.ci(reps3, type="bca") #density plot plot(density(reps3$t)) abline(v=cor1, lty="dashed") #------------------------------------------------------------------------------; #Example 3: K-fold validation/ Jackknife #Question: How accurate is using power and toughness to predict Mana cost for creatures, using K-fold? lm3 <-lm(convertedManaCost ~ power * toughness, data=creatures) summary(lm3) ctrl <- trainControl(method="cv", number =5) modelA <-train(convertedManaCost ~ power, data=creatures, method="lm", trControl=ctrl) modelB <-train(convertedManaCost ~ toughness, data=creatures, method="lm", trControl=ctrl) modelC <-train(convertedManaCost ~ power + toughness, data=creatures, method="lm", trControl=ctrl) modelD <-train(convertedManaCost ~ power * toughness, data=creatures, method="lm", trControl=ctrl) #RSME=root mean squared error: lower the better; #Rsquared=correlation: the higher the better; #MAE=mean absolute error: lower the better; modelA$results; modelB$results; modelC$results; modelD$results #Model 4 had lowest RSME, highest Rsquared, and lowest MAE modelD$finalModel #Question: what about jackknife? jlm <-pcr(convertedManaCost ~ power * toughness, data=creatures, validation="LOO", jackknife=TRUE) jack.test(jlm) #visualization example: lm3$coefficients power_c <-unname(lm3$coefficients[2]) tough_c <-unname(lm3$coefficients[3]) lc<-length(creatures$convertedManaCost) power_coeff_list <-c() tough_coeff_list <-c() for (i in seq(1:lc)){ new_df <-creatures[-c(i), ] lm_loo <-lm(convertedManaCost ~ power * toughness, data=new_df) power_coeff <-unname(lm_loo$coefficients[2]) tough_coeff <-unname(lm_loo$coefficients[3]) power_coeff_list <<-c(power_coeff_list, power_coeff) tough_coeff_list <<-c(tough_coeff_list, tough_coeff) } power_jack_df <-data.frame(coeff=power_coeff_list) tough_jack_df <-data.frame(coeff=tough_coeff_list) plot2 <-ggplot(data=power_jack_df, aes(x=coeff))+ geom_histogram(bins=12, fill="red", col="black")+ geom_segment(aes(x=power_c, y=0, xend=power_c, yend=120), colour="black", linewidth=1.2)+ labs(x="Power coefficient") plot2 plot3 <-ggplot(data=tough_jack_df, aes(x=coeff))+ geom_histogram(bins=12, fill="green", col="black")+ geom_segment(aes(x=tough_c, y=0, xend=tough_c, yend=95), colour="black", linewidth=1.2)+ labs(x="Toughness coefficient") plot3