#--------------------------------------------------------------------------------------------------------------------------- ## MANOVA, MANCOVA, and Multivariate Regression ## #--------------------------------------------------------------------------------------------------------------------------- #Intro stuff: library(rstatix) library(plyr) library(tidyverse) head(mtcars) #outcomes (mpg and qsec); categorical predictors (am, gear); covariate (wt); numerical predictors (disp, hp, wt) mtcars$am2 <-as.factor(mtcars$am) mtcars$gear2 <-as.factor(mtcars$gear) #--------------------------------------------------------------------------------------------------------------------------- #data visualization for MANOVA/MANCOVA par(mfrow=c(2,2)) plot(mtcars$mpg~mtcars$am2, col='orange') plot(mtcars$mpg~mtcars$gear2, col='orange') plot(mtcars$qsec~mtcars$am2, col='blue') plot(mtcars$qsec~mtcars$gear2, col='blue') plot(mtcars$mpg~mtcars$wt, col=mtcars$am2, pch=16) plot(mtcars$mpg~mtcars$wt, col=mtcars$gear2, pch=16) plot(mtcars$qsec~mtcars$wt, col=mtcars$am2, pch=17) plot(mtcars$qsec~mtcars$wt, col=mtcars$gear2, pch=17) par(mfrow=c(1,1)) #--------------------------------------------------------------------------------------------------------------------------- #testing assumptions for MANOVA/MANCOVA #normality hist(mtcars$mpg) hist(mtcars$qsec) #good enough mtcars %>% select(mpg, qsec) %>% mshapiro_test() #good #multicollinearity cor.test(mtcars$mpg, mtcars$qsec) #good #linearity plot(mtcars$mpg, mtcars$qsec) #good #homogeneity of variance and covariance ldply(mtcars[,9:10],function(x) t(rbind(names(table(x)),table(x),paste0(prop.table(table(x))*100,"%")))) box_m(mtcars[, c("mpg", "qsec")], mtcars$am2) #good box_m(mtcars[, c("mpg", "qsec")], mtcars$gear2) #significant, so an issue (Use Pillai's) mtcars %>% gather(key = "variable", value = "value", mpg, qsec) %>% group_by(variable) %>% levene_test(value ~ am2) #good for mpg, not good for qsec mtcars %>% gather(key = "variable", value = "value", mpg, qsec) %>% group_by(variable) %>% levene_test(value ~ gear2) #not good for mpg, not good for qsec #--------------------------------------------------------------------------------------------------------------------------- #MANOVA (http://www.sthda.com/english/wiki/manova-test-in-r-multivariate-analysis-of-variance) manova1 <- manova(cbind(mpg, qsec)~ am2 + gear2, data=mtcars) summary(manova1) summary.aov(manova1) #--------------------------------------------------------------------------------------------------------------------------- #MANCOVA (https://www.rdocumentation.org/packages/jmv/versions/1.2.23/topics/mancova) mancova1 <-manova(cbind(mpg, qsec)~ am2 + gear2 + wt, data=mtcars) summary(mancova1) summary.aov(mancova1) #--------------------------------------------------------------------------------------------------------------------------- #data visualization for Multivar. Regression #--------------------------------------------------------------------------------------------------------------------------- #mpg mtcars_mpg <- data.frame(mpg=mtcars$mpg, disp=mtcars$disp, hp=mtcars$hp, wt=mtcars$wt) pairs(mtcars_mpg, col="blue") #qsec mtcars_qsec <- data.frame(qsec=mtcars$qsec, disp=mtcars$disp, hp=mtcars$hp, wt=mtcars$wt) pairs(mtcars_qsec, col="red") #--------------------------------------------------------------------------------------------------------------------------- #testing assumptions for Multivar. Regression #normality and linearity #good,based on previously done with MANOVA/MANCOVA #multicollinearity the X-variables cor(mtcars[,4:6]) #drat and wt are at -0.71, but still below the 0.80 where colinearity is really an issue #--------------------------------------------------------------------------------------------------------------------------- #Multivariate Regression (https://www.datascievo.com/multivariate-regression-in-r/) mvreg1 <-lm(cbind(mpg,qsec) ~ disp + hp + wt, data=mtcars) summary(mvreg1) #check with multiple regression models mvreg2 <-lm(mpg ~ disp + hp + wt, data=mtcars) mvreg3 <-lm(qsec ~ disp + hp + wt, data=mtcars) summary(mvreg2) summary(mvreg3) #--------------------------------------------------------------------------------------------------------------------------- ## Clustering and Recursive Partitioning ## #--------------------------------------------------------------------------------------------------------------------------- #Intro stuff: library(cluster) library(factoextra) library(dbscan) library(mclust) head(mtcars) mtcars2 <- mtcars[,1:7] head(mtcars2) #--------------------------------------------------------------------------------------------------------------------------- #K-means clustering: https://data-flair.training/blogs/clustering-in-r-tutorial/ kmeans1 <- kmeans(mtcars2, centers=2, nstart=100) str(kmeans1) fviz_cluster(kmeans1, data=mtcars2) kmeans2 <- kmeans(mtcars2, centers=3, nstart=100) fviz_cluster(kmeans2, data=mtcars2) #Hierarchical clustering: https://www.datacamp.com/community/tutorials/hierarchical-clustering-R dist_mat <- dist(mtcars2, method="euclidean") hclust1 <-hclust(dist_mat, method='average') plot(hclust1) plot(hclust1) rect.hclust(hclust1, k=2, border=2:6) abline(h=200, col="red") plot(hclust1) rect.hclust(hclust1, k=3, border=2:6) abline(h=160, col="red") #Density-based clustering: https://en.proft.me/2017/02/3/density-based-clustering-r/ kNNdistplot(mtcars2, k=2) abline(h=60, col="red") dbclust1 <-dbscan(mtcars2, 60, 2) hullplot(mtcars2, dbclust1$cluster) kNNdistplot(mtcars2, k=3) abline(h=43, col="red") dbclust2 <-dbscan(mtcars2, 43, 3) hullplot(mtcars2, dbclust2$cluster) #Distribution-based clustering: https://en.proft.me/2017/02/1/model-based-clustering-r/ mclust1 <-Mclust(mtcars2) mclust1$modelName mclust1$G plot(mclust1, what=c('classification')) plot(mclust1, "density") mtcars3 <-mtcars2[,1:3] mclust2 <-Mclust(mtcars3) mclust2$modelName mclust2$G plot(mclust2, what=c('classification')) plot(mclust2, "density") mclust3 <-Mclust(mtcars3, 2) mclust3$modelName mclust3$G plot(mclust3, what=c('classification')) plot(mclust3, "density") #--------------------------------------------------------------------------------------------------------------------------- #Recursive Partitioning: https://www.statology.org/classification-and-regression-trees-in-r/ library(rpart) library(rpart.plot) head(mtcars) #Regression Tree: tree1 <-rpart(mpg ~ disp + hp + drat + wt + qsec, data=mtcars, control=rpart.control(cp=0.0001)) printcp(tree1) prp(tree1) best <-tree1$cptable[which.min(tree1$cptable[,"xerror"]),"CP"] pruned_tree1 <-prune(tree1, cp=best) prp(pruned_tree1, faclen=0, extra=1, roundint=F, digits=4) #predict tree2 <-rpart(mpg~disp+wt,data=mtcars, control=rpart.control(cp=0.0001)) prp(tree2) new_car <- data.frame(wt=3, disp=300) predict(tree2, newdata=new_car) #Classification Tree mtcars$vs2 <-as.factor(mtcars$vs) mtcars$am2 <-as.factor(mtcars$am) mtcars$gear2 <-as.factor(mtcars$gear) mtcars$carb2 <-as.factor(mtcars$carb) str(mtcars) tree3 <-rpart(vs2 ~ mpg + cyl + disp + hp + drat +wt +qsec +am2 +gear2 +carb2, data=mtcars, control=rpart.control(cp=0.0001)) printcp(tree3) prp(tree3,faclen=0, extra=1, roundint=F, digits=4) #less predictive tree4 <-rpart(vs2 ~ drat +am2 +carb2, data=mtcars, control=rpart.control(cp=0.0001)) printcp(tree4) prp(tree4,faclen=0, extra=1, roundint=F, digits=4)