#--------------------------------------------------------------------------------------------------------------------------- ## CANONICAL ANALYSIS ## #--------------------------------------------------------------------------------------------------------------------------- #Intro stuff: #librairies library(dplyr) library(ggplot2) library(GGally) library(CCA) library(CCP) library(MASS) library(heplots) library(candisc) library(vegan) library(vars) #dataset names(Cars93) head(Cars93) #--------------------------------------------------------------------------------------------------------------------------- ######Canonical Correlation Analysis###### #REFERENCE: https://stats.idre.ucla.edu/r/dae/canonical-correlation-analysis/ #set up variable sets fuel <- data.frame(MPG.highway=Cars93$MPG.highway, Fuel.tank.capacity=Cars93$Fuel.tank.capacity, Weight=Cars93$Weight) engine <-data.frame(EngineSize=Cars93$EngineSize, RPM=Cars93$RPM, Rev.per.mile=Cars93$Rev.per.mile) #test normality par(mfrow=c(3,2)) hist(fuel$MPG.highway, col='blue'); hist(fuel$Fuel.tank.capacity, col='blue'); hist(fuel$Weight, col='blue') hist(engine$EngineSize, col="red"); hist(engine$RPM, col="red"); hist(engine$Rev.per.mile, col="red") par(mfrow=c(1,1)) #looking at variable sets ggpairs(fuel) ggpairs(engine) #correlations at and between variable sets matcor(fuel, engine) #canonical correlation analysis cc1<-cc(fuel, engine) #canonical correlations cc1$cor #raw canonical coefficients cc1[3:4] #canonical loadings (type of latent variable) cc2 <-comput(fuel, engine, cc1) cc2[3:6] #significance test for canonical dimensions rho <-cc1$cor n <-dim(fuel)[1] p <-length(fuel) q <-length(engine) p.asym(rho, n, p, q, tstat = "Wilks") #1st significant only p.asym(rho, n, p, q, tstat = "Hotelling") #1st significant only p.asym(rho, n, p, q, tstat = "Pillai") #1-3 significant #standardized canonical coefficients s1 <- diag(sqrt(diag(cov(fuel)))) #for fuel s2 <- diag(sqrt(diag(cov(engine)))) #for engine s1 %*% cc1$xcoef s2 %*% cc1$ycoef #Tests of Canonical Dimensions # Canonical Mult. #Dimension Corr. F df1 df2 p #1 0.85 18.27 9 211.9 0.0000 #2 0.28 2.01 4 176.0 0.0955 #3 0.07 0.41 1 89.0 0.5220 #Table 2: Standardized Canonical Coefficients #Dimension #1 #Fuel Variables #MPG 0.20 #Tank Capacity 0.07 #Weight 1.09 #Engine Variables #Engine size 0.95 #RPM 0.07 #Rev per mile -0.11 #--------------------------------------------------------------------------------------------------------------------------- ######Linear Discriminant Analysis###### #REFERENCE: https://www.statology.org/linear-discriminant-analysis-in-r/ #set up dataset CarsLD <-data.frame(DriveTrain=Cars93$DriveTrain, MPG.highway=Cars93$MPG.highway, Fuel.tank.capacity=Cars93$Fuel.tank.capacity, Weight=Cars93$Weight, EngineSize=Cars93$EngineSize, RPM=Cars93$RPM, Rev.per.mile=Cars93$Rev.per.mile) #scale data CarsLD[2:7]<-scale(CarsLD[2:7]) apply(CarsLD[2:7], 2, mean) apply(CarsLD[2:7], 2, sd) #training and test samples set.seed(1) sample <-sample(c(TRUE, FALSE), nrow(CarsLD), replace=TRUE, prob=c(0.7,0.3)) train <-CarsLD[sample, ] #70% as training set test <-CarsLD[!sample, ] #30% as testing set #fit LDA model LDA1 <-lda(DriveTrain~., data=CarsLD) LDA1 #make predictions predicted <-predict(LDA1, test) names(predicted) head(predicted$class) head(predicted$posterior) head(predicted$x) #find accuracy of model mean(predicted$class==test$DriveTrain) #80% accuracy #visualize the results DriveTrain2 <-data.frame(Drivetrain=CarsLD$DriveTrain) LDA_plot <-cbind(DriveTrain2, predict(LDA1)$x) ggplot(LDA_plot, aes(LD1, LD2)) + geom_point(aes(color=Drivetrain)) #--------------------------------------------------------------------------------------------------------------------------- ######Canonical Correspondence and Redundancy Analysis###### #REFERENCE:https://mran.microsoft.com/snapshot/2015-11-17/web/packages/vegan/vegan.pdf #(X, Y, Z matrices) where Z is conditioning matrix #X, Y, Z -> partial correspondence/redundancy analysis #X, Y -> canonical correspondence/redundency analysis #X -> principale components analysis CarCounts <-data.frame(Cylinders=Cars93$Cylinders, Passengers=Cars93$Passengers, Fuel.tank.capacity=Cars93$Fuel.tank.capacity) CarMetrics<-data.frame(EngineSize=Cars93$EngineSize, Horsepower=Cars93$Horsepower, RPM=Cars93$RPM, Rev.per.mile=Cars93$Rev.per.mile, Price=Cars93$Price, MPG.highway=Cars93$MPG.highway) CarCounts$Cylinders <-as.integer(CarCounts$Cylinders) str(CarCounts);str(CarMetrics) #basic cca with all variables Car_CCA1 <-cca(CarCounts, CarMetrics) Car_CCA1 plot(Car_CCA1) #more nuanced formula Car_CCA2 <-cca(CarCounts ~ EngineSize*(RPM + Rev.per.mile) + Price, data=CarMetrics) Car_CCA2 plot(Car_CCA2) #redundancy analysis Car_RDA <-rda(CarCounts, CarMetrics) Car_RDA plot(Car_RDA) #--------------------------------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------------------------------- ## TIME BASED ## #--------------------------------------------------------------------------------------------------------------------------- ######Vector Autoregression###### #REFERENCE: https://www.r-econometrics.com/timeseries/varintro/ set.seed(123) # Reset random number generator for reasons of reproducibility # Generate sample t <- 100 # Number of time series observations k <- 2 # Number of endogenous variables p <- 2 # Number of lags # Generate coefficient matrices A.1 <- matrix(c(.1, .2, .3, .4), k) # Coefficient matrix of lag 1 A.2 <- matrix(c(-.4, -.3, -.2, -.1), k) # Coefficient matrix of lag 2 A <- cbind(A.1, A.2) # Companion form of the coefficient matrices # Generate series series <- matrix(0, k, t + 2*p) # Raw series with zeros for (i in (p + 1):(t + 2*p)){ # Generate series with e ~ N(0,0.5) series[, i] <- A.1%*%series[, i-1] + A.2%*%series[, i-2] + rnorm(k, 0, .5) } series <- ts(t(series[, -(1:p)])) # Convert to time series format names <- c("V1", "V2") # Rename variables plot.ts(series) # Plot the series var.1 <- VAR(series, 2, type = "none") # Estimate the model var.aic <- VAR(series, type = "none", lag.max = 5, ic = "AIC") summary(var.aic) #true values A # Extract coefficients, standard errors etc. from the object # produced by the VAR function est_coefs <- coef(var.aic) # Extract only the coefficients for both dependend variables # and combine them to a single matrix est_coefs <- rbind(est_coefs[[1]][, 1], est_coefs[[2]][, 1]) # Print the rounded estimates in the console round(est_coefs, 2) #Impulse response # Calculate the IRF ir.1 <- irf(var.1, impulse = "Series.1", response = "Series.2", n.ahead = 20, ortho = FALSE) # Plot the IRF plot(ir.1) # Calculate impulse response ir.2 <- irf(var.1,impulse="Series.1",response="Series.2",n.ahead = 20,ortho = FALSE, cumulative = TRUE) # Plot plot(ir.2) #--------------------------------------------------------------------------------------------------------------------------- ##### Principal Response Curves###### #REFERENCE: https://www.rdocumentation.org/packages/vegan/versions/2.4-2/topics/prc ## Chlorpyrifos experiment and experimental design: Pesticide ## treatment in ditches (replicated) and followed over from 4 weeks ## before to 24 weeks after exposure data(pyrifos) week <- gl(11, 12, labels=c(-4, -1, 0.1, 1, 2, 4, 8, 12, 15, 19, 24)) dose <- factor(rep(c(0.1, 0, 0, 0.9, 0, 44, 6, 0.1, 44, 0.9, 0, 6), 11)) ditch <- gl(12, 1, length=132) # PRC mod <- prc(pyrifos, dose, week) #response, treatment, time mod # RDA summary(mod) # PRC logabu <- colSums(pyrifos) plot(mod, select = logabu > 100) ## Ditches are randomized, we have a time series, and are only ## interested in the first axis ctrl <- how(plots = Plots(strata = ditch,type = "free"), within = Within(type = "series"), nperm = 99) anova(mod, permutations = ctrl, first=TRUE) #reduced pyrifos_red <-pyrifos[,1:10] mod2 <- prc(pyrifos_red, dose, week) #response, treatment, time mod2 # RDA summary(mod2) # PRC plot(mod2) #individual abundance pyrifos_combo <-cbind(pyrifos, dose, week) pyrifos_combo$week <- as.integer(pyrifos_combo$week) Slyla_m <-aggregate(Slyla~ dose*week, FUN=mean, data=pyrifos_combo) Aloco_m <-aggregate(Aloco~ dose*week, FUN=mean, data=pyrifos_combo) Copsp_m <-aggregate(Copsp~ dose*week, FUN=mean, data=pyrifos_combo) Simve_m <-aggregate(Simve~ dose*week, FUN=mean, data=pyrifos_combo) g1 <- ggplot()+geom_line(data=Slyla_m, aes(x=week, y=Slyla, col=dose)) g2 <- ggplot()+geom_line(data=Aloco_m, aes(x=week, y=Aloco, col=dose)) g3 <- ggplot()+geom_line(data=Copsp_m, aes(x=week, y=Copsp, col=dose)) g4 <- ggplot()+geom_line(data=Simve_m, aes(x=week, y=Simve, col=dose)) g1;g2;g3;g4