#---------------------------------------------------------------------------------------------------------------------------
           ##         CANONICAL ANALYSIS ##
#---------------------------------------------------------------------------------------------------------------------------

      #Intro stuff:

#librairies
library(dplyr)
library(ggplot2)
library(GGally)
library(CCA)
library(CCP)
library(MASS)
library(heplots)
library(candisc)
library(vegan)
library(vars)

#dataset
names(Cars93)
head(Cars93)

#---------------------------------------------------------------------------------------------------------------------------

  ######Canonical Correlation Analysis######

#REFERENCE: https://stats.idre.ucla.edu/r/dae/canonical-correlation-analysis/

#set up variable sets
fuel <- data.frame(MPG.highway=Cars93$MPG.highway, Fuel.tank.capacity=Cars93$Fuel.tank.capacity, Weight=Cars93$Weight)
engine <-data.frame(EngineSize=Cars93$EngineSize, RPM=Cars93$RPM, Rev.per.mile=Cars93$Rev.per.mile)

#test normality
par(mfrow=c(3,2))
hist(fuel$MPG.highway, col='blue'); hist(fuel$Fuel.tank.capacity, col='blue'); hist(fuel$Weight, col='blue')
hist(engine$EngineSize, col="red"); hist(engine$RPM, col="red"); hist(engine$Rev.per.mile, col="red")
par(mfrow=c(1,1))

#looking at variable sets
ggpairs(fuel)
ggpairs(engine)

#correlations at and between variable sets
matcor(fuel, engine)

#canonical correlation analysis
cc1<-cc(fuel, engine)

#canonical correlations
cc1$cor

#raw canonical coefficients
cc1[3:4]

#canonical loadings (type of latent variable)
cc2 <-comput(fuel, engine, cc1)
cc2[3:6]

#significance test for canonical dimensions
rho <-cc1$cor
n <-dim(fuel)[1]
p <-length(fuel)
q <-length(engine)

p.asym(rho, n, p, q, tstat = "Wilks") #1st significant only
p.asym(rho, n, p, q, tstat = "Hotelling") #1st significant only
p.asym(rho, n, p, q, tstat = "Pillai") #1-3 significant

#standardized canonical coefficients
s1 <- diag(sqrt(diag(cov(fuel)))) #for fuel
s2 <- diag(sqrt(diag(cov(engine)))) #for engine

s1 %*% cc1$xcoef
s2 %*% cc1$ycoef

#Tests of Canonical Dimensions
#         Canonical  Mult.
#Dimension Corr.      F    df1    df2      p
#1         0.85     18.27    9   211.9    0.0000
#2         0.28      2.01    4   176.0    0.0955
#3         0.07      0.41    1    89.0    0.5220

#Table 2: Standardized Canonical Coefficients
                    #Dimension
                      #1
#Fuel Variables
  #MPG                0.20
  #Tank Capacity      0.07  
  #Weight             1.09
#Engine Variables 
  #Engine size        0.95    
  #RPM                0.07
  #Rev per mile      -0.11           
#---------------------------------------------------------------------------------------------------------------------------


  ######Linear Discriminant Analysis######

#REFERENCE: https://www.statology.org/linear-discriminant-analysis-in-r/

#set up dataset
CarsLD <-data.frame(DriveTrain=Cars93$DriveTrain, MPG.highway=Cars93$MPG.highway, Fuel.tank.capacity=Cars93$Fuel.tank.capacity,
                    Weight=Cars93$Weight, EngineSize=Cars93$EngineSize, RPM=Cars93$RPM, Rev.per.mile=Cars93$Rev.per.mile)

#scale data
CarsLD[2:7]<-scale(CarsLD[2:7])
apply(CarsLD[2:7], 2, mean)
apply(CarsLD[2:7], 2, sd)

#training and test samples
set.seed(1)

sample <-sample(c(TRUE, FALSE), nrow(CarsLD), replace=TRUE, prob=c(0.7,0.3))
train <-CarsLD[sample, ] #70% as training set
test <-CarsLD[!sample, ] #30% as testing set

#fit LDA model
LDA1 <-lda(DriveTrain~., data=CarsLD)
LDA1

#make predictions
predicted <-predict(LDA1, test)

names(predicted)
head(predicted$class)
head(predicted$posterior)
head(predicted$x)

#find accuracy of model
mean(predicted$class==test$DriveTrain) #80% accuracy

#visualize the results

DriveTrain2 <-data.frame(Drivetrain=CarsLD$DriveTrain)
LDA_plot <-cbind(DriveTrain2, predict(LDA1)$x)
ggplot(LDA_plot, aes(LD1, LD2)) +
  geom_point(aes(color=Drivetrain))
#---------------------------------------------------------------------------------------------------------------------------


  ######Canonical Correspondence and Redundancy Analysis######

#REFERENCE:https://mran.microsoft.com/snapshot/2015-11-17/web/packages/vegan/vegan.pdf

#(X, Y, Z matrices) where Z is conditioning matrix
#X, Y, Z -> partial correspondence/redundancy analysis
#X, Y    -> canonical correspondence/redundency analysis
#X       -> principale components analysis

CarCounts <-data.frame(Cylinders=Cars93$Cylinders, Passengers=Cars93$Passengers, Fuel.tank.capacity=Cars93$Fuel.tank.capacity)
CarMetrics<-data.frame(EngineSize=Cars93$EngineSize, Horsepower=Cars93$Horsepower, RPM=Cars93$RPM, 
                   Rev.per.mile=Cars93$Rev.per.mile, Price=Cars93$Price, MPG.highway=Cars93$MPG.highway)

CarCounts$Cylinders <-as.integer(CarCounts$Cylinders)
str(CarCounts);str(CarMetrics)

#basic cca with all variables
Car_CCA1 <-cca(CarCounts, CarMetrics)
Car_CCA1
plot(Car_CCA1)

#more nuanced formula
Car_CCA2 <-cca(CarCounts ~ EngineSize*(RPM + Rev.per.mile) + Price, data=CarMetrics)
Car_CCA2
plot(Car_CCA2)

#redundancy analysis
Car_RDA <-rda(CarCounts, CarMetrics)
Car_RDA
plot(Car_RDA)
#---------------------------------------------------------------------------------------------------------------------------
 

#---------------------------------------------------------------------------------------------------------------------------
##         TIME BASED ##
#---------------------------------------------------------------------------------------------------------------------------


  ######Vector Autoregression######

#REFERENCE: https://www.r-econometrics.com/timeseries/varintro/

set.seed(123) # Reset random number generator for reasons of reproducibility

# Generate sample
t <- 100 # Number of time series observations
k <- 2 # Number of endogenous variables
p <- 2 # Number of lags

# Generate coefficient matrices
A.1 <- matrix(c(.1, .2, .3, .4), k) # Coefficient matrix of lag 1
A.2 <- matrix(c(-.4, -.3, -.2, -.1), k) # Coefficient matrix of lag 2
A <- cbind(A.1, A.2) # Companion form of the coefficient matrices

# Generate series
series <- matrix(0, k, t + 2*p) # Raw series with zeros
for (i in (p + 1):(t + 2*p)){ # Generate series with e ~ N(0,0.5)
  series[, i] <- A.1%*%series[, i-1] + A.2%*%series[, i-2] + rnorm(k, 0, .5)
}

series <- ts(t(series[, -(1:p)])) # Convert to time series format
names <- c("V1", "V2") # Rename variables

plot.ts(series) # Plot the series

var.1 <- VAR(series, 2, type = "none") # Estimate the model

var.aic <- VAR(series, type = "none", lag.max = 5, ic = "AIC")

summary(var.aic)

#true values
A

# Extract coefficients, standard errors etc. from the object
# produced by the VAR function
est_coefs <- coef(var.aic)

# Extract only the coefficients for both dependend variables
# and combine them to a single matrix
est_coefs <- rbind(est_coefs[[1]][, 1], est_coefs[[2]][, 1]) 

# Print the rounded estimates in the console
round(est_coefs, 2)

#Impulse response
# Calculate the IRF
ir.1 <- irf(var.1, impulse = "Series.1", response = "Series.2", n.ahead = 20, ortho = FALSE)

# Plot the IRF
plot(ir.1)

# Calculate impulse response
ir.2 <- irf(var.1,impulse="Series.1",response="Series.2",n.ahead = 20,ortho = FALSE,
            cumulative = TRUE)

# Plot
plot(ir.2)
#---------------------------------------------------------------------------------------------------------------------------

  ##### Principal Response Curves######

#REFERENCE: https://www.rdocumentation.org/packages/vegan/versions/2.4-2/topics/prc

## Chlorpyrifos experiment and experimental design: Pesticide
## treatment in ditches (replicated) and followed over from 4 weeks
## before to 24 weeks after exposure 

data(pyrifos)
week <- gl(11, 12, labels=c(-4, -1, 0.1, 1, 2, 4, 8, 12, 15, 19, 24))
dose <- factor(rep(c(0.1, 0, 0, 0.9, 0, 44, 6, 0.1, 44, 0.9, 0, 6), 11))
ditch <- gl(12, 1, length=132)

# PRC
mod <- prc(pyrifos, dose, week) #response, treatment, time
mod            # RDA

summary(mod)   # PRC

logabu <- colSums(pyrifos)
plot(mod, select = logabu > 100)
## Ditches are randomized, we have a time series, and are only
## interested in the first axis
ctrl <- how(plots = Plots(strata = ditch,type = "free"),
            within = Within(type = "series"), nperm = 99)
anova(mod, permutations = ctrl, first=TRUE)


#reduced
pyrifos_red <-pyrifos[,1:10]

mod2 <- prc(pyrifos_red, dose, week) #response, treatment, time
mod2            # RDA

summary(mod2)   # PRC

plot(mod2)

#individual abundance

pyrifos_combo <-cbind(pyrifos, dose, week)
pyrifos_combo$week <- as.integer(pyrifos_combo$week)

Slyla_m <-aggregate(Slyla~ dose*week, FUN=mean, data=pyrifos_combo)
Aloco_m <-aggregate(Aloco~ dose*week, FUN=mean, data=pyrifos_combo)
Copsp_m <-aggregate(Copsp~ dose*week, FUN=mean, data=pyrifos_combo)
Simve_m <-aggregate(Simve~ dose*week, FUN=mean, data=pyrifos_combo)

g1 <- ggplot()+geom_line(data=Slyla_m, aes(x=week, y=Slyla, col=dose))
g2 <- ggplot()+geom_line(data=Aloco_m, aes(x=week, y=Aloco, col=dose))
g3 <- ggplot()+geom_line(data=Copsp_m, aes(x=week, y=Copsp, col=dose))
g4 <- ggplot()+geom_line(data=Simve_m, aes(x=week, y=Simve, col=dose))

g1;g2;g3;g4