library(MASS)
library(caret)
library(randomForest)
library(rpart)
library(rpart.plot)
library(rattle)
library(cluster)
library(factoextra)
#-----------------------------------------------------------------------------------------------


#Example 1: Classification of tumors (Supervised Learning)
  #Ref: https://machinelearningmastery.com/machine-learning-in-r-step-by-step/

#Get and check data
data(biopsy)
head(biopsy)
sapply(biopsy, class) #list types for each attribute

  #new dataset, excluding missing data and setting variable to numerical
biopsy1 <-na.exclude((biopsy[,2:11]))
biopsy1[,1:9] <- sapply(biopsy1[,1:9],as.numeric)
sapply(biopsy1, class) #numeric variables

#Validation Dataset
validation_index <- createDataPartition(biopsy1$class, p=0.80, list=FALSE)
  # select 20% of the data for validation
validation <- biopsy1[-validation_index,]
  # use the remaining 80% of data to training and testing the models
dataset <- biopsy1[validation_index,]

#Evaluate algorithms 
  # Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"

  # Test Five Classification models
# a) linear algorithm
set.seed(1)
fit.lda <- train(class~., data=dataset, method="lda", metric=metric, trControl=control)
# b) nonlinear algorithms
# CART
set.seed(1)
fit.cart <- train(class~., data=dataset, method="rpart", metric=metric, trControl=control)
# kNN
set.seed(1)
fit.knn <- train(class~., data=dataset, method="knn", metric=metric, trControl=control)
# c) advanced algorithms
# SVM
set.seed(1)
fit.svm <- train(class~., data=dataset, method="svmRadial", metric=metric, trControl=control)
# Random Forest
set.seed(1)
fit.rf <- train(class~., data=dataset, method="rf", metric=metric, trControl=control)

  # summarize accuracy of models
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf))
summary(results)
dotplot(results) #most accurate model was rf (random forest)
print(fit.rf)

  # Make predictions
predictions <-predict(fit.rf, validation)
confusionMatrix(predictions, validation$class)
#-----------------------------------------------------------------------------------------------


#Example 2: Regression decision tree for tumors (Supervised Learning)
  #Ref: https://appsilon.com/r-decision-treees/
  #     https://www.gormanalysis.com/blog/decision-trees-in-r-using-rpart/

#Get and check data
biopsy2 <- na.exclude((biopsy[,2:11]))
biopsy2[,1:9] <- sapply(biopsy2[,1:9],as.numeric)
head(biopsy2)

#Split into training and testing
sample <- sample(c(TRUE, FALSE), nrow(biopsy2), replace=TRUE, prob=c(0.75,0.25))
train  <- biopsy2[sample, ]
test   <- biopsy2[!sample, ]

#Modeling
fit.dt <-rpart(class~., data=train, method="class")
rpart.plot(fit.dt)
fancyRpartPlot(fit.dt, caption=NULL)

#Feature Importance
varImp(fit.dt)

# Make predictions
predictions2 <-predict(fit.dt, newdata=test, type="class")
head(predictions2)
confusionMatrix(predictions2, test$class)
#-----------------------------------------------------------------------------------------------


#Example 3: Clustering tumors (Unsupervised Learning)
#Ref: https://www.r-bloggers.com/2021/04/cluster-analysis-in-r/
#     https://data-flair.training/blogs/clustering-in-r-tutorial/

#Get data
biopsy3 <- na.exclude((biopsy[,2:11]))
biopsy3 <- sapply(biopsy3[,1:9],as.numeric)
head(biopsy3)

#Clustering
set.seed(1)
clust.km <-kmeans(biopsy3,2)
clust.km

#Graphing
fviz_cluster(clust.km, data=biopsy3)

#Optimal clusters
fviz_nbclust(biopsy3[,1:9], kmeans, method = "silhouette")