library(MASS) library(caret) library(randomForest) library(rpart) library(rpart.plot) library(rattle) library(cluster) library(factoextra) #----------------------------------------------------------------------------------------------- #Example 1: Classification of tumors (Supervised Learning) #Ref: https://machinelearningmastery.com/machine-learning-in-r-step-by-step/ #Get and check data data(biopsy) head(biopsy) sapply(biopsy, class) #list types for each attribute #new dataset, excluding missing data and setting variable to numerical biopsy1 <-na.exclude((biopsy[,2:11])) biopsy1[,1:9] <- sapply(biopsy1[,1:9],as.numeric) sapply(biopsy1, class) #numeric variables #Validation Dataset validation_index <- createDataPartition(biopsy1$class, p=0.80, list=FALSE) # select 20% of the data for validation validation <- biopsy1[-validation_index,] # use the remaining 80% of data to training and testing the models dataset <- biopsy1[validation_index,] #Evaluate algorithms # Run algorithms using 10-fold cross validation control <- trainControl(method="cv", number=10) metric <- "Accuracy" # Test Five Classification models # a) linear algorithm set.seed(1) fit.lda <- train(class~., data=dataset, method="lda", metric=metric, trControl=control) # b) nonlinear algorithms # CART set.seed(1) fit.cart <- train(class~., data=dataset, method="rpart", metric=metric, trControl=control) # kNN set.seed(1) fit.knn <- train(class~., data=dataset, method="knn", metric=metric, trControl=control) # c) advanced algorithms # SVM set.seed(1) fit.svm <- train(class~., data=dataset, method="svmRadial", metric=metric, trControl=control) # Random Forest set.seed(1) fit.rf <- train(class~., data=dataset, method="rf", metric=metric, trControl=control) # summarize accuracy of models results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf)) summary(results) dotplot(results) #most accurate model was rf (random forest) print(fit.rf) # Make predictions predictions <-predict(fit.rf, validation) confusionMatrix(predictions, validation$class) #----------------------------------------------------------------------------------------------- #Example 2: Regression decision tree for tumors (Supervised Learning) #Ref: https://appsilon.com/r-decision-treees/ # https://www.gormanalysis.com/blog/decision-trees-in-r-using-rpart/ #Get and check data biopsy2 <- na.exclude((biopsy[,2:11])) biopsy2[,1:9] <- sapply(biopsy2[,1:9],as.numeric) head(biopsy2) #Split into training and testing sample <- sample(c(TRUE, FALSE), nrow(biopsy2), replace=TRUE, prob=c(0.75,0.25)) train <- biopsy2[sample, ] test <- biopsy2[!sample, ] #Modeling fit.dt <-rpart(class~., data=train, method="class") rpart.plot(fit.dt) fancyRpartPlot(fit.dt, caption=NULL) #Feature Importance varImp(fit.dt) # Make predictions predictions2 <-predict(fit.dt, newdata=test, type="class") head(predictions2) confusionMatrix(predictions2, test$class) #----------------------------------------------------------------------------------------------- #Example 3: Clustering tumors (Unsupervised Learning) #Ref: https://www.r-bloggers.com/2021/04/cluster-analysis-in-r/ # https://data-flair.training/blogs/clustering-in-r-tutorial/ #Get data biopsy3 <- na.exclude((biopsy[,2:11])) biopsy3 <- sapply(biopsy3[,1:9],as.numeric) head(biopsy3) #Clustering set.seed(1) clust.km <-kmeans(biopsy3,2) clust.km #Graphing fviz_cluster(clust.km, data=biopsy3) #Optimal clusters fviz_nbclust(biopsy3[,1:9], kmeans, method = "silhouette")