diff --git a/Lab 5/Rplots.pdf b/Lab 5/Rplots.pdf new file mode 100644 index 0000000..bf95203 Binary files /dev/null and b/Lab 5/Rplots.pdf differ diff --git a/Lab 5/lab5.r b/Lab 5/lab5.r index a1d5067..91b20a6 100644 --- a/Lab 5/lab5.r +++ b/Lab 5/lab5.r @@ -1,22 +1,28 @@ -install.packages(c("e1071","caret","randomForest","ggplot2","pROC"), dependencies = TRUE) +install.packages( + c("e1071", "caret", "randomForest", "ggplot2", "pROC"), + repos = c("https://cloud.r-project.org/"), + dependencies = TRUE +) suppressPackageStartupMessages({ - library(e1071) # for svm/tune.svm - library(caret) # for metrics - library(randomForest) # alternative classifier - library(ggplot2) + library(e1071) # for svm/tune.svm + library(caret) # for metrics + library(randomForest) # alternative classifier + library(ggplot2) }) set.seed(42) read_wine <- function() { - df <- read.csv("wine.data", header = FALSE) - colnames(df) <- c("Class", - "Alcohol","Malic.acid","Ash","Alcalinity.of.ash","Magnesium", - "Total.phenols","Flavanoids","Nonflavanoid.phenols","Proanthocyanins", - "Color.intensity","Hue","OD280.OD315","Proline") - df$Class <- factor(df$Class) - df + df <- read.csv("wine.data", header = FALSE) + colnames(df) <- c( + "Class", + "Alcohol", "Malic.acid", "Ash", "Alcalinity.of.ash", "Magnesium", + "Total.phenols", "Flavanoids", "Nonflavanoid.phenols", "Proanthocyanins", + "Color.intensity", "Hue", "OD280.OD315", "Proline" + ) + df$Class <- factor(df$Class) + df } df <- read_wine() @@ -24,56 +30,62 @@ df <- read_wine() # split into train/test idx <- createDataPartition(df$Class, p = 0.8, list = FALSE) train <- df[idx, ] -test <- df[-idx, ] +test <- df[-idx, ] # choose a subset of features based on ANOVA F-test # I picked this sbuset before the runs: # alcohol, flavanoids, color intensity, od280/od315, proline, total phenols -features <- c("Alcohol","Flavanoids","Color.intensity","OD280.OD315","Proline","Total.phenols") +features <- c("Alcohol", "Flavanoids", "Color.intensity", "OD280.OD315", "Proline", "Total.phenols") x_train <- train[, features] y_train <- train$Class -x_test <- test[, features] -y_test <- test$Class +x_test <- test[, features] +y_test <- test$Class # scale features -pp <- preProcess(x_train, method = c("center","scale")) +pp <- preProcess(x_train, method = c("center", "scale")) x_train_s <- predict(pp, x_train) -x_test_s <- predict(pp, x_test) +x_test_s <- predict(pp, x_test) -# 1) linear kernel svm with hyperparameter tuning (C) +# linear kernel svm with hyperparameter tuning (C) set.seed(42) lin_grid <- data.frame(cost = c(0.1, 1, 10, 100)) -tune_lin <- tune.svm(x = x_train_s, y = y_train, - kernel = "linear", - cost = lin_grid$cost, - tunecontrol = tune.control(cross = 5)) +tune_lin <- tune.svm( + x = x_train_s, y = y_train, + kernel = "linear", + cost = lin_grid$cost, + tunecontrol = tune.control(cross = 5) +) lin_best <- tune_lin$best.model -# 2) rbf kernel svm with tuning (C, gamma) +# rbf kernel svm with tuning (C, gamma) set.seed(42) rbf_grid_cost <- c(0.1, 1, 10, 100, 1000) rbf_grid_gamma <- c(0.001, 0.01, 0.1, 1) -tune_rbf <- tune.svm(x = x_train_s, y = y_train, - kernel = "radial", - cost = rbf_grid_cost, - gamma = rbf_grid_gamma, - tunecontrol = tune.control(cross = 5)) +tune_rbf <- tune.svm( + x = x_train_s, y = y_train, + kernel = "radial", + cost = rbf_grid_cost, + gamma = rbf_grid_gamma, + tunecontrol = tune.control(cross = 5) +) rbf_best <- tune_rbf$best.model -# 3) alternative classifier: random forest (same features) +# alt classifier: random forest (same features) set.seed(42) rf_fit <- randomForest(x = x_train, y = y_train, ntree = 500, mtry = 2, importance = TRUE) # evaluation helper eval_model <- function(model, x_test_s, y_test, name) { - pred <- predict(model, x_test_s) - cm <- confusionMatrix(pred, y_test) - pr <- data.frame(model = name, - accuracy = cm$overall["Accuracy"], - precision_macro = mean(cm$byClass[,"Precision"], na.rm=TRUE), - recall_macro = mean(cm$byClass[,"Recall"], na.rm=TRUE), - f1_macro = mean(cm$byClass[,"F1"], na.rm=TRUE)) - list(cm = cm, pr = pr) + pred <- predict(model, x_test_s) + cm <- confusionMatrix(pred, y_test) + pr <- data.frame( + model = name, + accuracy = cm$overall["Accuracy"], + precision_macro = mean(cm$byClass[, "Precision"], na.rm = TRUE), + recall_macro = mean(cm$byClass[, "Recall"], na.rm = TRUE), + f1_macro = mean(cm$byClass[, "F1"], na.rm = TRUE) + ) + list(cm = cm, pr = pr) } # eval svm models (use scaled features) @@ -84,11 +96,13 @@ rbf_eval <- eval_model(rbf_best, x_test_s, y_test, "svm_rbf") rf_pred <- predict(rf_fit, x_test) rf_cm <- confusionMatrix(rf_pred, y_test) -rf_pr <- data.frame(model = "random_forest", - accuracy = rf_cm$overall["Accuracy"], - precision_macro = mean(rf_cm$byClass[,"Precision"], na.rm=TRUE), - recall_macro = mean(rf_cm$byClass[,"Recall"], na.rm=TRUE), - f1_macro = mean(rf_cm$byClass[,"F1"], na.rm=TRUE)) +rf_pr <- data.frame( + model = "random_forest", + accuracy = rf_cm$overall["Accuracy"], + precision_macro = mean(rf_cm$byClass[, "Precision"], na.rm = TRUE), + recall_macro = mean(rf_cm$byClass[, "Recall"], na.rm = TRUE), + f1_macro = mean(rf_cm$byClass[, "F1"], na.rm = TRUE) +) perf <- rbind(lin_eval$pr, rbf_eval$pr, rf_pr) @@ -98,14 +112,17 @@ cat("best params (rbf svm): C =", rbf_best$cost, " gamma =", rbf_best$gamma, "\n print(perf) # macro-f1 comparison -ggplot(perf, aes(x = model, y = f1_macro)) + - geom_col() + - labs(title = "macro-F1 by model (wine test set)") +ggplot(perf, aes(x = model, y = f1_macro)) + + geom_col() + + labs(title = "macro-F1 by model (wine test set)") # save outputs write.table(perf, file = "lab5_performance_table.txt", sep = "\t", row.names = FALSE, quote = FALSE) sink("lab5_confusion_matrices.txt") -cat("=== svm linear ===\n"); print(lin_eval$cm) -cat("\n=== svm rbf ===\n"); print(rbf_eval$cm) -cat("\n=== random forest ===\n"); print(rf_cm) -sink() \ No newline at end of file +cat("=== svm linear ===\n") +print(lin_eval$cm) +cat("\n=== svm rbf ===\n") +print(rbf_eval$cm) +cat("\n=== random forest ===\n") +print(rf_cm) +sink() diff --git a/Lab 5/lab5_confusion_matrices.txt b/Lab 5/lab5_confusion_matrices.txt new file mode 100644 index 0000000..065a670 --- /dev/null +++ b/Lab 5/lab5_confusion_matrices.txt @@ -0,0 +1,95 @@ +=== svm linear === +Confusion Matrix and Statistics + + Reference +Prediction 1 2 3 + 1 11 1 0 + 2 0 13 0 + 3 0 0 9 + +Overall Statistics + + Accuracy : 0.9706 + 95% CI : (0.8467, 0.9993) + No Information Rate : 0.4118 + P-Value [Acc > NIR] : 3.92e-12 + + Kappa : 0.9553 + + Mcnemar's Test P-Value : NA + +Statistics by Class: + + Class: 1 Class: 2 Class: 3 +Sensitivity 1.0000 0.9286 1.0000 +Specificity 0.9565 1.0000 1.0000 +Pos Pred Value 0.9167 1.0000 1.0000 +Neg Pred Value 1.0000 0.9524 1.0000 +Prevalence 0.3235 0.4118 0.2647 +Detection Rate 0.3235 0.3824 0.2647 +Detection Prevalence 0.3529 0.3824 0.2647 +Balanced Accuracy 0.9783 0.9643 1.0000 + +=== svm rbf === +Confusion Matrix and Statistics + + Reference +Prediction 1 2 3 + 1 11 1 0 + 2 0 13 0 + 3 0 0 9 + +Overall Statistics + + Accuracy : 0.9706 + 95% CI : (0.8467, 0.9993) + No Information Rate : 0.4118 + P-Value [Acc > NIR] : 3.92e-12 + + Kappa : 0.9553 + + Mcnemar's Test P-Value : NA + +Statistics by Class: + + Class: 1 Class: 2 Class: 3 +Sensitivity 1.0000 0.9286 1.0000 +Specificity 0.9565 1.0000 1.0000 +Pos Pred Value 0.9167 1.0000 1.0000 +Neg Pred Value 1.0000 0.9524 1.0000 +Prevalence 0.3235 0.4118 0.2647 +Detection Rate 0.3235 0.3824 0.2647 +Detection Prevalence 0.3529 0.3824 0.2647 +Balanced Accuracy 0.9783 0.9643 1.0000 + +=== random forest === +Confusion Matrix and Statistics + + Reference +Prediction 1 2 3 + 1 11 1 0 + 2 0 13 0 + 3 0 0 9 + +Overall Statistics + + Accuracy : 0.9706 + 95% CI : (0.8467, 0.9993) + No Information Rate : 0.4118 + P-Value [Acc > NIR] : 3.92e-12 + + Kappa : 0.9553 + + Mcnemar's Test P-Value : NA + +Statistics by Class: + + Class: 1 Class: 2 Class: 3 +Sensitivity 1.0000 0.9286 1.0000 +Specificity 0.9565 1.0000 1.0000 +Pos Pred Value 0.9167 1.0000 1.0000 +Neg Pred Value 1.0000 0.9524 1.0000 +Prevalence 0.3235 0.4118 0.2647 +Detection Rate 0.3235 0.3824 0.2647 +Detection Prevalence 0.3529 0.3824 0.2647 +Balanced Accuracy 0.9783 0.9643 1.0000 diff --git a/Lab 5/lab5_performance_table.txt b/Lab 5/lab5_performance_table.txt new file mode 100644 index 0000000..9b74dfc --- /dev/null +++ b/Lab 5/lab5_performance_table.txt @@ -0,0 +1,4 @@ +model accuracy precision_macro recall_macro f1_macro +svm_linear 0.970588235294118 0.972222222222222 0.976190476190476 0.973161567364466 +svm_rbf 0.970588235294118 0.972222222222222 0.976190476190476 0.973161567364466 +random_forest 0.970588235294118 0.972222222222222 0.976190476190476 0.973161567364466