diff --git a/Lab 5/Rplots.pdf b/Lab 5/Rplots.pdf new file mode 100644 index 0000000..bf95203 Binary files /dev/null and b/Lab 5/Rplots.pdf differ diff --git a/Lab 5/lab5.r b/Lab 5/lab5.r new file mode 100644 index 0000000..91b20a6 --- /dev/null +++ b/Lab 5/lab5.r @@ -0,0 +1,128 @@ +install.packages( + c("e1071", "caret", "randomForest", "ggplot2", "pROC"), + repos = c("https://cloud.r-project.org/"), + dependencies = TRUE +) + +suppressPackageStartupMessages({ + library(e1071) # for svm/tune.svm + library(caret) # for metrics + library(randomForest) # alternative classifier + library(ggplot2) +}) + +set.seed(42) + +read_wine <- function() { + df <- read.csv("wine.data", header = FALSE) + colnames(df) <- c( + "Class", + "Alcohol", "Malic.acid", "Ash", "Alcalinity.of.ash", "Magnesium", + "Total.phenols", "Flavanoids", "Nonflavanoid.phenols", "Proanthocyanins", + "Color.intensity", "Hue", "OD280.OD315", "Proline" + ) + df$Class <- factor(df$Class) + df +} + +df <- read_wine() + +# split into train/test +idx <- createDataPartition(df$Class, p = 0.8, list = FALSE) +train <- df[idx, ] +test <- df[-idx, ] + +# choose a subset of features based on ANOVA F-test +# I picked this sbuset before the runs: +# alcohol, flavanoids, color intensity, od280/od315, proline, total phenols +features <- c("Alcohol", "Flavanoids", "Color.intensity", "OD280.OD315", "Proline", "Total.phenols") +x_train <- train[, features] +y_train <- train$Class +x_test <- test[, features] +y_test <- test$Class + +# scale features +pp <- preProcess(x_train, method = c("center", "scale")) +x_train_s <- predict(pp, x_train) +x_test_s <- predict(pp, x_test) + +# linear kernel svm with hyperparameter tuning (C) +set.seed(42) +lin_grid <- data.frame(cost = c(0.1, 1, 10, 100)) +tune_lin <- tune.svm( + x = x_train_s, y = y_train, + kernel = "linear", + cost = lin_grid$cost, + tunecontrol = tune.control(cross = 5) +) +lin_best <- tune_lin$best.model + +# rbf kernel svm with tuning (C, gamma) +set.seed(42) +rbf_grid_cost <- c(0.1, 1, 10, 100, 1000) +rbf_grid_gamma <- c(0.001, 0.01, 0.1, 1) +tune_rbf <- tune.svm( + x = x_train_s, y = y_train, + kernel = "radial", + cost = rbf_grid_cost, + gamma = rbf_grid_gamma, + tunecontrol = tune.control(cross = 5) +) +rbf_best <- tune_rbf$best.model + +# alt classifier: random forest (same features) +set.seed(42) +rf_fit <- randomForest(x = x_train, y = y_train, ntree = 500, mtry = 2, importance = TRUE) + +# evaluation helper +eval_model <- function(model, x_test_s, y_test, name) { + pred <- predict(model, x_test_s) + cm <- confusionMatrix(pred, y_test) + pr <- data.frame( + model = name, + accuracy = cm$overall["Accuracy"], + precision_macro = mean(cm$byClass[, "Precision"], na.rm = TRUE), + recall_macro = mean(cm$byClass[, "Recall"], na.rm = TRUE), + f1_macro = mean(cm$byClass[, "F1"], na.rm = TRUE) + ) + list(cm = cm, pr = pr) +} + +# eval svm models (use scaled features) +lin_eval <- eval_model(lin_best, x_test_s, y_test, "svm_linear") +rbf_eval <- eval_model(rbf_best, x_test_s, y_test, "svm_rbf") + +# evaluate random forest (no scaling) +rf_pred <- predict(rf_fit, x_test) +rf_cm <- confusionMatrix(rf_pred, y_test) + +rf_pr <- data.frame( + model = "random_forest", + accuracy = rf_cm$overall["Accuracy"], + precision_macro = mean(rf_cm$byClass[, "Precision"], na.rm = TRUE), + recall_macro = mean(rf_cm$byClass[, "Recall"], na.rm = TRUE), + f1_macro = mean(rf_cm$byClass[, "F1"], na.rm = TRUE) +) + +perf <- rbind(lin_eval$pr, rbf_eval$pr, rf_pr) + +# print +cat("best params (linear svm): C =", lin_best$cost, "\n") +cat("best params (rbf svm): C =", rbf_best$cost, " gamma =", rbf_best$gamma, "\n\n") +print(perf) + +# macro-f1 comparison +ggplot(perf, aes(x = model, y = f1_macro)) + + geom_col() + + labs(title = "macro-F1 by model (wine test set)") + +# save outputs +write.table(perf, file = "lab5_performance_table.txt", sep = "\t", row.names = FALSE, quote = FALSE) +sink("lab5_confusion_matrices.txt") +cat("=== svm linear ===\n") +print(lin_eval$cm) +cat("\n=== svm rbf ===\n") +print(rbf_eval$cm) +cat("\n=== random forest ===\n") +print(rf_cm) +sink() diff --git a/Lab 5/lab5_confusion_matrices.txt b/Lab 5/lab5_confusion_matrices.txt new file mode 100644 index 0000000..065a670 --- /dev/null +++ b/Lab 5/lab5_confusion_matrices.txt @@ -0,0 +1,95 @@ +=== svm linear === +Confusion Matrix and Statistics + + Reference +Prediction 1 2 3 + 1 11 1 0 + 2 0 13 0 + 3 0 0 9 + +Overall Statistics + + Accuracy : 0.9706 + 95% CI : (0.8467, 0.9993) + No Information Rate : 0.4118 + P-Value [Acc > NIR] : 3.92e-12 + + Kappa : 0.9553 + + Mcnemar's Test P-Value : NA + +Statistics by Class: + + Class: 1 Class: 2 Class: 3 +Sensitivity 1.0000 0.9286 1.0000 +Specificity 0.9565 1.0000 1.0000 +Pos Pred Value 0.9167 1.0000 1.0000 +Neg Pred Value 1.0000 0.9524 1.0000 +Prevalence 0.3235 0.4118 0.2647 +Detection Rate 0.3235 0.3824 0.2647 +Detection Prevalence 0.3529 0.3824 0.2647 +Balanced Accuracy 0.9783 0.9643 1.0000 + +=== svm rbf === +Confusion Matrix and Statistics + + Reference +Prediction 1 2 3 + 1 11 1 0 + 2 0 13 0 + 3 0 0 9 + +Overall Statistics + + Accuracy : 0.9706 + 95% CI : (0.8467, 0.9993) + No Information Rate : 0.4118 + P-Value [Acc > NIR] : 3.92e-12 + + Kappa : 0.9553 + + Mcnemar's Test P-Value : NA + +Statistics by Class: + + Class: 1 Class: 2 Class: 3 +Sensitivity 1.0000 0.9286 1.0000 +Specificity 0.9565 1.0000 1.0000 +Pos Pred Value 0.9167 1.0000 1.0000 +Neg Pred Value 1.0000 0.9524 1.0000 +Prevalence 0.3235 0.4118 0.2647 +Detection Rate 0.3235 0.3824 0.2647 +Detection Prevalence 0.3529 0.3824 0.2647 +Balanced Accuracy 0.9783 0.9643 1.0000 + +=== random forest === +Confusion Matrix and Statistics + + Reference +Prediction 1 2 3 + 1 11 1 0 + 2 0 13 0 + 3 0 0 9 + +Overall Statistics + + Accuracy : 0.9706 + 95% CI : (0.8467, 0.9993) + No Information Rate : 0.4118 + P-Value [Acc > NIR] : 3.92e-12 + + Kappa : 0.9553 + + Mcnemar's Test P-Value : NA + +Statistics by Class: + + Class: 1 Class: 2 Class: 3 +Sensitivity 1.0000 0.9286 1.0000 +Specificity 0.9565 1.0000 1.0000 +Pos Pred Value 0.9167 1.0000 1.0000 +Neg Pred Value 1.0000 0.9524 1.0000 +Prevalence 0.3235 0.4118 0.2647 +Detection Rate 0.3235 0.3824 0.2647 +Detection Prevalence 0.3529 0.3824 0.2647 +Balanced Accuracy 0.9783 0.9643 1.0000 diff --git a/Lab 5/lab5_performance_table.txt b/Lab 5/lab5_performance_table.txt new file mode 100644 index 0000000..9b74dfc --- /dev/null +++ b/Lab 5/lab5_performance_table.txt @@ -0,0 +1,4 @@ +model accuracy precision_macro recall_macro f1_macro +svm_linear 0.970588235294118 0.972222222222222 0.976190476190476 0.973161567364466 +svm_rbf 0.970588235294118 0.972222222222222 0.976190476190476 0.973161567364466 +random_forest 0.970588235294118 0.972222222222222 0.976190476190476 0.973161567364466 diff --git a/Lab 5/lab6.r b/Lab 5/lab6.r deleted file mode 100644 index a1d5067..0000000 --- a/Lab 5/lab6.r +++ /dev/null @@ -1,111 +0,0 @@ -install.packages(c("e1071","caret","randomForest","ggplot2","pROC"), dependencies = TRUE) - -suppressPackageStartupMessages({ - library(e1071) # for svm/tune.svm - library(caret) # for metrics - library(randomForest) # alternative classifier - library(ggplot2) -}) - -set.seed(42) - -read_wine <- function() { - df <- read.csv("wine.data", header = FALSE) - colnames(df) <- c("Class", - "Alcohol","Malic.acid","Ash","Alcalinity.of.ash","Magnesium", - "Total.phenols","Flavanoids","Nonflavanoid.phenols","Proanthocyanins", - "Color.intensity","Hue","OD280.OD315","Proline") - df$Class <- factor(df$Class) - df -} - -df <- read_wine() - -# split into train/test -idx <- createDataPartition(df$Class, p = 0.8, list = FALSE) -train <- df[idx, ] -test <- df[-idx, ] - -# choose a subset of features based on ANOVA F-test -# I picked this sbuset before the runs: -# alcohol, flavanoids, color intensity, od280/od315, proline, total phenols -features <- c("Alcohol","Flavanoids","Color.intensity","OD280.OD315","Proline","Total.phenols") -x_train <- train[, features] -y_train <- train$Class -x_test <- test[, features] -y_test <- test$Class - -# scale features -pp <- preProcess(x_train, method = c("center","scale")) -x_train_s <- predict(pp, x_train) -x_test_s <- predict(pp, x_test) - -# 1) linear kernel svm with hyperparameter tuning (C) -set.seed(42) -lin_grid <- data.frame(cost = c(0.1, 1, 10, 100)) -tune_lin <- tune.svm(x = x_train_s, y = y_train, - kernel = "linear", - cost = lin_grid$cost, - tunecontrol = tune.control(cross = 5)) -lin_best <- tune_lin$best.model - -# 2) rbf kernel svm with tuning (C, gamma) -set.seed(42) -rbf_grid_cost <- c(0.1, 1, 10, 100, 1000) -rbf_grid_gamma <- c(0.001, 0.01, 0.1, 1) -tune_rbf <- tune.svm(x = x_train_s, y = y_train, - kernel = "radial", - cost = rbf_grid_cost, - gamma = rbf_grid_gamma, - tunecontrol = tune.control(cross = 5)) -rbf_best <- tune_rbf$best.model - -# 3) alternative classifier: random forest (same features) -set.seed(42) -rf_fit <- randomForest(x = x_train, y = y_train, ntree = 500, mtry = 2, importance = TRUE) - -# evaluation helper -eval_model <- function(model, x_test_s, y_test, name) { - pred <- predict(model, x_test_s) - cm <- confusionMatrix(pred, y_test) - pr <- data.frame(model = name, - accuracy = cm$overall["Accuracy"], - precision_macro = mean(cm$byClass[,"Precision"], na.rm=TRUE), - recall_macro = mean(cm$byClass[,"Recall"], na.rm=TRUE), - f1_macro = mean(cm$byClass[,"F1"], na.rm=TRUE)) - list(cm = cm, pr = pr) -} - -# eval svm models (use scaled features) -lin_eval <- eval_model(lin_best, x_test_s, y_test, "svm_linear") -rbf_eval <- eval_model(rbf_best, x_test_s, y_test, "svm_rbf") - -# evaluate random forest (no scaling) -rf_pred <- predict(rf_fit, x_test) -rf_cm <- confusionMatrix(rf_pred, y_test) - -rf_pr <- data.frame(model = "random_forest", - accuracy = rf_cm$overall["Accuracy"], - precision_macro = mean(rf_cm$byClass[,"Precision"], na.rm=TRUE), - recall_macro = mean(rf_cm$byClass[,"Recall"], na.rm=TRUE), - f1_macro = mean(rf_cm$byClass[,"F1"], na.rm=TRUE)) - -perf <- rbind(lin_eval$pr, rbf_eval$pr, rf_pr) - -# print -cat("best params (linear svm): C =", lin_best$cost, "\n") -cat("best params (rbf svm): C =", rbf_best$cost, " gamma =", rbf_best$gamma, "\n\n") -print(perf) - -# macro-f1 comparison -ggplot(perf, aes(x = model, y = f1_macro)) + - geom_col() + - labs(title = "macro-F1 by model (wine test set)") - -# save outputs -write.table(perf, file = "lab5_performance_table.txt", sep = "\t", row.names = FALSE, quote = FALSE) -sink("lab5_confusion_matrices.txt") -cat("=== svm linear ===\n"); print(lin_eval$cm) -cat("\n=== svm rbf ===\n"); print(rbf_eval$cm) -cat("\n=== random forest ===\n"); print(rf_cm) -sink() \ No newline at end of file