install.packages( c("e1071", "caret", "randomForest", "ggplot2", "pROC"), repos = c("https://cloud.r-project.org/"), dependencies = TRUE ) suppressPackageStartupMessages({ library(e1071) # for svm/tune.svm library(caret) # for metrics library(randomForest) # alternative classifier library(ggplot2) }) set.seed(42) read_wine <- function() { df <- read.csv("wine.data", header = FALSE) colnames(df) <- c( "Class", "Alcohol", "Malic.acid", "Ash", "Alcalinity.of.ash", "Magnesium", "Total.phenols", "Flavanoids", "Nonflavanoid.phenols", "Proanthocyanins", "Color.intensity", "Hue", "OD280.OD315", "Proline" ) df$Class <- factor(df$Class) df } df <- read_wine() # split into train/test idx <- createDataPartition(df$Class, p = 0.8, list = FALSE) train <- df[idx, ] test <- df[-idx, ] # choose a subset of features based on ANOVA F-test # I picked this sbuset before the runs: # alcohol, flavanoids, color intensity, od280/od315, proline, total phenols features <- c("Alcohol", "Flavanoids", "Color.intensity", "OD280.OD315", "Proline", "Total.phenols") x_train <- train[, features] y_train <- train$Class x_test <- test[, features] y_test <- test$Class # scale features pp <- preProcess(x_train, method = c("center", "scale")) x_train_s <- predict(pp, x_train) x_test_s <- predict(pp, x_test) # linear kernel svm with hyperparameter tuning (C) set.seed(42) lin_grid <- data.frame(cost = c(0.1, 1, 10, 100)) tune_lin <- tune.svm( x = x_train_s, y = y_train, kernel = "linear", cost = lin_grid$cost, tunecontrol = tune.control(cross = 5) ) lin_best <- tune_lin$best.model # rbf kernel svm with tuning (C, gamma) set.seed(42) rbf_grid_cost <- c(0.1, 1, 10, 100, 1000) rbf_grid_gamma <- c(0.001, 0.01, 0.1, 1) tune_rbf <- tune.svm( x = x_train_s, y = y_train, kernel = "radial", cost = rbf_grid_cost, gamma = rbf_grid_gamma, tunecontrol = tune.control(cross = 5) ) rbf_best <- tune_rbf$best.model # alt classifier: random forest (same features) set.seed(42) rf_fit <- randomForest(x = x_train, y = y_train, ntree = 500, mtry = 2, importance = TRUE) # evaluation helper eval_model <- function(model, x_test_s, y_test, name) { pred <- predict(model, x_test_s) cm <- confusionMatrix(pred, y_test) pr <- data.frame( model = name, accuracy = cm$overall["Accuracy"], precision_macro = mean(cm$byClass[, "Precision"], na.rm = TRUE), recall_macro = mean(cm$byClass[, "Recall"], na.rm = TRUE), f1_macro = mean(cm$byClass[, "F1"], na.rm = TRUE) ) list(cm = cm, pr = pr) } # eval svm models (use scaled features) lin_eval <- eval_model(lin_best, x_test_s, y_test, "svm_linear") rbf_eval <- eval_model(rbf_best, x_test_s, y_test, "svm_rbf") # evaluate random forest (no scaling) rf_pred <- predict(rf_fit, x_test) rf_cm <- confusionMatrix(rf_pred, y_test) rf_pr <- data.frame( model = "random_forest", accuracy = rf_cm$overall["Accuracy"], precision_macro = mean(rf_cm$byClass[, "Precision"], na.rm = TRUE), recall_macro = mean(rf_cm$byClass[, "Recall"], na.rm = TRUE), f1_macro = mean(rf_cm$byClass[, "F1"], na.rm = TRUE) ) perf <- rbind(lin_eval$pr, rbf_eval$pr, rf_pr) # print cat("best params (linear svm): C =", lin_best$cost, "\n") cat("best params (rbf svm): C =", rbf_best$cost, " gamma =", rbf_best$gamma, "\n\n") print(perf) # macro-f1 comparison ggplot(perf, aes(x = model, y = f1_macro)) + geom_col() + labs(title = "macro-F1 by model (wine test set)") # save outputs write.table(perf, file = "lab5_performance_table.txt", sep = "\t", row.names = FALSE, quote = FALSE) sink("lab5_confusion_matrices.txt") cat("=== svm linear ===\n") print(lin_eval$cm) cat("\n=== svm rbf ===\n") print(rbf_eval$cm) cat("\n=== random forest ===\n") print(rf_cm) sink()