god I am DUMB

2025-11-04 17:43:39 -05:00
parent 4eff5a6378
commit 414a4ac5a3
1 changed files with 0 additions and 0 deletions
@@ -0,0 +1,111 @@
+install.packages(c("e1071","caret","randomForest","ggplot2","pROC"), dependencies = TRUE)
+
+suppressPackageStartupMessages({
+	library(e1071)        # for svm/tune.svm
+	library(caret)        # for metrics
+	library(randomForest) # alternative classifier
+	library(ggplot2)
+})
+
+set.seed(42)
+
+read_wine <- function() {
+	df <- read.csv("wine.data", header = FALSE)
+	colnames(df) <- c("Class",
+						"Alcohol","Malic.acid","Ash","Alcalinity.of.ash","Magnesium",
+						"Total.phenols","Flavanoids","Nonflavanoid.phenols","Proanthocyanins",
+						"Color.intensity","Hue","OD280.OD315","Proline")
+	df$Class <- factor(df$Class)
+	df
+}
+
+df <- read_wine()
+
+# split into train/test
+idx <- createDataPartition(df$Class, p = 0.8, list = FALSE)
+train <- df[idx, ]
+test  <- df[-idx, ]
+
+# choose a subset of features based on ANOVA F-test
+# I picked this sbuset before the runs:
+# alcohol, flavanoids, color intensity, od280/od315, proline, total phenols
+features <- c("Alcohol","Flavanoids","Color.intensity","OD280.OD315","Proline","Total.phenols")
+x_train <- train[, features]
+y_train <- train$Class
+x_test  <- test[, features]
+y_test  <- test$Class
+
+# scale features
+pp <- preProcess(x_train, method = c("center","scale"))
+x_train_s <- predict(pp, x_train)
+x_test_s  <- predict(pp, x_test)
+
+# 1) linear kernel svm with hyperparameter tuning (C)
+set.seed(42)
+lin_grid <- data.frame(cost = c(0.1, 1, 10, 100))
+tune_lin <- tune.svm(x = x_train_s, y = y_train,
+                     kernel = "linear",
+                     cost = lin_grid$cost,
+                     tunecontrol = tune.control(cross = 5))
+lin_best <- tune_lin$best.model
+
+# 2) rbf kernel svm with tuning (C, gamma)
+set.seed(42)
+rbf_grid_cost <- c(0.1, 1, 10, 100, 1000)
+rbf_grid_gamma <- c(0.001, 0.01, 0.1, 1)
+tune_rbf <- tune.svm(x = x_train_s, y = y_train,
+                     kernel = "radial",
+                     cost = rbf_grid_cost,
+                     gamma = rbf_grid_gamma,
+                     tunecontrol = tune.control(cross = 5))
+rbf_best <- tune_rbf$best.model
+
+# 3) alternative classifier: random forest (same features)
+set.seed(42)
+rf_fit <- randomForest(x = x_train, y = y_train, ntree = 500, mtry = 2, importance = TRUE)
+
+# evaluation helper
+eval_model <- function(model, x_test_s, y_test, name) {
+	pred <- predict(model, x_test_s)
+	cm <- confusionMatrix(pred, y_test)
+	pr <- data.frame(model = name,
+					accuracy = cm$overall["Accuracy"],
+					precision_macro = mean(cm$byClass[,"Precision"], na.rm=TRUE),
+					recall_macro = mean(cm$byClass[,"Recall"], na.rm=TRUE),
+					f1_macro = mean(cm$byClass[,"F1"], na.rm=TRUE))
+	list(cm = cm, pr = pr)
+}
+
+# eval svm models (use scaled features)
+lin_eval <- eval_model(lin_best, x_test_s, y_test, "svm_linear")
+rbf_eval <- eval_model(rbf_best, x_test_s, y_test, "svm_rbf")
+
+# evaluate random forest (no scaling)
+rf_pred <- predict(rf_fit, x_test)
+rf_cm <- confusionMatrix(rf_pred, y_test)
+
+rf_pr <- data.frame(model = "random_forest",
+                    accuracy = rf_cm$overall["Accuracy"],
+                    precision_macro = mean(rf_cm$byClass[,"Precision"], na.rm=TRUE),
+                    recall_macro = mean(rf_cm$byClass[,"Recall"], na.rm=TRUE),
+                    f1_macro = mean(rf_cm$byClass[,"F1"], na.rm=TRUE))
+
+perf <- rbind(lin_eval$pr, rbf_eval$pr, rf_pr)
+
+# print
+cat("best params (linear svm): C =", lin_best$cost, "\n")
+cat("best params (rbf svm): C =", rbf_best$cost, " gamma =", rbf_best$gamma, "\n\n")
+print(perf)
+
+# macro-f1 comparison
+ggplot(perf, aes(x = model, y = f1_macro)) + 
+  geom_col() + 
+  labs(title = "macro-F1 by model (wine test set)")
+
+# save outputs
+write.table(perf, file = "lab5_performance_table.txt", sep = "\t", row.names = FALSE, quote = FALSE)
+sink("lab5_confusion_matrices.txt")
+cat("=== svm linear ===\n"); print(lin_eval$cm)
+cat("\n=== svm rbf ===\n"); print(rbf_eval$cm)
+cat("\n=== random forest ===\n"); print(rf_cm)
+sink()