added lab 5

2025-11-04 21:00:48 -05:00
parent 414a4ac5a3
commit 18a911f9d3
4 changed files with 166 additions and 50 deletions
@@ -1,22 +1,28 @@
-install.packages(c("e1071","caret","randomForest","ggplot2","pROC"), dependencies = TRUE)
+install.packages(
+    c("e1071", "caret", "randomForest", "ggplot2", "pROC"),
+    repos = c("https://cloud.r-project.org/"),
+    dependencies = TRUE
+)

 suppressPackageStartupMessages({
-	library(e1071)        # for svm/tune.svm
-	library(caret)        # for metrics
-	library(randomForest) # alternative classifier
-	library(ggplot2)
+    library(e1071) # for svm/tune.svm
+    library(caret) # for metrics
+    library(randomForest) # alternative classifier
+    library(ggplot2)
 })

 set.seed(42)

 read_wine <- function() {
-	df <- read.csv("wine.data", header = FALSE)
-	colnames(df) <- c("Class",
-						"Alcohol","Malic.acid","Ash","Alcalinity.of.ash","Magnesium",
-						"Total.phenols","Flavanoids","Nonflavanoid.phenols","Proanthocyanins",
-						"Color.intensity","Hue","OD280.OD315","Proline")
-	df$Class <- factor(df$Class)
-	df
+    df <- read.csv("wine.data", header = FALSE)
+    colnames(df) <- c(
+        "Class",
+        "Alcohol", "Malic.acid", "Ash", "Alcalinity.of.ash", "Magnesium",
+        "Total.phenols", "Flavanoids", "Nonflavanoid.phenols", "Proanthocyanins",
+        "Color.intensity", "Hue", "OD280.OD315", "Proline"
+    )
+    df$Class <- factor(df$Class)
+    df
 }

 df <- read_wine()
@@ -24,56 +30,62 @@ df <- read_wine()
 # split into train/test
 idx <- createDataPartition(df$Class, p = 0.8, list = FALSE)
 train <- df[idx, ]
-test  <- df[-idx, ]
+test <- df[-idx, ]

 # choose a subset of features based on ANOVA F-test
 # I picked this sbuset before the runs:
 # alcohol, flavanoids, color intensity, od280/od315, proline, total phenols
-features <- c("Alcohol","Flavanoids","Color.intensity","OD280.OD315","Proline","Total.phenols")
+features <- c("Alcohol", "Flavanoids", "Color.intensity", "OD280.OD315", "Proline", "Total.phenols")
 x_train <- train[, features]
 y_train <- train$Class
-x_test  <- test[, features]
-y_test  <- test$Class
+x_test <- test[, features]
+y_test <- test$Class

 # scale features
-pp <- preProcess(x_train, method = c("center","scale"))
+pp <- preProcess(x_train, method = c("center", "scale"))
 x_train_s <- predict(pp, x_train)
-x_test_s  <- predict(pp, x_test)
+x_test_s <- predict(pp, x_test)

-# 1) linear kernel svm with hyperparameter tuning (C)
+# linear kernel svm with hyperparameter tuning (C)
 set.seed(42)
 lin_grid <- data.frame(cost = c(0.1, 1, 10, 100))
-tune_lin <- tune.svm(x = x_train_s, y = y_train,
-                     kernel = "linear",
-                     cost = lin_grid$cost,
-                     tunecontrol = tune.control(cross = 5))
+tune_lin <- tune.svm(
+    x = x_train_s, y = y_train,
+    kernel = "linear",
+    cost = lin_grid$cost,
+    tunecontrol = tune.control(cross = 5)
+)
 lin_best <- tune_lin$best.model

-# 2) rbf kernel svm with tuning (C, gamma)
+# rbf kernel svm with tuning (C, gamma)
 set.seed(42)
 rbf_grid_cost <- c(0.1, 1, 10, 100, 1000)
 rbf_grid_gamma <- c(0.001, 0.01, 0.1, 1)
-tune_rbf <- tune.svm(x = x_train_s, y = y_train,
-                     kernel = "radial",
-                     cost = rbf_grid_cost,
-                     gamma = rbf_grid_gamma,
-                     tunecontrol = tune.control(cross = 5))
+tune_rbf <- tune.svm(
+    x = x_train_s, y = y_train,
+    kernel = "radial",
+    cost = rbf_grid_cost,
+    gamma = rbf_grid_gamma,
+    tunecontrol = tune.control(cross = 5)
+)
 rbf_best <- tune_rbf$best.model

-# 3) alternative classifier: random forest (same features)
+# alt classifier: random forest (same features)
 set.seed(42)
 rf_fit <- randomForest(x = x_train, y = y_train, ntree = 500, mtry = 2, importance = TRUE)

 # evaluation helper
 eval_model <- function(model, x_test_s, y_test, name) {
-	pred <- predict(model, x_test_s)
-	cm <- confusionMatrix(pred, y_test)
-	pr <- data.frame(model = name,
-					accuracy = cm$overall["Accuracy"],
-					precision_macro = mean(cm$byClass[,"Precision"], na.rm=TRUE),
-					recall_macro = mean(cm$byClass[,"Recall"], na.rm=TRUE),
-					f1_macro = mean(cm$byClass[,"F1"], na.rm=TRUE))
-	list(cm = cm, pr = pr)
+    pred <- predict(model, x_test_s)
+    cm <- confusionMatrix(pred, y_test)
+    pr <- data.frame(
+        model = name,
+        accuracy = cm$overall["Accuracy"],
+        precision_macro = mean(cm$byClass[, "Precision"], na.rm = TRUE),
+        recall_macro = mean(cm$byClass[, "Recall"], na.rm = TRUE),
+        f1_macro = mean(cm$byClass[, "F1"], na.rm = TRUE)
+    )
+    list(cm = cm, pr = pr)
 }

 # eval svm models (use scaled features)
@@ -84,11 +96,13 @@ rbf_eval <- eval_model(rbf_best, x_test_s, y_test, "svm_rbf")
 rf_pred <- predict(rf_fit, x_test)
 rf_cm <- confusionMatrix(rf_pred, y_test)

-rf_pr <- data.frame(model = "random_forest",
-                    accuracy = rf_cm$overall["Accuracy"],
-                    precision_macro = mean(rf_cm$byClass[,"Precision"], na.rm=TRUE),
-                    recall_macro = mean(rf_cm$byClass[,"Recall"], na.rm=TRUE),
-                    f1_macro = mean(rf_cm$byClass[,"F1"], na.rm=TRUE))
+rf_pr <- data.frame(
+    model = "random_forest",
+    accuracy = rf_cm$overall["Accuracy"],
+    precision_macro = mean(rf_cm$byClass[, "Precision"], na.rm = TRUE),
+    recall_macro = mean(rf_cm$byClass[, "Recall"], na.rm = TRUE),
+    f1_macro = mean(rf_cm$byClass[, "F1"], na.rm = TRUE)
+)

 perf <- rbind(lin_eval$pr, rbf_eval$pr, rf_pr)

@@ -98,14 +112,17 @@ cat("best params (rbf svm): C =", rbf_best$cost, " gamma =", rbf_best$gamma, "\n
 print(perf)

 # macro-f1 comparison
-ggplot(perf, aes(x = model, y = f1_macro)) + 
-  geom_col() + 
-  labs(title = "macro-F1 by model (wine test set)")
+ggplot(perf, aes(x = model, y = f1_macro)) +
+    geom_col() +
+    labs(title = "macro-F1 by model (wine test set)")

 # save outputs
 write.table(perf, file = "lab5_performance_table.txt", sep = "\t", row.names = FALSE, quote = FALSE)
 sink("lab5_confusion_matrices.txt")
-cat("=== svm linear ===\n"); print(lin_eval$cm)
-cat("\n=== svm rbf ===\n"); print(rbf_eval$cm)
-cat("\n=== random forest ===\n"); print(rf_cm)
-sink()
+cat("=== svm linear ===\n")
+print(lin_eval$cm)
+cat("\n=== svm rbf ===\n")
+print(rbf_eval$cm)
+cat("\n=== random forest ===\n")
+print(rf_cm)
+sink()
@@ -0,0 +1,95 @@
+=== svm linear ===
+Confusion Matrix and Statistics
+
+          Reference
+Prediction  1  2  3
+         1 11  1  0
+         2  0 13  0
+         3  0  0  9
+
+Overall Statistics
+                                          
+               Accuracy : 0.9706          
+                 95% CI : (0.8467, 0.9993)
+    No Information Rate : 0.4118          
+    P-Value [Acc > NIR] : 3.92e-12        
+                                          
+                  Kappa : 0.9553          
+                                          
+ Mcnemar's Test P-Value : NA              
+
+Statistics by Class:
+
+                     Class: 1 Class: 2 Class: 3
+Sensitivity            1.0000   0.9286   1.0000
+Specificity            0.9565   1.0000   1.0000
+Pos Pred Value         0.9167   1.0000   1.0000
+Neg Pred Value         1.0000   0.9524   1.0000
+Prevalence             0.3235   0.4118   0.2647
+Detection Rate         0.3235   0.3824   0.2647
+Detection Prevalence   0.3529   0.3824   0.2647
+Balanced Accuracy      0.9783   0.9643   1.0000
+
+=== svm rbf ===
+Confusion Matrix and Statistics
+
+          Reference
+Prediction  1  2  3
+         1 11  1  0
+         2  0 13  0
+         3  0  0  9
+
+Overall Statistics
+                                          
+               Accuracy : 0.9706          
+                 95% CI : (0.8467, 0.9993)
+    No Information Rate : 0.4118          
+    P-Value [Acc > NIR] : 3.92e-12        
+                                          
+                  Kappa : 0.9553          
+                                          
+ Mcnemar's Test P-Value : NA              
+
+Statistics by Class:
+
+                     Class: 1 Class: 2 Class: 3
+Sensitivity            1.0000   0.9286   1.0000
+Specificity            0.9565   1.0000   1.0000
+Pos Pred Value         0.9167   1.0000   1.0000
+Neg Pred Value         1.0000   0.9524   1.0000
+Prevalence             0.3235   0.4118   0.2647
+Detection Rate         0.3235   0.3824   0.2647
+Detection Prevalence   0.3529   0.3824   0.2647
+Balanced Accuracy      0.9783   0.9643   1.0000
+
+=== random forest ===
+Confusion Matrix and Statistics
+
+          Reference
+Prediction  1  2  3
+         1 11  1  0
+         2  0 13  0
+         3  0  0  9
+
+Overall Statistics
+                                          
+               Accuracy : 0.9706          
+                 95% CI : (0.8467, 0.9993)
+    No Information Rate : 0.4118          
+    P-Value [Acc > NIR] : 3.92e-12        
+                                          
+                  Kappa : 0.9553          
+                                          
+ Mcnemar's Test P-Value : NA              
+
+Statistics by Class:
+
+                     Class: 1 Class: 2 Class: 3
+Sensitivity            1.0000   0.9286   1.0000
+Specificity            0.9565   1.0000   1.0000
+Pos Pred Value         0.9167   1.0000   1.0000
+Neg Pred Value         1.0000   0.9524   1.0000
+Prevalence             0.3235   0.4118   0.2647
+Detection Rate         0.3235   0.3824   0.2647
+Detection Prevalence   0.3529   0.3824   0.2647
+Balanced Accuracy      0.9783   0.9643   1.0000
@@ -0,0 +1,4 @@
+model	accuracy	precision_macro	recall_macro	f1_macro
+svm_linear	0.970588235294118	0.972222222222222	0.976190476190476	0.973161567364466
+svm_rbf	0.970588235294118	0.972222222222222	0.976190476190476	0.973161567364466
+random_forest	0.970588235294118	0.972222222222222	0.976190476190476	0.973161567364466