added lab 5

2025-11-04 21:00:48 -05:00
parent 414a4ac5a3
commit 18a911f9d3
4 changed files with 166 additions and 50 deletions
@@ -1,22 +1,28 @@
-install.packages(c("e1071","caret","randomForest","ggplot2","pROC"), dependencies = TRUE)
+install.packages(
    c("e1071", "caret", "randomForest", "ggplot2", "pROC"),
    repos = c("https://cloud.r-project.org/"),
    dependencies = TRUE
 )
 suppressPackageStartupMessages({
-	library(e1071)        # for svm/tune.svm
+    library(e1071) # for svm/tune.svm
-	library(caret)        # for metrics
+    library(caret) # for metrics
-	library(randomForest) # alternative classifier
+    library(randomForest) # alternative classifier
-	library(ggplot2)
+    library(ggplot2)
 })
 set.seed(42)
 read_wine <- function() {
-	df <- read.csv("wine.data", header = FALSE)
+    df <- read.csv("wine.data", header = FALSE)
-	colnames(df) <- c("Class",
+    colnames(df) <- c(
-						"Alcohol","Malic.acid","Ash","Alcalinity.of.ash","Magnesium",
+        "Class",
-						"Total.phenols","Flavanoids","Nonflavanoid.phenols","Proanthocyanins",
+        "Alcohol", "Malic.acid", "Ash", "Alcalinity.of.ash", "Magnesium",
-						"Color.intensity","Hue","OD280.OD315","Proline")
+        "Total.phenols", "Flavanoids", "Nonflavanoid.phenols", "Proanthocyanins",
-	df$Class <- factor(df$Class)
+        "Color.intensity", "Hue", "OD280.OD315", "Proline"
-	df
+    )
    df$Class <- factor(df$Class)
    df
 }
 df <- read_wine()
@@ -24,56 +30,62 @@ df <- read_wine()
 # split into train/test
 idx <- createDataPartition(df$Class, p = 0.8, list = FALSE)
 train <- df[idx, ]
-test  <- df[-idx, ]
+test <- df[-idx, ]
 # choose a subset of features based on ANOVA F-test
 # I picked this sbuset before the runs:
 # alcohol, flavanoids, color intensity, od280/od315, proline, total phenols
-features <- c("Alcohol","Flavanoids","Color.intensity","OD280.OD315","Proline","Total.phenols")
+features <- c("Alcohol", "Flavanoids", "Color.intensity", "OD280.OD315", "Proline", "Total.phenols")
 x_train <- train[, features]
 y_train <- train$Class
-x_test  <- test[, features]
+x_test <- test[, features]
-y_test  <- test$Class
+y_test <- test$Class
 # scale features
-pp <- preProcess(x_train, method = c("center","scale"))
+pp <- preProcess(x_train, method = c("center", "scale"))
 x_train_s <- predict(pp, x_train)
-x_test_s  <- predict(pp, x_test)
+x_test_s <- predict(pp, x_test)
-# 1) linear kernel svm with hyperparameter tuning (C)
+# linear kernel svm with hyperparameter tuning (C)
 set.seed(42)
 lin_grid <- data.frame(cost = c(0.1, 1, 10, 100))
-tune_lin <- tune.svm(x = x_train_s, y = y_train,
+tune_lin <- tune.svm(
-                     kernel = "linear",
+    x = x_train_s, y = y_train,
-                     cost = lin_grid$cost,
+    kernel = "linear",
-                     tunecontrol = tune.control(cross = 5))
+    cost = lin_grid$cost,
    tunecontrol = tune.control(cross = 5)
 )
 lin_best <- tune_lin$best.model
-# 2) rbf kernel svm with tuning (C, gamma)
+# rbf kernel svm with tuning (C, gamma)
 set.seed(42)
 rbf_grid_cost <- c(0.1, 1, 10, 100, 1000)
 rbf_grid_gamma <- c(0.001, 0.01, 0.1, 1)
-tune_rbf <- tune.svm(x = x_train_s, y = y_train,
+tune_rbf <- tune.svm(
-                     kernel = "radial",
+    x = x_train_s, y = y_train,
-                     cost = rbf_grid_cost,
+    kernel = "radial",
-                     gamma = rbf_grid_gamma,
+    cost = rbf_grid_cost,
-                     tunecontrol = tune.control(cross = 5))
+    gamma = rbf_grid_gamma,
    tunecontrol = tune.control(cross = 5)
 )
 rbf_best <- tune_rbf$best.model
-# 3) alternative classifier: random forest (same features)
+# alt classifier: random forest (same features)
 set.seed(42)
 rf_fit <- randomForest(x = x_train, y = y_train, ntree = 500, mtry = 2, importance = TRUE)
 # evaluation helper
 eval_model <- function(model, x_test_s, y_test, name) {
-	pred <- predict(model, x_test_s)
+    pred <- predict(model, x_test_s)
-	cm <- confusionMatrix(pred, y_test)
+    cm <- confusionMatrix(pred, y_test)
-	pr <- data.frame(model = name,
+    pr <- data.frame(
-					accuracy = cm$overall["Accuracy"],
+        model = name,
-					precision_macro = mean(cm$byClass[,"Precision"], na.rm=TRUE),
+        accuracy = cm$overall["Accuracy"],
-					recall_macro = mean(cm$byClass[,"Recall"], na.rm=TRUE),
+        precision_macro = mean(cm$byClass[, "Precision"], na.rm = TRUE),
-					f1_macro = mean(cm$byClass[,"F1"], na.rm=TRUE))
+        recall_macro = mean(cm$byClass[, "Recall"], na.rm = TRUE),
-	list(cm = cm, pr = pr)
+        f1_macro = mean(cm$byClass[, "F1"], na.rm = TRUE)
    )
    list(cm = cm, pr = pr)
 }
 # eval svm models (use scaled features)
@@ -84,11 +96,13 @@ rbf_eval <- eval_model(rbf_best, x_test_s, y_test, "svm_rbf")
 rf_pred <- predict(rf_fit, x_test)
 rf_cm <- confusionMatrix(rf_pred, y_test)
-rf_pr <- data.frame(model = "random_forest",
+rf_pr <- data.frame(
-                    accuracy = rf_cm$overall["Accuracy"],
+    model = "random_forest",
-                    precision_macro = mean(rf_cm$byClass[,"Precision"], na.rm=TRUE),
+    accuracy = rf_cm$overall["Accuracy"],
-                    recall_macro = mean(rf_cm$byClass[,"Recall"], na.rm=TRUE),
+    precision_macro = mean(rf_cm$byClass[, "Precision"], na.rm = TRUE),
-                    f1_macro = mean(rf_cm$byClass[,"F1"], na.rm=TRUE))
+    recall_macro = mean(rf_cm$byClass[, "Recall"], na.rm = TRUE),
    f1_macro = mean(rf_cm$byClass[, "F1"], na.rm = TRUE)
 )
 perf <- rbind(lin_eval$pr, rbf_eval$pr, rf_pr)
@@ -98,14 +112,17 @@ cat("best params (rbf svm): C =", rbf_best$cost, " gamma =", rbf_best$gamma, "\n
 print(perf)
 # macro-f1 comparison
-ggplot(perf, aes(x = model, y = f1_macro)) + 
+ggplot(perf, aes(x = model, y = f1_macro)) +
-  geom_col() + 
+    geom_col() +
-  labs(title = "macro-F1 by model (wine test set)")
+    labs(title = "macro-F1 by model (wine test set)")
 # save outputs
 write.table(perf, file = "lab5_performance_table.txt", sep = "\t", row.names = FALSE, quote = FALSE)
 sink("lab5_confusion_matrices.txt")
-cat("=== svm linear ===\n"); print(lin_eval$cm)
+cat("=== svm linear ===\n")
-cat("\n=== svm rbf ===\n"); print(rbf_eval$cm)
+print(lin_eval$cm)
-cat("\n=== random forest ===\n"); print(rf_cm)
+cat("\n=== svm rbf ===\n")
-sink()
+print(rbf_eval$cm)
 cat("\n=== random forest ===\n")
 print(rf_cm)
 sink()
@@ -0,0 +1,95 @@
 === svm linear ===
 Confusion Matrix and Statistics
          Reference
 Prediction  1  2  3
         1 11  1  0
         2  0 13  0
         3  0  0  9
 Overall Statistics
               Accuracy : 0.9706          
                 95% CI : (0.8467, 0.9993)
    No Information Rate : 0.4118          
    P-Value [Acc > NIR] : 3.92e-12        
                  Kappa : 0.9553          
 Mcnemar's Test P-Value : NA              
 Statistics by Class:
                     Class: 1 Class: 2 Class: 3
 Sensitivity            1.0000   0.9286   1.0000
 Specificity            0.9565   1.0000   1.0000
 Pos Pred Value         0.9167   1.0000   1.0000
 Neg Pred Value         1.0000   0.9524   1.0000
 Prevalence             0.3235   0.4118   0.2647
 Detection Rate         0.3235   0.3824   0.2647
 Detection Prevalence   0.3529   0.3824   0.2647
 Balanced Accuracy      0.9783   0.9643   1.0000
 === svm rbf ===
 Confusion Matrix and Statistics
          Reference
 Prediction  1  2  3
         1 11  1  0
         2  0 13  0
         3  0  0  9
 Overall Statistics
               Accuracy : 0.9706          
                 95% CI : (0.8467, 0.9993)
    No Information Rate : 0.4118          
    P-Value [Acc > NIR] : 3.92e-12        
                  Kappa : 0.9553          
 Mcnemar's Test P-Value : NA              
 Statistics by Class:
                     Class: 1 Class: 2 Class: 3
 Sensitivity            1.0000   0.9286   1.0000
 Specificity            0.9565   1.0000   1.0000
 Pos Pred Value         0.9167   1.0000   1.0000
 Neg Pred Value         1.0000   0.9524   1.0000
 Prevalence             0.3235   0.4118   0.2647
 Detection Rate         0.3235   0.3824   0.2647
 Detection Prevalence   0.3529   0.3824   0.2647
 Balanced Accuracy      0.9783   0.9643   1.0000
 === random forest ===
 Confusion Matrix and Statistics
          Reference
 Prediction  1  2  3
         1 11  1  0
         2  0 13  0
         3  0  0  9
 Overall Statistics
               Accuracy : 0.9706          
                 95% CI : (0.8467, 0.9993)
    No Information Rate : 0.4118          
    P-Value [Acc > NIR] : 3.92e-12        
                  Kappa : 0.9553          
 Mcnemar's Test P-Value : NA              
 Statistics by Class:
                     Class: 1 Class: 2 Class: 3
 Sensitivity            1.0000   0.9286   1.0000
 Specificity            0.9565   1.0000   1.0000
 Pos Pred Value         0.9167   1.0000   1.0000
 Neg Pred Value         1.0000   0.9524   1.0000
 Prevalence             0.3235   0.4118   0.2647
 Detection Rate         0.3235   0.3824   0.2647
 Detection Prevalence   0.3529   0.3824   0.2647
 Balanced Accuracy      0.9783   0.9643   1.0000
@@ -0,0 +1,4 @@
 model	accuracy	precision_macro	recall_macro	f1_macro
 svm_linear	0.970588235294118	0.972222222222222	0.976190476190476	0.973161567364466
 svm_rbf	0.970588235294118	0.972222222222222	0.976190476190476	0.973161567364466
 random_forest	0.970588235294118	0.972222222222222	0.976190476190476	0.973161567364466