added lab 5

This commit is contained in:
2025-11-04 21:00:48 -05:00
parent 414a4ac5a3
commit 18a911f9d3
4 changed files with 166 additions and 50 deletions
BIN
View File
Binary file not shown.
+67 -50
View File
@@ -1,22 +1,28 @@
install.packages(c("e1071","caret","randomForest","ggplot2","pROC"), dependencies = TRUE)
install.packages(
c("e1071", "caret", "randomForest", "ggplot2", "pROC"),
repos = c("https://cloud.r-project.org/"),
dependencies = TRUE
)
suppressPackageStartupMessages({
library(e1071) # for svm/tune.svm
library(caret) # for metrics
library(randomForest) # alternative classifier
library(ggplot2)
library(e1071) # for svm/tune.svm
library(caret) # for metrics
library(randomForest) # alternative classifier
library(ggplot2)
})
set.seed(42)
read_wine <- function() {
df <- read.csv("wine.data", header = FALSE)
colnames(df) <- c("Class",
"Alcohol","Malic.acid","Ash","Alcalinity.of.ash","Magnesium",
"Total.phenols","Flavanoids","Nonflavanoid.phenols","Proanthocyanins",
"Color.intensity","Hue","OD280.OD315","Proline")
df$Class <- factor(df$Class)
df
df <- read.csv("wine.data", header = FALSE)
colnames(df) <- c(
"Class",
"Alcohol", "Malic.acid", "Ash", "Alcalinity.of.ash", "Magnesium",
"Total.phenols", "Flavanoids", "Nonflavanoid.phenols", "Proanthocyanins",
"Color.intensity", "Hue", "OD280.OD315", "Proline"
)
df$Class <- factor(df$Class)
df
}
df <- read_wine()
@@ -24,56 +30,62 @@ df <- read_wine()
# split into train/test
idx <- createDataPartition(df$Class, p = 0.8, list = FALSE)
train <- df[idx, ]
test <- df[-idx, ]
test <- df[-idx, ]
# choose a subset of features based on ANOVA F-test
# I picked this sbuset before the runs:
# alcohol, flavanoids, color intensity, od280/od315, proline, total phenols
features <- c("Alcohol","Flavanoids","Color.intensity","OD280.OD315","Proline","Total.phenols")
features <- c("Alcohol", "Flavanoids", "Color.intensity", "OD280.OD315", "Proline", "Total.phenols")
x_train <- train[, features]
y_train <- train$Class
x_test <- test[, features]
y_test <- test$Class
x_test <- test[, features]
y_test <- test$Class
# scale features
pp <- preProcess(x_train, method = c("center","scale"))
pp <- preProcess(x_train, method = c("center", "scale"))
x_train_s <- predict(pp, x_train)
x_test_s <- predict(pp, x_test)
x_test_s <- predict(pp, x_test)
# 1) linear kernel svm with hyperparameter tuning (C)
# linear kernel svm with hyperparameter tuning (C)
set.seed(42)
lin_grid <- data.frame(cost = c(0.1, 1, 10, 100))
tune_lin <- tune.svm(x = x_train_s, y = y_train,
kernel = "linear",
cost = lin_grid$cost,
tunecontrol = tune.control(cross = 5))
tune_lin <- tune.svm(
x = x_train_s, y = y_train,
kernel = "linear",
cost = lin_grid$cost,
tunecontrol = tune.control(cross = 5)
)
lin_best <- tune_lin$best.model
# 2) rbf kernel svm with tuning (C, gamma)
# rbf kernel svm with tuning (C, gamma)
set.seed(42)
rbf_grid_cost <- c(0.1, 1, 10, 100, 1000)
rbf_grid_gamma <- c(0.001, 0.01, 0.1, 1)
tune_rbf <- tune.svm(x = x_train_s, y = y_train,
kernel = "radial",
cost = rbf_grid_cost,
gamma = rbf_grid_gamma,
tunecontrol = tune.control(cross = 5))
tune_rbf <- tune.svm(
x = x_train_s, y = y_train,
kernel = "radial",
cost = rbf_grid_cost,
gamma = rbf_grid_gamma,
tunecontrol = tune.control(cross = 5)
)
rbf_best <- tune_rbf$best.model
# 3) alternative classifier: random forest (same features)
# alt classifier: random forest (same features)
set.seed(42)
rf_fit <- randomForest(x = x_train, y = y_train, ntree = 500, mtry = 2, importance = TRUE)
# evaluation helper
eval_model <- function(model, x_test_s, y_test, name) {
pred <- predict(model, x_test_s)
cm <- confusionMatrix(pred, y_test)
pr <- data.frame(model = name,
accuracy = cm$overall["Accuracy"],
precision_macro = mean(cm$byClass[,"Precision"], na.rm=TRUE),
recall_macro = mean(cm$byClass[,"Recall"], na.rm=TRUE),
f1_macro = mean(cm$byClass[,"F1"], na.rm=TRUE))
list(cm = cm, pr = pr)
pred <- predict(model, x_test_s)
cm <- confusionMatrix(pred, y_test)
pr <- data.frame(
model = name,
accuracy = cm$overall["Accuracy"],
precision_macro = mean(cm$byClass[, "Precision"], na.rm = TRUE),
recall_macro = mean(cm$byClass[, "Recall"], na.rm = TRUE),
f1_macro = mean(cm$byClass[, "F1"], na.rm = TRUE)
)
list(cm = cm, pr = pr)
}
# eval svm models (use scaled features)
@@ -84,11 +96,13 @@ rbf_eval <- eval_model(rbf_best, x_test_s, y_test, "svm_rbf")
rf_pred <- predict(rf_fit, x_test)
rf_cm <- confusionMatrix(rf_pred, y_test)
rf_pr <- data.frame(model = "random_forest",
accuracy = rf_cm$overall["Accuracy"],
precision_macro = mean(rf_cm$byClass[,"Precision"], na.rm=TRUE),
recall_macro = mean(rf_cm$byClass[,"Recall"], na.rm=TRUE),
f1_macro = mean(rf_cm$byClass[,"F1"], na.rm=TRUE))
rf_pr <- data.frame(
model = "random_forest",
accuracy = rf_cm$overall["Accuracy"],
precision_macro = mean(rf_cm$byClass[, "Precision"], na.rm = TRUE),
recall_macro = mean(rf_cm$byClass[, "Recall"], na.rm = TRUE),
f1_macro = mean(rf_cm$byClass[, "F1"], na.rm = TRUE)
)
perf <- rbind(lin_eval$pr, rbf_eval$pr, rf_pr)
@@ -98,14 +112,17 @@ cat("best params (rbf svm): C =", rbf_best$cost, " gamma =", rbf_best$gamma, "\n
print(perf)
# macro-f1 comparison
ggplot(perf, aes(x = model, y = f1_macro)) +
geom_col() +
labs(title = "macro-F1 by model (wine test set)")
ggplot(perf, aes(x = model, y = f1_macro)) +
geom_col() +
labs(title = "macro-F1 by model (wine test set)")
# save outputs
write.table(perf, file = "lab5_performance_table.txt", sep = "\t", row.names = FALSE, quote = FALSE)
sink("lab5_confusion_matrices.txt")
cat("=== svm linear ===\n"); print(lin_eval$cm)
cat("\n=== svm rbf ===\n"); print(rbf_eval$cm)
cat("\n=== random forest ===\n"); print(rf_cm)
sink()
cat("=== svm linear ===\n")
print(lin_eval$cm)
cat("\n=== svm rbf ===\n")
print(rbf_eval$cm)
cat("\n=== random forest ===\n")
print(rf_cm)
sink()
+95
View File
@@ -0,0 +1,95 @@
=== svm linear ===
Confusion Matrix and Statistics
Reference
Prediction 1 2 3
1 11 1 0
2 0 13 0
3 0 0 9
Overall Statistics
Accuracy : 0.9706
95% CI : (0.8467, 0.9993)
No Information Rate : 0.4118
P-Value [Acc > NIR] : 3.92e-12
Kappa : 0.9553
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 1 Class: 2 Class: 3
Sensitivity 1.0000 0.9286 1.0000
Specificity 0.9565 1.0000 1.0000
Pos Pred Value 0.9167 1.0000 1.0000
Neg Pred Value 1.0000 0.9524 1.0000
Prevalence 0.3235 0.4118 0.2647
Detection Rate 0.3235 0.3824 0.2647
Detection Prevalence 0.3529 0.3824 0.2647
Balanced Accuracy 0.9783 0.9643 1.0000
=== svm rbf ===
Confusion Matrix and Statistics
Reference
Prediction 1 2 3
1 11 1 0
2 0 13 0
3 0 0 9
Overall Statistics
Accuracy : 0.9706
95% CI : (0.8467, 0.9993)
No Information Rate : 0.4118
P-Value [Acc > NIR] : 3.92e-12
Kappa : 0.9553
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 1 Class: 2 Class: 3
Sensitivity 1.0000 0.9286 1.0000
Specificity 0.9565 1.0000 1.0000
Pos Pred Value 0.9167 1.0000 1.0000
Neg Pred Value 1.0000 0.9524 1.0000
Prevalence 0.3235 0.4118 0.2647
Detection Rate 0.3235 0.3824 0.2647
Detection Prevalence 0.3529 0.3824 0.2647
Balanced Accuracy 0.9783 0.9643 1.0000
=== random forest ===
Confusion Matrix and Statistics
Reference
Prediction 1 2 3
1 11 1 0
2 0 13 0
3 0 0 9
Overall Statistics
Accuracy : 0.9706
95% CI : (0.8467, 0.9993)
No Information Rate : 0.4118
P-Value [Acc > NIR] : 3.92e-12
Kappa : 0.9553
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 1 Class: 2 Class: 3
Sensitivity 1.0000 0.9286 1.0000
Specificity 0.9565 1.0000 1.0000
Pos Pred Value 0.9167 1.0000 1.0000
Neg Pred Value 1.0000 0.9524 1.0000
Prevalence 0.3235 0.4118 0.2647
Detection Rate 0.3235 0.3824 0.2647
Detection Prevalence 0.3529 0.3824 0.2647
Balanced Accuracy 0.9783 0.9643 1.0000
+4
View File
@@ -0,0 +1,4 @@
model accuracy precision_macro recall_macro f1_macro
svm_linear 0.970588235294118 0.972222222222222 0.976190476190476 0.973161567364466
svm_rbf 0.970588235294118 0.972222222222222 0.976190476190476 0.973161567364466
random_forest 0.970588235294118 0.972222222222222 0.976190476190476 0.973161567364466