Merge branch 'transfer' of https://git.ion606.com/ION606/Data-Analytics into transfer

This commit is contained in:
ION606
2025-11-14 15:54:16 -05:00
5 changed files with 227 additions and 111 deletions
BIN
View File
Binary file not shown.
+128
View File
@@ -0,0 +1,128 @@
install.packages(
c("e1071", "caret", "randomForest", "ggplot2", "pROC"),
repos = c("https://cloud.r-project.org/"),
dependencies = TRUE
)
suppressPackageStartupMessages({
library(e1071) # for svm/tune.svm
library(caret) # for metrics
library(randomForest) # alternative classifier
library(ggplot2)
})
set.seed(42)
read_wine <- function() {
df <- read.csv("wine.data", header = FALSE)
colnames(df) <- c(
"Class",
"Alcohol", "Malic.acid", "Ash", "Alcalinity.of.ash", "Magnesium",
"Total.phenols", "Flavanoids", "Nonflavanoid.phenols", "Proanthocyanins",
"Color.intensity", "Hue", "OD280.OD315", "Proline"
)
df$Class <- factor(df$Class)
df
}
df <- read_wine()
# split into train/test
idx <- createDataPartition(df$Class, p = 0.8, list = FALSE)
train <- df[idx, ]
test <- df[-idx, ]
# choose a subset of features based on ANOVA F-test
# I picked this sbuset before the runs:
# alcohol, flavanoids, color intensity, od280/od315, proline, total phenols
features <- c("Alcohol", "Flavanoids", "Color.intensity", "OD280.OD315", "Proline", "Total.phenols")
x_train <- train[, features]
y_train <- train$Class
x_test <- test[, features]
y_test <- test$Class
# scale features
pp <- preProcess(x_train, method = c("center", "scale"))
x_train_s <- predict(pp, x_train)
x_test_s <- predict(pp, x_test)
# linear kernel svm with hyperparameter tuning (C)
set.seed(42)
lin_grid <- data.frame(cost = c(0.1, 1, 10, 100))
tune_lin <- tune.svm(
x = x_train_s, y = y_train,
kernel = "linear",
cost = lin_grid$cost,
tunecontrol = tune.control(cross = 5)
)
lin_best <- tune_lin$best.model
# rbf kernel svm with tuning (C, gamma)
set.seed(42)
rbf_grid_cost <- c(0.1, 1, 10, 100, 1000)
rbf_grid_gamma <- c(0.001, 0.01, 0.1, 1)
tune_rbf <- tune.svm(
x = x_train_s, y = y_train,
kernel = "radial",
cost = rbf_grid_cost,
gamma = rbf_grid_gamma,
tunecontrol = tune.control(cross = 5)
)
rbf_best <- tune_rbf$best.model
# alt classifier: random forest (same features)
set.seed(42)
rf_fit <- randomForest(x = x_train, y = y_train, ntree = 500, mtry = 2, importance = TRUE)
# evaluation helper
eval_model <- function(model, x_test_s, y_test, name) {
pred <- predict(model, x_test_s)
cm <- confusionMatrix(pred, y_test)
pr <- data.frame(
model = name,
accuracy = cm$overall["Accuracy"],
precision_macro = mean(cm$byClass[, "Precision"], na.rm = TRUE),
recall_macro = mean(cm$byClass[, "Recall"], na.rm = TRUE),
f1_macro = mean(cm$byClass[, "F1"], na.rm = TRUE)
)
list(cm = cm, pr = pr)
}
# eval svm models (use scaled features)
lin_eval <- eval_model(lin_best, x_test_s, y_test, "svm_linear")
rbf_eval <- eval_model(rbf_best, x_test_s, y_test, "svm_rbf")
# evaluate random forest (no scaling)
rf_pred <- predict(rf_fit, x_test)
rf_cm <- confusionMatrix(rf_pred, y_test)
rf_pr <- data.frame(
model = "random_forest",
accuracy = rf_cm$overall["Accuracy"],
precision_macro = mean(rf_cm$byClass[, "Precision"], na.rm = TRUE),
recall_macro = mean(rf_cm$byClass[, "Recall"], na.rm = TRUE),
f1_macro = mean(rf_cm$byClass[, "F1"], na.rm = TRUE)
)
perf <- rbind(lin_eval$pr, rbf_eval$pr, rf_pr)
# print
cat("best params (linear svm): C =", lin_best$cost, "\n")
cat("best params (rbf svm): C =", rbf_best$cost, " gamma =", rbf_best$gamma, "\n\n")
print(perf)
# macro-f1 comparison
ggplot(perf, aes(x = model, y = f1_macro)) +
geom_col() +
labs(title = "macro-F1 by model (wine test set)")
# save outputs
write.table(perf, file = "lab5_performance_table.txt", sep = "\t", row.names = FALSE, quote = FALSE)
sink("lab5_confusion_matrices.txt")
cat("=== svm linear ===\n")
print(lin_eval$cm)
cat("\n=== svm rbf ===\n")
print(rbf_eval$cm)
cat("\n=== random forest ===\n")
print(rf_cm)
sink()
+95
View File
@@ -0,0 +1,95 @@
=== svm linear ===
Confusion Matrix and Statistics
Reference
Prediction 1 2 3
1 11 1 0
2 0 13 0
3 0 0 9
Overall Statistics
Accuracy : 0.9706
95% CI : (0.8467, 0.9993)
No Information Rate : 0.4118
P-Value [Acc > NIR] : 3.92e-12
Kappa : 0.9553
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 1 Class: 2 Class: 3
Sensitivity 1.0000 0.9286 1.0000
Specificity 0.9565 1.0000 1.0000
Pos Pred Value 0.9167 1.0000 1.0000
Neg Pred Value 1.0000 0.9524 1.0000
Prevalence 0.3235 0.4118 0.2647
Detection Rate 0.3235 0.3824 0.2647
Detection Prevalence 0.3529 0.3824 0.2647
Balanced Accuracy 0.9783 0.9643 1.0000
=== svm rbf ===
Confusion Matrix and Statistics
Reference
Prediction 1 2 3
1 11 1 0
2 0 13 0
3 0 0 9
Overall Statistics
Accuracy : 0.9706
95% CI : (0.8467, 0.9993)
No Information Rate : 0.4118
P-Value [Acc > NIR] : 3.92e-12
Kappa : 0.9553
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 1 Class: 2 Class: 3
Sensitivity 1.0000 0.9286 1.0000
Specificity 0.9565 1.0000 1.0000
Pos Pred Value 0.9167 1.0000 1.0000
Neg Pred Value 1.0000 0.9524 1.0000
Prevalence 0.3235 0.4118 0.2647
Detection Rate 0.3235 0.3824 0.2647
Detection Prevalence 0.3529 0.3824 0.2647
Balanced Accuracy 0.9783 0.9643 1.0000
=== random forest ===
Confusion Matrix and Statistics
Reference
Prediction 1 2 3
1 11 1 0
2 0 13 0
3 0 0 9
Overall Statistics
Accuracy : 0.9706
95% CI : (0.8467, 0.9993)
No Information Rate : 0.4118
P-Value [Acc > NIR] : 3.92e-12
Kappa : 0.9553
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 1 Class: 2 Class: 3
Sensitivity 1.0000 0.9286 1.0000
Specificity 0.9565 1.0000 1.0000
Pos Pred Value 0.9167 1.0000 1.0000
Neg Pred Value 1.0000 0.9524 1.0000
Prevalence 0.3235 0.4118 0.2647
Detection Rate 0.3235 0.3824 0.2647
Detection Prevalence 0.3529 0.3824 0.2647
Balanced Accuracy 0.9783 0.9643 1.0000
+4
View File
@@ -0,0 +1,4 @@
model accuracy precision_macro recall_macro f1_macro
svm_linear 0.970588235294118 0.972222222222222 0.976190476190476 0.973161567364466
svm_rbf 0.970588235294118 0.972222222222222 0.976190476190476 0.973161567364466
random_forest 0.970588235294118 0.972222222222222 0.976190476190476 0.973161567364466
-111
View File
@@ -1,111 +0,0 @@
install.packages(c("e1071","caret","randomForest","ggplot2","pROC"), dependencies = TRUE)
suppressPackageStartupMessages({
library(e1071) # for svm/tune.svm
library(caret) # for metrics
library(randomForest) # alternative classifier
library(ggplot2)
})
set.seed(42)
read_wine <- function() {
df <- read.csv("wine.data", header = FALSE)
colnames(df) <- c("Class",
"Alcohol","Malic.acid","Ash","Alcalinity.of.ash","Magnesium",
"Total.phenols","Flavanoids","Nonflavanoid.phenols","Proanthocyanins",
"Color.intensity","Hue","OD280.OD315","Proline")
df$Class <- factor(df$Class)
df
}
df <- read_wine()
# split into train/test
idx <- createDataPartition(df$Class, p = 0.8, list = FALSE)
train <- df[idx, ]
test <- df[-idx, ]
# choose a subset of features based on ANOVA F-test
# I picked this sbuset before the runs:
# alcohol, flavanoids, color intensity, od280/od315, proline, total phenols
features <- c("Alcohol","Flavanoids","Color.intensity","OD280.OD315","Proline","Total.phenols")
x_train <- train[, features]
y_train <- train$Class
x_test <- test[, features]
y_test <- test$Class
# scale features
pp <- preProcess(x_train, method = c("center","scale"))
x_train_s <- predict(pp, x_train)
x_test_s <- predict(pp, x_test)
# 1) linear kernel svm with hyperparameter tuning (C)
set.seed(42)
lin_grid <- data.frame(cost = c(0.1, 1, 10, 100))
tune_lin <- tune.svm(x = x_train_s, y = y_train,
kernel = "linear",
cost = lin_grid$cost,
tunecontrol = tune.control(cross = 5))
lin_best <- tune_lin$best.model
# 2) rbf kernel svm with tuning (C, gamma)
set.seed(42)
rbf_grid_cost <- c(0.1, 1, 10, 100, 1000)
rbf_grid_gamma <- c(0.001, 0.01, 0.1, 1)
tune_rbf <- tune.svm(x = x_train_s, y = y_train,
kernel = "radial",
cost = rbf_grid_cost,
gamma = rbf_grid_gamma,
tunecontrol = tune.control(cross = 5))
rbf_best <- tune_rbf$best.model
# 3) alternative classifier: random forest (same features)
set.seed(42)
rf_fit <- randomForest(x = x_train, y = y_train, ntree = 500, mtry = 2, importance = TRUE)
# evaluation helper
eval_model <- function(model, x_test_s, y_test, name) {
pred <- predict(model, x_test_s)
cm <- confusionMatrix(pred, y_test)
pr <- data.frame(model = name,
accuracy = cm$overall["Accuracy"],
precision_macro = mean(cm$byClass[,"Precision"], na.rm=TRUE),
recall_macro = mean(cm$byClass[,"Recall"], na.rm=TRUE),
f1_macro = mean(cm$byClass[,"F1"], na.rm=TRUE))
list(cm = cm, pr = pr)
}
# eval svm models (use scaled features)
lin_eval <- eval_model(lin_best, x_test_s, y_test, "svm_linear")
rbf_eval <- eval_model(rbf_best, x_test_s, y_test, "svm_rbf")
# evaluate random forest (no scaling)
rf_pred <- predict(rf_fit, x_test)
rf_cm <- confusionMatrix(rf_pred, y_test)
rf_pr <- data.frame(model = "random_forest",
accuracy = rf_cm$overall["Accuracy"],
precision_macro = mean(rf_cm$byClass[,"Precision"], na.rm=TRUE),
recall_macro = mean(rf_cm$byClass[,"Recall"], na.rm=TRUE),
f1_macro = mean(rf_cm$byClass[,"F1"], na.rm=TRUE))
perf <- rbind(lin_eval$pr, rbf_eval$pr, rf_pr)
# print
cat("best params (linear svm): C =", lin_best$cost, "\n")
cat("best params (rbf svm): C =", rbf_best$cost, " gamma =", rbf_best$gamma, "\n\n")
print(perf)
# macro-f1 comparison
ggplot(perf, aes(x = model, y = f1_macro)) +
geom_col() +
labs(title = "macro-F1 by model (wine test set)")
# save outputs
write.table(perf, file = "lab5_performance_table.txt", sep = "\t", row.names = FALSE, quote = FALSE)
sink("lab5_confusion_matrices.txt")
cat("=== svm linear ===\n"); print(lin_eval$cm)
cat("\n=== svm rbf ===\n"); print(rbf_eval$cm)
cat("\n=== random forest ===\n"); print(rf_cm)
sink()