added lab 5
This commit is contained in:
Binary file not shown.
+67
-50
@@ -1,22 +1,28 @@
|
|||||||
install.packages(c("e1071","caret","randomForest","ggplot2","pROC"), dependencies = TRUE)
|
install.packages(
|
||||||
|
c("e1071", "caret", "randomForest", "ggplot2", "pROC"),
|
||||||
|
repos = c("https://cloud.r-project.org/"),
|
||||||
|
dependencies = TRUE
|
||||||
|
)
|
||||||
|
|
||||||
suppressPackageStartupMessages({
|
suppressPackageStartupMessages({
|
||||||
library(e1071) # for svm/tune.svm
|
library(e1071) # for svm/tune.svm
|
||||||
library(caret) # for metrics
|
library(caret) # for metrics
|
||||||
library(randomForest) # alternative classifier
|
library(randomForest) # alternative classifier
|
||||||
library(ggplot2)
|
library(ggplot2)
|
||||||
})
|
})
|
||||||
|
|
||||||
set.seed(42)
|
set.seed(42)
|
||||||
|
|
||||||
read_wine <- function() {
|
read_wine <- function() {
|
||||||
df <- read.csv("wine.data", header = FALSE)
|
df <- read.csv("wine.data", header = FALSE)
|
||||||
colnames(df) <- c("Class",
|
colnames(df) <- c(
|
||||||
"Alcohol","Malic.acid","Ash","Alcalinity.of.ash","Magnesium",
|
"Class",
|
||||||
"Total.phenols","Flavanoids","Nonflavanoid.phenols","Proanthocyanins",
|
"Alcohol", "Malic.acid", "Ash", "Alcalinity.of.ash", "Magnesium",
|
||||||
"Color.intensity","Hue","OD280.OD315","Proline")
|
"Total.phenols", "Flavanoids", "Nonflavanoid.phenols", "Proanthocyanins",
|
||||||
df$Class <- factor(df$Class)
|
"Color.intensity", "Hue", "OD280.OD315", "Proline"
|
||||||
df
|
)
|
||||||
|
df$Class <- factor(df$Class)
|
||||||
|
df
|
||||||
}
|
}
|
||||||
|
|
||||||
df <- read_wine()
|
df <- read_wine()
|
||||||
@@ -24,56 +30,62 @@ df <- read_wine()
|
|||||||
# split into train/test
|
# split into train/test
|
||||||
idx <- createDataPartition(df$Class, p = 0.8, list = FALSE)
|
idx <- createDataPartition(df$Class, p = 0.8, list = FALSE)
|
||||||
train <- df[idx, ]
|
train <- df[idx, ]
|
||||||
test <- df[-idx, ]
|
test <- df[-idx, ]
|
||||||
|
|
||||||
# choose a subset of features based on ANOVA F-test
|
# choose a subset of features based on ANOVA F-test
|
||||||
# I picked this sbuset before the runs:
|
# I picked this sbuset before the runs:
|
||||||
# alcohol, flavanoids, color intensity, od280/od315, proline, total phenols
|
# alcohol, flavanoids, color intensity, od280/od315, proline, total phenols
|
||||||
features <- c("Alcohol","Flavanoids","Color.intensity","OD280.OD315","Proline","Total.phenols")
|
features <- c("Alcohol", "Flavanoids", "Color.intensity", "OD280.OD315", "Proline", "Total.phenols")
|
||||||
x_train <- train[, features]
|
x_train <- train[, features]
|
||||||
y_train <- train$Class
|
y_train <- train$Class
|
||||||
x_test <- test[, features]
|
x_test <- test[, features]
|
||||||
y_test <- test$Class
|
y_test <- test$Class
|
||||||
|
|
||||||
# scale features
|
# scale features
|
||||||
pp <- preProcess(x_train, method = c("center","scale"))
|
pp <- preProcess(x_train, method = c("center", "scale"))
|
||||||
x_train_s <- predict(pp, x_train)
|
x_train_s <- predict(pp, x_train)
|
||||||
x_test_s <- predict(pp, x_test)
|
x_test_s <- predict(pp, x_test)
|
||||||
|
|
||||||
# 1) linear kernel svm with hyperparameter tuning (C)
|
# linear kernel svm with hyperparameter tuning (C)
|
||||||
set.seed(42)
|
set.seed(42)
|
||||||
lin_grid <- data.frame(cost = c(0.1, 1, 10, 100))
|
lin_grid <- data.frame(cost = c(0.1, 1, 10, 100))
|
||||||
tune_lin <- tune.svm(x = x_train_s, y = y_train,
|
tune_lin <- tune.svm(
|
||||||
kernel = "linear",
|
x = x_train_s, y = y_train,
|
||||||
cost = lin_grid$cost,
|
kernel = "linear",
|
||||||
tunecontrol = tune.control(cross = 5))
|
cost = lin_grid$cost,
|
||||||
|
tunecontrol = tune.control(cross = 5)
|
||||||
|
)
|
||||||
lin_best <- tune_lin$best.model
|
lin_best <- tune_lin$best.model
|
||||||
|
|
||||||
# 2) rbf kernel svm with tuning (C, gamma)
|
# rbf kernel svm with tuning (C, gamma)
|
||||||
set.seed(42)
|
set.seed(42)
|
||||||
rbf_grid_cost <- c(0.1, 1, 10, 100, 1000)
|
rbf_grid_cost <- c(0.1, 1, 10, 100, 1000)
|
||||||
rbf_grid_gamma <- c(0.001, 0.01, 0.1, 1)
|
rbf_grid_gamma <- c(0.001, 0.01, 0.1, 1)
|
||||||
tune_rbf <- tune.svm(x = x_train_s, y = y_train,
|
tune_rbf <- tune.svm(
|
||||||
kernel = "radial",
|
x = x_train_s, y = y_train,
|
||||||
cost = rbf_grid_cost,
|
kernel = "radial",
|
||||||
gamma = rbf_grid_gamma,
|
cost = rbf_grid_cost,
|
||||||
tunecontrol = tune.control(cross = 5))
|
gamma = rbf_grid_gamma,
|
||||||
|
tunecontrol = tune.control(cross = 5)
|
||||||
|
)
|
||||||
rbf_best <- tune_rbf$best.model
|
rbf_best <- tune_rbf$best.model
|
||||||
|
|
||||||
# 3) alternative classifier: random forest (same features)
|
# alt classifier: random forest (same features)
|
||||||
set.seed(42)
|
set.seed(42)
|
||||||
rf_fit <- randomForest(x = x_train, y = y_train, ntree = 500, mtry = 2, importance = TRUE)
|
rf_fit <- randomForest(x = x_train, y = y_train, ntree = 500, mtry = 2, importance = TRUE)
|
||||||
|
|
||||||
# evaluation helper
|
# evaluation helper
|
||||||
eval_model <- function(model, x_test_s, y_test, name) {
|
eval_model <- function(model, x_test_s, y_test, name) {
|
||||||
pred <- predict(model, x_test_s)
|
pred <- predict(model, x_test_s)
|
||||||
cm <- confusionMatrix(pred, y_test)
|
cm <- confusionMatrix(pred, y_test)
|
||||||
pr <- data.frame(model = name,
|
pr <- data.frame(
|
||||||
accuracy = cm$overall["Accuracy"],
|
model = name,
|
||||||
precision_macro = mean(cm$byClass[,"Precision"], na.rm=TRUE),
|
accuracy = cm$overall["Accuracy"],
|
||||||
recall_macro = mean(cm$byClass[,"Recall"], na.rm=TRUE),
|
precision_macro = mean(cm$byClass[, "Precision"], na.rm = TRUE),
|
||||||
f1_macro = mean(cm$byClass[,"F1"], na.rm=TRUE))
|
recall_macro = mean(cm$byClass[, "Recall"], na.rm = TRUE),
|
||||||
list(cm = cm, pr = pr)
|
f1_macro = mean(cm$byClass[, "F1"], na.rm = TRUE)
|
||||||
|
)
|
||||||
|
list(cm = cm, pr = pr)
|
||||||
}
|
}
|
||||||
|
|
||||||
# eval svm models (use scaled features)
|
# eval svm models (use scaled features)
|
||||||
@@ -84,11 +96,13 @@ rbf_eval <- eval_model(rbf_best, x_test_s, y_test, "svm_rbf")
|
|||||||
rf_pred <- predict(rf_fit, x_test)
|
rf_pred <- predict(rf_fit, x_test)
|
||||||
rf_cm <- confusionMatrix(rf_pred, y_test)
|
rf_cm <- confusionMatrix(rf_pred, y_test)
|
||||||
|
|
||||||
rf_pr <- data.frame(model = "random_forest",
|
rf_pr <- data.frame(
|
||||||
accuracy = rf_cm$overall["Accuracy"],
|
model = "random_forest",
|
||||||
precision_macro = mean(rf_cm$byClass[,"Precision"], na.rm=TRUE),
|
accuracy = rf_cm$overall["Accuracy"],
|
||||||
recall_macro = mean(rf_cm$byClass[,"Recall"], na.rm=TRUE),
|
precision_macro = mean(rf_cm$byClass[, "Precision"], na.rm = TRUE),
|
||||||
f1_macro = mean(rf_cm$byClass[,"F1"], na.rm=TRUE))
|
recall_macro = mean(rf_cm$byClass[, "Recall"], na.rm = TRUE),
|
||||||
|
f1_macro = mean(rf_cm$byClass[, "F1"], na.rm = TRUE)
|
||||||
|
)
|
||||||
|
|
||||||
perf <- rbind(lin_eval$pr, rbf_eval$pr, rf_pr)
|
perf <- rbind(lin_eval$pr, rbf_eval$pr, rf_pr)
|
||||||
|
|
||||||
@@ -98,14 +112,17 @@ cat("best params (rbf svm): C =", rbf_best$cost, " gamma =", rbf_best$gamma, "\n
|
|||||||
print(perf)
|
print(perf)
|
||||||
|
|
||||||
# macro-f1 comparison
|
# macro-f1 comparison
|
||||||
ggplot(perf, aes(x = model, y = f1_macro)) +
|
ggplot(perf, aes(x = model, y = f1_macro)) +
|
||||||
geom_col() +
|
geom_col() +
|
||||||
labs(title = "macro-F1 by model (wine test set)")
|
labs(title = "macro-F1 by model (wine test set)")
|
||||||
|
|
||||||
# save outputs
|
# save outputs
|
||||||
write.table(perf, file = "lab5_performance_table.txt", sep = "\t", row.names = FALSE, quote = FALSE)
|
write.table(perf, file = "lab5_performance_table.txt", sep = "\t", row.names = FALSE, quote = FALSE)
|
||||||
sink("lab5_confusion_matrices.txt")
|
sink("lab5_confusion_matrices.txt")
|
||||||
cat("=== svm linear ===\n"); print(lin_eval$cm)
|
cat("=== svm linear ===\n")
|
||||||
cat("\n=== svm rbf ===\n"); print(rbf_eval$cm)
|
print(lin_eval$cm)
|
||||||
cat("\n=== random forest ===\n"); print(rf_cm)
|
cat("\n=== svm rbf ===\n")
|
||||||
sink()
|
print(rbf_eval$cm)
|
||||||
|
cat("\n=== random forest ===\n")
|
||||||
|
print(rf_cm)
|
||||||
|
sink()
|
||||||
|
|||||||
@@ -0,0 +1,95 @@
|
|||||||
|
=== svm linear ===
|
||||||
|
Confusion Matrix and Statistics
|
||||||
|
|
||||||
|
Reference
|
||||||
|
Prediction 1 2 3
|
||||||
|
1 11 1 0
|
||||||
|
2 0 13 0
|
||||||
|
3 0 0 9
|
||||||
|
|
||||||
|
Overall Statistics
|
||||||
|
|
||||||
|
Accuracy : 0.9706
|
||||||
|
95% CI : (0.8467, 0.9993)
|
||||||
|
No Information Rate : 0.4118
|
||||||
|
P-Value [Acc > NIR] : 3.92e-12
|
||||||
|
|
||||||
|
Kappa : 0.9553
|
||||||
|
|
||||||
|
Mcnemar's Test P-Value : NA
|
||||||
|
|
||||||
|
Statistics by Class:
|
||||||
|
|
||||||
|
Class: 1 Class: 2 Class: 3
|
||||||
|
Sensitivity 1.0000 0.9286 1.0000
|
||||||
|
Specificity 0.9565 1.0000 1.0000
|
||||||
|
Pos Pred Value 0.9167 1.0000 1.0000
|
||||||
|
Neg Pred Value 1.0000 0.9524 1.0000
|
||||||
|
Prevalence 0.3235 0.4118 0.2647
|
||||||
|
Detection Rate 0.3235 0.3824 0.2647
|
||||||
|
Detection Prevalence 0.3529 0.3824 0.2647
|
||||||
|
Balanced Accuracy 0.9783 0.9643 1.0000
|
||||||
|
|
||||||
|
=== svm rbf ===
|
||||||
|
Confusion Matrix and Statistics
|
||||||
|
|
||||||
|
Reference
|
||||||
|
Prediction 1 2 3
|
||||||
|
1 11 1 0
|
||||||
|
2 0 13 0
|
||||||
|
3 0 0 9
|
||||||
|
|
||||||
|
Overall Statistics
|
||||||
|
|
||||||
|
Accuracy : 0.9706
|
||||||
|
95% CI : (0.8467, 0.9993)
|
||||||
|
No Information Rate : 0.4118
|
||||||
|
P-Value [Acc > NIR] : 3.92e-12
|
||||||
|
|
||||||
|
Kappa : 0.9553
|
||||||
|
|
||||||
|
Mcnemar's Test P-Value : NA
|
||||||
|
|
||||||
|
Statistics by Class:
|
||||||
|
|
||||||
|
Class: 1 Class: 2 Class: 3
|
||||||
|
Sensitivity 1.0000 0.9286 1.0000
|
||||||
|
Specificity 0.9565 1.0000 1.0000
|
||||||
|
Pos Pred Value 0.9167 1.0000 1.0000
|
||||||
|
Neg Pred Value 1.0000 0.9524 1.0000
|
||||||
|
Prevalence 0.3235 0.4118 0.2647
|
||||||
|
Detection Rate 0.3235 0.3824 0.2647
|
||||||
|
Detection Prevalence 0.3529 0.3824 0.2647
|
||||||
|
Balanced Accuracy 0.9783 0.9643 1.0000
|
||||||
|
|
||||||
|
=== random forest ===
|
||||||
|
Confusion Matrix and Statistics
|
||||||
|
|
||||||
|
Reference
|
||||||
|
Prediction 1 2 3
|
||||||
|
1 11 1 0
|
||||||
|
2 0 13 0
|
||||||
|
3 0 0 9
|
||||||
|
|
||||||
|
Overall Statistics
|
||||||
|
|
||||||
|
Accuracy : 0.9706
|
||||||
|
95% CI : (0.8467, 0.9993)
|
||||||
|
No Information Rate : 0.4118
|
||||||
|
P-Value [Acc > NIR] : 3.92e-12
|
||||||
|
|
||||||
|
Kappa : 0.9553
|
||||||
|
|
||||||
|
Mcnemar's Test P-Value : NA
|
||||||
|
|
||||||
|
Statistics by Class:
|
||||||
|
|
||||||
|
Class: 1 Class: 2 Class: 3
|
||||||
|
Sensitivity 1.0000 0.9286 1.0000
|
||||||
|
Specificity 0.9565 1.0000 1.0000
|
||||||
|
Pos Pred Value 0.9167 1.0000 1.0000
|
||||||
|
Neg Pred Value 1.0000 0.9524 1.0000
|
||||||
|
Prevalence 0.3235 0.4118 0.2647
|
||||||
|
Detection Rate 0.3235 0.3824 0.2647
|
||||||
|
Detection Prevalence 0.3529 0.3824 0.2647
|
||||||
|
Balanced Accuracy 0.9783 0.9643 1.0000
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
model accuracy precision_macro recall_macro f1_macro
|
||||||
|
svm_linear 0.970588235294118 0.972222222222222 0.976190476190476 0.973161567364466
|
||||||
|
svm_rbf 0.970588235294118 0.972222222222222 0.976190476190476 0.973161567364466
|
||||||
|
random_forest 0.970588235294118 0.972222222222222 0.976190476190476 0.973161567364466
|
||||||
Reference in New Issue
Block a user