367 lines
13 KiB
R
367 lines
13 KiB
R
|
|
has_pkg <- function(pkg) requireNamespace(pkg, quietly = TRUE)
|
|||
|
|
|
|||
|
|
has_ggplot2 <- has_pkg("ggplot2")
|
|||
|
|
has_GGally <- has_pkg("GGally")
|
|||
|
|
has_e1071 <- has_pkg("e1071")
|
|||
|
|
has_class <- has_pkg("class")
|
|||
|
|
has_psych <- has_pkg("psych")
|
|||
|
|
has_readr <- has_pkg("readr")
|
|||
|
|
|
|||
|
|
# WHY IS THIS HERE YOU MIGHT ASK???? WELL LET ME TELL YOU I SPENT TWO HOURS ON STUPID PACKAGE IMPORTS
|
|||
|
|
# OOOOOOHHH PSYCH IS IN A DIFFERENT REPO??? OH IT ISN'T??? I have a fever of 103 I DO NOT CARE
|
|||
|
|
if (has_ggplot2) { library(ggplot2) } else { warning("ggplot2 not available; plots will be skipped") }
|
|||
|
|
if (has_GGally) { library(GGally) } else { message("GGally not available; skipping ggpairs plot") }
|
|||
|
|
if (has_e1071) { library(e1071) }
|
|||
|
|
if (has_class) { library(class) } else { stop("class package not available for kNN") }
|
|||
|
|
if (!has_psych) { message("psych not available; skipping pairs.panels plot") }
|
|||
|
|
if (has_readr) { library(readr) }
|
|||
|
|
library(grid) # unit() for arrows in plots
|
|||
|
|
suppressWarnings(RNGkind(sample.kind = "Rounding"))
|
|||
|
|
|
|||
|
|
# set a reproducible seed
|
|||
|
|
set.seed(4600)
|
|||
|
|
|
|||
|
|
# 178 rows
|
|||
|
|
# col 1 is class label (1,2,3)
|
|||
|
|
# other 13 columns continuous predictors
|
|||
|
|
|
|||
|
|
possible_paths <- c(
|
|||
|
|
"wine.data",
|
|||
|
|
"./wine.data",
|
|||
|
|
"../wine.data",
|
|||
|
|
"DAN/wine.data",
|
|||
|
|
"./DAN/wine.data"
|
|||
|
|
)
|
|||
|
|
data_path <- NA
|
|||
|
|
for (p in possible_paths) { if (file.exists(p)) { data_path <- p; break } }
|
|||
|
|
if (is.na(data_path)) stop("could not find wine.data; place this script in the DAN folder or given/ and re-run")
|
|||
|
|
|
|||
|
|
if (has_readr) {
|
|||
|
|
wine <- readr::read_csv(
|
|||
|
|
file = data_path,
|
|||
|
|
col_names = FALSE,
|
|||
|
|
show_col_types = FALSE,
|
|||
|
|
progress = FALSE
|
|||
|
|
)
|
|||
|
|
} else {
|
|||
|
|
wine <- read.csv(file = data_path, header = FALSE)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
colnames(wine) <- c(
|
|||
|
|
"Type",
|
|||
|
|
"Alcohol",
|
|||
|
|
"Malic_acid",
|
|||
|
|
"Ash",
|
|||
|
|
"Alcalinity_of_ash",
|
|||
|
|
"Magnesium",
|
|||
|
|
"Total_phenols",
|
|||
|
|
"Flavanoids",
|
|||
|
|
"Nonflavanoid_phenols",
|
|||
|
|
"Proanthocyanins",
|
|||
|
|
"Color_intensity",
|
|||
|
|
"Hue",
|
|||
|
|
"OD280_OD315",
|
|||
|
|
"Proline"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
wine$Type <- as.factor(wine$Type)
|
|||
|
|
|
|||
|
|
# put here from when I accidentally read in the wrong file repeatedly
|
|||
|
|
# left because it makes it more, "robust"
|
|||
|
|
stopifnot(nrow(wine) == 178, ncol(wine) == 14)
|
|||
|
|
print(summary(wine$Type))
|
|||
|
|
|
|||
|
|
# exploratory plots (because I went down a rabbit hole and by god I'm using it)
|
|||
|
|
|
|||
|
|
if (has_psych) {
|
|||
|
|
# pairs panel (psych) – colors by class
|
|||
|
|
psych::pairs.panels(
|
|||
|
|
wine[,-1],
|
|||
|
|
gap = 0,
|
|||
|
|
bg = c("red","gold","royalblue")[wine$Type],
|
|||
|
|
pch = 21,
|
|||
|
|
main = "wine (uci) – scatterplot matrix by class"
|
|||
|
|
)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (has_GGally && has_ggplot2) {
|
|||
|
|
# ggpairs for nice matrix <3
|
|||
|
|
GGally::ggpairs(wine, ggplot2::aes(colour = Type), columns = 2:ncol(wine))
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# split into train/test BEFORE!!!!!!!!!!!!!!!!!!!!!! any preprocessing to avoid leakage
|
|||
|
|
|
|||
|
|
set.seed(4600)
|
|||
|
|
n <- nrow(wine)
|
|||
|
|
train_idx <- sample.int(n, size = floor(0.7 * n))
|
|||
|
|
wine_train <- wine[train_idx, , drop = FALSE]
|
|||
|
|
wine_test <- wine[-train_idx, , drop = FALSE]
|
|||
|
|
|
|||
|
|
X_train <- wine_train[, -1]
|
|||
|
|
y_train <- wine_train$Type
|
|||
|
|
X_test <- wine_test[, -1]
|
|||
|
|
y_test <- wine_test$Type
|
|||
|
|
|
|||
|
|
# yes
|
|||
|
|
if (any(sapply(X_train, function(x) var(x, na.rm = TRUE) == 0))) {
|
|||
|
|
warning("one or more predictors have zero variance in the training set; scale() would fail")
|
|||
|
|
}
|
|||
|
|
if (anyNA(X_train) | anyNA(X_test)) {
|
|||
|
|
stop("found NA values in predictors; handle missingness before PCA")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# project both train and test using the train-fitted pca
|
|||
|
|
pca_tr <- prcomp(X_train, center = TRUE, scale. = TRUE)
|
|||
|
|
|
|||
|
|
pve_tr <- (pca_tr$sdev^2) / sum(pca_tr$sdev^2)
|
|||
|
|
pve_df <- data.frame(
|
|||
|
|
PC = paste0("PC", seq_along(pve_tr)),
|
|||
|
|
PVE = pve_tr,
|
|||
|
|
CumPVE = cumsum(pve_tr)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
print("variance explained (training pca):")
|
|||
|
|
print(pve_df)
|
|||
|
|
|
|||
|
|
# scree plot from training pca
|
|||
|
|
p_scree <- ggplot(pve_df, aes(x = seq_along(PVE), y = PVE)) +
|
|||
|
|
geom_line() + geom_point() +
|
|||
|
|
scale_x_continuous(breaks = 1:length(pve_df$PC), labels = pve_df$PC) +
|
|||
|
|
labs(title = "scree plot – variance explained by principal components (training pca)",
|
|||
|
|
x = "principal component", y = "proportion of variance explained") +
|
|||
|
|
theme_minimal()
|
|||
|
|
|
|||
|
|
# cumulative variance plot from training pca
|
|||
|
|
p_cumvar <- ggplot(pve_df, aes(x = seq_along(CumPVE), y = CumPVE)) +
|
|||
|
|
geom_line() + geom_point() +
|
|||
|
|
scale_x_continuous(breaks = 1:length(pve_df$PC), labels = pve_df$PC) +
|
|||
|
|
labs(title = "cumulative variance explained (training pca)",
|
|||
|
|
x = "principal component", y = "cumulative proportion of variance") +
|
|||
|
|
theme_minimal()
|
|||
|
|
|
|||
|
|
# ========================================================================================================
|
|||
|
|
|
|||
|
|
# choose number of pcs: default to the smallest k with >= thresh cum variance
|
|||
|
|
# you can change thresh to 0.90 or 0.99 if you prefer
|
|||
|
|
|
|||
|
|
pc_variance_threshold <- 0.95
|
|||
|
|
k_pcs <- which(cumsum(pve_tr) >= pc_variance_threshold)[1]
|
|||
|
|
if (is.na(k_pcs)) k_pcs <- ncol(X_train) # crashes if fails so...
|
|||
|
|
cat("chosen number of pcs (threshold =", pc_variance_threshold, "):", k_pcs, "\n")
|
|||
|
|
|
|||
|
|
# project train/test into the pca space
|
|||
|
|
Z_train_full <- as.data.frame(predict(pca_tr, newdata = X_train))
|
|||
|
|
Z_test_full <- as.data.frame(predict(pca_tr, newdata = X_test))
|
|||
|
|
|
|||
|
|
# for downstream modeling
|
|||
|
|
Z_train <- Z_train_full[, seq_len(k_pcs), drop = FALSE]
|
|||
|
|
Z_test <- Z_test_full[, seq_len(k_pcs), drop = FALSE]
|
|||
|
|
|
|||
|
|
scores_all <- as.data.frame(predict(pca_tr, newdata = wine[,-1]))
|
|||
|
|
scores_all$Type <- wine$Type
|
|||
|
|
|
|||
|
|
# loadings from training pca
|
|||
|
|
loadings <- as.data.frame(pca_tr$rotation)
|
|||
|
|
loadings$Variable <- rownames(loadings)
|
|||
|
|
top_pc1 <- loadings[order(abs(loadings$PC1), decreasing = TRUE), c("Variable","PC1")][1:5, ]
|
|||
|
|
top_pc2 <- loadings[order(abs(loadings$PC2), decreasing = TRUE), c("Variable","PC2")][1:5, ]
|
|||
|
|
print("top contributors to pc1 (training pca):"); print(top_pc1)
|
|||
|
|
print("top contributors to pc2 (training pca):"); print(top_pc2)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# function to make convex hull data for each group
|
|||
|
|
scores <- scores_all
|
|||
|
|
hull_df <- do.call(rbind, lapply(split(scores, scores$Type), function(df) {
|
|||
|
|
pts <- df[chull(df$PC1, df$PC2), c("PC1","PC2")]
|
|||
|
|
pts$Type <- unique(df$Type)
|
|||
|
|
pts
|
|||
|
|
}))
|
|||
|
|
p_pc12 <- ggplot(scores, aes(PC1, PC2, color = Type)) +
|
|||
|
|
geom_point(size = 2, alpha = 0.85) +
|
|||
|
|
geom_polygon(data = hull_df, aes(fill = Type, group = Type), color = NA, alpha = 0.15) +
|
|||
|
|
guides(fill = "none") +
|
|||
|
|
theme_minimal() +
|
|||
|
|
labs(title = "pc1 vs pc2 by class (projected with training pca)")
|
|||
|
|
|
|||
|
|
# arrow arrow arrow arrow arrow arrow arrow arrow arrow
|
|||
|
|
loading_scalefactor <- 3 * max(abs(scores$PC1), abs(scores$PC2)) # heuristic
|
|||
|
|
load_plot_df <- loadings
|
|||
|
|
load_plot_df$PC1s <- load_plot_df$PC1 * loading_scalefactor
|
|||
|
|
load_plot_df$PC2s <- load_plot_df$PC2 * loading_scalefactor
|
|||
|
|
|
|||
|
|
p_biplot <- ggplot(scores, aes(PC1, PC2, color = Type)) +
|
|||
|
|
geom_point(size = 2, alpha = 0.85) +
|
|||
|
|
geom_segment(
|
|||
|
|
data = load_plot_df,
|
|||
|
|
mapping = aes(x = 0, y = 0, xend = PC1s, yend = PC2s),
|
|||
|
|
inherit.aes = FALSE,
|
|||
|
|
arrow = arrow(length = unit(0.02, "npc")),
|
|||
|
|
color = "black",
|
|||
|
|
alpha = 0.8
|
|||
|
|
) +
|
|||
|
|
geom_text(
|
|||
|
|
data = load_plot_df,
|
|||
|
|
mapping = aes(x = PC1s, y = PC2s, label = Variable),
|
|||
|
|
inherit.aes = FALSE,
|
|||
|
|
hjust = 0,
|
|||
|
|
vjust = 0
|
|||
|
|
) +
|
|||
|
|
theme_minimal() +
|
|||
|
|
labs(title = "pc1 vs pc2 with variable loadings (training pca projection)")
|
|||
|
|
|
|||
|
|
# 1) kNN on original variables with standardization
|
|||
|
|
# 2) kNN on first 2 principal components only
|
|||
|
|
|
|||
|
|
# helper to create metrics from a confusion matrix (rows=true, cols=pred)
|
|||
|
|
compute_metrics <- function(cm) {
|
|||
|
|
lv <- rownames(cm)
|
|||
|
|
if (is.null(lv)) lv <- as.character(1:nrow(cm))
|
|||
|
|
TP <- diag(cm)
|
|||
|
|
FP <- colSums(cm) - TP
|
|||
|
|
FN <- rowSums(cm) - TP
|
|||
|
|
precision <- TP / (TP + FP)
|
|||
|
|
recall <- TP / (TP + FN)
|
|||
|
|
f1 <- 2 * precision * recall / (precision + recall)
|
|||
|
|
acc <- sum(TP) / sum(cm)
|
|||
|
|
macro_precision <- mean(precision, na.rm = TRUE)
|
|||
|
|
macro_recall <- mean(recall, na.rm = TRUE)
|
|||
|
|
macro_f1 <- mean(f1, na.rm = TRUE)
|
|||
|
|
per_class <- data.frame(
|
|||
|
|
class = lv,
|
|||
|
|
precision = precision,
|
|||
|
|
recall = recall,
|
|||
|
|
f1 = f1,
|
|||
|
|
row.names = NULL
|
|||
|
|
)
|
|||
|
|
summary <- data.frame(
|
|||
|
|
accuracy = acc,
|
|||
|
|
macro_precision = macro_precision,
|
|||
|
|
macro_recall = macro_recall,
|
|||
|
|
macro_f1 = macro_f1
|
|||
|
|
)
|
|||
|
|
list(per_class = per_class, summary = summary)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
set.seed(4600)
|
|||
|
|
ks <- seq(1, 15, by = 2)
|
|||
|
|
Kfolds <- 5
|
|||
|
|
|
|||
|
|
# kNN on original vars
|
|||
|
|
X_train_scaled <- scale(X_train, center = TRUE, scale = TRUE)
|
|||
|
|
scale_center <- attr(X_train_scaled, "scaled:center")
|
|||
|
|
scale_scale <- attr(X_train_scaled, "scaled:scale")
|
|||
|
|
X_test_scaled <- scale(X_test, center = scale_center, scale = scale_scale)
|
|||
|
|
|
|||
|
|
n_train_orig <- nrow(X_train_scaled)
|
|||
|
|
folds_orig <- sample(rep(1:Kfolds, length.out = n_train_orig))
|
|||
|
|
cv_acc_orig <- sapply(ks, function(k) {
|
|||
|
|
mean(sapply(1:Kfolds, function(f) {
|
|||
|
|
tr <- which(folds_orig != f)
|
|||
|
|
va <- which(folds_orig == f)
|
|||
|
|
pred_cv <- knn(train = X_train_scaled[tr, , drop = FALSE],
|
|||
|
|
test = X_train_scaled[va, , drop = FALSE],
|
|||
|
|
cl = y_train[tr], k = k)
|
|||
|
|
mean(pred_cv == y_train[va])
|
|||
|
|
}))
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
best_k_orig <- ks[which.max(cv_acc_orig)]
|
|||
|
|
cat("[Original vars] best k:", best_k_orig, "cv acc:", max(cv_acc_orig), "\n")
|
|||
|
|
|
|||
|
|
pred_orig <- knn(train = X_train_scaled, test = X_test_scaled, cl = y_train, k = best_k_orig)
|
|||
|
|
acc_orig <- mean(pred_orig == y_test)
|
|||
|
|
cm_orig <- table(truth = y_test, pred = pred_orig)
|
|||
|
|
|
|||
|
|
cat("[Original vars] held-out accuracy:", round(acc_orig, 4), "\n")
|
|||
|
|
print(cm_orig)
|
|||
|
|
|
|||
|
|
metrics_orig <- compute_metrics(cm_orig)
|
|||
|
|
print(metrics_orig$summary)
|
|||
|
|
print(metrics_orig$per_class)
|
|||
|
|
|
|||
|
|
# kNN on first 2 PCs only
|
|||
|
|
Z2_train <- Z_train_full[, 1:2, drop = FALSE]
|
|||
|
|
Z2_test <- Z_test_full[, 1:2, drop = FALSE]
|
|||
|
|
n_train_2pc <- nrow(Z2_train)
|
|||
|
|
|
|||
|
|
folds_2pc <- sample(rep(1:Kfolds, length.out = n_train_2pc))
|
|||
|
|
cv_acc_2pc <- sapply(ks, function(k) {
|
|||
|
|
mean(sapply(1:Kfolds, function(f) {
|
|||
|
|
tr <- which(folds_2pc != f)
|
|||
|
|
va <- which(folds_2pc == f)
|
|||
|
|
pred_cv <- knn(train = Z2_train[tr, , drop = FALSE],
|
|||
|
|
test = Z2_train[va, , drop = FALSE],
|
|||
|
|
cl = y_train[tr], k = k)
|
|||
|
|
mean(pred_cv == y_train[va])
|
|||
|
|
}))
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
best_k_2pc <- ks[which.max(cv_acc_2pc)]
|
|||
|
|
cat("[First 2 PCs] best k:", best_k_2pc, "cv acc:", max(cv_acc_2pc), "\n")
|
|||
|
|
|
|||
|
|
pred_2pc <- knn(train = Z2_train, test = Z2_test, cl = y_train, k = best_k_2pc)
|
|||
|
|
acc_2pc <- mean(pred_2pc == y_test)
|
|||
|
|
cm_2pc <- table(truth = y_test, pred = pred_2pc)
|
|||
|
|
|
|||
|
|
cat("[First 2 PCs] held-out accuracy:", round(acc_2pc, 4), "\n")
|
|||
|
|
print(cm_2pc)
|
|||
|
|
|
|||
|
|
metrics_2pc <- compute_metrics(cm_2pc)
|
|||
|
|
print(metrics_2pc$summary)
|
|||
|
|
print(metrics_2pc$per_class)
|
|||
|
|
|
|||
|
|
# ===========================================================================================
|
|||
|
|
outputs_dir <- "outputs"
|
|||
|
|
if (!dir.exists(outputs_dir)) dir.create(outputs_dir, recursive = TRUE, showWarnings = FALSE)
|
|||
|
|
|
|||
|
|
# plots
|
|||
|
|
if (exists("p_pc12") && inherits(p_pc12, "ggplot")) ggsave(filename = file.path(outputs_dir, "pc12_scatter.png"), plot = p_pc12, width = 8, height = 6, dpi = 300)
|
|||
|
|
if (exists("p_biplot") && inherits(p_biplot, "ggplot")) ggsave(filename = file.path(outputs_dir, "pc12_biplot.png"), plot = p_biplot, width = 8, height = 6, dpi = 300)
|
|||
|
|
if (exists("p_scree") && inherits(p_scree, "ggplot")) ggsave(filename = file.path(outputs_dir, "pca_scree.png"), plot = p_scree, width = 8, height = 6, dpi = 300)
|
|||
|
|
if (exists("p_cumvar") && inherits(p_cumvar, "ggplot")) ggsave(filename = file.path(outputs_dir, "pca_cumvar.png"), plot = p_cumvar, width = 8, height = 6, dpi = 300)
|
|||
|
|
|
|||
|
|
# top contributors/vars to PC1 and PC2
|
|||
|
|
write.csv(top_pc1, file = file.path(outputs_dir, "top_contributors_pc1.csv"), row.names = FALSE)
|
|||
|
|
write.csv(top_pc2, file = file.path(outputs_dir, "top_contributors_pc2.csv"), row.names = FALSE)
|
|||
|
|
|
|||
|
|
# confusion matrices as wide CSV and pretty text
|
|||
|
|
write.csv(as.matrix(cm_orig), file = file.path(outputs_dir, "confusion_original_wide.csv"))
|
|||
|
|
writeLines(capture.output(cm_orig), con = file.path(outputs_dir, "confusion_original.txt"))
|
|||
|
|
|
|||
|
|
write.csv(as.matrix(cm_2pc), file = file.path(outputs_dir, "confusion_2pc_wide.csv"))
|
|||
|
|
writeLines(capture.output(cm_2pc), con = file.path(outputs_dir, "confusion_2pc.txt"))
|
|||
|
|
|
|||
|
|
# metrics
|
|||
|
|
write.csv(metrics_orig$per_class, file = file.path(outputs_dir, "metrics_original_per_class.csv"), row.names = FALSE)
|
|||
|
|
write.csv(metrics_orig$summary, file = file.path(outputs_dir, "metrics_original_summary.csv"), row.names = FALSE)
|
|||
|
|
write.csv(metrics_2pc$per_class, file = file.path(outputs_dir, "metrics_2pc_per_class.csv"), row.names = FALSE)
|
|||
|
|
write.csv(metrics_2pc$summary, file = file.path(outputs_dir, "metrics_2pc_summary.csv"), row.names = FALSE)
|
|||
|
|
|
|||
|
|
# summary
|
|||
|
|
metrics_compare <- data.frame(
|
|||
|
|
model = c("original_variables", "first_2_pcs"),
|
|||
|
|
accuracy = c(metrics_orig$summary$accuracy, metrics_2pc$summary$accuracy),
|
|||
|
|
macro_precision = c(metrics_orig$summary$macro_precision, metrics_2pc$summary$macro_precision),
|
|||
|
|
macro_recall = c(metrics_orig$summary$macro_recall, metrics_2pc$summary$macro_recall),
|
|||
|
|
macro_f1 = c(metrics_orig$summary$macro_f1, metrics_2pc$summary$macro_f1)
|
|||
|
|
)
|
|||
|
|
write.csv(metrics_compare, file = file.path(outputs_dir, "metrics_comparison.csv"), row.names = FALSE)
|
|||
|
|
|
|||
|
|
# The below was made with help from ChatGPT because the psych package is confusing
|
|||
|
|
if (!interactive() && has_ggplot2) {
|
|||
|
|
pdf("Rplots_pca_fixed.pdf", width = 8, height = 6)
|
|||
|
|
if (has_psych) {
|
|||
|
|
psych::pairs.panels(
|
|||
|
|
wine[,-1],
|
|||
|
|
gap = 0,
|
|||
|
|
bg = c("red","gold","royalblue")[wine$Type],
|
|||
|
|
pch = 21,
|
|||
|
|
main = "wine (uci) – scatterplot matrix by class"
|
|||
|
|
)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (exists("p_scree") && inherits(p_scree, "ggplot")) print(p_scree)
|
|||
|
|
if (exists("p_pc12") && inherits(p_pc12, "ggplot")) print(p_pc12)
|
|||
|
|
dev.off()
|
|||
|
|
}
|