This repository has been archived on 2026-05-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
2025-10-31 17:55:13 -04:00

367 lines
13 KiB
R
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
has_pkg <- function(pkg) requireNamespace(pkg, quietly = TRUE)
has_ggplot2 <- has_pkg("ggplot2")
has_GGally <- has_pkg("GGally")
has_e1071 <- has_pkg("e1071")
has_class <- has_pkg("class")
has_psych <- has_pkg("psych")
has_readr <- has_pkg("readr")
# WHY IS THIS HERE YOU MIGHT ASK???? WELL LET ME TELL YOU I SPENT TWO HOURS ON STUPID PACKAGE IMPORTS
# OOOOOOHHH PSYCH IS IN A DIFFERENT REPO??? OH IT ISN'T??? I have a fever of 103 I DO NOT CARE
if (has_ggplot2) { library(ggplot2) } else { warning("ggplot2 not available; plots will be skipped") }
if (has_GGally) { library(GGally) } else { message("GGally not available; skipping ggpairs plot") }
if (has_e1071) { library(e1071) }
if (has_class) { library(class) } else { stop("class package not available for kNN") }
if (!has_psych) { message("psych not available; skipping pairs.panels plot") }
if (has_readr) { library(readr) }
library(grid) # unit() for arrows in plots
suppressWarnings(RNGkind(sample.kind = "Rounding"))
# set a reproducible seed
set.seed(4600)
# 178 rows
# col 1 is class label (1,2,3)
# other 13 columns continuous predictors
possible_paths <- c(
"wine.data",
"./wine.data",
"../wine.data",
"DAN/wine.data",
"./DAN/wine.data"
)
data_path <- NA
for (p in possible_paths) { if (file.exists(p)) { data_path <- p; break } }
if (is.na(data_path)) stop("could not find wine.data; place this script in the DAN folder or given/ and re-run")
if (has_readr) {
wine <- readr::read_csv(
file = data_path,
col_names = FALSE,
show_col_types = FALSE,
progress = FALSE
)
} else {
wine <- read.csv(file = data_path, header = FALSE)
}
colnames(wine) <- c(
"Type",
"Alcohol",
"Malic_acid",
"Ash",
"Alcalinity_of_ash",
"Magnesium",
"Total_phenols",
"Flavanoids",
"Nonflavanoid_phenols",
"Proanthocyanins",
"Color_intensity",
"Hue",
"OD280_OD315",
"Proline"
)
wine$Type <- as.factor(wine$Type)
# put here from when I accidentally read in the wrong file repeatedly
# left because it makes it more, "robust"
stopifnot(nrow(wine) == 178, ncol(wine) == 14)
print(summary(wine$Type))
# exploratory plots (because I went down a rabbit hole and by god I'm using it)
if (has_psych) {
# pairs panel (psych) colors by class
psych::pairs.panels(
wine[,-1],
gap = 0,
bg = c("red","gold","royalblue")[wine$Type],
pch = 21,
main = "wine (uci) scatterplot matrix by class"
)
}
if (has_GGally && has_ggplot2) {
# ggpairs for nice matrix <3
GGally::ggpairs(wine, ggplot2::aes(colour = Type), columns = 2:ncol(wine))
}
# split into train/test BEFORE!!!!!!!!!!!!!!!!!!!!!! any preprocessing to avoid leakage
set.seed(4600)
n <- nrow(wine)
train_idx <- sample.int(n, size = floor(0.7 * n))
wine_train <- wine[train_idx, , drop = FALSE]
wine_test <- wine[-train_idx, , drop = FALSE]
X_train <- wine_train[, -1]
y_train <- wine_train$Type
X_test <- wine_test[, -1]
y_test <- wine_test$Type
# yes
if (any(sapply(X_train, function(x) var(x, na.rm = TRUE) == 0))) {
warning("one or more predictors have zero variance in the training set; scale() would fail")
}
if (anyNA(X_train) | anyNA(X_test)) {
stop("found NA values in predictors; handle missingness before PCA")
}
# project both train and test using the train-fitted pca
pca_tr <- prcomp(X_train, center = TRUE, scale. = TRUE)
pve_tr <- (pca_tr$sdev^2) / sum(pca_tr$sdev^2)
pve_df <- data.frame(
PC = paste0("PC", seq_along(pve_tr)),
PVE = pve_tr,
CumPVE = cumsum(pve_tr)
)
print("variance explained (training pca):")
print(pve_df)
# scree plot from training pca
p_scree <- ggplot(pve_df, aes(x = seq_along(PVE), y = PVE)) +
geom_line() + geom_point() +
scale_x_continuous(breaks = 1:length(pve_df$PC), labels = pve_df$PC) +
labs(title = "scree plot variance explained by principal components (training pca)",
x = "principal component", y = "proportion of variance explained") +
theme_minimal()
# cumulative variance plot from training pca
p_cumvar <- ggplot(pve_df, aes(x = seq_along(CumPVE), y = CumPVE)) +
geom_line() + geom_point() +
scale_x_continuous(breaks = 1:length(pve_df$PC), labels = pve_df$PC) +
labs(title = "cumulative variance explained (training pca)",
x = "principal component", y = "cumulative proportion of variance") +
theme_minimal()
# ========================================================================================================
# choose number of pcs: default to the smallest k with >= thresh cum variance
# you can change thresh to 0.90 or 0.99 if you prefer
pc_variance_threshold <- 0.95
k_pcs <- which(cumsum(pve_tr) >= pc_variance_threshold)[1]
if (is.na(k_pcs)) k_pcs <- ncol(X_train) # crashes if fails so...
cat("chosen number of pcs (threshold =", pc_variance_threshold, "):", k_pcs, "\n")
# project train/test into the pca space
Z_train_full <- as.data.frame(predict(pca_tr, newdata = X_train))
Z_test_full <- as.data.frame(predict(pca_tr, newdata = X_test))
# for downstream modeling
Z_train <- Z_train_full[, seq_len(k_pcs), drop = FALSE]
Z_test <- Z_test_full[, seq_len(k_pcs), drop = FALSE]
scores_all <- as.data.frame(predict(pca_tr, newdata = wine[,-1]))
scores_all$Type <- wine$Type
# loadings from training pca
loadings <- as.data.frame(pca_tr$rotation)
loadings$Variable <- rownames(loadings)
top_pc1 <- loadings[order(abs(loadings$PC1), decreasing = TRUE), c("Variable","PC1")][1:5, ]
top_pc2 <- loadings[order(abs(loadings$PC2), decreasing = TRUE), c("Variable","PC2")][1:5, ]
print("top contributors to pc1 (training pca):"); print(top_pc1)
print("top contributors to pc2 (training pca):"); print(top_pc2)
# function to make convex hull data for each group
scores <- scores_all
hull_df <- do.call(rbind, lapply(split(scores, scores$Type), function(df) {
pts <- df[chull(df$PC1, df$PC2), c("PC1","PC2")]
pts$Type <- unique(df$Type)
pts
}))
p_pc12 <- ggplot(scores, aes(PC1, PC2, color = Type)) +
geom_point(size = 2, alpha = 0.85) +
geom_polygon(data = hull_df, aes(fill = Type, group = Type), color = NA, alpha = 0.15) +
guides(fill = "none") +
theme_minimal() +
labs(title = "pc1 vs pc2 by class (projected with training pca)")
# arrow arrow arrow arrow arrow arrow arrow arrow arrow
loading_scalefactor <- 3 * max(abs(scores$PC1), abs(scores$PC2)) # heuristic
load_plot_df <- loadings
load_plot_df$PC1s <- load_plot_df$PC1 * loading_scalefactor
load_plot_df$PC2s <- load_plot_df$PC2 * loading_scalefactor
p_biplot <- ggplot(scores, aes(PC1, PC2, color = Type)) +
geom_point(size = 2, alpha = 0.85) +
geom_segment(
data = load_plot_df,
mapping = aes(x = 0, y = 0, xend = PC1s, yend = PC2s),
inherit.aes = FALSE,
arrow = arrow(length = unit(0.02, "npc")),
color = "black",
alpha = 0.8
) +
geom_text(
data = load_plot_df,
mapping = aes(x = PC1s, y = PC2s, label = Variable),
inherit.aes = FALSE,
hjust = 0,
vjust = 0
) +
theme_minimal() +
labs(title = "pc1 vs pc2 with variable loadings (training pca projection)")
# 1) kNN on original variables with standardization
# 2) kNN on first 2 principal components only
# helper to create metrics from a confusion matrix (rows=true, cols=pred)
compute_metrics <- function(cm) {
lv <- rownames(cm)
if (is.null(lv)) lv <- as.character(1:nrow(cm))
TP <- diag(cm)
FP <- colSums(cm) - TP
FN <- rowSums(cm) - TP
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1 <- 2 * precision * recall / (precision + recall)
acc <- sum(TP) / sum(cm)
macro_precision <- mean(precision, na.rm = TRUE)
macro_recall <- mean(recall, na.rm = TRUE)
macro_f1 <- mean(f1, na.rm = TRUE)
per_class <- data.frame(
class = lv,
precision = precision,
recall = recall,
f1 = f1,
row.names = NULL
)
summary <- data.frame(
accuracy = acc,
macro_precision = macro_precision,
macro_recall = macro_recall,
macro_f1 = macro_f1
)
list(per_class = per_class, summary = summary)
}
set.seed(4600)
ks <- seq(1, 15, by = 2)
Kfolds <- 5
# kNN on original vars
X_train_scaled <- scale(X_train, center = TRUE, scale = TRUE)
scale_center <- attr(X_train_scaled, "scaled:center")
scale_scale <- attr(X_train_scaled, "scaled:scale")
X_test_scaled <- scale(X_test, center = scale_center, scale = scale_scale)
n_train_orig <- nrow(X_train_scaled)
folds_orig <- sample(rep(1:Kfolds, length.out = n_train_orig))
cv_acc_orig <- sapply(ks, function(k) {
mean(sapply(1:Kfolds, function(f) {
tr <- which(folds_orig != f)
va <- which(folds_orig == f)
pred_cv <- knn(train = X_train_scaled[tr, , drop = FALSE],
test = X_train_scaled[va, , drop = FALSE],
cl = y_train[tr], k = k)
mean(pred_cv == y_train[va])
}))
})
best_k_orig <- ks[which.max(cv_acc_orig)]
cat("[Original vars] best k:", best_k_orig, "cv acc:", max(cv_acc_orig), "\n")
pred_orig <- knn(train = X_train_scaled, test = X_test_scaled, cl = y_train, k = best_k_orig)
acc_orig <- mean(pred_orig == y_test)
cm_orig <- table(truth = y_test, pred = pred_orig)
cat("[Original vars] held-out accuracy:", round(acc_orig, 4), "\n")
print(cm_orig)
metrics_orig <- compute_metrics(cm_orig)
print(metrics_orig$summary)
print(metrics_orig$per_class)
# kNN on first 2 PCs only
Z2_train <- Z_train_full[, 1:2, drop = FALSE]
Z2_test <- Z_test_full[, 1:2, drop = FALSE]
n_train_2pc <- nrow(Z2_train)
folds_2pc <- sample(rep(1:Kfolds, length.out = n_train_2pc))
cv_acc_2pc <- sapply(ks, function(k) {
mean(sapply(1:Kfolds, function(f) {
tr <- which(folds_2pc != f)
va <- which(folds_2pc == f)
pred_cv <- knn(train = Z2_train[tr, , drop = FALSE],
test = Z2_train[va, , drop = FALSE],
cl = y_train[tr], k = k)
mean(pred_cv == y_train[va])
}))
})
best_k_2pc <- ks[which.max(cv_acc_2pc)]
cat("[First 2 PCs] best k:", best_k_2pc, "cv acc:", max(cv_acc_2pc), "\n")
pred_2pc <- knn(train = Z2_train, test = Z2_test, cl = y_train, k = best_k_2pc)
acc_2pc <- mean(pred_2pc == y_test)
cm_2pc <- table(truth = y_test, pred = pred_2pc)
cat("[First 2 PCs] held-out accuracy:", round(acc_2pc, 4), "\n")
print(cm_2pc)
metrics_2pc <- compute_metrics(cm_2pc)
print(metrics_2pc$summary)
print(metrics_2pc$per_class)
# ===========================================================================================
outputs_dir <- "outputs"
if (!dir.exists(outputs_dir)) dir.create(outputs_dir, recursive = TRUE, showWarnings = FALSE)
# plots
if (exists("p_pc12") && inherits(p_pc12, "ggplot")) ggsave(filename = file.path(outputs_dir, "pc12_scatter.png"), plot = p_pc12, width = 8, height = 6, dpi = 300)
if (exists("p_biplot") && inherits(p_biplot, "ggplot")) ggsave(filename = file.path(outputs_dir, "pc12_biplot.png"), plot = p_biplot, width = 8, height = 6, dpi = 300)
if (exists("p_scree") && inherits(p_scree, "ggplot")) ggsave(filename = file.path(outputs_dir, "pca_scree.png"), plot = p_scree, width = 8, height = 6, dpi = 300)
if (exists("p_cumvar") && inherits(p_cumvar, "ggplot")) ggsave(filename = file.path(outputs_dir, "pca_cumvar.png"), plot = p_cumvar, width = 8, height = 6, dpi = 300)
# top contributors/vars to PC1 and PC2
write.csv(top_pc1, file = file.path(outputs_dir, "top_contributors_pc1.csv"), row.names = FALSE)
write.csv(top_pc2, file = file.path(outputs_dir, "top_contributors_pc2.csv"), row.names = FALSE)
# confusion matrices as wide CSV and pretty text
write.csv(as.matrix(cm_orig), file = file.path(outputs_dir, "confusion_original_wide.csv"))
writeLines(capture.output(cm_orig), con = file.path(outputs_dir, "confusion_original.txt"))
write.csv(as.matrix(cm_2pc), file = file.path(outputs_dir, "confusion_2pc_wide.csv"))
writeLines(capture.output(cm_2pc), con = file.path(outputs_dir, "confusion_2pc.txt"))
# metrics
write.csv(metrics_orig$per_class, file = file.path(outputs_dir, "metrics_original_per_class.csv"), row.names = FALSE)
write.csv(metrics_orig$summary, file = file.path(outputs_dir, "metrics_original_summary.csv"), row.names = FALSE)
write.csv(metrics_2pc$per_class, file = file.path(outputs_dir, "metrics_2pc_per_class.csv"), row.names = FALSE)
write.csv(metrics_2pc$summary, file = file.path(outputs_dir, "metrics_2pc_summary.csv"), row.names = FALSE)
# summary
metrics_compare <- data.frame(
model = c("original_variables", "first_2_pcs"),
accuracy = c(metrics_orig$summary$accuracy, metrics_2pc$summary$accuracy),
macro_precision = c(metrics_orig$summary$macro_precision, metrics_2pc$summary$macro_precision),
macro_recall = c(metrics_orig$summary$macro_recall, metrics_2pc$summary$macro_recall),
macro_f1 = c(metrics_orig$summary$macro_f1, metrics_2pc$summary$macro_f1)
)
write.csv(metrics_compare, file = file.path(outputs_dir, "metrics_comparison.csv"), row.names = FALSE)
# The below was made with help from ChatGPT because the psych package is confusing
if (!interactive() && has_ggplot2) {
pdf("Rplots_pca_fixed.pdf", width = 8, height = 6)
if (has_psych) {
psych::pairs.panels(
wine[,-1],
gap = 0,
bg = c("red","gold","royalblue")[wine$Type],
pch = 21,
main = "wine (uci) scatterplot matrix by class"
)
}
if (exists("p_scree") && inherits(p_scree, "ggplot")) print(p_scree)
if (exists("p_pc12") && inherits(p_pc12, "ggplot")) print(p_pc12)
dev.off()
}