This repository has been archived on 2026-05-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files

367 lines
13 KiB
R
Raw Permalink Normal View History

2025-10-31 17:55:13 -04:00
has_pkg <- function(pkg) requireNamespace(pkg, quietly = TRUE)
has_ggplot2 <- has_pkg("ggplot2")
has_GGally <- has_pkg("GGally")
has_e1071 <- has_pkg("e1071")
has_class <- has_pkg("class")
has_psych <- has_pkg("psych")
has_readr <- has_pkg("readr")
# WHY IS THIS HERE YOU MIGHT ASK???? WELL LET ME TELL YOU I SPENT TWO HOURS ON STUPID PACKAGE IMPORTS
# OOOOOOHHH PSYCH IS IN A DIFFERENT REPO??? OH IT ISN'T??? I have a fever of 103 I DO NOT CARE
if (has_ggplot2) { library(ggplot2) } else { warning("ggplot2 not available; plots will be skipped") }
if (has_GGally) { library(GGally) } else { message("GGally not available; skipping ggpairs plot") }
if (has_e1071) { library(e1071) }
if (has_class) { library(class) } else { stop("class package not available for kNN") }
if (!has_psych) { message("psych not available; skipping pairs.panels plot") }
if (has_readr) { library(readr) }
library(grid) # unit() for arrows in plots
suppressWarnings(RNGkind(sample.kind = "Rounding"))
# set a reproducible seed
set.seed(4600)
# 178 rows
# col 1 is class label (1,2,3)
# other 13 columns continuous predictors
possible_paths <- c(
"wine.data",
"./wine.data",
"../wine.data",
"DAN/wine.data",
"./DAN/wine.data"
)
data_path <- NA
for (p in possible_paths) { if (file.exists(p)) { data_path <- p; break } }
if (is.na(data_path)) stop("could not find wine.data; place this script in the DAN folder or given/ and re-run")
if (has_readr) {
wine <- readr::read_csv(
file = data_path,
col_names = FALSE,
show_col_types = FALSE,
progress = FALSE
)
} else {
wine <- read.csv(file = data_path, header = FALSE)
}
colnames(wine) <- c(
"Type",
"Alcohol",
"Malic_acid",
"Ash",
"Alcalinity_of_ash",
"Magnesium",
"Total_phenols",
"Flavanoids",
"Nonflavanoid_phenols",
"Proanthocyanins",
"Color_intensity",
"Hue",
"OD280_OD315",
"Proline"
)
wine$Type <- as.factor(wine$Type)
# put here from when I accidentally read in the wrong file repeatedly
# left because it makes it more, "robust"
stopifnot(nrow(wine) == 178, ncol(wine) == 14)
print(summary(wine$Type))
# exploratory plots (because I went down a rabbit hole and by god I'm using it)
if (has_psych) {
# pairs panel (psych) colors by class
psych::pairs.panels(
wine[,-1],
gap = 0,
bg = c("red","gold","royalblue")[wine$Type],
pch = 21,
main = "wine (uci) scatterplot matrix by class"
)
}
if (has_GGally && has_ggplot2) {
# ggpairs for nice matrix <3
GGally::ggpairs(wine, ggplot2::aes(colour = Type), columns = 2:ncol(wine))
}
# split into train/test BEFORE!!!!!!!!!!!!!!!!!!!!!! any preprocessing to avoid leakage
set.seed(4600)
n <- nrow(wine)
train_idx <- sample.int(n, size = floor(0.7 * n))
wine_train <- wine[train_idx, , drop = FALSE]
wine_test <- wine[-train_idx, , drop = FALSE]
X_train <- wine_train[, -1]
y_train <- wine_train$Type
X_test <- wine_test[, -1]
y_test <- wine_test$Type
# yes
if (any(sapply(X_train, function(x) var(x, na.rm = TRUE) == 0))) {
warning("one or more predictors have zero variance in the training set; scale() would fail")
}
if (anyNA(X_train) | anyNA(X_test)) {
stop("found NA values in predictors; handle missingness before PCA")
}
# project both train and test using the train-fitted pca
pca_tr <- prcomp(X_train, center = TRUE, scale. = TRUE)
pve_tr <- (pca_tr$sdev^2) / sum(pca_tr$sdev^2)
pve_df <- data.frame(
PC = paste0("PC", seq_along(pve_tr)),
PVE = pve_tr,
CumPVE = cumsum(pve_tr)
)
print("variance explained (training pca):")
print(pve_df)
# scree plot from training pca
p_scree <- ggplot(pve_df, aes(x = seq_along(PVE), y = PVE)) +
geom_line() + geom_point() +
scale_x_continuous(breaks = 1:length(pve_df$PC), labels = pve_df$PC) +
labs(title = "scree plot variance explained by principal components (training pca)",
x = "principal component", y = "proportion of variance explained") +
theme_minimal()
# cumulative variance plot from training pca
p_cumvar <- ggplot(pve_df, aes(x = seq_along(CumPVE), y = CumPVE)) +
geom_line() + geom_point() +
scale_x_continuous(breaks = 1:length(pve_df$PC), labels = pve_df$PC) +
labs(title = "cumulative variance explained (training pca)",
x = "principal component", y = "cumulative proportion of variance") +
theme_minimal()
# ========================================================================================================
# choose number of pcs: default to the smallest k with >= thresh cum variance
# you can change thresh to 0.90 or 0.99 if you prefer
pc_variance_threshold <- 0.95
k_pcs <- which(cumsum(pve_tr) >= pc_variance_threshold)[1]
if (is.na(k_pcs)) k_pcs <- ncol(X_train) # crashes if fails so...
cat("chosen number of pcs (threshold =", pc_variance_threshold, "):", k_pcs, "\n")
# project train/test into the pca space
Z_train_full <- as.data.frame(predict(pca_tr, newdata = X_train))
Z_test_full <- as.data.frame(predict(pca_tr, newdata = X_test))
# for downstream modeling
Z_train <- Z_train_full[, seq_len(k_pcs), drop = FALSE]
Z_test <- Z_test_full[, seq_len(k_pcs), drop = FALSE]
scores_all <- as.data.frame(predict(pca_tr, newdata = wine[,-1]))
scores_all$Type <- wine$Type
# loadings from training pca
loadings <- as.data.frame(pca_tr$rotation)
loadings$Variable <- rownames(loadings)
top_pc1 <- loadings[order(abs(loadings$PC1), decreasing = TRUE), c("Variable","PC1")][1:5, ]
top_pc2 <- loadings[order(abs(loadings$PC2), decreasing = TRUE), c("Variable","PC2")][1:5, ]
print("top contributors to pc1 (training pca):"); print(top_pc1)
print("top contributors to pc2 (training pca):"); print(top_pc2)
# function to make convex hull data for each group
scores <- scores_all
hull_df <- do.call(rbind, lapply(split(scores, scores$Type), function(df) {
pts <- df[chull(df$PC1, df$PC2), c("PC1","PC2")]
pts$Type <- unique(df$Type)
pts
}))
p_pc12 <- ggplot(scores, aes(PC1, PC2, color = Type)) +
geom_point(size = 2, alpha = 0.85) +
geom_polygon(data = hull_df, aes(fill = Type, group = Type), color = NA, alpha = 0.15) +
guides(fill = "none") +
theme_minimal() +
labs(title = "pc1 vs pc2 by class (projected with training pca)")
# arrow arrow arrow arrow arrow arrow arrow arrow arrow
loading_scalefactor <- 3 * max(abs(scores$PC1), abs(scores$PC2)) # heuristic
load_plot_df <- loadings
load_plot_df$PC1s <- load_plot_df$PC1 * loading_scalefactor
load_plot_df$PC2s <- load_plot_df$PC2 * loading_scalefactor
p_biplot <- ggplot(scores, aes(PC1, PC2, color = Type)) +
geom_point(size = 2, alpha = 0.85) +
geom_segment(
data = load_plot_df,
mapping = aes(x = 0, y = 0, xend = PC1s, yend = PC2s),
inherit.aes = FALSE,
arrow = arrow(length = unit(0.02, "npc")),
color = "black",
alpha = 0.8
) +
geom_text(
data = load_plot_df,
mapping = aes(x = PC1s, y = PC2s, label = Variable),
inherit.aes = FALSE,
hjust = 0,
vjust = 0
) +
theme_minimal() +
labs(title = "pc1 vs pc2 with variable loadings (training pca projection)")
# 1) kNN on original variables with standardization
# 2) kNN on first 2 principal components only
# helper to create metrics from a confusion matrix (rows=true, cols=pred)
compute_metrics <- function(cm) {
lv <- rownames(cm)
if (is.null(lv)) lv <- as.character(1:nrow(cm))
TP <- diag(cm)
FP <- colSums(cm) - TP
FN <- rowSums(cm) - TP
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1 <- 2 * precision * recall / (precision + recall)
acc <- sum(TP) / sum(cm)
macro_precision <- mean(precision, na.rm = TRUE)
macro_recall <- mean(recall, na.rm = TRUE)
macro_f1 <- mean(f1, na.rm = TRUE)
per_class <- data.frame(
class = lv,
precision = precision,
recall = recall,
f1 = f1,
row.names = NULL
)
summary <- data.frame(
accuracy = acc,
macro_precision = macro_precision,
macro_recall = macro_recall,
macro_f1 = macro_f1
)
list(per_class = per_class, summary = summary)
}
set.seed(4600)
ks <- seq(1, 15, by = 2)
Kfolds <- 5
# kNN on original vars
X_train_scaled <- scale(X_train, center = TRUE, scale = TRUE)
scale_center <- attr(X_train_scaled, "scaled:center")
scale_scale <- attr(X_train_scaled, "scaled:scale")
X_test_scaled <- scale(X_test, center = scale_center, scale = scale_scale)
n_train_orig <- nrow(X_train_scaled)
folds_orig <- sample(rep(1:Kfolds, length.out = n_train_orig))
cv_acc_orig <- sapply(ks, function(k) {
mean(sapply(1:Kfolds, function(f) {
tr <- which(folds_orig != f)
va <- which(folds_orig == f)
pred_cv <- knn(train = X_train_scaled[tr, , drop = FALSE],
test = X_train_scaled[va, , drop = FALSE],
cl = y_train[tr], k = k)
mean(pred_cv == y_train[va])
}))
})
best_k_orig <- ks[which.max(cv_acc_orig)]
cat("[Original vars] best k:", best_k_orig, "cv acc:", max(cv_acc_orig), "\n")
pred_orig <- knn(train = X_train_scaled, test = X_test_scaled, cl = y_train, k = best_k_orig)
acc_orig <- mean(pred_orig == y_test)
cm_orig <- table(truth = y_test, pred = pred_orig)
cat("[Original vars] held-out accuracy:", round(acc_orig, 4), "\n")
print(cm_orig)
metrics_orig <- compute_metrics(cm_orig)
print(metrics_orig$summary)
print(metrics_orig$per_class)
# kNN on first 2 PCs only
Z2_train <- Z_train_full[, 1:2, drop = FALSE]
Z2_test <- Z_test_full[, 1:2, drop = FALSE]
n_train_2pc <- nrow(Z2_train)
folds_2pc <- sample(rep(1:Kfolds, length.out = n_train_2pc))
cv_acc_2pc <- sapply(ks, function(k) {
mean(sapply(1:Kfolds, function(f) {
tr <- which(folds_2pc != f)
va <- which(folds_2pc == f)
pred_cv <- knn(train = Z2_train[tr, , drop = FALSE],
test = Z2_train[va, , drop = FALSE],
cl = y_train[tr], k = k)
mean(pred_cv == y_train[va])
}))
})
best_k_2pc <- ks[which.max(cv_acc_2pc)]
cat("[First 2 PCs] best k:", best_k_2pc, "cv acc:", max(cv_acc_2pc), "\n")
pred_2pc <- knn(train = Z2_train, test = Z2_test, cl = y_train, k = best_k_2pc)
acc_2pc <- mean(pred_2pc == y_test)
cm_2pc <- table(truth = y_test, pred = pred_2pc)
cat("[First 2 PCs] held-out accuracy:", round(acc_2pc, 4), "\n")
print(cm_2pc)
metrics_2pc <- compute_metrics(cm_2pc)
print(metrics_2pc$summary)
print(metrics_2pc$per_class)
# ===========================================================================================
outputs_dir <- "outputs"
if (!dir.exists(outputs_dir)) dir.create(outputs_dir, recursive = TRUE, showWarnings = FALSE)
# plots
if (exists("p_pc12") && inherits(p_pc12, "ggplot")) ggsave(filename = file.path(outputs_dir, "pc12_scatter.png"), plot = p_pc12, width = 8, height = 6, dpi = 300)
if (exists("p_biplot") && inherits(p_biplot, "ggplot")) ggsave(filename = file.path(outputs_dir, "pc12_biplot.png"), plot = p_biplot, width = 8, height = 6, dpi = 300)
if (exists("p_scree") && inherits(p_scree, "ggplot")) ggsave(filename = file.path(outputs_dir, "pca_scree.png"), plot = p_scree, width = 8, height = 6, dpi = 300)
if (exists("p_cumvar") && inherits(p_cumvar, "ggplot")) ggsave(filename = file.path(outputs_dir, "pca_cumvar.png"), plot = p_cumvar, width = 8, height = 6, dpi = 300)
# top contributors/vars to PC1 and PC2
write.csv(top_pc1, file = file.path(outputs_dir, "top_contributors_pc1.csv"), row.names = FALSE)
write.csv(top_pc2, file = file.path(outputs_dir, "top_contributors_pc2.csv"), row.names = FALSE)
# confusion matrices as wide CSV and pretty text
write.csv(as.matrix(cm_orig), file = file.path(outputs_dir, "confusion_original_wide.csv"))
writeLines(capture.output(cm_orig), con = file.path(outputs_dir, "confusion_original.txt"))
write.csv(as.matrix(cm_2pc), file = file.path(outputs_dir, "confusion_2pc_wide.csv"))
writeLines(capture.output(cm_2pc), con = file.path(outputs_dir, "confusion_2pc.txt"))
# metrics
write.csv(metrics_orig$per_class, file = file.path(outputs_dir, "metrics_original_per_class.csv"), row.names = FALSE)
write.csv(metrics_orig$summary, file = file.path(outputs_dir, "metrics_original_summary.csv"), row.names = FALSE)
write.csv(metrics_2pc$per_class, file = file.path(outputs_dir, "metrics_2pc_per_class.csv"), row.names = FALSE)
write.csv(metrics_2pc$summary, file = file.path(outputs_dir, "metrics_2pc_summary.csv"), row.names = FALSE)
# summary
metrics_compare <- data.frame(
model = c("original_variables", "first_2_pcs"),
accuracy = c(metrics_orig$summary$accuracy, metrics_2pc$summary$accuracy),
macro_precision = c(metrics_orig$summary$macro_precision, metrics_2pc$summary$macro_precision),
macro_recall = c(metrics_orig$summary$macro_recall, metrics_2pc$summary$macro_recall),
macro_f1 = c(metrics_orig$summary$macro_f1, metrics_2pc$summary$macro_f1)
)
write.csv(metrics_compare, file = file.path(outputs_dir, "metrics_comparison.csv"), row.names = FALSE)
# The below was made with help from ChatGPT because the psych package is confusing
if (!interactive() && has_ggplot2) {
pdf("Rplots_pca_fixed.pdf", width = 8, height = 6)
if (has_psych) {
psych::pairs.panels(
wine[,-1],
gap = 0,
bg = c("red","gold","royalblue")[wine$Type],
pch = 21,
main = "wine (uci) scatterplot matrix by class"
)
}
if (exists("p_scree") && inherits(p_scree, "ggplot")) print(p_scree)
if (exists("p_pc12") && inherits(p_pc12, "ggplot")) print(p_pc12)
dev.off()
}