has_pkg <- function(pkg) requireNamespace(pkg, quietly = TRUE) has_ggplot2 <- has_pkg("ggplot2") has_GGally <- has_pkg("GGally") has_e1071 <- has_pkg("e1071") has_class <- has_pkg("class") has_psych <- has_pkg("psych") has_readr <- has_pkg("readr") # WHY IS THIS HERE YOU MIGHT ASK???? WELL LET ME TELL YOU I SPENT TWO HOURS ON STUPID PACKAGE IMPORTS # OOOOOOHHH PSYCH IS IN A DIFFERENT REPO??? OH IT ISN'T??? I have a fever of 103 I DO NOT CARE if (has_ggplot2) { library(ggplot2) } else { warning("ggplot2 not available; plots will be skipped") } if (has_GGally) { library(GGally) } else { message("GGally not available; skipping ggpairs plot") } if (has_e1071) { library(e1071) } if (has_class) { library(class) } else { stop("class package not available for kNN") } if (!has_psych) { message("psych not available; skipping pairs.panels plot") } if (has_readr) { library(readr) } library(grid) # unit() for arrows in plots suppressWarnings(RNGkind(sample.kind = "Rounding")) # set a reproducible seed set.seed(4600) # 178 rows # col 1 is class label (1,2,3) # other 13 columns continuous predictors possible_paths <- c( "wine.data", "./wine.data", "../wine.data", "DAN/wine.data", "./DAN/wine.data" ) data_path <- NA for (p in possible_paths) { if (file.exists(p)) { data_path <- p; break } } if (is.na(data_path)) stop("could not find wine.data; place this script in the DAN folder or given/ and re-run") if (has_readr) { wine <- readr::read_csv( file = data_path, col_names = FALSE, show_col_types = FALSE, progress = FALSE ) } else { wine <- read.csv(file = data_path, header = FALSE) } colnames(wine) <- c( "Type", "Alcohol", "Malic_acid", "Ash", "Alcalinity_of_ash", "Magnesium", "Total_phenols", "Flavanoids", "Nonflavanoid_phenols", "Proanthocyanins", "Color_intensity", "Hue", "OD280_OD315", "Proline" ) wine$Type <- as.factor(wine$Type) # put here from when I accidentally read in the wrong file repeatedly # left because it makes it more, "robust" stopifnot(nrow(wine) == 178, ncol(wine) == 14) print(summary(wine$Type)) # exploratory plots (because I went down a rabbit hole and by god I'm using it) if (has_psych) { # pairs panel (psych) – colors by class psych::pairs.panels( wine[,-1], gap = 0, bg = c("red","gold","royalblue")[wine$Type], pch = 21, main = "wine (uci) – scatterplot matrix by class" ) } if (has_GGally && has_ggplot2) { # ggpairs for nice matrix <3 GGally::ggpairs(wine, ggplot2::aes(colour = Type), columns = 2:ncol(wine)) } # split into train/test BEFORE!!!!!!!!!!!!!!!!!!!!!! any preprocessing to avoid leakage set.seed(4600) n <- nrow(wine) train_idx <- sample.int(n, size = floor(0.7 * n)) wine_train <- wine[train_idx, , drop = FALSE] wine_test <- wine[-train_idx, , drop = FALSE] X_train <- wine_train[, -1] y_train <- wine_train$Type X_test <- wine_test[, -1] y_test <- wine_test$Type # yes if (any(sapply(X_train, function(x) var(x, na.rm = TRUE) == 0))) { warning("one or more predictors have zero variance in the training set; scale() would fail") } if (anyNA(X_train) | anyNA(X_test)) { stop("found NA values in predictors; handle missingness before PCA") } # project both train and test using the train-fitted pca pca_tr <- prcomp(X_train, center = TRUE, scale. = TRUE) pve_tr <- (pca_tr$sdev^2) / sum(pca_tr$sdev^2) pve_df <- data.frame( PC = paste0("PC", seq_along(pve_tr)), PVE = pve_tr, CumPVE = cumsum(pve_tr) ) print("variance explained (training pca):") print(pve_df) # scree plot from training pca p_scree <- ggplot(pve_df, aes(x = seq_along(PVE), y = PVE)) + geom_line() + geom_point() + scale_x_continuous(breaks = 1:length(pve_df$PC), labels = pve_df$PC) + labs(title = "scree plot – variance explained by principal components (training pca)", x = "principal component", y = "proportion of variance explained") + theme_minimal() # cumulative variance plot from training pca p_cumvar <- ggplot(pve_df, aes(x = seq_along(CumPVE), y = CumPVE)) + geom_line() + geom_point() + scale_x_continuous(breaks = 1:length(pve_df$PC), labels = pve_df$PC) + labs(title = "cumulative variance explained (training pca)", x = "principal component", y = "cumulative proportion of variance") + theme_minimal() # ======================================================================================================== # choose number of pcs: default to the smallest k with >= thresh cum variance # you can change thresh to 0.90 or 0.99 if you prefer pc_variance_threshold <- 0.95 k_pcs <- which(cumsum(pve_tr) >= pc_variance_threshold)[1] if (is.na(k_pcs)) k_pcs <- ncol(X_train) # crashes if fails so... cat("chosen number of pcs (threshold =", pc_variance_threshold, "):", k_pcs, "\n") # project train/test into the pca space Z_train_full <- as.data.frame(predict(pca_tr, newdata = X_train)) Z_test_full <- as.data.frame(predict(pca_tr, newdata = X_test)) # for downstream modeling Z_train <- Z_train_full[, seq_len(k_pcs), drop = FALSE] Z_test <- Z_test_full[, seq_len(k_pcs), drop = FALSE] scores_all <- as.data.frame(predict(pca_tr, newdata = wine[,-1])) scores_all$Type <- wine$Type # loadings from training pca loadings <- as.data.frame(pca_tr$rotation) loadings$Variable <- rownames(loadings) top_pc1 <- loadings[order(abs(loadings$PC1), decreasing = TRUE), c("Variable","PC1")][1:5, ] top_pc2 <- loadings[order(abs(loadings$PC2), decreasing = TRUE), c("Variable","PC2")][1:5, ] print("top contributors to pc1 (training pca):"); print(top_pc1) print("top contributors to pc2 (training pca):"); print(top_pc2) # function to make convex hull data for each group scores <- scores_all hull_df <- do.call(rbind, lapply(split(scores, scores$Type), function(df) { pts <- df[chull(df$PC1, df$PC2), c("PC1","PC2")] pts$Type <- unique(df$Type) pts })) p_pc12 <- ggplot(scores, aes(PC1, PC2, color = Type)) + geom_point(size = 2, alpha = 0.85) + geom_polygon(data = hull_df, aes(fill = Type, group = Type), color = NA, alpha = 0.15) + guides(fill = "none") + theme_minimal() + labs(title = "pc1 vs pc2 by class (projected with training pca)") # arrow arrow arrow arrow arrow arrow arrow arrow arrow loading_scalefactor <- 3 * max(abs(scores$PC1), abs(scores$PC2)) # heuristic load_plot_df <- loadings load_plot_df$PC1s <- load_plot_df$PC1 * loading_scalefactor load_plot_df$PC2s <- load_plot_df$PC2 * loading_scalefactor p_biplot <- ggplot(scores, aes(PC1, PC2, color = Type)) + geom_point(size = 2, alpha = 0.85) + geom_segment( data = load_plot_df, mapping = aes(x = 0, y = 0, xend = PC1s, yend = PC2s), inherit.aes = FALSE, arrow = arrow(length = unit(0.02, "npc")), color = "black", alpha = 0.8 ) + geom_text( data = load_plot_df, mapping = aes(x = PC1s, y = PC2s, label = Variable), inherit.aes = FALSE, hjust = 0, vjust = 0 ) + theme_minimal() + labs(title = "pc1 vs pc2 with variable loadings (training pca projection)") # 1) kNN on original variables with standardization # 2) kNN on first 2 principal components only # helper to create metrics from a confusion matrix (rows=true, cols=pred) compute_metrics <- function(cm) { lv <- rownames(cm) if (is.null(lv)) lv <- as.character(1:nrow(cm)) TP <- diag(cm) FP <- colSums(cm) - TP FN <- rowSums(cm) - TP precision <- TP / (TP + FP) recall <- TP / (TP + FN) f1 <- 2 * precision * recall / (precision + recall) acc <- sum(TP) / sum(cm) macro_precision <- mean(precision, na.rm = TRUE) macro_recall <- mean(recall, na.rm = TRUE) macro_f1 <- mean(f1, na.rm = TRUE) per_class <- data.frame( class = lv, precision = precision, recall = recall, f1 = f1, row.names = NULL ) summary <- data.frame( accuracy = acc, macro_precision = macro_precision, macro_recall = macro_recall, macro_f1 = macro_f1 ) list(per_class = per_class, summary = summary) } set.seed(4600) ks <- seq(1, 15, by = 2) Kfolds <- 5 # kNN on original vars X_train_scaled <- scale(X_train, center = TRUE, scale = TRUE) scale_center <- attr(X_train_scaled, "scaled:center") scale_scale <- attr(X_train_scaled, "scaled:scale") X_test_scaled <- scale(X_test, center = scale_center, scale = scale_scale) n_train_orig <- nrow(X_train_scaled) folds_orig <- sample(rep(1:Kfolds, length.out = n_train_orig)) cv_acc_orig <- sapply(ks, function(k) { mean(sapply(1:Kfolds, function(f) { tr <- which(folds_orig != f) va <- which(folds_orig == f) pred_cv <- knn(train = X_train_scaled[tr, , drop = FALSE], test = X_train_scaled[va, , drop = FALSE], cl = y_train[tr], k = k) mean(pred_cv == y_train[va]) })) }) best_k_orig <- ks[which.max(cv_acc_orig)] cat("[Original vars] best k:", best_k_orig, "cv acc:", max(cv_acc_orig), "\n") pred_orig <- knn(train = X_train_scaled, test = X_test_scaled, cl = y_train, k = best_k_orig) acc_orig <- mean(pred_orig == y_test) cm_orig <- table(truth = y_test, pred = pred_orig) cat("[Original vars] held-out accuracy:", round(acc_orig, 4), "\n") print(cm_orig) metrics_orig <- compute_metrics(cm_orig) print(metrics_orig$summary) print(metrics_orig$per_class) # kNN on first 2 PCs only Z2_train <- Z_train_full[, 1:2, drop = FALSE] Z2_test <- Z_test_full[, 1:2, drop = FALSE] n_train_2pc <- nrow(Z2_train) folds_2pc <- sample(rep(1:Kfolds, length.out = n_train_2pc)) cv_acc_2pc <- sapply(ks, function(k) { mean(sapply(1:Kfolds, function(f) { tr <- which(folds_2pc != f) va <- which(folds_2pc == f) pred_cv <- knn(train = Z2_train[tr, , drop = FALSE], test = Z2_train[va, , drop = FALSE], cl = y_train[tr], k = k) mean(pred_cv == y_train[va]) })) }) best_k_2pc <- ks[which.max(cv_acc_2pc)] cat("[First 2 PCs] best k:", best_k_2pc, "cv acc:", max(cv_acc_2pc), "\n") pred_2pc <- knn(train = Z2_train, test = Z2_test, cl = y_train, k = best_k_2pc) acc_2pc <- mean(pred_2pc == y_test) cm_2pc <- table(truth = y_test, pred = pred_2pc) cat("[First 2 PCs] held-out accuracy:", round(acc_2pc, 4), "\n") print(cm_2pc) metrics_2pc <- compute_metrics(cm_2pc) print(metrics_2pc$summary) print(metrics_2pc$per_class) # =========================================================================================== outputs_dir <- "outputs" if (!dir.exists(outputs_dir)) dir.create(outputs_dir, recursive = TRUE, showWarnings = FALSE) # plots if (exists("p_pc12") && inherits(p_pc12, "ggplot")) ggsave(filename = file.path(outputs_dir, "pc12_scatter.png"), plot = p_pc12, width = 8, height = 6, dpi = 300) if (exists("p_biplot") && inherits(p_biplot, "ggplot")) ggsave(filename = file.path(outputs_dir, "pc12_biplot.png"), plot = p_biplot, width = 8, height = 6, dpi = 300) if (exists("p_scree") && inherits(p_scree, "ggplot")) ggsave(filename = file.path(outputs_dir, "pca_scree.png"), plot = p_scree, width = 8, height = 6, dpi = 300) if (exists("p_cumvar") && inherits(p_cumvar, "ggplot")) ggsave(filename = file.path(outputs_dir, "pca_cumvar.png"), plot = p_cumvar, width = 8, height = 6, dpi = 300) # top contributors/vars to PC1 and PC2 write.csv(top_pc1, file = file.path(outputs_dir, "top_contributors_pc1.csv"), row.names = FALSE) write.csv(top_pc2, file = file.path(outputs_dir, "top_contributors_pc2.csv"), row.names = FALSE) # confusion matrices as wide CSV and pretty text write.csv(as.matrix(cm_orig), file = file.path(outputs_dir, "confusion_original_wide.csv")) writeLines(capture.output(cm_orig), con = file.path(outputs_dir, "confusion_original.txt")) write.csv(as.matrix(cm_2pc), file = file.path(outputs_dir, "confusion_2pc_wide.csv")) writeLines(capture.output(cm_2pc), con = file.path(outputs_dir, "confusion_2pc.txt")) # metrics write.csv(metrics_orig$per_class, file = file.path(outputs_dir, "metrics_original_per_class.csv"), row.names = FALSE) write.csv(metrics_orig$summary, file = file.path(outputs_dir, "metrics_original_summary.csv"), row.names = FALSE) write.csv(metrics_2pc$per_class, file = file.path(outputs_dir, "metrics_2pc_per_class.csv"), row.names = FALSE) write.csv(metrics_2pc$summary, file = file.path(outputs_dir, "metrics_2pc_summary.csv"), row.names = FALSE) # summary metrics_compare <- data.frame( model = c("original_variables", "first_2_pcs"), accuracy = c(metrics_orig$summary$accuracy, metrics_2pc$summary$accuracy), macro_precision = c(metrics_orig$summary$macro_precision, metrics_2pc$summary$macro_precision), macro_recall = c(metrics_orig$summary$macro_recall, metrics_2pc$summary$macro_recall), macro_f1 = c(metrics_orig$summary$macro_f1, metrics_2pc$summary$macro_f1) ) write.csv(metrics_compare, file = file.path(outputs_dir, "metrics_comparison.csv"), row.names = FALSE) # The below was made with help from ChatGPT because the psych package is confusing if (!interactive() && has_ggplot2) { pdf("Rplots_pca_fixed.pdf", width = 8, height = 6) if (has_psych) { psych::pairs.panels( wine[,-1], gap = 0, bg = c("red","gold","royalblue")[wine$Type], pch = 21, main = "wine (uci) – scatterplot matrix by class" ) } if (exists("p_scree") && inherits(p_scree, "ggplot")) print(p_scree) if (exists("p_pc12") && inherits(p_pc12, "ggplot")) print(p_pc12) dev.off() }