Data-Analytics/Lab 4/lab4_pca.R

has_pkg <- function(pkg) requireNamespace(pkg, quietly = TRUE)

has_ggplot2 <- has_pkg("ggplot2")
has_GGally  <- has_pkg("GGally")
has_e1071   <- has_pkg("e1071")
has_class   <- has_pkg("class")
has_psych   <- has_pkg("psych")
has_readr   <- has_pkg("readr")

# WHY IS THIS HERE YOU MIGHT ASK???? WELL LET ME TELL YOU I SPENT TWO HOURS ON STUPID PACKAGE IMPORTS
# OOOOOOHHH PSYCH IS IN A DIFFERENT REPO??? OH IT ISN'T??? I have a fever of 103 I DO NOT CARE
if (has_ggplot2) { library(ggplot2) } else { warning("ggplot2 not available; plots will be skipped") }
if (has_GGally)  { library(GGally)  } else { message("GGally not available; skipping ggpairs plot") }
if (has_e1071)   { library(e1071)   }
if (has_class)   { library(class)   } else { stop("class package not available for kNN") }
if (!has_psych)  { message("psych not available; skipping pairs.panels plot") }
if (has_readr)   { library(readr)   }
library(grid)        # unit() for arrows in plots
suppressWarnings(RNGkind(sample.kind = "Rounding"))

# set a reproducible seed
set.seed(4600)

# 178 rows
# col 1 is class label (1,2,3)
# other 13 columns continuous predictors

possible_paths <- c(
  "wine.data",
  "./wine.data",
  "../wine.data",
  "DAN/wine.data",
  "./DAN/wine.data"
)
data_path <- NA
for (p in possible_paths) { if (file.exists(p)) { data_path <- p; break } }
if (is.na(data_path)) stop("could not find wine.data; place this script in the DAN folder or given/ and re-run")

if (has_readr) {
  wine <- readr::read_csv(
    file = data_path,
    col_names = FALSE,
    show_col_types = FALSE,
    progress = FALSE
  )
} else {
  wine <- read.csv(file = data_path, header = FALSE)
}

colnames(wine) <- c(
  "Type",
  "Alcohol",
  "Malic_acid",
  "Ash",
  "Alcalinity_of_ash",
  "Magnesium",
  "Total_phenols",
  "Flavanoids",
  "Nonflavanoid_phenols",
  "Proanthocyanins",
  "Color_intensity",
  "Hue",
  "OD280_OD315",
  "Proline"
)

wine$Type <- as.factor(wine$Type)

# put here from when I accidentally read in the wrong file repeatedly
# left because it makes it more, "robust"
stopifnot(nrow(wine) == 178, ncol(wine) == 14)
print(summary(wine$Type))

# exploratory plots (because I went down a rabbit hole and by god I'm using it)

if (has_psych) {
  # pairs panel (psych) – colors by class
  psych::pairs.panels(
    wine[,-1],
    gap = 0,
    bg = c("red","gold","royalblue")[wine$Type],
    pch = 21,
    main = "wine (uci) – scatterplot matrix by class"
  )
}

if (has_GGally && has_ggplot2) {
  # ggpairs for nice matrix <3
  GGally::ggpairs(wine, ggplot2::aes(colour = Type), columns = 2:ncol(wine))
}

# split into train/test BEFORE!!!!!!!!!!!!!!!!!!!!!! any preprocessing to avoid leakage

set.seed(4600)
n <- nrow(wine)
train_idx <- sample.int(n, size = floor(0.7 * n))
wine_train <- wine[train_idx, , drop = FALSE]
wine_test  <- wine[-train_idx, , drop = FALSE]

X_train <- wine_train[, -1]
y_train <- wine_train$Type
X_test  <- wine_test[, -1]
y_test  <- wine_test$Type

# yes
if (any(sapply(X_train, function(x) var(x, na.rm = TRUE) == 0))) {
  warning("one or more predictors have zero variance in the training set; scale() would fail")
}
if (anyNA(X_train) | anyNA(X_test)) {
  stop("found NA values in predictors; handle missingness before PCA")
}

# project both train and test using the train-fitted pca
pca_tr <- prcomp(X_train, center = TRUE, scale. = TRUE)

pve_tr <- (pca_tr$sdev^2) / sum(pca_tr$sdev^2)
pve_df <- data.frame(
  PC = paste0("PC", seq_along(pve_tr)),
  PVE = pve_tr,
  CumPVE = cumsum(pve_tr)
)

print("variance explained (training pca):")
print(pve_df)

# scree plot from training pca
p_scree <- ggplot(pve_df, aes(x = seq_along(PVE), y = PVE)) +
  geom_line() + geom_point() +
  scale_x_continuous(breaks = 1:length(pve_df$PC), labels = pve_df$PC) +
  labs(title = "scree plot – variance explained by principal components (training pca)",
       x = "principal component", y = "proportion of variance explained") +
  theme_minimal()

# cumulative variance plot from training pca
p_cumvar <- ggplot(pve_df, aes(x = seq_along(CumPVE), y = CumPVE)) +
  geom_line() + geom_point() +
  scale_x_continuous(breaks = 1:length(pve_df$PC), labels = pve_df$PC) +
  labs(title = "cumulative variance explained (training pca)",
       x = "principal component", y = "cumulative proportion of variance") +
  theme_minimal()

# ========================================================================================================

# choose number of pcs: default to the smallest k with >= thresh cum variance
# you can change thresh to 0.90 or 0.99 if you prefer

pc_variance_threshold <- 0.95
k_pcs <- which(cumsum(pve_tr) >= pc_variance_threshold)[1]
if (is.na(k_pcs)) k_pcs <- ncol(X_train)  # crashes if fails so...
cat("chosen number of pcs (threshold =", pc_variance_threshold, "):", k_pcs, "\n")

# project train/test into the pca space
Z_train_full <- as.data.frame(predict(pca_tr, newdata = X_train))
Z_test_full  <- as.data.frame(predict(pca_tr, newdata = X_test))

# for downstream modeling
Z_train <- Z_train_full[, seq_len(k_pcs), drop = FALSE]
Z_test  <- Z_test_full[,  seq_len(k_pcs), drop = FALSE]

scores_all <- as.data.frame(predict(pca_tr, newdata = wine[,-1]))
scores_all$Type <- wine$Type

# loadings from training pca
loadings <- as.data.frame(pca_tr$rotation)
loadings$Variable <- rownames(loadings)
top_pc1 <- loadings[order(abs(loadings$PC1), decreasing = TRUE), c("Variable","PC1")][1:5, ]
top_pc2 <- loadings[order(abs(loadings$PC2), decreasing = TRUE), c("Variable","PC2")][1:5, ]
print("top contributors to pc1 (training pca):"); print(top_pc1)
print("top contributors to pc2 (training pca):"); print(top_pc2)


# function to make convex hull data for each group
scores <- scores_all
hull_df <- do.call(rbind, lapply(split(scores, scores$Type), function(df) {
  pts <- df[chull(df$PC1, df$PC2), c("PC1","PC2")]
  pts$Type <- unique(df$Type)
  pts
}))
p_pc12 <- ggplot(scores, aes(PC1, PC2, color = Type)) +
  geom_point(size = 2, alpha = 0.85) +
  geom_polygon(data = hull_df, aes(fill = Type, group = Type), color = NA, alpha = 0.15) +
  guides(fill = "none") +
  theme_minimal() +
  labs(title = "pc1 vs pc2 by class (projected with training pca)")

# arrow arrow arrow arrow arrow arrow arrow arrow arrow
loading_scalefactor <- 3 * max(abs(scores$PC1), abs(scores$PC2)) # heuristic
load_plot_df <- loadings
load_plot_df$PC1s <- load_plot_df$PC1 * loading_scalefactor
load_plot_df$PC2s <- load_plot_df$PC2 * loading_scalefactor

p_biplot <- ggplot(scores, aes(PC1, PC2, color = Type)) +
  geom_point(size = 2, alpha = 0.85) +
  geom_segment(
    data = load_plot_df,
    mapping = aes(x = 0, y = 0, xend = PC1s, yend = PC2s),
    inherit.aes = FALSE,
    arrow = arrow(length = unit(0.02, "npc")),
    color = "black",
    alpha = 0.8
  ) +
  geom_text(
    data = load_plot_df,
    mapping = aes(x = PC1s, y = PC2s, label = Variable),
    inherit.aes = FALSE,
    hjust = 0,
    vjust = 0
  ) +
  theme_minimal() +
  labs(title = "pc1 vs pc2 with variable loadings (training pca projection)")

# 1) kNN on original variables with standardization
# 2) kNN on first 2 principal components only

# helper to create metrics from a confusion matrix (rows=true, cols=pred)
compute_metrics <- function(cm) {
  lv <- rownames(cm)
  if (is.null(lv)) lv <- as.character(1:nrow(cm))
  TP <- diag(cm)
  FP <- colSums(cm) - TP
  FN <- rowSums(cm) - TP
  precision <- TP / (TP + FP)
  recall <- TP / (TP + FN)
  f1 <- 2 * precision * recall / (precision + recall)
  acc <- sum(TP) / sum(cm)
  macro_precision <- mean(precision, na.rm = TRUE)
  macro_recall <- mean(recall, na.rm = TRUE)
  macro_f1 <- mean(f1, na.rm = TRUE)
  per_class <- data.frame(
    class = lv,
    precision = precision,
    recall = recall,
    f1 = f1,
    row.names = NULL
  )
  summary <- data.frame(
    accuracy = acc,
    macro_precision = macro_precision,
    macro_recall = macro_recall,
    macro_f1 = macro_f1
  )
  list(per_class = per_class, summary = summary)
}

set.seed(4600)
ks <- seq(1, 15, by = 2)
Kfolds <- 5

# kNN on original vars
X_train_scaled <- scale(X_train, center = TRUE, scale = TRUE)
scale_center <- attr(X_train_scaled, "scaled:center")
scale_scale <- attr(X_train_scaled, "scaled:scale")
X_test_scaled <- scale(X_test, center = scale_center, scale = scale_scale)

n_train_orig <- nrow(X_train_scaled)
folds_orig <- sample(rep(1:Kfolds, length.out = n_train_orig))
cv_acc_orig <- sapply(ks, function(k) {
  mean(sapply(1:Kfolds, function(f) {
    tr <- which(folds_orig != f)
    va <- which(folds_orig == f)
    pred_cv <- knn(train = X_train_scaled[tr, , drop = FALSE],
                   test = X_train_scaled[va, , drop = FALSE],
                   cl = y_train[tr], k = k)
    mean(pred_cv == y_train[va])
  }))
})

best_k_orig <- ks[which.max(cv_acc_orig)]
cat("[Original vars] best k:", best_k_orig, "cv acc:", max(cv_acc_orig), "\n")

pred_orig <- knn(train = X_train_scaled, test = X_test_scaled, cl = y_train, k = best_k_orig)
acc_orig <- mean(pred_orig == y_test)
cm_orig <- table(truth = y_test, pred = pred_orig)

cat("[Original vars] held-out accuracy:", round(acc_orig, 4), "\n")
print(cm_orig)

metrics_orig <- compute_metrics(cm_orig)
print(metrics_orig$summary)
print(metrics_orig$per_class)

# kNN on first 2 PCs only
Z2_train <- Z_train_full[, 1:2, drop = FALSE]
Z2_test  <- Z_test_full[, 1:2, drop = FALSE]
n_train_2pc <- nrow(Z2_train)

folds_2pc <- sample(rep(1:Kfolds, length.out = n_train_2pc))
cv_acc_2pc <- sapply(ks, function(k) {
  mean(sapply(1:Kfolds, function(f) {
    tr <- which(folds_2pc != f)
    va <- which(folds_2pc == f)
    pred_cv <- knn(train = Z2_train[tr, , drop = FALSE],
                   test = Z2_train[va, , drop = FALSE],
                   cl = y_train[tr], k = k)
    mean(pred_cv == y_train[va])
  }))
})

best_k_2pc <- ks[which.max(cv_acc_2pc)]
cat("[First 2 PCs] best k:", best_k_2pc, "cv acc:", max(cv_acc_2pc), "\n")

pred_2pc <- knn(train = Z2_train, test = Z2_test, cl = y_train, k = best_k_2pc)
acc_2pc <- mean(pred_2pc == y_test)
cm_2pc <- table(truth = y_test, pred = pred_2pc)

cat("[First 2 PCs] held-out accuracy:", round(acc_2pc, 4), "\n")
print(cm_2pc)

metrics_2pc <- compute_metrics(cm_2pc)
print(metrics_2pc$summary)
print(metrics_2pc$per_class)

# ===========================================================================================
outputs_dir <- "outputs"
if (!dir.exists(outputs_dir)) dir.create(outputs_dir, recursive = TRUE, showWarnings = FALSE)

# plots
if (exists("p_pc12") && inherits(p_pc12, "ggplot")) ggsave(filename = file.path(outputs_dir, "pc12_scatter.png"), plot = p_pc12, width = 8, height = 6, dpi = 300)
if (exists("p_biplot") && inherits(p_biplot, "ggplot")) ggsave(filename = file.path(outputs_dir, "pc12_biplot.png"),  plot = p_biplot, width = 8, height = 6, dpi = 300)
if (exists("p_scree") && inherits(p_scree, "ggplot")) ggsave(filename = file.path(outputs_dir, "pca_scree.png"),    plot = p_scree, width = 8, height = 6, dpi = 300)
if (exists("p_cumvar") && inherits(p_cumvar, "ggplot")) ggsave(filename = file.path(outputs_dir, "pca_cumvar.png"),   plot = p_cumvar, width = 8, height = 6, dpi = 300)

# top contributors/vars to PC1 and PC2
write.csv(top_pc1, file = file.path(outputs_dir, "top_contributors_pc1.csv"), row.names = FALSE)
write.csv(top_pc2, file = file.path(outputs_dir, "top_contributors_pc2.csv"), row.names = FALSE)

# confusion matrices as wide CSV and pretty text
write.csv(as.matrix(cm_orig), file = file.path(outputs_dir, "confusion_original_wide.csv"))
writeLines(capture.output(cm_orig), con = file.path(outputs_dir, "confusion_original.txt"))

write.csv(as.matrix(cm_2pc), file = file.path(outputs_dir, "confusion_2pc_wide.csv"))
writeLines(capture.output(cm_2pc), con = file.path(outputs_dir, "confusion_2pc.txt"))

# metrics
write.csv(metrics_orig$per_class, file = file.path(outputs_dir, "metrics_original_per_class.csv"), row.names = FALSE)
write.csv(metrics_orig$summary,   file = file.path(outputs_dir, "metrics_original_summary.csv"), row.names = FALSE)
write.csv(metrics_2pc$per_class,  file = file.path(outputs_dir, "metrics_2pc_per_class.csv"), row.names = FALSE)
write.csv(metrics_2pc$summary,    file = file.path(outputs_dir, "metrics_2pc_summary.csv"), row.names = FALSE)

# summary
metrics_compare <- data.frame(
  model = c("original_variables", "first_2_pcs"),
  accuracy = c(metrics_orig$summary$accuracy, metrics_2pc$summary$accuracy),
  macro_precision = c(metrics_orig$summary$macro_precision, metrics_2pc$summary$macro_precision),
  macro_recall = c(metrics_orig$summary$macro_recall, metrics_2pc$summary$macro_recall),
  macro_f1 = c(metrics_orig$summary$macro_f1, metrics_2pc$summary$macro_f1)
)
write.csv(metrics_compare, file = file.path(outputs_dir, "metrics_comparison.csv"), row.names = FALSE)

# The below was made with help from ChatGPT because the psych package is confusing
if (!interactive() && has_ggplot2) {
  pdf("Rplots_pca_fixed.pdf", width = 8, height = 6)
  if (has_psych) {
    psych::pairs.panels(
      wine[,-1],
      gap = 0,
      bg = c("red","gold","royalblue")[wine$Type],
      pch = 21,
      main = "wine (uci) – scatterplot matrix by class"
    )
  }

  if (exists("p_scree") && inherits(p_scree, "ggplot")) print(p_scree)
  if (exists("p_pc12") && inherits(p_pc12, "ggplot")) print(p_pc12)
  dev.off()
}