# lab 3: (A) bologna # I NEEDED TO INSTALL FORTRAN BS FOR THIS LMAO # sudo pacman -S --needed base-devel gcc-fortran lapack openblas libxml2 curl openssl # packages (install if necessary) required_pkgs <- c("dplyr", "ggplot2", "caret", "class", "cluster", "factoextra", "gridExtra") for (p in required_pkgs) { if (!requireNamespace(p, quietly = TRUE)) { install.packages(p, repos = "https://cloud.r-project.org", dependencies=TRUE) } library(p, character.only = TRUE) } # path handling: prefer the uploaded path if present uploaded_path <- "/home/ion606/Desktop/Data Analytics/Lab 3/abalone_dataset.csv" fallback_path <- "/home/ion606/Desktop/Data Analytics/Lab 3/abalone_dataset.csv" data_path <- if (file.exists(uploaded_path)) uploaded_path else fallback_path # read dataset abalone.data <- read.csv(data_path, stringsAsFactors = FALSE) # canonicalize column names to predictable lower-case tokens names(abalone.data) <- tolower(gsub("[[:space:]]+", ".", names(abalone.data))) # if rings column was named differently, try to find it if (!"rings" %in% names(abalone.data)) { stop("could not find 'rings' column in dataset. column names found: ", paste(names(abalone.data), collapse = ", ")) } print(names(abalone.data)) # old code but I left it here anyways # abalone.data$age.group[abalone.data$rings <= 8] <- "young" # abalone.data$age.group[abalone.data$rings > 8 & abalone.data$rings <= 11] <- "adult" # abalone.data$age.group[abalone.data$rings > 11 & abalone.data$rings <= 35] <- "old" # abalone.data$age.group <- factor(abalone.data$age.group, levels = c("young", "adult", "old")) # new code abalone.data$age.group <- cut(abalone.data$rings, breaks = c(0, 8, 11, 35), labels = c("young", "adult", "old"), right = TRUE, include.lowest = TRUE) if ("sex" %in% names(abalone.data)) { abalone.data$sex <- as.factor(abalone.data$sex) } # preview cat("dataset dims:", dim(abalone.data), "\n") cat("columns:", paste(names(abalone.data), collapse = ", "), "\n") expected_num_cols <- c("length", "diameter", "height", "whole.weight", "shucked.weight", "viscera.weight", "shell.weight") num_cols_present <- intersect(expected_num_cols, names(abalone.data)) if (length(num_cols_present) < 3) { stop("expected at least three numeric measurement columns; found ", paste(num_cols_present, collapse = ", ")) } # feature subsets features_full <- num_cols_present # numeric features_small <- intersect(c("length","diameter","height"), names(abalone.data)) # subset lmao cat("using features (full):", paste(features_full, collapse = ", "), "\n") cat("using features (small):", paste(features_small, collapse = ", "), "\n") # data split set.seed(123) train_index <- createDataPartition(abalone.data$age.group, p = 0.7, list = FALSE) train_df <- abalone.data[train_index, , drop = FALSE] test_df <- abalone.data[-train_index, , drop = FALSE] # helper to scale numeric features / return matrix + labels scale_features <- function(df, feature_names, center = NULL, scale = NULL) { mat <- as.data.frame(df[, feature_names, drop = FALSE]) # compute center/scale from provided if present (for train/test separation) if (is.null(center)) { center <- sapply(mat, mean, na.rm = TRUE) } if (is.null(scale)) { scale <- sapply(mat, sd, na.rm = TRUE) # avoid zero sd scale[scale == 0] <- 1 } scaled <- as.data.frame(scale(mat, center = center, scale = scale)) list(scaled = scaled, center = center, scale = scale) } # scale train/test for both feature sets train_full_scaled <- scale_features(train_df, features_full) test_full_scaled <- scale_features(test_df, features_full, center = train_full_scaled$center, scale = train_full_scaled$scale) train_small_scaled <- scale_features(train_df, features_small) test_small_scaled <- scale_features(test_df, features_small, center = train_small_scaled$center, scale = train_small_scaled$scale) # labels for knn train_labels <- train_df$age.group test_labels <- test_df$age.group # 2 kNN models (initial comparison) library(class) # knn() # pick an initial k (odd) k_init <- 5 knn_predict_and_confmat <- function(train_mat, test_mat, train_labels, test_labels, k) { pred <- knn(train = as.matrix(train_mat), test = as.matrix(test_mat), cl = train_labels, k = k) cm <- confusionMatrix(pred, test_labels) list(pred = pred, confmat = cm) } res_full_init <- knn_predict_and_confmat(train_full_scaled$scaled, test_full_scaled$scaled, train_labels, test_labels, k_init) res_small_init <- knn_predict_and_confmat(train_small_scaled$scaled, test_small_scaled$scaled, train_labels, test_labels, k_init) cat("\ninitial results (k =", k_init, ")\n") cat("full-features accuracy:", res_full_init$confmat$overall["Accuracy"], "\n") print(res_full_init$confmat$table) cat("small-features accuracy:", res_small_init$confmat$overall["Accuracy"], "\n") print(res_small_init$confmat$table) # choose better performing feature subset (by accuracy) acc_full <- as.numeric(res_full_init$confmat$overall["Accuracy"]) acc_small <- as.numeric(res_small_init$confmat$overall["Accuracy"]) if (acc_full >= acc_small) { best_features <- features_full best_train_scaled <- train_full_scaled best_test_scaled <- test_full_scaled chosen_tag <- "full" } else { best_features <- features_small best_train_scaled <- train_small_scaled best_test_scaled <- test_small_scaled chosen_tag <- "small" } cat("\nchosen feature subset for tuning:", chosen_tag, "(", paste(best_features, collapse = ", "), ")\n") # optimal k for best performing subset k_values <- seq(1, 25, by = 2) # odd ks accuracy_by_k <- numeric(length(k_values)) names(accuracy_by_k) <- k_values for (i in seq_along(k_values)) { k <- k_values[i] tmp <- knn(train = as.matrix(best_train_scaled$scaled), test = as.matrix(best_test_scaled$scaled), cl = train_labels, k = k) cm <- confusionMatrix(tmp, test_labels) accuracy_by_k[i] <- as.numeric(cm$overall["Accuracy"]) } best_k_idx <- which.max(accuracy_by_k) best_k <- k_values[best_k_idx] cat("\naccuracy_by_k:\n") print(round(accuracy_by_k, 4)) cat("\nbest k:", best_k, "with accuracy", round(accuracy_by_k[best_k_idx], 4), "\n") # final model with best_k final_knn <- knn(train = as.matrix(best_train_scaled$scaled), test = as.matrix(best_test_scaled$scaled), cl = train_labels, k = best_k) final_cm <- confusionMatrix(final_knn, test_labels) cat("\nfinal confusion matrix (best k):\n") print(final_cm) # per-class print(final_cm$byClass) # summary output_pdf <- "lab3_output.pdf" pdf(output_pdf, width = 10, height = 7) # accuracy vs k plot(k_values, accuracy_by_k, type = "b", pch = 19, xlab = "k (odd)", ylab = "accuracy", main = paste("k-NN accuracy (chosen subset:", chosen_tag, ")")) grid() # Exercise 2: clustering (k-means and pam) using best feature subset # use scaled dataset (all observations) for clustering # scale using full population mean/sd all_scaled_res <- scale_features(abalone.data, best_features) all_scaled <- all_scaled_res$scaled # use fviz_nbclust for optimal K using silhouette # < (k = 10 or sqrt(n)) k_max <- min(10, floor(sqrt(nrow(all_scaled)) * 2)) # silhouette factoextra::fviz_nbclust(all_scaled, kmeans, method = "silhouette") + ggtitle("fviz_nbclust: silhouette (kmeans)") # show elbow factoextra::fviz_nbclust(all_scaled, kmeans, method = "wss") + ggtitle("fviz_nbclust: wss (kmeans)") # pick K by the maximum average silhouette avg_sil <- numeric(k_max - 1) for (k in 2:k_max) { km_tmp <- kmeans(all_scaled, centers = k, nstart = 25) sil <- cluster::silhouette(km_tmp$cluster, dist(all_scaled)) avg_sil[k - 1] <- mean(sil[, 3]) } k_values_clust <- 2:k_max best_k_clust <- k_values_clust[which.max(avg_sil)] cat("\navg silhouette by k (kmeans):\n") print(data.frame(k = k_values_clust, avg_silhouette = round(avg_sil, 4))) cat("\nchosen best k for kmeans (max avg silhouette):", best_k_clust, "\n") # run kmeans with best_k_clust and plot silhouette km_final <- kmeans(all_scaled, centers = best_k_clust, nstart = 25) sil_km <- cluster::silhouette(km_final$cluster, dist(all_scaled)) factoextra::fviz_silhouette(sil_km) + ggtitle(paste("kmeans silhouette (k=", best_k_clust, ")", sep = "")) # run pam with same range and pick best k for pam by avg silhouette avg_sil_pam <- numeric(k_max - 1) for (k in 2:k_max) { pam_tmp <- cluster::pam(all_scaled, k = k) avg_sil_pam[k - 1] <- mean(pam_tmp$silinfo$avg.width) } best_k_pam <- k_values_clust[which.max(avg_sil_pam)] cat("\navg silhouette by k (pam):\n") print(data.frame(k = k_values_clust, avg_silhouette = round(avg_sil_pam, 4))) cat("\nchosen best k for pam (max avg silhouette):", best_k_pam, "\n") # run pam for best_k_pam/show silhouette plot pam_final <- cluster::pam(all_scaled, k = best_k_pam) factoextra::fviz_silhouette(pam_final) + ggtitle(paste("pam silhouette (k=", best_k_pam, ")", sep = "")) # also plot cluster centers (2-d PCA scatter with cluster colors) for kmeans and pam pca_res <- prcomp(all_scaled, center = TRUE, scale. = FALSE) pcs <- data.frame(pca_res$x[, 1:2]) pcs$kmeans_cluster <- factor(km_final$cluster) pcs$pam_cluster <- factor(pam_final$clustering) # kmeans PCA plot ggplot(pcs, aes(x = PC1, y = PC2, color = kmeans_cluster)) + geom_point(alpha = 0.6) + ggtitle(paste("kmeans clusters (k=", best_k_clust, ")", sep = "")) # pam PCA plot ggplot(pcs, aes(x = PC1, y = PC2, color = pam_cluster)) + geom_point(alpha = 0.6) + ggtitle(paste("pam clusters (k=", best_k_pam, ")", sep = "")) # kill the pdf device dev.off() cat("plots and clustering/kNN visuals saved to", output_pdf, "\n") cat("final chosen k for kNN:", best_k, "\n") cat("final chosen k for kmeans:", best_k_clust, "\n") cat("final chosen k for pam:", best_k_pam, "\n") # yes I am lazy thx summary_txt <- paste0( "lab 3 results\n\n", "chosen feature subset (by initial k=", k_init, "): ", chosen_tag, " features: ", paste(best_features, collapse = ", "), "\n", "best k (k-NN tuning): ", best_k, " (accuracy = ", round(accuracy_by_k[best_k_idx], 4), ")\n", "kmeans best k (silhouette): ", best_k_clust, "\n", "pam best k (silhouette): ", best_k_pam, "\n" ) writeLines(summary_txt, con = "lab3_summary.txt")