added lab 3

added lab 2
2025-10-10 21:57:41 -04:00 · 2025-10-03 18:59:02 -04:00 · 2025-10-03 18:58:15 -04:00
17 changed files with 9445 additions and 0 deletions
@@ -0,0 +1 @@
+other/
@@ -0,0 +1,166 @@
+# die
+suppressPackageStartupMessages({
+  library(ggplot2)
+  library(dplyr)
+  library(readr)
+  library(broom)     # for augment/tidy
+  library(scales)    # for label_comma
+  library(tidyr)     # for crossing
+  # library(lmtest)  # bp test
+  # library(sandwich) # good(er) ses
+})
+
+setwd("/home/ion606/Desktop/Data Analytics/Lab 2")
+
+# configuration
+data_path <- "NY-House-Dataset.csv"
+out_dir   <- "outputs"
+if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)
+
+# load
+raw <- read_csv(file = data_path, show_col_types = FALSE)
+
+# drop missing
+df <- raw |>
+  transmute(
+    PRICE = as.numeric(PRICE),
+    PROPERTYSQFT = as.numeric(PROPERTYSQFT),
+    BEDS = as.numeric(BEDS),
+    BATH = as.numeric(BATH)
+  ) |>
+  filter(is.finite(PRICE), is.finite(PROPERTYSQFT), is.finite(BEDS), is.finite(BATH))
+
+# basic summaries
+summary(df)
+fivenum(df$PRICE, na.rm = TRUE)
+
+# no outliters with 1%/99% quantiles
+quant_trim <- function(x, lo = 0.01, hi = 0.99) {
+  qs <- quantile(x, probs = c(lo, hi), na.rm = TRUE, names = FALSE)
+  x >= qs[1] & x <= qs[2]
+}
+
+keep <- quant_trim(df$PRICE) & quant_trim(df$PROPERTYSQFT) & quant_trim(df$BEDS) & quant_trim(df$BATH)
+
+dat <- df[keep, , drop = FALSE] |>
+  filter(PROPERTYSQFT > 0, PRICE > 0, BEDS >= 0, BATH >= 0) |>
+  mutate(
+    log_PRICE = log(PRICE),
+    log_SQFT  = log(PROPERTYSQFT)
+  )
+
+# helper to get the most significant non-intercept term (TODO: ASK PROF ABOUT THIS)
+most_sig_term <- function(model) {
+  tt <- broom::tidy(model) |>
+    dplyr::filter(term != "(Intercept)") |>
+    dplyr::arrange(p.value)
+  if (nrow(tt) == 0) return(NA_character_)
+  tt$term[1]
+}
+
+# model 1: PRICE ~ PROPERTYSQFT
+m1 <- lm(PRICE ~ PROPERTYSQFT, data = dat)
+cat("\n==== model 1: PRICE ~ PROPERTYSQFT ====\n")
+print(summary(m1))
+
+top1 <- most_sig_term(m1)
+
+p1_scatter <- ggplot(dat, aes(x = PROPERTYSQFT, y = PRICE)) +
+  geom_point(alpha = 0.35) +
+  stat_smooth(method = "lm", se = TRUE) +
+  scale_y_continuous(labels = label_comma()) +
+  labs(title = "model 1: price vs property sqft with lm fit",
+       x = "property sqft", y = "price (usd)")
+
+p1_resid <- augment(m1) |>
+  ggplot(aes(x = .fitted, y = .resid)) +
+  geom_point(alpha = 0.35) +
+  geom_hline(yintercept = 0) +
+  labs(title = "model 1: residuals vs fitted", x = "fitted", y = "residuals")
+
+ggsave(file.path(out_dir, "m1_scatter.png"), p1_scatter, width = 7, height = 5, dpi = 150)
+ggsave(file.path(out_dir, "m1_residuals.png"), p1_resid, width = 7, height = 5, dpi = 150)
+
+# model 2: PRICE ~ PROPERTYSQFT + BEDS + BATH
+m2 <- lm(PRICE ~ PROPERTYSQFT + BEDS + BATH, data = dat)
+cat("\n==== model 2: PRICE ~ PROPERTYSQFT + BEDS + BATH ====\n")
+print(summary(m2))
+
+top2 <- most_sig_term(m2)
+
+xlab2 <- paste0("most significant predictor: ", top2)
+
+p2_scatter <- ggplot(dat, aes_string(x = top2, y = "PRICE")) +
+  geom_point(alpha = 0.35) +
+  stat_smooth(method = "lm", se = TRUE) +
+  scale_y_continuous(labels = label_comma()) +
+  labs(title = "model 2: price vs most significant predictor with lm fit",
+       x = xlab2, y = "price (usd)")
+
+p2_resid <- augment(m2) |>
+  ggplot(aes(x = .fitted, y = .resid)) +
+  geom_point(alpha = 0.35) +
+  geom_hline(yintercept = 0) +
+  labs(title = "model 2: residuals vs fitted", x = "fitted", y = "residuals")
+
+ggsave(file.path(out_dir, "m2_scatter.png"), p2_scatter, width = 7, height = 5, dpi = 150)
+ggsave(file.path(out_dir, "m2_residuals.png"), p2_resid, width = 7, height = 5, dpi = 150)
+
+# model 3: log(PRICE) ~ log(PROPERTYSQFT) + BEDS + BATH
+m3 <- lm(log_PRICE ~ log_SQFT + BEDS + BATH, data = dat)
+cat("\n==== model 3: log(PRICE) ~ log(PROPERTYSQFT) + BEDS + BATH ====\n")
+print(summary(m3))
+
+top3 <- most_sig_term(m3)
+
+# price vs top predictor, overlay w/back-transformed fit -- hold other predictors at medians
+meds <- dat |>
+  summarise(
+    PROPERTYSQFT = median(PROPERTYSQFT, na.rm = TRUE),
+    BEDS = median(BEDS, na.rm = TRUE),
+    BATH = median(BATH, na.rm = TRUE)
+  )
+
+grid <- tibble::tibble(
+  x = seq(min(dat[[top3]], na.rm = TRUE), max(dat[[top3]], na.rm = TRUE), length.out = 200)
+)
+
+nd <- meds[rep(1, nrow(grid)), ]
+nd[[top3]] <- grid$x
+nd$log_SQFT <- log(nd$PROPERTYSQFT)
+
+pred_log <- predict(m3, newdata = nd, se.fit = TRUE)
+nd$PRICE_hat <- exp(pred_log$fit)  # back-transform
+
+p3_scatter <- ggplot(dat, aes_string(x = top3, y = "PRICE")) +
+  geom_point(alpha = 0.35) +
+  geom_line(data = nd, aes_string(x = top3, y = "PRICE_hat"), linewidth = 1) +
+  scale_y_continuous(labels = label_comma()) +
+  labs(title = "model 3: price vs most significant predictor (back-transformed fit)",
+       x = paste0("most significant predictor: ", top3), y = "price (usd)")
+
+p3_resid <- augment(m3) |>
+  ggplot(aes(x = .fitted, y = .resid)) +
+  geom_point(alpha = 0.35) +
+  geom_hline(yintercept = 0) +
+  labs(title = "model 3: residuals vs fitted (log scale)", x = "fitted (log price)", y = "residuals")
+
+ggsave(file.path(out_dir, "m3_scatter.png"), p3_scatter, width = 7, height = 5, dpi = 150)
+ggsave(file.path(out_dir, "m3_residuals.png"), p3_resid, width = 7, height = 5, dpi = 150)
+
+# comp table
+compare <- tibble::tibble(
+  model = c("PRICE ~ PROPERTYSQFT",
+            "PRICE ~ PROPERTYSQFT + BEDS + BATH",
+            "log(PRICE) ~ log(PROPERTYSQFT) + BEDS + BATH"),
+  r2      = c(summary(m1)$r.squared, summary(m2)$r.squared, summary(m3)$r.squared),
+  adj_r2  = c(summary(m1)$adj.r.squared, summary(m2)$adj.r.squared, summary(m3)$adj.r.squared),
+  aic     = c(AIC(m1), AIC(m2), AIC(m3)),
+  bic     = c(BIC(m1), BIC(m2), BIC(m3)),
+  top_var = c(top1, top2, top3)
+)
+print(compare)
+
+readr::write_csv(compare, file.path(out_dir, "model_comparison.csv"))
+
+message("\ndone!")
@@ -0,0 +1,4 @@
+model,r2,adj_r2,aic,bic,top_var
+PRICE ~ PROPERTYSQFT,0.28368247344839936,0.2835263109180851,146322.7288184832,146342.0230707264,PROPERTYSQFT
+PRICE ~ PROPERTYSQFT + BEDS + BATH,0.3385405215516886,0.33810772363776387,145961.10108380177,145993.25817087374,PROPERTYSQFT
+log(PRICE) ~ log(PROPERTYSQFT) + BEDS + BATH,0.4401401151381639,0.43977379460281263,9572.876890519634,9605.033977591607,BATH
@@ -0,0 +1,7 @@
+{
+	"[r]": {
+		"editor.indentSize": "tabSize",
+		"editor.tabSize": 4,
+		"editor.useTabStops": true
+	}
+}
@@ -0,0 +1,16 @@
+####################################
+##### Abalone Data Preparation #####
+####################################
+
+# read dataset
+abalone.data <- read.csv("Courses/Data Analytics/Fall25/labs/lab 3/abalone_dataset.csv")
+
+## add new column age.group with 3 values based on the number of rings 
+abalone.data$age.group <- cut(abalone.data$rings, br=c(0,8,11,35), labels = c("young", 'adult', 'old'))
+
+## alternative way of setting age.group
+abalone.data$age.group[abalone.data$rings<=8] <- "young"
+abalone.data$age.group[abalone.data$rings>8 & abalone.data$rings<=11] <- "adult"
+abalone.data$age.group[abalone.data$rings>11 & abalone.data$rings<=35] <- "old"
+
+
@@ -0,0 +1,7 @@
+lab 3 results
+
+chosen feature subset (by initial k=5): small features: length, diameter, height
+best k (k-NN tuning): 17 (accuracy = 0.6094)
+kmeans best k (silhouette): 2
+pam best k (silhouette): 2
+
@@ -0,0 +1,265 @@
+# lab 3: (A) bologna
+
+# I NEEDED TO INSTALL FORTRAN BS FOR THIS LMAO
+# sudo pacman -S --needed base-devel gcc-fortran lapack openblas libxml2 curl openssl
+
+# packages (install if necessary)
+required_pkgs <- c("dplyr", "ggplot2", "caret", "class", "cluster", "factoextra", "gridExtra")
+for (p in required_pkgs) {
+	if (!requireNamespace(p, quietly = TRUE)) {
+		install.packages(p, repos = "https://cloud.r-project.org", dependencies=TRUE)
+	}
+	library(p, character.only = TRUE)
+}
+
+# path handling: prefer the uploaded path if present
+uploaded_path <- "/home/ion606/Desktop/Data Analytics/Lab 3/abalone_dataset.csv"
+fallback_path <- "/home/ion606/Desktop/Data Analytics/Lab 3/abalone_dataset.csv"
+data_path <- if (file.exists(uploaded_path)) uploaded_path else fallback_path
+
+# read dataset
+abalone.data <- read.csv(data_path, stringsAsFactors = FALSE)
+
+# canonicalize column names to predictable lower-case tokens
+names(abalone.data) <- tolower(gsub("[[:space:]]+", ".", names(abalone.data)))
+
+# if rings column was named differently, try to find it
+if (!"rings" %in% names(abalone.data)) {
+	stop("could not find 'rings' column in dataset. column names found: ", paste(names(abalone.data), collapse = ", "))
+}
+
+print(names(abalone.data))
+
+# old code but I left it here anyways
+# abalone.data$age.group[abalone.data$rings <= 8] <- "young"
+# abalone.data$age.group[abalone.data$rings > 8 & abalone.data$rings <= 11] <- "adult"
+# abalone.data$age.group[abalone.data$rings > 11 & abalone.data$rings <= 35] <- "old"
+# abalone.data$age.group <- factor(abalone.data$age.group, levels = c("young", "adult", "old"))
+
+
+# new code
+abalone.data$age.group <- cut(abalone.data$rings, breaks = c(0, 8, 11, 35),
+							  labels = c("young", "adult", "old"),
+							  right = TRUE, include.lowest = TRUE)
+
+if ("sex" %in% names(abalone.data)) {
+	abalone.data$sex <- as.factor(abalone.data$sex)
+}
+
+# preview
+cat("dataset dims:", dim(abalone.data), "\n")
+cat("columns:", paste(names(abalone.data), collapse = ", "), "\n")
+
+expected_num_cols <- c("length", "diameter", "height",
+					   "whole.weight", "shucked.weight",
+					   "viscera.weight", "shell.weight")
+
+
+num_cols_present <- intersect(expected_num_cols, names(abalone.data))
+if (length(num_cols_present) < 3) {
+	stop("expected at least three numeric measurement columns; found ", paste(num_cols_present, collapse = ", "))
+}
+
+# feature subsets
+features_full <- num_cols_present  # numeric
+features_small <- intersect(c("length","diameter","height"), names(abalone.data))  # subset lmao
+
+cat("using features (full):", paste(features_full, collapse = ", "), "\n")
+cat("using features (small):", paste(features_small, collapse = ", "), "\n")
+
+# data split
+set.seed(123)
+train_index <- createDataPartition(abalone.data$age.group, p = 0.7, list = FALSE)
+train_df <- abalone.data[train_index, , drop = FALSE]
+test_df  <- abalone.data[-train_index, , drop = FALSE]
+
+# helper to scale numeric features / return matrix + labels
+scale_features <- function(df, feature_names, center = NULL, scale = NULL) {
+	mat <- as.data.frame(df[, feature_names, drop = FALSE])
+
+	# compute center/scale from provided if present (for train/test separation)
+	if (is.null(center)) {
+		center <- sapply(mat, mean, na.rm = TRUE)
+	}
+
+	if (is.null(scale)) {
+		scale <- sapply(mat, sd, na.rm = TRUE)
+		# avoid zero sd
+		scale[scale == 0] <- 1
+	}
+
+	scaled <- as.data.frame(scale(mat, center = center, scale = scale))
+	list(scaled = scaled, center = center, scale = scale)
+}
+
+# scale train/test for both feature sets
+train_full_scaled <- scale_features(train_df, features_full)
+test_full_scaled  <- scale_features(test_df, features_full, center = train_full_scaled$center, scale = train_full_scaled$scale)
+
+train_small_scaled <- scale_features(train_df, features_small)
+test_small_scaled  <- scale_features(test_df, features_small, center = train_small_scaled$center, scale = train_small_scaled$scale)
+
+# labels for knn
+train_labels <- train_df$age.group
+test_labels  <- test_df$age.group
+
+# 2 kNN models (initial comparison)
+library(class) # knn()
+
+# pick an initial k (odd)
+k_init <- 5
+
+knn_predict_and_confmat <- function(train_mat, test_mat, train_labels, test_labels, k) {
+	pred <- knn(train = as.matrix(train_mat), test = as.matrix(test_mat), cl = train_labels, k = k)
+	cm <- confusionMatrix(pred, test_labels)
+	list(pred = pred, confmat = cm)
+}
+
+res_full_init <- knn_predict_and_confmat(train_full_scaled$scaled, test_full_scaled$scaled, train_labels, test_labels, k_init)
+res_small_init <- knn_predict_and_confmat(train_small_scaled$scaled, test_small_scaled$scaled, train_labels, test_labels, k_init)
+
+cat("\ninitial results (k =", k_init, ")\n")
+cat("full-features accuracy:", res_full_init$confmat$overall["Accuracy"], "\n")
+print(res_full_init$confmat$table)
+cat("small-features accuracy:", res_small_init$confmat$overall["Accuracy"], "\n")
+print(res_small_init$confmat$table)
+
+# choose better performing feature subset (by accuracy)
+acc_full  <- as.numeric(res_full_init$confmat$overall["Accuracy"])
+acc_small <- as.numeric(res_small_init$confmat$overall["Accuracy"])
+if (acc_full >= acc_small) {
+	best_features <- features_full
+	best_train_scaled <- train_full_scaled
+	best_test_scaled  <- test_full_scaled
+	chosen_tag <- "full"
+} else {
+	best_features <- features_small
+	best_train_scaled <- train_small_scaled
+	best_test_scaled  <- test_small_scaled
+	chosen_tag <- "small"
+}
+cat("\nchosen feature subset for tuning:", chosen_tag, "(", paste(best_features, collapse = ", "), ")\n")
+
+# optimal k for best performing subset
+k_values <- seq(1, 25, by = 2) # odd ks
+accuracy_by_k <- numeric(length(k_values))
+names(accuracy_by_k) <- k_values
+
+for (i in seq_along(k_values)) {
+  k <- k_values[i]
+  tmp <- knn(train = as.matrix(best_train_scaled$scaled),
+			 test  = as.matrix(best_test_scaled$scaled),
+			 cl    = train_labels,
+			 k     = k)
+  cm <- confusionMatrix(tmp, test_labels)
+  accuracy_by_k[i] <- as.numeric(cm$overall["Accuracy"])
+}
+
+best_k_idx <- which.max(accuracy_by_k)
+best_k <- k_values[best_k_idx]
+cat("\naccuracy_by_k:\n")
+print(round(accuracy_by_k, 4))
+cat("\nbest k:", best_k, "with accuracy", round(accuracy_by_k[best_k_idx], 4), "\n")
+
+# final model with best_k
+final_knn <- knn(train = as.matrix(best_train_scaled$scaled),
+				 test  = as.matrix(best_test_scaled$scaled),
+				 cl    = train_labels,
+				 k     = best_k)
+
+final_cm <- confusionMatrix(final_knn, test_labels)
+cat("\nfinal confusion matrix (best k):\n")
+print(final_cm)
+
+# per-class
+print(final_cm$byClass)
+
+# summary
+output_pdf <- "lab3_output.pdf"
+pdf(output_pdf, width = 10, height = 7)
+
+# accuracy vs k
+plot(k_values, accuracy_by_k, type = "b", pch = 19, xlab = "k (odd)", ylab = "accuracy", main = paste("k-NN accuracy (chosen subset:", chosen_tag, ")"))
+grid()
+
+# Exercise 2: clustering (k-means and pam) using best feature subset
+
+# use scaled dataset (all observations) for clustering
+# scale using full population mean/sd
+all_scaled_res <- scale_features(abalone.data, best_features)
+all_scaled <- all_scaled_res$scaled
+
+# use fviz_nbclust for optimal K using silhouette
+# < (k = 10 or sqrt(n))
+k_max <- min(10, floor(sqrt(nrow(all_scaled)) * 2))
+
+# silhouette
+factoextra::fviz_nbclust(all_scaled, kmeans, method = "silhouette") + ggtitle("fviz_nbclust: silhouette (kmeans)")
+
+# show elbow
+factoextra::fviz_nbclust(all_scaled, kmeans, method = "wss") + ggtitle("fviz_nbclust: wss (kmeans)")
+
+# pick K by the maximum average silhouette
+avg_sil <- numeric(k_max - 1)
+for (k in 2:k_max) {
+	km_tmp <- kmeans(all_scaled, centers = k, nstart = 25)
+	sil <- cluster::silhouette(km_tmp$cluster, dist(all_scaled))
+	avg_sil[k - 1] <- mean(sil[, 3])
+}
+k_values_clust <- 2:k_max
+best_k_clust <- k_values_clust[which.max(avg_sil)]
+cat("\navg silhouette by k (kmeans):\n")
+print(data.frame(k = k_values_clust, avg_silhouette = round(avg_sil, 4)))
+cat("\nchosen best k for kmeans (max avg silhouette):", best_k_clust, "\n")
+
+# run kmeans with best_k_clust and plot silhouette
+km_final <- kmeans(all_scaled, centers = best_k_clust, nstart = 25)
+sil_km <- cluster::silhouette(km_final$cluster, dist(all_scaled))
+factoextra::fviz_silhouette(sil_km) + ggtitle(paste("kmeans silhouette (k=", best_k_clust, ")", sep = ""))
+
+# run pam with same range and pick best k for pam by avg silhouette
+avg_sil_pam <- numeric(k_max - 1)
+for (k in 2:k_max) {
+	pam_tmp <- cluster::pam(all_scaled, k = k)
+	avg_sil_pam[k - 1] <- mean(pam_tmp$silinfo$avg.width)
+}
+
+best_k_pam <- k_values_clust[which.max(avg_sil_pam)]
+cat("\navg silhouette by k (pam):\n")
+print(data.frame(k = k_values_clust, avg_silhouette = round(avg_sil_pam, 4)))
+cat("\nchosen best k for pam (max avg silhouette):", best_k_pam, "\n")
+
+# run pam for best_k_pam/show silhouette plot
+pam_final <- cluster::pam(all_scaled, k = best_k_pam)
+factoextra::fviz_silhouette(pam_final) + ggtitle(paste("pam silhouette (k=", best_k_pam, ")", sep = ""))
+
+# also plot cluster centers (2-d PCA scatter with cluster colors) for kmeans and pam
+pca_res <- prcomp(all_scaled, center = TRUE, scale. = FALSE)
+pcs <- data.frame(pca_res$x[, 1:2])
+pcs$kmeans_cluster <- factor(km_final$cluster)
+pcs$pam_cluster    <- factor(pam_final$clustering)
+
+# kmeans PCA plot
+ggplot(pcs, aes(x = PC1, y = PC2, color = kmeans_cluster)) + geom_point(alpha = 0.6) + ggtitle(paste("kmeans clusters (k=", best_k_clust, ")", sep = ""))
+
+# pam PCA plot
+ggplot(pcs, aes(x = PC1, y = PC2, color = pam_cluster)) + geom_point(alpha = 0.6) + ggtitle(paste("pam clusters (k=", best_k_pam, ")", sep = ""))
+
+# kill the pdf device
+dev.off()
+
+cat("plots and clustering/kNN visuals saved to", output_pdf, "\n")
+cat("final chosen k for kNN:", best_k, "\n")
+cat("final chosen k for kmeans:", best_k_clust, "\n")
+cat("final chosen k for pam:", best_k_pam, "\n")
+
+# yes I am lazy thx
+summary_txt <- paste0(
+  "lab 3 results\n\n",
+  "chosen feature subset (by initial k=", k_init, "): ", chosen_tag, " features: ", paste(best_features, collapse = ", "), "\n",
+  "best k (k-NN tuning): ", best_k, " (accuracy = ", round(accuracy_by_k[best_k_idx], 4), ")\n",
+  "kmeans best k (silhouette): ", best_k_clust, "\n",
+  "pam best k (silhouette): ", best_k_pam, "\n"
+)
+
+writeLines(summary_txt, con = "lab3_summary.txt")
Author	SHA1	Message	Date
ION606	df21d7f281	added lab 3	2025-10-10 21:57:41 -04:00
ION606	6dbe45c975	added lab 2	2025-10-03 18:59:02 -04:00
ION606	170fee98d5	added lab 2	2025-10-03 18:58:15 -04:00