Compare commits
5 Commits
87d0c4bbb5
...
lab3
| Author | SHA1 | Date | |
|---|---|---|---|
| df21d7f281 | |||
| 6dbe45c975 | |||
| 170fee98d5 | |||
| 562986db3b | |||
| 8fe1967319 |
+20
-27
@@ -1,12 +1,13 @@
|
|||||||
library(readr)
|
library(readr)
|
||||||
library(EnvStats)
|
library(EnvStats)
|
||||||
|
library(nortest)
|
||||||
|
|
||||||
# install.packages(c("readr", "EnvStats"))
|
# install.packages(c("readr", "EnvStats"))
|
||||||
|
|
||||||
# set working directory (relative path)
|
# set working directory (relative path)
|
||||||
|
setwd("~/Desktop/Data Analytics/Lab 1")
|
||||||
|
|
||||||
# paste function my beloved <3
|
pdf("all_plots.pdf", width = 8, height = 6)
|
||||||
setwd(paste(getwd(), "Lab 1", sep="/"))
|
|
||||||
|
|
||||||
# read data
|
# read data
|
||||||
epi.data <- read_csv("epi_results_2024_pop_gdp.csv")
|
epi.data <- read_csv("epi_results_2024_pop_gdp.csv")
|
||||||
@@ -20,9 +21,7 @@ summary(epi.data$epi_results_2024_pop_gdp.csv.new)
|
|||||||
# print values in variable
|
# print values in variable
|
||||||
epi.data$RLI.new
|
epi.data$RLI.new
|
||||||
|
|
||||||
|
# AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
|
||||||
######## Optional ########
|
|
||||||
## If you want to reference the variable without using the dataframe:
|
|
||||||
|
|
||||||
# attach dataframe
|
# attach dataframe
|
||||||
attach(epi.data)
|
attach(epi.data)
|
||||||
@@ -46,6 +45,15 @@ PHL <- epi.data$PHL.new
|
|||||||
|
|
||||||
PHL
|
PHL
|
||||||
|
|
||||||
|
# no NAs
|
||||||
|
RLI_noNA <- epi.data$RLI.new[!is.na(epi.data$RLI.new)];
|
||||||
|
PHL_noNA <- epi.data$PHL.new[!is.na(epi.data$PHL.new)];
|
||||||
|
|
||||||
|
set.seed(1);
|
||||||
|
RLI_sub <- sample(RLI_noNA, size = min(180, length(RLI_noNA)));
|
||||||
|
RLI_new_sub <- RLI_sub; # only if you truly need a second alias
|
||||||
|
|
||||||
|
|
||||||
# find NAs inv variavle - outputs vector of logical values, true if NA, false otherwise
|
# find NAs inv variavle - outputs vector of logical values, true if NA, false otherwise
|
||||||
NAs <- is.na(PHL)
|
NAs <- is.na(PHL)
|
||||||
|
|
||||||
@@ -75,25 +83,8 @@ boxplot(RLI, PHL.above30, names = c("RHI","PHL"))
|
|||||||
# hist(RLI)
|
# hist(RLI)
|
||||||
|
|
||||||
# define sequence of values over which to plot histogram
|
# define sequence of values over which to plot histogram
|
||||||
# I have NO IDEA why this keep breaking but I just started using the range func
|
x <- seq(0., 100., 10)
|
||||||
rng <- range(RLI, na.rm = TRUE)
|
|
||||||
lo <- floor(rng[1] / 5) * 5
|
|
||||||
hi <- ceiling(rng[2] / 5) * 5
|
|
||||||
brks <- seq(lo, hi, by = 1)
|
|
||||||
|
|
||||||
# WHY????? WHY IS IT BREAKING????
|
|
||||||
# [1] "range 0 lo 0 hi 100 brks 50" "range 97.7 lo 0 hi 100 brks 50"
|
|
||||||
# Error in freq && !equidist : 'length = 15' in coercion to 'logical(1)'
|
|
||||||
# Calls: hist -> hist.default -> plot -> plot.histogram
|
|
||||||
# Execution halted
|
|
||||||
print(paste("range", rng, "lo", lo, "hi", hi, "brks", brks))
|
|
||||||
|
|
||||||
hist(RLI,
|
|
||||||
breaks = brks,
|
|
||||||
prob = TRUE)
|
|
||||||
|
|
||||||
x <- seq(20, 90, by = 5)
|
|
||||||
|
|
||||||
# histogram (frequency distribution) over range
|
# histogram (frequency distribution) over range
|
||||||
hist(RLI, x, breaks=brks, prob=TRUE)
|
hist(RLI, x, breaks=brks, prob=TRUE)
|
||||||
|
|
||||||
@@ -103,7 +94,7 @@ lines(density(RLI, na.rm=TRUE)) # or try bw=“SJ”
|
|||||||
# print rug
|
# print rug
|
||||||
rug(RLI)
|
rug(RLI)
|
||||||
|
|
||||||
x <- seq(5., 95., 5)
|
x <- seq(0., 100., 5)
|
||||||
|
|
||||||
# histogram (frequency distribution) over rabge
|
# histogram (frequency distribution) over rabge
|
||||||
hist(RLI, breaks = "FD", prob=TRUE)
|
hist(RLI, breaks = "FD", prob=TRUE)
|
||||||
@@ -156,8 +147,8 @@ qqnorm(x); qqline(x)
|
|||||||
|
|
||||||
|
|
||||||
# print quantile-quantile plot for variable with any theoretical distribution
|
# print quantile-quantile plot for variable with any theoretical distribution
|
||||||
qqplot(rnorm(180), RLI.sub, xlab = "Q-Q plot for norm dsn")
|
qqplot(rnorm(180), RLI_sub, xlab = "Q-Q plot for norm dsn")
|
||||||
qqline(RLI.sub)
|
qqline(RLI_sub)
|
||||||
|
|
||||||
# print quantile-quantile plot for 2 variables
|
# print quantile-quantile plot for 2 variables
|
||||||
qqplot(RLI, PHL, xlab = "Q-Q plot for RHI vs PHL")
|
qqplot(RLI, PHL, xlab = "Q-Q plot for RHI vs PHL")
|
||||||
@@ -191,3 +182,5 @@ wilcox.test(x,y)
|
|||||||
|
|
||||||
var.test(x,y)
|
var.test(x,y)
|
||||||
t.test(x,y)
|
t.test(x,y)
|
||||||
|
|
||||||
|
dev.off()
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
|
|||||||
|
other/
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,166 @@
|
|||||||
|
# die
|
||||||
|
suppressPackageStartupMessages({
|
||||||
|
library(ggplot2)
|
||||||
|
library(dplyr)
|
||||||
|
library(readr)
|
||||||
|
library(broom) # for augment/tidy
|
||||||
|
library(scales) # for label_comma
|
||||||
|
library(tidyr) # for crossing
|
||||||
|
# library(lmtest) # bp test
|
||||||
|
# library(sandwich) # good(er) ses
|
||||||
|
})
|
||||||
|
|
||||||
|
setwd("/home/ion606/Desktop/Data Analytics/Lab 2")
|
||||||
|
|
||||||
|
# configuration
|
||||||
|
data_path <- "NY-House-Dataset.csv"
|
||||||
|
out_dir <- "outputs"
|
||||||
|
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)
|
||||||
|
|
||||||
|
# load
|
||||||
|
raw <- read_csv(file = data_path, show_col_types = FALSE)
|
||||||
|
|
||||||
|
# drop missing
|
||||||
|
df <- raw |>
|
||||||
|
transmute(
|
||||||
|
PRICE = as.numeric(PRICE),
|
||||||
|
PROPERTYSQFT = as.numeric(PROPERTYSQFT),
|
||||||
|
BEDS = as.numeric(BEDS),
|
||||||
|
BATH = as.numeric(BATH)
|
||||||
|
) |>
|
||||||
|
filter(is.finite(PRICE), is.finite(PROPERTYSQFT), is.finite(BEDS), is.finite(BATH))
|
||||||
|
|
||||||
|
# basic summaries
|
||||||
|
summary(df)
|
||||||
|
fivenum(df$PRICE, na.rm = TRUE)
|
||||||
|
|
||||||
|
# no outliters with 1%/99% quantiles
|
||||||
|
quant_trim <- function(x, lo = 0.01, hi = 0.99) {
|
||||||
|
qs <- quantile(x, probs = c(lo, hi), na.rm = TRUE, names = FALSE)
|
||||||
|
x >= qs[1] & x <= qs[2]
|
||||||
|
}
|
||||||
|
|
||||||
|
keep <- quant_trim(df$PRICE) & quant_trim(df$PROPERTYSQFT) & quant_trim(df$BEDS) & quant_trim(df$BATH)
|
||||||
|
|
||||||
|
dat <- df[keep, , drop = FALSE] |>
|
||||||
|
filter(PROPERTYSQFT > 0, PRICE > 0, BEDS >= 0, BATH >= 0) |>
|
||||||
|
mutate(
|
||||||
|
log_PRICE = log(PRICE),
|
||||||
|
log_SQFT = log(PROPERTYSQFT)
|
||||||
|
)
|
||||||
|
|
||||||
|
# helper to get the most significant non-intercept term (TODO: ASK PROF ABOUT THIS)
|
||||||
|
most_sig_term <- function(model) {
|
||||||
|
tt <- broom::tidy(model) |>
|
||||||
|
dplyr::filter(term != "(Intercept)") |>
|
||||||
|
dplyr::arrange(p.value)
|
||||||
|
if (nrow(tt) == 0) return(NA_character_)
|
||||||
|
tt$term[1]
|
||||||
|
}
|
||||||
|
|
||||||
|
# model 1: PRICE ~ PROPERTYSQFT
|
||||||
|
m1 <- lm(PRICE ~ PROPERTYSQFT, data = dat)
|
||||||
|
cat("\n==== model 1: PRICE ~ PROPERTYSQFT ====\n")
|
||||||
|
print(summary(m1))
|
||||||
|
|
||||||
|
top1 <- most_sig_term(m1)
|
||||||
|
|
||||||
|
p1_scatter <- ggplot(dat, aes(x = PROPERTYSQFT, y = PRICE)) +
|
||||||
|
geom_point(alpha = 0.35) +
|
||||||
|
stat_smooth(method = "lm", se = TRUE) +
|
||||||
|
scale_y_continuous(labels = label_comma()) +
|
||||||
|
labs(title = "model 1: price vs property sqft with lm fit",
|
||||||
|
x = "property sqft", y = "price (usd)")
|
||||||
|
|
||||||
|
p1_resid <- augment(m1) |>
|
||||||
|
ggplot(aes(x = .fitted, y = .resid)) +
|
||||||
|
geom_point(alpha = 0.35) +
|
||||||
|
geom_hline(yintercept = 0) +
|
||||||
|
labs(title = "model 1: residuals vs fitted", x = "fitted", y = "residuals")
|
||||||
|
|
||||||
|
ggsave(file.path(out_dir, "m1_scatter.png"), p1_scatter, width = 7, height = 5, dpi = 150)
|
||||||
|
ggsave(file.path(out_dir, "m1_residuals.png"), p1_resid, width = 7, height = 5, dpi = 150)
|
||||||
|
|
||||||
|
# model 2: PRICE ~ PROPERTYSQFT + BEDS + BATH
|
||||||
|
m2 <- lm(PRICE ~ PROPERTYSQFT + BEDS + BATH, data = dat)
|
||||||
|
cat("\n==== model 2: PRICE ~ PROPERTYSQFT + BEDS + BATH ====\n")
|
||||||
|
print(summary(m2))
|
||||||
|
|
||||||
|
top2 <- most_sig_term(m2)
|
||||||
|
|
||||||
|
xlab2 <- paste0("most significant predictor: ", top2)
|
||||||
|
|
||||||
|
p2_scatter <- ggplot(dat, aes_string(x = top2, y = "PRICE")) +
|
||||||
|
geom_point(alpha = 0.35) +
|
||||||
|
stat_smooth(method = "lm", se = TRUE) +
|
||||||
|
scale_y_continuous(labels = label_comma()) +
|
||||||
|
labs(title = "model 2: price vs most significant predictor with lm fit",
|
||||||
|
x = xlab2, y = "price (usd)")
|
||||||
|
|
||||||
|
p2_resid <- augment(m2) |>
|
||||||
|
ggplot(aes(x = .fitted, y = .resid)) +
|
||||||
|
geom_point(alpha = 0.35) +
|
||||||
|
geom_hline(yintercept = 0) +
|
||||||
|
labs(title = "model 2: residuals vs fitted", x = "fitted", y = "residuals")
|
||||||
|
|
||||||
|
ggsave(file.path(out_dir, "m2_scatter.png"), p2_scatter, width = 7, height = 5, dpi = 150)
|
||||||
|
ggsave(file.path(out_dir, "m2_residuals.png"), p2_resid, width = 7, height = 5, dpi = 150)
|
||||||
|
|
||||||
|
# model 3: log(PRICE) ~ log(PROPERTYSQFT) + BEDS + BATH
|
||||||
|
m3 <- lm(log_PRICE ~ log_SQFT + BEDS + BATH, data = dat)
|
||||||
|
cat("\n==== model 3: log(PRICE) ~ log(PROPERTYSQFT) + BEDS + BATH ====\n")
|
||||||
|
print(summary(m3))
|
||||||
|
|
||||||
|
top3 <- most_sig_term(m3)
|
||||||
|
|
||||||
|
# price vs top predictor, overlay w/back-transformed fit -- hold other predictors at medians
|
||||||
|
meds <- dat |>
|
||||||
|
summarise(
|
||||||
|
PROPERTYSQFT = median(PROPERTYSQFT, na.rm = TRUE),
|
||||||
|
BEDS = median(BEDS, na.rm = TRUE),
|
||||||
|
BATH = median(BATH, na.rm = TRUE)
|
||||||
|
)
|
||||||
|
|
||||||
|
grid <- tibble::tibble(
|
||||||
|
x = seq(min(dat[[top3]], na.rm = TRUE), max(dat[[top3]], na.rm = TRUE), length.out = 200)
|
||||||
|
)
|
||||||
|
|
||||||
|
nd <- meds[rep(1, nrow(grid)), ]
|
||||||
|
nd[[top3]] <- grid$x
|
||||||
|
nd$log_SQFT <- log(nd$PROPERTYSQFT)
|
||||||
|
|
||||||
|
pred_log <- predict(m3, newdata = nd, se.fit = TRUE)
|
||||||
|
nd$PRICE_hat <- exp(pred_log$fit) # back-transform
|
||||||
|
|
||||||
|
p3_scatter <- ggplot(dat, aes_string(x = top3, y = "PRICE")) +
|
||||||
|
geom_point(alpha = 0.35) +
|
||||||
|
geom_line(data = nd, aes_string(x = top3, y = "PRICE_hat"), linewidth = 1) +
|
||||||
|
scale_y_continuous(labels = label_comma()) +
|
||||||
|
labs(title = "model 3: price vs most significant predictor (back-transformed fit)",
|
||||||
|
x = paste0("most significant predictor: ", top3), y = "price (usd)")
|
||||||
|
|
||||||
|
p3_resid <- augment(m3) |>
|
||||||
|
ggplot(aes(x = .fitted, y = .resid)) +
|
||||||
|
geom_point(alpha = 0.35) +
|
||||||
|
geom_hline(yintercept = 0) +
|
||||||
|
labs(title = "model 3: residuals vs fitted (log scale)", x = "fitted (log price)", y = "residuals")
|
||||||
|
|
||||||
|
ggsave(file.path(out_dir, "m3_scatter.png"), p3_scatter, width = 7, height = 5, dpi = 150)
|
||||||
|
ggsave(file.path(out_dir, "m3_residuals.png"), p3_resid, width = 7, height = 5, dpi = 150)
|
||||||
|
|
||||||
|
# comp table
|
||||||
|
compare <- tibble::tibble(
|
||||||
|
model = c("PRICE ~ PROPERTYSQFT",
|
||||||
|
"PRICE ~ PROPERTYSQFT + BEDS + BATH",
|
||||||
|
"log(PRICE) ~ log(PROPERTYSQFT) + BEDS + BATH"),
|
||||||
|
r2 = c(summary(m1)$r.squared, summary(m2)$r.squared, summary(m3)$r.squared),
|
||||||
|
adj_r2 = c(summary(m1)$adj.r.squared, summary(m2)$adj.r.squared, summary(m3)$adj.r.squared),
|
||||||
|
aic = c(AIC(m1), AIC(m2), AIC(m3)),
|
||||||
|
bic = c(BIC(m1), BIC(m2), BIC(m3)),
|
||||||
|
top_var = c(top1, top2, top3)
|
||||||
|
)
|
||||||
|
print(compare)
|
||||||
|
|
||||||
|
readr::write_csv(compare, file.path(out_dir, "model_comparison.csv"))
|
||||||
|
|
||||||
|
message("\ndone!")
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 61 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 138 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 56 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 142 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 132 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 40 KiB |
@@ -0,0 +1,4 @@
|
|||||||
|
model,r2,adj_r2,aic,bic,top_var
|
||||||
|
PRICE ~ PROPERTYSQFT,0.28368247344839936,0.2835263109180851,146322.7288184832,146342.0230707264,PROPERTYSQFT
|
||||||
|
PRICE ~ PROPERTYSQFT + BEDS + BATH,0.3385405215516886,0.33810772363776387,145961.10108380177,145993.25817087374,PROPERTYSQFT
|
||||||
|
log(PRICE) ~ log(PROPERTYSQFT) + BEDS + BATH,0.4401401151381639,0.43977379460281263,9572.876890519634,9605.033977591607,BATH
|
||||||
|
Vendored
+7
@@ -0,0 +1,7 @@
|
|||||||
|
{
|
||||||
|
"[r]": {
|
||||||
|
"editor.indentSize": "tabSize",
|
||||||
|
"editor.tabSize": 4,
|
||||||
|
"editor.useTabStops": true
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,16 @@
|
|||||||
|
####################################
|
||||||
|
##### Abalone Data Preparation #####
|
||||||
|
####################################
|
||||||
|
|
||||||
|
# read dataset
|
||||||
|
abalone.data <- read.csv("Courses/Data Analytics/Fall25/labs/lab 3/abalone_dataset.csv")
|
||||||
|
|
||||||
|
## add new column age.group with 3 values based on the number of rings
|
||||||
|
abalone.data$age.group <- cut(abalone.data$rings, br=c(0,8,11,35), labels = c("young", 'adult', 'old'))
|
||||||
|
|
||||||
|
## alternative way of setting age.group
|
||||||
|
abalone.data$age.group[abalone.data$rings<=8] <- "young"
|
||||||
|
abalone.data$age.group[abalone.data$rings>8 & abalone.data$rings<=11] <- "adult"
|
||||||
|
abalone.data$age.group[abalone.data$rings>11 & abalone.data$rings<=35] <- "old"
|
||||||
|
|
||||||
|
|
||||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,7 @@
|
|||||||
|
lab 3 results
|
||||||
|
|
||||||
|
chosen feature subset (by initial k=5): small features: length, diameter, height
|
||||||
|
best k (k-NN tuning): 17 (accuracy = 0.6094)
|
||||||
|
kmeans best k (silhouette): 2
|
||||||
|
pam best k (silhouette): 2
|
||||||
|
|
||||||
+265
@@ -0,0 +1,265 @@
|
|||||||
|
# lab 3: (A) bologna
|
||||||
|
|
||||||
|
# I NEEDED TO INSTALL FORTRAN BS FOR THIS LMAO
|
||||||
|
# sudo pacman -S --needed base-devel gcc-fortran lapack openblas libxml2 curl openssl
|
||||||
|
|
||||||
|
# packages (install if necessary)
|
||||||
|
required_pkgs <- c("dplyr", "ggplot2", "caret", "class", "cluster", "factoextra", "gridExtra")
|
||||||
|
for (p in required_pkgs) {
|
||||||
|
if (!requireNamespace(p, quietly = TRUE)) {
|
||||||
|
install.packages(p, repos = "https://cloud.r-project.org", dependencies=TRUE)
|
||||||
|
}
|
||||||
|
library(p, character.only = TRUE)
|
||||||
|
}
|
||||||
|
|
||||||
|
# path handling: prefer the uploaded path if present
|
||||||
|
uploaded_path <- "/home/ion606/Desktop/Data Analytics/Lab 3/abalone_dataset.csv"
|
||||||
|
fallback_path <- "/home/ion606/Desktop/Data Analytics/Lab 3/abalone_dataset.csv"
|
||||||
|
data_path <- if (file.exists(uploaded_path)) uploaded_path else fallback_path
|
||||||
|
|
||||||
|
# read dataset
|
||||||
|
abalone.data <- read.csv(data_path, stringsAsFactors = FALSE)
|
||||||
|
|
||||||
|
# canonicalize column names to predictable lower-case tokens
|
||||||
|
names(abalone.data) <- tolower(gsub("[[:space:]]+", ".", names(abalone.data)))
|
||||||
|
|
||||||
|
# if rings column was named differently, try to find it
|
||||||
|
if (!"rings" %in% names(abalone.data)) {
|
||||||
|
stop("could not find 'rings' column in dataset. column names found: ", paste(names(abalone.data), collapse = ", "))
|
||||||
|
}
|
||||||
|
|
||||||
|
print(names(abalone.data))
|
||||||
|
|
||||||
|
# old code but I left it here anyways
|
||||||
|
# abalone.data$age.group[abalone.data$rings <= 8] <- "young"
|
||||||
|
# abalone.data$age.group[abalone.data$rings > 8 & abalone.data$rings <= 11] <- "adult"
|
||||||
|
# abalone.data$age.group[abalone.data$rings > 11 & abalone.data$rings <= 35] <- "old"
|
||||||
|
# abalone.data$age.group <- factor(abalone.data$age.group, levels = c("young", "adult", "old"))
|
||||||
|
|
||||||
|
|
||||||
|
# new code
|
||||||
|
abalone.data$age.group <- cut(abalone.data$rings, breaks = c(0, 8, 11, 35),
|
||||||
|
labels = c("young", "adult", "old"),
|
||||||
|
right = TRUE, include.lowest = TRUE)
|
||||||
|
|
||||||
|
if ("sex" %in% names(abalone.data)) {
|
||||||
|
abalone.data$sex <- as.factor(abalone.data$sex)
|
||||||
|
}
|
||||||
|
|
||||||
|
# preview
|
||||||
|
cat("dataset dims:", dim(abalone.data), "\n")
|
||||||
|
cat("columns:", paste(names(abalone.data), collapse = ", "), "\n")
|
||||||
|
|
||||||
|
expected_num_cols <- c("length", "diameter", "height",
|
||||||
|
"whole.weight", "shucked.weight",
|
||||||
|
"viscera.weight", "shell.weight")
|
||||||
|
|
||||||
|
|
||||||
|
num_cols_present <- intersect(expected_num_cols, names(abalone.data))
|
||||||
|
if (length(num_cols_present) < 3) {
|
||||||
|
stop("expected at least three numeric measurement columns; found ", paste(num_cols_present, collapse = ", "))
|
||||||
|
}
|
||||||
|
|
||||||
|
# feature subsets
|
||||||
|
features_full <- num_cols_present # numeric
|
||||||
|
features_small <- intersect(c("length","diameter","height"), names(abalone.data)) # subset lmao
|
||||||
|
|
||||||
|
cat("using features (full):", paste(features_full, collapse = ", "), "\n")
|
||||||
|
cat("using features (small):", paste(features_small, collapse = ", "), "\n")
|
||||||
|
|
||||||
|
# data split
|
||||||
|
set.seed(123)
|
||||||
|
train_index <- createDataPartition(abalone.data$age.group, p = 0.7, list = FALSE)
|
||||||
|
train_df <- abalone.data[train_index, , drop = FALSE]
|
||||||
|
test_df <- abalone.data[-train_index, , drop = FALSE]
|
||||||
|
|
||||||
|
# helper to scale numeric features / return matrix + labels
|
||||||
|
scale_features <- function(df, feature_names, center = NULL, scale = NULL) {
|
||||||
|
mat <- as.data.frame(df[, feature_names, drop = FALSE])
|
||||||
|
|
||||||
|
# compute center/scale from provided if present (for train/test separation)
|
||||||
|
if (is.null(center)) {
|
||||||
|
center <- sapply(mat, mean, na.rm = TRUE)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is.null(scale)) {
|
||||||
|
scale <- sapply(mat, sd, na.rm = TRUE)
|
||||||
|
# avoid zero sd
|
||||||
|
scale[scale == 0] <- 1
|
||||||
|
}
|
||||||
|
|
||||||
|
scaled <- as.data.frame(scale(mat, center = center, scale = scale))
|
||||||
|
list(scaled = scaled, center = center, scale = scale)
|
||||||
|
}
|
||||||
|
|
||||||
|
# scale train/test for both feature sets
|
||||||
|
train_full_scaled <- scale_features(train_df, features_full)
|
||||||
|
test_full_scaled <- scale_features(test_df, features_full, center = train_full_scaled$center, scale = train_full_scaled$scale)
|
||||||
|
|
||||||
|
train_small_scaled <- scale_features(train_df, features_small)
|
||||||
|
test_small_scaled <- scale_features(test_df, features_small, center = train_small_scaled$center, scale = train_small_scaled$scale)
|
||||||
|
|
||||||
|
# labels for knn
|
||||||
|
train_labels <- train_df$age.group
|
||||||
|
test_labels <- test_df$age.group
|
||||||
|
|
||||||
|
# 2 kNN models (initial comparison)
|
||||||
|
library(class) # knn()
|
||||||
|
|
||||||
|
# pick an initial k (odd)
|
||||||
|
k_init <- 5
|
||||||
|
|
||||||
|
knn_predict_and_confmat <- function(train_mat, test_mat, train_labels, test_labels, k) {
|
||||||
|
pred <- knn(train = as.matrix(train_mat), test = as.matrix(test_mat), cl = train_labels, k = k)
|
||||||
|
cm <- confusionMatrix(pred, test_labels)
|
||||||
|
list(pred = pred, confmat = cm)
|
||||||
|
}
|
||||||
|
|
||||||
|
res_full_init <- knn_predict_and_confmat(train_full_scaled$scaled, test_full_scaled$scaled, train_labels, test_labels, k_init)
|
||||||
|
res_small_init <- knn_predict_and_confmat(train_small_scaled$scaled, test_small_scaled$scaled, train_labels, test_labels, k_init)
|
||||||
|
|
||||||
|
cat("\ninitial results (k =", k_init, ")\n")
|
||||||
|
cat("full-features accuracy:", res_full_init$confmat$overall["Accuracy"], "\n")
|
||||||
|
print(res_full_init$confmat$table)
|
||||||
|
cat("small-features accuracy:", res_small_init$confmat$overall["Accuracy"], "\n")
|
||||||
|
print(res_small_init$confmat$table)
|
||||||
|
|
||||||
|
# choose better performing feature subset (by accuracy)
|
||||||
|
acc_full <- as.numeric(res_full_init$confmat$overall["Accuracy"])
|
||||||
|
acc_small <- as.numeric(res_small_init$confmat$overall["Accuracy"])
|
||||||
|
if (acc_full >= acc_small) {
|
||||||
|
best_features <- features_full
|
||||||
|
best_train_scaled <- train_full_scaled
|
||||||
|
best_test_scaled <- test_full_scaled
|
||||||
|
chosen_tag <- "full"
|
||||||
|
} else {
|
||||||
|
best_features <- features_small
|
||||||
|
best_train_scaled <- train_small_scaled
|
||||||
|
best_test_scaled <- test_small_scaled
|
||||||
|
chosen_tag <- "small"
|
||||||
|
}
|
||||||
|
cat("\nchosen feature subset for tuning:", chosen_tag, "(", paste(best_features, collapse = ", "), ")\n")
|
||||||
|
|
||||||
|
# optimal k for best performing subset
|
||||||
|
k_values <- seq(1, 25, by = 2) # odd ks
|
||||||
|
accuracy_by_k <- numeric(length(k_values))
|
||||||
|
names(accuracy_by_k) <- k_values
|
||||||
|
|
||||||
|
for (i in seq_along(k_values)) {
|
||||||
|
k <- k_values[i]
|
||||||
|
tmp <- knn(train = as.matrix(best_train_scaled$scaled),
|
||||||
|
test = as.matrix(best_test_scaled$scaled),
|
||||||
|
cl = train_labels,
|
||||||
|
k = k)
|
||||||
|
cm <- confusionMatrix(tmp, test_labels)
|
||||||
|
accuracy_by_k[i] <- as.numeric(cm$overall["Accuracy"])
|
||||||
|
}
|
||||||
|
|
||||||
|
best_k_idx <- which.max(accuracy_by_k)
|
||||||
|
best_k <- k_values[best_k_idx]
|
||||||
|
cat("\naccuracy_by_k:\n")
|
||||||
|
print(round(accuracy_by_k, 4))
|
||||||
|
cat("\nbest k:", best_k, "with accuracy", round(accuracy_by_k[best_k_idx], 4), "\n")
|
||||||
|
|
||||||
|
# final model with best_k
|
||||||
|
final_knn <- knn(train = as.matrix(best_train_scaled$scaled),
|
||||||
|
test = as.matrix(best_test_scaled$scaled),
|
||||||
|
cl = train_labels,
|
||||||
|
k = best_k)
|
||||||
|
|
||||||
|
final_cm <- confusionMatrix(final_knn, test_labels)
|
||||||
|
cat("\nfinal confusion matrix (best k):\n")
|
||||||
|
print(final_cm)
|
||||||
|
|
||||||
|
# per-class
|
||||||
|
print(final_cm$byClass)
|
||||||
|
|
||||||
|
# summary
|
||||||
|
output_pdf <- "lab3_output.pdf"
|
||||||
|
pdf(output_pdf, width = 10, height = 7)
|
||||||
|
|
||||||
|
# accuracy vs k
|
||||||
|
plot(k_values, accuracy_by_k, type = "b", pch = 19, xlab = "k (odd)", ylab = "accuracy", main = paste("k-NN accuracy (chosen subset:", chosen_tag, ")"))
|
||||||
|
grid()
|
||||||
|
|
||||||
|
# Exercise 2: clustering (k-means and pam) using best feature subset
|
||||||
|
|
||||||
|
# use scaled dataset (all observations) for clustering
|
||||||
|
# scale using full population mean/sd
|
||||||
|
all_scaled_res <- scale_features(abalone.data, best_features)
|
||||||
|
all_scaled <- all_scaled_res$scaled
|
||||||
|
|
||||||
|
# use fviz_nbclust for optimal K using silhouette
|
||||||
|
# < (k = 10 or sqrt(n))
|
||||||
|
k_max <- min(10, floor(sqrt(nrow(all_scaled)) * 2))
|
||||||
|
|
||||||
|
# silhouette
|
||||||
|
factoextra::fviz_nbclust(all_scaled, kmeans, method = "silhouette") + ggtitle("fviz_nbclust: silhouette (kmeans)")
|
||||||
|
|
||||||
|
# show elbow
|
||||||
|
factoextra::fviz_nbclust(all_scaled, kmeans, method = "wss") + ggtitle("fviz_nbclust: wss (kmeans)")
|
||||||
|
|
||||||
|
# pick K by the maximum average silhouette
|
||||||
|
avg_sil <- numeric(k_max - 1)
|
||||||
|
for (k in 2:k_max) {
|
||||||
|
km_tmp <- kmeans(all_scaled, centers = k, nstart = 25)
|
||||||
|
sil <- cluster::silhouette(km_tmp$cluster, dist(all_scaled))
|
||||||
|
avg_sil[k - 1] <- mean(sil[, 3])
|
||||||
|
}
|
||||||
|
k_values_clust <- 2:k_max
|
||||||
|
best_k_clust <- k_values_clust[which.max(avg_sil)]
|
||||||
|
cat("\navg silhouette by k (kmeans):\n")
|
||||||
|
print(data.frame(k = k_values_clust, avg_silhouette = round(avg_sil, 4)))
|
||||||
|
cat("\nchosen best k for kmeans (max avg silhouette):", best_k_clust, "\n")
|
||||||
|
|
||||||
|
# run kmeans with best_k_clust and plot silhouette
|
||||||
|
km_final <- kmeans(all_scaled, centers = best_k_clust, nstart = 25)
|
||||||
|
sil_km <- cluster::silhouette(km_final$cluster, dist(all_scaled))
|
||||||
|
factoextra::fviz_silhouette(sil_km) + ggtitle(paste("kmeans silhouette (k=", best_k_clust, ")", sep = ""))
|
||||||
|
|
||||||
|
# run pam with same range and pick best k for pam by avg silhouette
|
||||||
|
avg_sil_pam <- numeric(k_max - 1)
|
||||||
|
for (k in 2:k_max) {
|
||||||
|
pam_tmp <- cluster::pam(all_scaled, k = k)
|
||||||
|
avg_sil_pam[k - 1] <- mean(pam_tmp$silinfo$avg.width)
|
||||||
|
}
|
||||||
|
|
||||||
|
best_k_pam <- k_values_clust[which.max(avg_sil_pam)]
|
||||||
|
cat("\navg silhouette by k (pam):\n")
|
||||||
|
print(data.frame(k = k_values_clust, avg_silhouette = round(avg_sil_pam, 4)))
|
||||||
|
cat("\nchosen best k for pam (max avg silhouette):", best_k_pam, "\n")
|
||||||
|
|
||||||
|
# run pam for best_k_pam/show silhouette plot
|
||||||
|
pam_final <- cluster::pam(all_scaled, k = best_k_pam)
|
||||||
|
factoextra::fviz_silhouette(pam_final) + ggtitle(paste("pam silhouette (k=", best_k_pam, ")", sep = ""))
|
||||||
|
|
||||||
|
# also plot cluster centers (2-d PCA scatter with cluster colors) for kmeans and pam
|
||||||
|
pca_res <- prcomp(all_scaled, center = TRUE, scale. = FALSE)
|
||||||
|
pcs <- data.frame(pca_res$x[, 1:2])
|
||||||
|
pcs$kmeans_cluster <- factor(km_final$cluster)
|
||||||
|
pcs$pam_cluster <- factor(pam_final$clustering)
|
||||||
|
|
||||||
|
# kmeans PCA plot
|
||||||
|
ggplot(pcs, aes(x = PC1, y = PC2, color = kmeans_cluster)) + geom_point(alpha = 0.6) + ggtitle(paste("kmeans clusters (k=", best_k_clust, ")", sep = ""))
|
||||||
|
|
||||||
|
# pam PCA plot
|
||||||
|
ggplot(pcs, aes(x = PC1, y = PC2, color = pam_cluster)) + geom_point(alpha = 0.6) + ggtitle(paste("pam clusters (k=", best_k_pam, ")", sep = ""))
|
||||||
|
|
||||||
|
# kill the pdf device
|
||||||
|
dev.off()
|
||||||
|
|
||||||
|
cat("plots and clustering/kNN visuals saved to", output_pdf, "\n")
|
||||||
|
cat("final chosen k for kNN:", best_k, "\n")
|
||||||
|
cat("final chosen k for kmeans:", best_k_clust, "\n")
|
||||||
|
cat("final chosen k for pam:", best_k_pam, "\n")
|
||||||
|
|
||||||
|
# yes I am lazy thx
|
||||||
|
summary_txt <- paste0(
|
||||||
|
"lab 3 results\n\n",
|
||||||
|
"chosen feature subset (by initial k=", k_init, "): ", chosen_tag, " features: ", paste(best_features, collapse = ", "), "\n",
|
||||||
|
"best k (k-NN tuning): ", best_k, " (accuracy = ", round(accuracy_by_k[best_k_idx], 4), ")\n",
|
||||||
|
"kmeans best k (silhouette): ", best_k_clust, "\n",
|
||||||
|
"pam best k (silhouette): ", best_k_pam, "\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
writeLines(summary_txt, con = "lab3_summary.txt")
|
||||||
Reference in New Issue
Block a user