added lab 4

2025-10-31 17:55:13 -04:00
parent dc2ceac7de
commit 88f2975b86
66 changed files with 730 additions and 301 deletions
@@ -0,0 +1,122 @@
+# TODO: organize this file better bc I just kinda dumped everything in here
+
+
+suppressPackageStartupMessages({
+	pkgs <- c("tidyverse", "readr", "readxl", "broom", "jsonlite", "ggplot2", "class", "optparse", "markdown")
+	to_install <- pkgs[!pkgs %in% rownames(installed.packages())]
+	if (length(to_install)) install.packages(to_install, repos = "https://cloud.r-project.org")
+	lapply(pkgs, library, character.only = TRUE)
+})
+
+sanitize <- function(x) {
+	gsub("[^A-Za-z0-9_.-]+", "_", x)
+}
+
+save_plot <- function(p, path, w = 7, h = 5, dpi = 160) {
+	dir.create(dirname(path), recursive = TRUE, showWarnings = FALSE)
+	ggplot2::ggsave(path, p, width = w, height = h, dpi = dpi)
+}
+
+hist_density_plot <- function(v, lbl) {
+	tibble(x = v) |>
+		ggplot(aes(x = x)) +
+		geom_histogram(aes(y = after_stat(density)), bins = 30, alpha = 0.6) +
+		geom_density() +
+		labs(title = paste("histogram + density:", lbl), x = lbl, y = "density") +
+		theme_minimal()
+}
+
+box_plot <- function(v, lbl) {
+	tibble(x = v) |>
+		ggplot(aes(y = x)) +
+		geom_boxplot(width = 0.3) +
+		labs(title = paste("boxplot:", lbl), y = lbl, x = NULL) +
+		theme_minimal()
+}
+
+qq_two_sample <- function(a, b, title = "qq plot", n_q = NULL) {
+	a <- a[is.finite(a)]
+	b <- b[is.finite(b)]
+	n <- min(length(a), length(b))
+
+	if (is.null(n_q)) n_q <- max(10, floor(n))
+	if (n_q < 10) return(NULL)
+
+	probs <- seq(0.01, 0.99, length.out = n_q)
+	qa <- quantile(a, probs, na.rm = TRUE, names = FALSE)
+	qb <- quantile(b, probs, na.rm = TRUE, names = FALSE)
+	d <- tibble(x = sort(qa), y = sort(qb))
+
+	ggplot(d, aes(x, y)) +
+		geom_point(size = 1.6) +
+		geom_abline(slope = 1, intercept = 0) +
+		labs(title = title, x = "region a quantiles", y = "region b quantiles") +
+		theme_minimal()
+}
+
+tf_pos <- function(s) {
+	s <- as.numeric(s)
+	if (all(s[is.finite(s)] > 0)) log1p(s) else s
+}
+
+strat_split <- function(d, label_col, test_prop = 0.25) {
+	d <- d |> tidyr::drop_na({{label_col}})
+	idx_tr <- integer(0)
+	idx_te <- integer(0)
+
+	for (lev in unique(d[[label_col]])) {
+		rows <- which(d[[label_col]] == lev)
+		nte <- max(1, floor(length(rows) * test_prop))
+
+		te <- sample(rows, nte)
+		tr <- setdiff(rows, te)
+
+		idx_tr <- c(idx_tr, tr)
+		idx_te <- c(idx_te, te)
+	}
+
+	list(train = d[idx_tr, , drop = FALSE], test = d[idx_te, , drop = FALSE])
+}
+
+run_knn <- function(d, label_col, vars, k, tag, fig_dir) {
+	use <- d |> dplyr::select(all_of(c(label_col, vars))) |> tidyr::drop_na()
+	split <- strat_split(use, label_col, 0.25)
+	tr <- split$train
+	te <- split$test
+
+	mu <- sapply(tr[, vars, drop = FALSE], mean, na.rm = TRUE)
+	sdv <- sapply(tr[, vars, drop = FALSE], sd, na.rm = TRUE)
+	sdv[sdv == 0] <- 1
+
+	trX <- scale(as.matrix(tr[, vars, drop = FALSE]), center = mu, scale = sdv)
+	teX <- scale(as.matrix(te[, vars, drop = FALSE]), center = mu, scale = sdv)
+
+	trY <- factor(tr[[label_col]])
+	teY <- factor(te[[label_col]], levels = levels(trY))
+
+	pred <- class::knn(train = trX, test = teX, cl = trY, k = k)
+	acc <- mean(pred == teY, na.rm = TRUE)
+
+	cm <- table(truth = teY, pred = pred)
+	cm_df <- as.data.frame(cm)
+	cm_fig <- file.path(fig_dir, paste0("knn_confusion_", sanitize(tag), ".png"))
+
+	p_cm <- ggplot(cm_df, aes(x = pred, y = truth, fill = Freq)) +
+		geom_tile() + geom_text(aes(label = Freq)) +
+		labs(
+			title = paste0("confusion matrix (k=", k, ") – ", tag),
+			x = "predicted", y = "true"
+		) + theme_minimal() +
+		theme(
+			axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
+		)
+
+  save_plot(p_cm, cm_fig, w = 6, h = 5)
+
+  list(tag = tag,
+       k = k,
+       vars = vars,
+       accuracy = unname(acc),
+       confusion_fig = cm_fig,
+       n_test = nrow(te))
+}
@@ -0,0 +1,121 @@
+source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
+
+# TODO: hard-code me
+
+# NOTE: The options were generated by chatGPT from my horrendous hard-coded options
+option_list <- list(
+	optparse::make_option("--data", type = "character", default = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/epi_results_2024_pop_gdp_v2.csv"),
+	optparse::make_option("--region-col", type = "character", default = NA),
+	optparse::make_option("--region-a", type = "character", default = NA),
+	optparse::make_option("--region-b", type = "character", default = NA),
+	optparse::make_option("--response", type = "character", default = NA),
+	optparse::make_option("--predictors", type = "character", default = NA),
+	optparse::make_option("--knn1", type = "character", default = NA),
+	optparse::make_option("--knn2", type = "character", default = NA),
+	optparse::make_option("--k", type = "integer", default = 5)
+)
+opt <- optparse::parse_args(optparse::OptionParser(option_list = option_list))
+
+if (is.na(opt$data)) stop("--data is required")
+
+read_any <- function(p) {
+	ext <- tolower(tools::file_ext(p))
+	if (ext %in% c("csv", "txt")) {
+		suppressMessages(readr::read_csv(p, show_col_types = FALSE))
+	} else if (ext %in% c("xls", "xlsx")) {
+		readxl::read_excel(p)
+	} else {
+		stop("unsupported extension: ", ext)
+	}
+}
+df <- read_any(opt$data)
+
+nms <- names(df)
+
+find_col <- function(nms, pats) {
+	for (pat in pats) {
+		idx <- which(stringr::str_detect(tolower(nms), pat))
+		if (length(idx)) return(nms[idx[1]])
+	}
+
+	# I hate it here
+	NA_character_
+}
+
+region_col <- if (!is.na(opt$`region-col`)) opt$`region-col` else
+  find_col(nms, c("^region$", "regions?$", "world\\s*bank\\s*region"))
+
+if (is.na(region_col)) stop("could not detect a region column; pass --region-col")
+
+response <- if (!is.na(opt$response)) opt$response else if ("EPI.new" %in% nms) {
+	"EPI.new"
+} else {
+	find_col(nms, c("^epi", "epi.*score", "index$", "score$"))
+}
+
+if (is.na(response)) {
+	num <- df |> dplyr::select(where(is.numeric)) |> names()
+	if (!length(num)) stop("no numeric columns; pass --response")
+	response <- num[1]
+}
+
+gdp_col <- find_col(nms, c("^gdp", "gdp.*per.*cap", "gdppc"))
+pop_col <- find_col(nms, c("^pop", "^population$"))
+
+counts <- sort(table(df[[region_col]]), decreasing = TRUE)
+region_a <- if (!is.na(opt$`region-a`)) opt$`region-a` else
+  if ("Sub-Saharan Africa" %in% names(counts)) "Sub-Saharan Africa" else names(counts)[1]
+
+region_b <- if (!is.na(opt$`region-b`)) opt$`region-b` else
+  if ("Latin America & Caribbean" %in% names(counts)) "Latin America & Caribbean" else names(counts)[2]
+
+pred_sets <- list()
+if (!is.na(opt$predictors)) {
+	pred_sets <- list(strsplit(opt$predictors, ",", fixed = TRUE)[[1]] |> trimws())
+} else {
+	plist <- c()
+	if (!is.na(gdp_col)) plist <- c(plist, gdp_col)
+	if (!is.na(pop_col)) plist <- c(plist, pop_col)
+	if (length(plist) >= 1) pred_sets <- append(pred_sets, list(plist[1]))
+	if (length(plist) >= 2) pred_sets <- append(pred_sets, list(plist[1:2]))
+}
+
+pred_sets <- pred_sets[lengths(pred_sets) > 0]
+
+choose_knn_vars <- function(df, exclude, k = 3) {
+	cands <- names(df)[endsWith(names(df), ".new") & names(df) != exclude]
+	cands <- cands[sapply(cands, function(c) is.numeric(df[[c]]))]
+	miss <- sapply(cands, function(c) mean(is.na(df[[c]])))
+	ord <- order(miss, cands)
+	head(cands[ord], k)
+}
+
+knn1 <- if (!is.na(opt$knn1)) {
+	strsplit(opt$knn1, ",", fixed = TRUE)[[1]] |> trimws()
+} else {
+	choose_knn_vars(df, response, 3)
+}
+
+knn2 <- if (!is.na(opt$knn2)) {
+	strsplit(opt$knn2, ",", fixed = TRUE)[[1]] |> trimws()
+} else {
+	setdiff(choose_knn_vars(df, response, 6), knn1)[1:3]
+}
+
+ctx <- list(
+	data = normalizePath(opt$data),
+	region_col = region_col,
+	response = response,
+	region_a = region_a,
+	region_b = region_b,
+	predictors = pred_sets,
+	knn1 = knn1,
+	knn2 = knn2,
+	k = opt$k,
+	fig_dir = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/figures",
+	stats_dir = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/stats"
+)
+
+
+writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
+message("wrote ctx.json")
@@ -0,0 +1,27 @@
+source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
+ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
+df <- suppressMessages(readr::read_csv(ctx$data, show_col_types = FALSE))
+
+va <- df |> dplyr::filter(.data[[ctx$region_col]] == ctx$region_a) |> dplyr::pull(all_of(ctx$response)) |> as.numeric()
+vb <- df |> dplyr::filter(.data[[ctx$region_col]] == ctx$region_b) |> dplyr::pull(all_of(ctx$response)) |> as.numeric()
+
+box_a_path  <- file.path(ctx$fig_dir, paste0("box_", sanitize(ctx$region_a), "_", sanitize(ctx$response), ".png"))
+box_b_path  <- file.path(ctx$fig_dir, paste0("box_", sanitize(ctx$region_b), "_", sanitize(ctx$response), ".png"))
+hist_a_path <- file.path(ctx$fig_dir, paste0("hist_", sanitize(ctx$region_a), "_", sanitize(ctx$response), ".png"))
+hist_b_path <- file.path(ctx$fig_dir, paste0("hist_", sanitize(ctx$region_b), "_", sanitize(ctx$response), ".png"))
+qq_path     <- file.path(ctx$fig_dir, paste0("qq_", sanitize(ctx$response), "_", sanitize(ctx$region_a), "_vs_", sanitize(ctx$region_b), ".png"))
+
+save_plot(box_plot(va, paste0(ctx$response, " (", ctx$region_a, ")")), box_a_path)
+save_plot(box_plot(vb, paste0(ctx$response, " (", ctx$region_b, ")")), box_b_path)
+save_plot(hist_density_plot(va, paste0(ctx$response, " (", ctx$region_a, ")")), hist_a_path)
+save_plot(hist_density_plot(vb, paste0(ctx$response, " (", ctx$region_b, ")")), hist_b_path)
+qp <- qq_two_sample(va, vb, paste0("qq plot: ", ctx$region_a, " vs ", ctx$region_b))
+if (!is.null(qp)) save_plot(qp, qq_path)
+
+ctx$box_a <- box_a_path
+ctx$box_b <- box_b_path
+ctx$hist_a <- hist_a_path
+ctx$hist_b <- hist_b_path
+ctx$qq_fig <- qq_path
+writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
+message("plots done")
@@ -0,0 +1,59 @@
+source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
+ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
+df <- suppressMessages(readr::read_csv(ctx$data, show_col_types = FALSE))
+
+fit_ols <- function(data, y_col, x_cols, name_tag, fig_dir, stats_dir) {
+	d <- data |> dplyr::select(all_of(c(y_col, x_cols))) |> tidyr::drop_na()
+	for (xc in x_cols) d[[xc]] <- tf_pos(d[[xc]])
+	f <- as.formula(paste(y_col, "~", paste(x_cols, collapse = " + ")))
+	m <- lm(f, data = d)
+
+	# residuals vs fitted
+	res_path <- file.path(fig_dir, paste0("residuals_", sanitize(name_tag), ".png"))
+	p_res <- tibble(fitted = fitted(m), resid = resid(m)) |>
+		ggplot(aes(fitted, resid)) +
+		geom_point(size = 1.6) +
+		geom_hline(yintercept = 0) +
+		labs(title = paste("residuals vs fitted:", name_tag),
+			x = "fitted", y = "residuals") +
+		theme_minimal()
+	save_plot(p_res, res_path)
+
+	# scatter first predictor vs response
+	first <- x_cols[1]
+	sc_path <- file.path(fig_dir, paste0("scatter_", sanitize(name_tag), "_", sanitize(first), ".png"))
+	p_sc <- d |>
+		ggplot(aes(.data[[first]], .data[[y_col]])) +
+		geom_point(size = 1.6) +
+		labs(title = paste(first, "vs", y_col), x = first, y = y_col) +
+		theme_minimal()
+	save_plot(p_sc, sc_path)
+
+	# summary to text
+	summ_path <- file.path(stats_dir, paste0("ols_", sanitize(name_tag), ".txt"))
+	capture.output(summary(m), file = summ_path)
+
+	gl <- broom::glance(m)
+	list(
+		name = name_tag,
+		rsq = unname(gl$r.squared),
+		aic = unname(gl$AIC),
+		bic = unname(gl$BIC),
+		nobs = stats::nobs(m),
+		summary_file = summ_path,
+		residuals_fig = res_path,
+		scatter_fig = sc_path
+	)
+}
+
+ols <- list()
+if (length(ctx$predictors)) {
+	for (p in ctx$predictors) {
+		tag <- paste0("full: ", ctx$response, " ~ ", paste(p, collapse = " + "))
+		ols <- append(ols, list(fit_ols(df, ctx$response, p, tag, ctx$fig_dir, ctx$stats_dir)))
+	}
+}
+
+ctx$ols <- ols
+writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
+message("ols (full) done")
@@ -0,0 +1,72 @@
+source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
+ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
+df <- suppressMessages(readr::read_csv(ctx$data, show_col_types = FALSE))
+reg_df <- df |> dplyr::filter(.data[[ctx$region_col]] == ctx$region_a)
+
+fit_ols <- function(data, y_col, x_cols, name_tag, fig_dir, stats_dir) {
+	d <- data |> dplyr::select(all_of(c(y_col, x_cols))) |> tidyr::drop_na()
+	for (xc in x_cols) d[[xc]] <- tf_pos(d[[xc]])
+	f <- as.formula(paste(y_col, "~", paste(x_cols, collapse = " + ")))
+	m <- lm(f, data = d)
+
+	res_path <- file.path(fig_dir, paste0("residuals_", sanitize(name_tag), ".png"))
+	p_res <- tibble(fitted = fitted(m), resid = resid(m)) |>
+		ggplot(aes(fitted, resid)) +
+		geom_point(size = 1.6) +
+		geom_hline(yintercept = 0) +
+		labs(title = paste("residuals vs fitted:", name_tag),
+			x = "fitted", y = "residuals") +
+		theme_minimal()
+	save_plot(p_res, res_path)
+
+	first <- x_cols[1]
+	sc_path <- file.path(fig_dir, paste0("scatter_", sanitize(name_tag), "_", sanitize(first), ".png"))
+	p_sc <- d |>
+		ggplot(aes(.data[[first]], .data[[y_col]])) +
+		geom_point(size = 1.6) +
+		labs(title = paste(first, "vs", y_col), x = first, y = y_col) +
+		theme_minimal()
+	save_plot(p_sc, sc_path)
+
+	summ_path <- file.path(stats_dir, paste0("ols_", sanitize(name_tag), ".txt"))
+	capture.output(summary(m), file = summ_path)
+
+	gl <- broom::glance(m)
+	list(
+		name = name_tag,
+		rsq = unname(gl$r.squared),
+		aic = unname(gl$AIC),
+		bic = unname(gl$BIC),
+		nobs = stats::nobs(m),
+		summary_file = summ_path,
+		residuals_fig = res_path,
+		scatter_fig = sc_path
+	)
+}
+
+region_models <- list()
+if (length(ctx$predictors)) {
+	for (p in ctx$predictors) {
+		tag <- paste0("region ", ctx$region_a, ": ", ctx$response, " ~ ", paste(p, collapse = " + "))
+		region_models <- append(region_models, list(fit_ols(reg_df, ctx$response, p, tag, ctx$fig_dir, ctx$stats_dir)))
+	}
+}
+
+best_note <- "no region-level comparison available."
+if (length(region_models) >= 1) {
+	ord <- order(
+		sapply(region_models, `[[`, "rsq"),
+		-sapply(region_models, `[[`, "aic"),
+		-sapply(region_models, `[[`, "bic"),
+		decreasing = TRUE
+	)
+	best <- region_models[[ord[1]]]
+	best_note <- sprintf(
+		"on region `%s`, the better model is **%s** (r²=%.3f, aic=%.1f, bic=%.1f).",
+		ctx$region_a, best$name, best$rsq, best$aic, best$bic
+	)
+}
+
+ctx$best_region_note <- best_note
+writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
+message("ols (region) done")
@@ -0,0 +1,11 @@
+source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
+ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
+df <- suppressMessages(readr::read_csv(ctx$data, show_col_types = FALSE))
+
+set.seed(42)
+knn1 <- run_knn(df, ctx$region_col, ctx$knn1, ctx$k, "model A", ctx$fig_dir)
+knn2 <- run_knn(df, ctx$region_col, ctx$knn2, ctx$k, "model B", ctx$fig_dir)
+
+ctx$knn <- list(knn1, knn2)
+writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
+message("knn done")
@@ -0,0 +1,89 @@
+library(markdown)
+
+source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
+ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
+
+md <- c(
+	"# exploratory data analysis and models on the epi dataset",
+	paste0("date: ", as.character(Sys.Date()), "  "),
+	"",
+	"## dataset and choices",
+	# I regret my life choices
+	paste0("- **file**: `", basename(ctx$data), "`"),
+	paste0("- **region column**: `", ctx$region_col, "`"),
+	paste0("- **response var**: `", ctx$response, "`"),
+	paste0("- **regions**: `", ctx$region_a, "` vs `", ctx$region_b, "`"),
+	"",
+	"## 1) variable distributions",
+	"### 1.1 boxplots and histograms (with density!)",
+	paste0("![](", ctx$box_a, ")"),
+	paste0("![](", ctx$box_b, ")"),
+	paste0("![](", ctx$hist_a, ")"),
+	paste0("![](", ctx$hist_b, ")"),
+	"",
+	"### 1.2 qq plot (two-sample)",
+	paste0("![](", ctx$qq_fig, ")"),
+	"",
+	"## 2) linear models"
+)
+
+# Normalize data.frame because NOTHING WORKS
+row_list <- function(x) {
+	if (is.null(x)) return(list())
+	if (is.data.frame(x)) {
+		lapply(seq_len(nrow(x)), function(i) as.list(x[i, , drop = FALSE]))
+	} else if (is.list(x)) {
+		x
+	} else {
+		list()
+	}
+}
+
+if (!is.null(ctx$ols) && length(ctx$ols)) {
+	for (m in row_list(ctx$ols)) {
+		md <- c(md,
+		paste0("### ", m$name),
+		sprintf("- **r²**: %.4f  |  **aic**: %.2f  |  **bic**: %.2f", m$r2, m$aic, m$bic),
+		""
+		)
+	}
+}
+
+md <- c(md,
+	"### 2.2 same models on one region (comparison)",
+	if (!is.null(ctx$best_region_note)) ctx$best_region_note else "no note available.",
+	"",
+	"## 3) classification (knn, label = region)"
+)
+
+if (!is.null(ctx$knn) && length(ctx$knn)) {
+	for (k in row_list(ctx$knn)) {
+		md <- c(md,
+		paste0("### ", k$tag),
+		sprintf("- **k**: %d  |  **accuracy**: %.4f  |  **test n**: %d",
+				k$k, k$accuracy, k$n_test),
+		paste0("variables: `", paste(k$vars, collapse = ", "), "`"),
+		paste0("![](", k$confusion_fig, ")"),
+		""
+		)
+	}
+}
+
+# I hate markdown sometimes man
+md <- gsub("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/", "", md)
+
+# writeLines(md, "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/report.md")
+# writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE),
+#            file.path(ctx$stats_dir, "summary.json"))
+
+md_file <- "output/report.md"
+html_file <- "output/report.html"
+pdf_file <- "output/report.pdf"
+
+setwd("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/")
+markdownToHTML(
+  md_file,
+  html_file
+)
+
+message("done")