added lab 4

This commit is contained in:
2025-10-31 17:55:13 -04:00
parent dc2ceac7de
commit 88f2975b86
66 changed files with 730 additions and 301 deletions
+122
View File
@@ -0,0 +1,122 @@
# TODO: organize this file better bc I just kinda dumped everything in here
suppressPackageStartupMessages({
pkgs <- c("tidyverse", "readr", "readxl", "broom", "jsonlite", "ggplot2", "class", "optparse", "markdown")
to_install <- pkgs[!pkgs %in% rownames(installed.packages())]
if (length(to_install)) install.packages(to_install, repos = "https://cloud.r-project.org")
lapply(pkgs, library, character.only = TRUE)
})
sanitize <- function(x) {
gsub("[^A-Za-z0-9_.-]+", "_", x)
}
save_plot <- function(p, path, w = 7, h = 5, dpi = 160) {
dir.create(dirname(path), recursive = TRUE, showWarnings = FALSE)
ggplot2::ggsave(path, p, width = w, height = h, dpi = dpi)
}
hist_density_plot <- function(v, lbl) {
tibble(x = v) |>
ggplot(aes(x = x)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, alpha = 0.6) +
geom_density() +
labs(title = paste("histogram + density:", lbl), x = lbl, y = "density") +
theme_minimal()
}
box_plot <- function(v, lbl) {
tibble(x = v) |>
ggplot(aes(y = x)) +
geom_boxplot(width = 0.3) +
labs(title = paste("boxplot:", lbl), y = lbl, x = NULL) +
theme_minimal()
}
qq_two_sample <- function(a, b, title = "qq plot", n_q = NULL) {
a <- a[is.finite(a)]
b <- b[is.finite(b)]
n <- min(length(a), length(b))
if (is.null(n_q)) n_q <- max(10, floor(n))
if (n_q < 10) return(NULL)
probs <- seq(0.01, 0.99, length.out = n_q)
qa <- quantile(a, probs, na.rm = TRUE, names = FALSE)
qb <- quantile(b, probs, na.rm = TRUE, names = FALSE)
d <- tibble(x = sort(qa), y = sort(qb))
ggplot(d, aes(x, y)) +
geom_point(size = 1.6) +
geom_abline(slope = 1, intercept = 0) +
labs(title = title, x = "region a quantiles", y = "region b quantiles") +
theme_minimal()
}
tf_pos <- function(s) {
s <- as.numeric(s)
if (all(s[is.finite(s)] > 0)) log1p(s) else s
}
strat_split <- function(d, label_col, test_prop = 0.25) {
d <- d |> tidyr::drop_na({{label_col}})
idx_tr <- integer(0)
idx_te <- integer(0)
for (lev in unique(d[[label_col]])) {
rows <- which(d[[label_col]] == lev)
nte <- max(1, floor(length(rows) * test_prop))
te <- sample(rows, nte)
tr <- setdiff(rows, te)
idx_tr <- c(idx_tr, tr)
idx_te <- c(idx_te, te)
}
list(train = d[idx_tr, , drop = FALSE], test = d[idx_te, , drop = FALSE])
}
run_knn <- function(d, label_col, vars, k, tag, fig_dir) {
use <- d |> dplyr::select(all_of(c(label_col, vars))) |> tidyr::drop_na()
split <- strat_split(use, label_col, 0.25)
tr <- split$train
te <- split$test
mu <- sapply(tr[, vars, drop = FALSE], mean, na.rm = TRUE)
sdv <- sapply(tr[, vars, drop = FALSE], sd, na.rm = TRUE)
sdv[sdv == 0] <- 1
trX <- scale(as.matrix(tr[, vars, drop = FALSE]), center = mu, scale = sdv)
teX <- scale(as.matrix(te[, vars, drop = FALSE]), center = mu, scale = sdv)
trY <- factor(tr[[label_col]])
teY <- factor(te[[label_col]], levels = levels(trY))
pred <- class::knn(train = trX, test = teX, cl = trY, k = k)
acc <- mean(pred == teY, na.rm = TRUE)
cm <- table(truth = teY, pred = pred)
cm_df <- as.data.frame(cm)
cm_fig <- file.path(fig_dir, paste0("knn_confusion_", sanitize(tag), ".png"))
p_cm <- ggplot(cm_df, aes(x = pred, y = truth, fill = Freq)) +
geom_tile() + geom_text(aes(label = Freq)) +
labs(
title = paste0("confusion matrix (k=", k, ") ", tag),
x = "predicted", y = "true"
) + theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
)
save_plot(p_cm, cm_fig, w = 6, h = 5)
list(tag = tag,
k = k,
vars = vars,
accuracy = unname(acc),
confusion_fig = cm_fig,
n_test = nrow(te))
}
+121
View File
@@ -0,0 +1,121 @@
source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
# TODO: hard-code me
# NOTE: The options were generated by chatGPT from my horrendous hard-coded options
option_list <- list(
optparse::make_option("--data", type = "character", default = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/epi_results_2024_pop_gdp_v2.csv"),
optparse::make_option("--region-col", type = "character", default = NA),
optparse::make_option("--region-a", type = "character", default = NA),
optparse::make_option("--region-b", type = "character", default = NA),
optparse::make_option("--response", type = "character", default = NA),
optparse::make_option("--predictors", type = "character", default = NA),
optparse::make_option("--knn1", type = "character", default = NA),
optparse::make_option("--knn2", type = "character", default = NA),
optparse::make_option("--k", type = "integer", default = 5)
)
opt <- optparse::parse_args(optparse::OptionParser(option_list = option_list))
if (is.na(opt$data)) stop("--data is required")
read_any <- function(p) {
ext <- tolower(tools::file_ext(p))
if (ext %in% c("csv", "txt")) {
suppressMessages(readr::read_csv(p, show_col_types = FALSE))
} else if (ext %in% c("xls", "xlsx")) {
readxl::read_excel(p)
} else {
stop("unsupported extension: ", ext)
}
}
df <- read_any(opt$data)
nms <- names(df)
find_col <- function(nms, pats) {
for (pat in pats) {
idx <- which(stringr::str_detect(tolower(nms), pat))
if (length(idx)) return(nms[idx[1]])
}
# I hate it here
NA_character_
}
region_col <- if (!is.na(opt$`region-col`)) opt$`region-col` else
find_col(nms, c("^region$", "regions?$", "world\\s*bank\\s*region"))
if (is.na(region_col)) stop("could not detect a region column; pass --region-col")
response <- if (!is.na(opt$response)) opt$response else if ("EPI.new" %in% nms) {
"EPI.new"
} else {
find_col(nms, c("^epi", "epi.*score", "index$", "score$"))
}
if (is.na(response)) {
num <- df |> dplyr::select(where(is.numeric)) |> names()
if (!length(num)) stop("no numeric columns; pass --response")
response <- num[1]
}
gdp_col <- find_col(nms, c("^gdp", "gdp.*per.*cap", "gdppc"))
pop_col <- find_col(nms, c("^pop", "^population$"))
counts <- sort(table(df[[region_col]]), decreasing = TRUE)
region_a <- if (!is.na(opt$`region-a`)) opt$`region-a` else
if ("Sub-Saharan Africa" %in% names(counts)) "Sub-Saharan Africa" else names(counts)[1]
region_b <- if (!is.na(opt$`region-b`)) opt$`region-b` else
if ("Latin America & Caribbean" %in% names(counts)) "Latin America & Caribbean" else names(counts)[2]
pred_sets <- list()
if (!is.na(opt$predictors)) {
pred_sets <- list(strsplit(opt$predictors, ",", fixed = TRUE)[[1]] |> trimws())
} else {
plist <- c()
if (!is.na(gdp_col)) plist <- c(plist, gdp_col)
if (!is.na(pop_col)) plist <- c(plist, pop_col)
if (length(plist) >= 1) pred_sets <- append(pred_sets, list(plist[1]))
if (length(plist) >= 2) pred_sets <- append(pred_sets, list(plist[1:2]))
}
pred_sets <- pred_sets[lengths(pred_sets) > 0]
choose_knn_vars <- function(df, exclude, k = 3) {
cands <- names(df)[endsWith(names(df), ".new") & names(df) != exclude]
cands <- cands[sapply(cands, function(c) is.numeric(df[[c]]))]
miss <- sapply(cands, function(c) mean(is.na(df[[c]])))
ord <- order(miss, cands)
head(cands[ord], k)
}
knn1 <- if (!is.na(opt$knn1)) {
strsplit(opt$knn1, ",", fixed = TRUE)[[1]] |> trimws()
} else {
choose_knn_vars(df, response, 3)
}
knn2 <- if (!is.na(opt$knn2)) {
strsplit(opt$knn2, ",", fixed = TRUE)[[1]] |> trimws()
} else {
setdiff(choose_knn_vars(df, response, 6), knn1)[1:3]
}
ctx <- list(
data = normalizePath(opt$data),
region_col = region_col,
response = response,
region_a = region_a,
region_b = region_b,
predictors = pred_sets,
knn1 = knn1,
knn2 = knn2,
k = opt$k,
fig_dir = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/figures",
stats_dir = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/stats"
)
writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
message("wrote ctx.json")
+27
View File
@@ -0,0 +1,27 @@
source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
df <- suppressMessages(readr::read_csv(ctx$data, show_col_types = FALSE))
va <- df |> dplyr::filter(.data[[ctx$region_col]] == ctx$region_a) |> dplyr::pull(all_of(ctx$response)) |> as.numeric()
vb <- df |> dplyr::filter(.data[[ctx$region_col]] == ctx$region_b) |> dplyr::pull(all_of(ctx$response)) |> as.numeric()
box_a_path <- file.path(ctx$fig_dir, paste0("box_", sanitize(ctx$region_a), "_", sanitize(ctx$response), ".png"))
box_b_path <- file.path(ctx$fig_dir, paste0("box_", sanitize(ctx$region_b), "_", sanitize(ctx$response), ".png"))
hist_a_path <- file.path(ctx$fig_dir, paste0("hist_", sanitize(ctx$region_a), "_", sanitize(ctx$response), ".png"))
hist_b_path <- file.path(ctx$fig_dir, paste0("hist_", sanitize(ctx$region_b), "_", sanitize(ctx$response), ".png"))
qq_path <- file.path(ctx$fig_dir, paste0("qq_", sanitize(ctx$response), "_", sanitize(ctx$region_a), "_vs_", sanitize(ctx$region_b), ".png"))
save_plot(box_plot(va, paste0(ctx$response, " (", ctx$region_a, ")")), box_a_path)
save_plot(box_plot(vb, paste0(ctx$response, " (", ctx$region_b, ")")), box_b_path)
save_plot(hist_density_plot(va, paste0(ctx$response, " (", ctx$region_a, ")")), hist_a_path)
save_plot(hist_density_plot(vb, paste0(ctx$response, " (", ctx$region_b, ")")), hist_b_path)
qp <- qq_two_sample(va, vb, paste0("qq plot: ", ctx$region_a, " vs ", ctx$region_b))
if (!is.null(qp)) save_plot(qp, qq_path)
ctx$box_a <- box_a_path
ctx$box_b <- box_b_path
ctx$hist_a <- hist_a_path
ctx$hist_b <- hist_b_path
ctx$qq_fig <- qq_path
writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
message("plots done")
+59
View File
@@ -0,0 +1,59 @@
source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
df <- suppressMessages(readr::read_csv(ctx$data, show_col_types = FALSE))
fit_ols <- function(data, y_col, x_cols, name_tag, fig_dir, stats_dir) {
d <- data |> dplyr::select(all_of(c(y_col, x_cols))) |> tidyr::drop_na()
for (xc in x_cols) d[[xc]] <- tf_pos(d[[xc]])
f <- as.formula(paste(y_col, "~", paste(x_cols, collapse = " + ")))
m <- lm(f, data = d)
# residuals vs fitted
res_path <- file.path(fig_dir, paste0("residuals_", sanitize(name_tag), ".png"))
p_res <- tibble(fitted = fitted(m), resid = resid(m)) |>
ggplot(aes(fitted, resid)) +
geom_point(size = 1.6) +
geom_hline(yintercept = 0) +
labs(title = paste("residuals vs fitted:", name_tag),
x = "fitted", y = "residuals") +
theme_minimal()
save_plot(p_res, res_path)
# scatter first predictor vs response
first <- x_cols[1]
sc_path <- file.path(fig_dir, paste0("scatter_", sanitize(name_tag), "_", sanitize(first), ".png"))
p_sc <- d |>
ggplot(aes(.data[[first]], .data[[y_col]])) +
geom_point(size = 1.6) +
labs(title = paste(first, "vs", y_col), x = first, y = y_col) +
theme_minimal()
save_plot(p_sc, sc_path)
# summary to text
summ_path <- file.path(stats_dir, paste0("ols_", sanitize(name_tag), ".txt"))
capture.output(summary(m), file = summ_path)
gl <- broom::glance(m)
list(
name = name_tag,
rsq = unname(gl$r.squared),
aic = unname(gl$AIC),
bic = unname(gl$BIC),
nobs = stats::nobs(m),
summary_file = summ_path,
residuals_fig = res_path,
scatter_fig = sc_path
)
}
ols <- list()
if (length(ctx$predictors)) {
for (p in ctx$predictors) {
tag <- paste0("full: ", ctx$response, " ~ ", paste(p, collapse = " + "))
ols <- append(ols, list(fit_ols(df, ctx$response, p, tag, ctx$fig_dir, ctx$stats_dir)))
}
}
ctx$ols <- ols
writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
message("ols (full) done")
+72
View File
@@ -0,0 +1,72 @@
source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
df <- suppressMessages(readr::read_csv(ctx$data, show_col_types = FALSE))
reg_df <- df |> dplyr::filter(.data[[ctx$region_col]] == ctx$region_a)
fit_ols <- function(data, y_col, x_cols, name_tag, fig_dir, stats_dir) {
d <- data |> dplyr::select(all_of(c(y_col, x_cols))) |> tidyr::drop_na()
for (xc in x_cols) d[[xc]] <- tf_pos(d[[xc]])
f <- as.formula(paste(y_col, "~", paste(x_cols, collapse = " + ")))
m <- lm(f, data = d)
res_path <- file.path(fig_dir, paste0("residuals_", sanitize(name_tag), ".png"))
p_res <- tibble(fitted = fitted(m), resid = resid(m)) |>
ggplot(aes(fitted, resid)) +
geom_point(size = 1.6) +
geom_hline(yintercept = 0) +
labs(title = paste("residuals vs fitted:", name_tag),
x = "fitted", y = "residuals") +
theme_minimal()
save_plot(p_res, res_path)
first <- x_cols[1]
sc_path <- file.path(fig_dir, paste0("scatter_", sanitize(name_tag), "_", sanitize(first), ".png"))
p_sc <- d |>
ggplot(aes(.data[[first]], .data[[y_col]])) +
geom_point(size = 1.6) +
labs(title = paste(first, "vs", y_col), x = first, y = y_col) +
theme_minimal()
save_plot(p_sc, sc_path)
summ_path <- file.path(stats_dir, paste0("ols_", sanitize(name_tag), ".txt"))
capture.output(summary(m), file = summ_path)
gl <- broom::glance(m)
list(
name = name_tag,
rsq = unname(gl$r.squared),
aic = unname(gl$AIC),
bic = unname(gl$BIC),
nobs = stats::nobs(m),
summary_file = summ_path,
residuals_fig = res_path,
scatter_fig = sc_path
)
}
region_models <- list()
if (length(ctx$predictors)) {
for (p in ctx$predictors) {
tag <- paste0("region ", ctx$region_a, ": ", ctx$response, " ~ ", paste(p, collapse = " + "))
region_models <- append(region_models, list(fit_ols(reg_df, ctx$response, p, tag, ctx$fig_dir, ctx$stats_dir)))
}
}
best_note <- "no region-level comparison available."
if (length(region_models) >= 1) {
ord <- order(
sapply(region_models, `[[`, "rsq"),
-sapply(region_models, `[[`, "aic"),
-sapply(region_models, `[[`, "bic"),
decreasing = TRUE
)
best <- region_models[[ord[1]]]
best_note <- sprintf(
"on region `%s`, the better model is **%s** (r²=%.3f, aic=%.1f, bic=%.1f).",
ctx$region_a, best$name, best$rsq, best$aic, best$bic
)
}
ctx$best_region_note <- best_note
writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
message("ols (region) done")
+11
View File
@@ -0,0 +1,11 @@
source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
df <- suppressMessages(readr::read_csv(ctx$data, show_col_types = FALSE))
set.seed(42)
knn1 <- run_knn(df, ctx$region_col, ctx$knn1, ctx$k, "model A", ctx$fig_dir)
knn2 <- run_knn(df, ctx$region_col, ctx$knn2, ctx$k, "model B", ctx$fig_dir)
ctx$knn <- list(knn1, knn2)
writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
message("knn done")
+89
View File
@@ -0,0 +1,89 @@
library(markdown)
source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
md <- c(
"# exploratory data analysis and models on the epi dataset",
paste0("date: ", as.character(Sys.Date()), " "),
"",
"## dataset and choices",
# I regret my life choices
paste0("- **file**: `", basename(ctx$data), "`"),
paste0("- **region column**: `", ctx$region_col, "`"),
paste0("- **response var**: `", ctx$response, "`"),
paste0("- **regions**: `", ctx$region_a, "` vs `", ctx$region_b, "`"),
"",
"## 1) variable distributions",
"### 1.1 boxplots and histograms (with density!)",
paste0("![](", ctx$box_a, ")"),
paste0("![](", ctx$box_b, ")"),
paste0("![](", ctx$hist_a, ")"),
paste0("![](", ctx$hist_b, ")"),
"",
"### 1.2 qq plot (two-sample)",
paste0("![](", ctx$qq_fig, ")"),
"",
"## 2) linear models"
)
# Normalize data.frame because NOTHING WORKS
row_list <- function(x) {
if (is.null(x)) return(list())
if (is.data.frame(x)) {
lapply(seq_len(nrow(x)), function(i) as.list(x[i, , drop = FALSE]))
} else if (is.list(x)) {
x
} else {
list()
}
}
if (!is.null(ctx$ols) && length(ctx$ols)) {
for (m in row_list(ctx$ols)) {
md <- c(md,
paste0("### ", m$name),
sprintf("- **r²**: %.4f | **aic**: %.2f | **bic**: %.2f", m$r2, m$aic, m$bic),
""
)
}
}
md <- c(md,
"### 2.2 same models on one region (comparison)",
if (!is.null(ctx$best_region_note)) ctx$best_region_note else "no note available.",
"",
"## 3) classification (knn, label = region)"
)
if (!is.null(ctx$knn) && length(ctx$knn)) {
for (k in row_list(ctx$knn)) {
md <- c(md,
paste0("### ", k$tag),
sprintf("- **k**: %d | **accuracy**: %.4f | **test n**: %d",
k$k, k$accuracy, k$n_test),
paste0("variables: `", paste(k$vars, collapse = ", "), "`"),
paste0("![](", k$confusion_fig, ")"),
""
)
}
}
# I hate markdown sometimes man
md <- gsub("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/", "", md)
# writeLines(md, "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/report.md")
# writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE),
# file.path(ctx$stats_dir, "summary.json"))
md_file <- "output/report.md"
html_file <- "output/report.html"
pdf_file <- "output/report.pdf"
setwd("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/")
markdownToHTML(
md_file,
html_file
)
message("done")