This repository has been archived on 2026-05-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Data-Analytics/Assignment II/R/04_ols_region.R
T
2025-10-31 17:55:13 -04:00

73 lines
2.6 KiB
R

source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
df <- suppressMessages(readr::read_csv(ctx$data, show_col_types = FALSE))
reg_df <- df |> dplyr::filter(.data[[ctx$region_col]] == ctx$region_a)
fit_ols <- function(data, y_col, x_cols, name_tag, fig_dir, stats_dir) {
d <- data |> dplyr::select(all_of(c(y_col, x_cols))) |> tidyr::drop_na()
for (xc in x_cols) d[[xc]] <- tf_pos(d[[xc]])
f <- as.formula(paste(y_col, "~", paste(x_cols, collapse = " + ")))
m <- lm(f, data = d)
res_path <- file.path(fig_dir, paste0("residuals_", sanitize(name_tag), ".png"))
p_res <- tibble(fitted = fitted(m), resid = resid(m)) |>
ggplot(aes(fitted, resid)) +
geom_point(size = 1.6) +
geom_hline(yintercept = 0) +
labs(title = paste("residuals vs fitted:", name_tag),
x = "fitted", y = "residuals") +
theme_minimal()
save_plot(p_res, res_path)
first <- x_cols[1]
sc_path <- file.path(fig_dir, paste0("scatter_", sanitize(name_tag), "_", sanitize(first), ".png"))
p_sc <- d |>
ggplot(aes(.data[[first]], .data[[y_col]])) +
geom_point(size = 1.6) +
labs(title = paste(first, "vs", y_col), x = first, y = y_col) +
theme_minimal()
save_plot(p_sc, sc_path)
summ_path <- file.path(stats_dir, paste0("ols_", sanitize(name_tag), ".txt"))
capture.output(summary(m), file = summ_path)
gl <- broom::glance(m)
list(
name = name_tag,
rsq = unname(gl$r.squared),
aic = unname(gl$AIC),
bic = unname(gl$BIC),
nobs = stats::nobs(m),
summary_file = summ_path,
residuals_fig = res_path,
scatter_fig = sc_path
)
}
region_models <- list()
if (length(ctx$predictors)) {
for (p in ctx$predictors) {
tag <- paste0("region ", ctx$region_a, ": ", ctx$response, " ~ ", paste(p, collapse = " + "))
region_models <- append(region_models, list(fit_ols(reg_df, ctx$response, p, tag, ctx$fig_dir, ctx$stats_dir)))
}
}
best_note <- "no region-level comparison available."
if (length(region_models) >= 1) {
ord <- order(
sapply(region_models, `[[`, "rsq"),
-sapply(region_models, `[[`, "aic"),
-sapply(region_models, `[[`, "bic"),
decreasing = TRUE
)
best <- region_models[[ord[1]]]
best_note <- sprintf(
"on region `%s`, the better model is **%s** (r²=%.3f, aic=%.1f, bic=%.1f).",
ctx$region_a, best$name, best$rsq, best$aic, best$bic
)
}
ctx$best_region_note <- best_note
writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
message("ols (region) done")