This repository has been archived on 2026-05-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
2025-10-31 17:55:13 -04:00

167 lines
5.3 KiB
R

# bleh
suppressPackageStartupMessages({
library(tidyverse)
library(broom)
library(caret)
library(class)
});
# helpers
find_col <- function(df, patterns) {
cols <- names(df)
for (pat in patterns) {
hit <- cols[str_detect(tolower(cols), regex(pat, ignore_case = TRUE))]
if (length(hit) > 0) return(hit[[1]])
}
return(NULL)
}
sanitize_filename <- function(x) {
gsub("[^A-Za-z0-9_.-]+", "_", x)
}
pick_regions <- function(df, region_col, a = NULL, b = NULL) {
if (!is.null(a) && !is.null(b)) return(c(a, b))
cnt <- df %>% filter(!is.na(.data[[region_col]])) %>%
count(.data[[region_col]], sort = TRUE)
stopifnot(nrow(cnt) >= 2)
c(cnt[[1,1]], cnt[[2,1]])
}
save_txt <- function(path, txt) {
dir.create(dirname(path), showWarnings = FALSE, recursive = TRUE)
writeLines(txt, path)
}
# args
args <- commandArgs(trailingOnly = TRUE)
opt <- list(
data = NULL, region_col = NULL, region_a = NULL, region_b = NULL,
response = NULL, predictors = NULL, knn1 = NULL, knn2 = NULL,
k = 5L, render = "none" # "none" | "html" | "pdf"
)
parse_flag <- function(flag) {
key <- sub("^--", "", flag[[1]])
val <- if (length(flag) > 1) flag[[2]] else TRUE
opt[[key]] <<- val
}
if (length(args)) {
kv <- split(args, cumsum(grepl("^--", args)))
lapply(kv, parse_flag)
}
stopifnot(!is.null(opt$data))
# load data
message("reading: ", opt$data)
df <- if (grepl("\\.csv(\\.gz)?$", opt$data, ignore.case = TRUE)) {
suppressMessages(readr::read_csv(opt$data, show_col_types = FALSE))
} else {
suppressMessages(readxl::read_excel(opt$data))
}
# auto-detect columns
region_col <- opt$region_col %||%
find_col(df, c("^region$", "regions?$", "world\\s*bank\\s*region"))
if (is.null(region_col)) stop("could not detect a 'region' column; use --region-col")
response_col <- opt$response %||%
find_col(df, c("^epi$", "overall\\s*score$", "index$", "score$"))
if (is.null(response_col)) {
nums <- df %>% select(where(is.numeric)) %>% names()
if (length(nums) == 0) stop("no numeric columns found; set --response explicitly")
response_col <- nums[[1]]
}
gdp_col <- find_col(df, c("^gdp", "gdp.*per.*cap", "gdppc"))
pop_col <- find_col(df, c("^pop", "^population$"))
# predictors set(s)
pred_sets <- list()
if (!is.null(opt$predictors)) {
pred_sets <- list(strsplit(opt$predictors, ",", fixed = TRUE)[[1]] %>% trimws())
} else {
ps <- c(na.omit(gdp_col), na.omit(pop_col))
if (length(ps) >= 1) pred_sets <- append(pred_sets, list(ps[1]))
if (length(ps) >= 2) pred_sets <- append(pred_sets, list(ps[1:2]))
}
# choose regions
regions <- pick_regions(df, region_col, opt$region_a, opt$region_b)
region_a <- regions[[1]]; region_b <- regions[[2]]
# dirs
fig_dir <- "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/figures"; dir.create(fig_dir, showWarnings = FALSE)
stats_dir <- "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/stats"; dir.create(stats_dir, showWarnings = FALSE)
# distributions and qq
fa <- df %>% filter(.data[[region_col]] == region_a) %>% pull(all_of(response_col)) %>% as.numeric()
fb <- df %>% filter(.data[[region_col]] == region_b) %>% pull(all_of(response_col)) %>% as.numeric()
# le plot de box
p_box_a <- tibble(val = fa) %>%
ggplot(aes(x = "", y = val)) +
geom_boxplot() +
labs(title = paste0("boxplot: ", response_col, " (", region_a, ")"),
x = NULL, y = response_col)
ggsave(file.path(fig_dir, sprintf("box_%s_%s.png",
sanitize_filename(region_a), sanitize_filename(response_col))),
p_box_a, width = 5, height = 4, dpi = 160)
p_box_b <- tibble(val = fb) %>%
ggplot(aes(x = "", y = val)) +
geom_boxplot() +
labs(title = paste0("boxplot: ", response_col, " (", region_b, ")"),
x = NULL, y = response_col)
ggsave(file.path(fig_dir, sprintf("box_%s_%s.png",
sanitize_filename(region_b), sanitize_filename(response_col))),
p_box_b, width = 5, height = 4, dpi = 160)
# hist and density
p_hist_a <- tibble(val = fa) %>%
ggplot(aes(x = val)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, alpha = 0.6) +
geom_density() +
labs(title = paste0("histogram + density: ", response_col, " (", region_a, ")"),
x = response_col, y = "density")
ggsave(file.path(fig_dir, sprintf("hist_%s_%s.png",
sanitize_filename(region_a), sanitize_filename(response_col))),
p_hist_a, width = 6, height = 4, dpi = 160)
p_hist_b <- tibble(val = fb) %>%
ggplot(aes(x = val)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, alpha = 0.6) +
geom_density() +
labs(title = paste0("histogram + density: ", response_col, " (", region_b, ")"),
x = response_col, y = "density")
ggsave(file.path(fig_dir, sprintf("hist_%s_%s.png",
sanitize_filename(region_b), sanitize_filename(response_col))),
p_hist_b, width = 6, height = 4, dpi = 160)
# qq plot between the two regions (2 sample qq)
png(file.path(fig_dir, sprintf("qq_%s_%s_vs_%s.png",
sanitize_filename(response_col), sanitize_filename(region_a), sanitize_filename(region_b))),
width = 700, height = 500, res = 160)
n <- min(sum(is.finite(fa)), sum(is.finite(fb)))
q <- seq(0.01, 0.99, length.out = max(10, n))
xq <- quantile(fa, q, na.rm = TRUE); yq <- quantile(fb, q, na.rm = TRUE)
plot(sort(xq), sort(yq), pch = 19, cex = 0.6,
xlab = paste0("quantiles: ", region_a),
ylab = paste0("quantiles: ", region_b),
main = paste("qq plot:", region_a, "vs", region_b))
abline(0, 1)
dev.off()