This repository has been archived on 2026-05-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Data-Analytics/Assignments/Assignment II/R/01_args_and_load.R
T
2025-10-13 12:44:58 -04:00

122 lines
4.2 KiB
R

source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
# TODO: hard-code me
# NOTE: The options were generated by chatGPT from my horrendous hard-coded options
option_list <- list(
optparse::make_option("--data", type = "character", default = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/epi_results_2024_pop_gdp_v2.csv"),
optparse::make_option("--region-col", type = "character", default = NA),
optparse::make_option("--region-a", type = "character", default = NA),
optparse::make_option("--region-b", type = "character", default = NA),
optparse::make_option("--response", type = "character", default = NA),
optparse::make_option("--predictors", type = "character", default = NA),
optparse::make_option("--knn1", type = "character", default = NA),
optparse::make_option("--knn2", type = "character", default = NA),
optparse::make_option("--k", type = "integer", default = 5)
)
opt <- optparse::parse_args(optparse::OptionParser(option_list = option_list))
if (is.na(opt$data)) stop("--data is required")
read_any <- function(p) {
ext <- tolower(tools::file_ext(p))
if (ext %in% c("csv", "txt")) {
suppressMessages(readr::read_csv(p, show_col_types = FALSE))
} else if (ext %in% c("xls", "xlsx")) {
readxl::read_excel(p)
} else {
stop("unsupported extension: ", ext)
}
}
df <- read_any(opt$data)
nms <- names(df)
find_col <- function(nms, pats) {
for (pat in pats) {
idx <- which(stringr::str_detect(tolower(nms), pat))
if (length(idx)) return(nms[idx[1]])
}
# I hate it here
NA_character_
}
region_col <- if (!is.na(opt$`region-col`)) opt$`region-col` else
find_col(nms, c("^region$", "regions?$", "world\\s*bank\\s*region"))
if (is.na(region_col)) stop("could not detect a region column; pass --region-col")
response <- if (!is.na(opt$response)) opt$response else if ("EPI.new" %in% nms) {
"EPI.new"
} else {
find_col(nms, c("^epi", "epi.*score", "index$", "score$"))
}
if (is.na(response)) {
num <- df |> dplyr::select(where(is.numeric)) |> names()
if (!length(num)) stop("no numeric columns; pass --response")
response <- num[1]
}
gdp_col <- find_col(nms, c("^gdp", "gdp.*per.*cap", "gdppc"))
pop_col <- find_col(nms, c("^pop", "^population$"))
counts <- sort(table(df[[region_col]]), decreasing = TRUE)
region_a <- if (!is.na(opt$`region-a`)) opt$`region-a` else
if ("Sub-Saharan Africa" %in% names(counts)) "Sub-Saharan Africa" else names(counts)[1]
region_b <- if (!is.na(opt$`region-b`)) opt$`region-b` else
if ("Latin America & Caribbean" %in% names(counts)) "Latin America & Caribbean" else names(counts)[2]
pred_sets <- list()
if (!is.na(opt$predictors)) {
pred_sets <- list(strsplit(opt$predictors, ",", fixed = TRUE)[[1]] |> trimws())
} else {
plist <- c()
if (!is.na(gdp_col)) plist <- c(plist, gdp_col)
if (!is.na(pop_col)) plist <- c(plist, pop_col)
if (length(plist) >= 1) pred_sets <- append(pred_sets, list(plist[1]))
if (length(plist) >= 2) pred_sets <- append(pred_sets, list(plist[1:2]))
}
pred_sets <- pred_sets[lengths(pred_sets) > 0]
choose_knn_vars <- function(df, exclude, k = 3) {
cands <- names(df)[endsWith(names(df), ".new") & names(df) != exclude]
cands <- cands[sapply(cands, function(c) is.numeric(df[[c]]))]
miss <- sapply(cands, function(c) mean(is.na(df[[c]])))
ord <- order(miss, cands)
head(cands[ord], k)
}
knn1 <- if (!is.na(opt$knn1)) {
strsplit(opt$knn1, ",", fixed = TRUE)[[1]] |> trimws()
} else {
choose_knn_vars(df, response, 3)
}
knn2 <- if (!is.na(opt$knn2)) {
strsplit(opt$knn2, ",", fixed = TRUE)[[1]] |> trimws()
} else {
setdiff(choose_knn_vars(df, response, 6), knn1)[1:3]
}
ctx <- list(
data = normalizePath(opt$data),
region_col = region_col,
response = response,
region_a = region_a,
region_b = region_b,
predictors = pred_sets,
knn1 = knn1,
knn2 = knn2,
k = opt$k,
fig_dir = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/figures",
stats_dir = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/stats"
)
writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
message("wrote ctx.json")