source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R") # TODO: hard-code me # NOTE: The options were generated by chatGPT from my horrendous hard-coded options option_list <- list( optparse::make_option("--data", type = "character", default = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/epi_results_2024_pop_gdp_v2.csv"), optparse::make_option("--region-col", type = "character", default = NA), optparse::make_option("--region-a", type = "character", default = NA), optparse::make_option("--region-b", type = "character", default = NA), optparse::make_option("--response", type = "character", default = NA), optparse::make_option("--predictors", type = "character", default = NA), optparse::make_option("--knn1", type = "character", default = NA), optparse::make_option("--knn2", type = "character", default = NA), optparse::make_option("--k", type = "integer", default = 5) ) opt <- optparse::parse_args(optparse::OptionParser(option_list = option_list)) if (is.na(opt$data)) stop("--data is required") read_any <- function(p) { ext <- tolower(tools::file_ext(p)) if (ext %in% c("csv", "txt")) { suppressMessages(readr::read_csv(p, show_col_types = FALSE)) } else if (ext %in% c("xls", "xlsx")) { readxl::read_excel(p) } else { stop("unsupported extension: ", ext) } } df <- read_any(opt$data) nms <- names(df) find_col <- function(nms, pats) { for (pat in pats) { idx <- which(stringr::str_detect(tolower(nms), pat)) if (length(idx)) return(nms[idx[1]]) } # I hate it here NA_character_ } region_col <- if (!is.na(opt$`region-col`)) opt$`region-col` else find_col(nms, c("^region$", "regions?$", "world\\s*bank\\s*region")) if (is.na(region_col)) stop("could not detect a region column; pass --region-col") response <- if (!is.na(opt$response)) opt$response else if ("EPI.new" %in% nms) { "EPI.new" } else { find_col(nms, c("^epi", "epi.*score", "index$", "score$")) } if (is.na(response)) { num <- df |> dplyr::select(where(is.numeric)) |> names() if (!length(num)) stop("no numeric columns; pass --response") response <- num[1] } gdp_col <- find_col(nms, c("^gdp", "gdp.*per.*cap", "gdppc")) pop_col <- find_col(nms, c("^pop", "^population$")) counts <- sort(table(df[[region_col]]), decreasing = TRUE) region_a <- if (!is.na(opt$`region-a`)) opt$`region-a` else if ("Sub-Saharan Africa" %in% names(counts)) "Sub-Saharan Africa" else names(counts)[1] region_b <- if (!is.na(opt$`region-b`)) opt$`region-b` else if ("Latin America & Caribbean" %in% names(counts)) "Latin America & Caribbean" else names(counts)[2] pred_sets <- list() if (!is.na(opt$predictors)) { pred_sets <- list(strsplit(opt$predictors, ",", fixed = TRUE)[[1]] |> trimws()) } else { plist <- c() if (!is.na(gdp_col)) plist <- c(plist, gdp_col) if (!is.na(pop_col)) plist <- c(plist, pop_col) if (length(plist) >= 1) pred_sets <- append(pred_sets, list(plist[1])) if (length(plist) >= 2) pred_sets <- append(pred_sets, list(plist[1:2])) } pred_sets <- pred_sets[lengths(pred_sets) > 0] choose_knn_vars <- function(df, exclude, k = 3) { cands <- names(df)[endsWith(names(df), ".new") & names(df) != exclude] cands <- cands[sapply(cands, function(c) is.numeric(df[[c]]))] miss <- sapply(cands, function(c) mean(is.na(df[[c]]))) ord <- order(miss, cands) head(cands[ord], k) } knn1 <- if (!is.na(opt$knn1)) { strsplit(opt$knn1, ",", fixed = TRUE)[[1]] |> trimws() } else { choose_knn_vars(df, response, 3) } knn2 <- if (!is.na(opt$knn2)) { strsplit(opt$knn2, ",", fixed = TRUE)[[1]] |> trimws() } else { setdiff(choose_knn_vars(df, response, 6), knn1)[1:3] } ctx <- list( data = normalizePath(opt$data), region_col = region_col, response = response, region_a = region_a, region_b = region_b, predictors = pred_sets, knn1 = knn1, knn2 = knn2, k = opt$k, fig_dir = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/figures", stats_dir = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/stats" ) writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json") message("wrote ctx.json")