122 lines
4.2 KiB
R
122 lines
4.2 KiB
R
source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
|
|
|
|
# TODO: hard-code me
|
|
|
|
# NOTE: The options were generated by chatGPT from my horrendous hard-coded options
|
|
option_list <- list(
|
|
optparse::make_option("--data", type = "character", default = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/epi_results_2024_pop_gdp_v2.csv"),
|
|
optparse::make_option("--region-col", type = "character", default = NA),
|
|
optparse::make_option("--region-a", type = "character", default = NA),
|
|
optparse::make_option("--region-b", type = "character", default = NA),
|
|
optparse::make_option("--response", type = "character", default = NA),
|
|
optparse::make_option("--predictors", type = "character", default = NA),
|
|
optparse::make_option("--knn1", type = "character", default = NA),
|
|
optparse::make_option("--knn2", type = "character", default = NA),
|
|
optparse::make_option("--k", type = "integer", default = 5)
|
|
)
|
|
opt <- optparse::parse_args(optparse::OptionParser(option_list = option_list))
|
|
|
|
if (is.na(opt$data)) stop("--data is required")
|
|
|
|
read_any <- function(p) {
|
|
ext <- tolower(tools::file_ext(p))
|
|
if (ext %in% c("csv", "txt")) {
|
|
suppressMessages(readr::read_csv(p, show_col_types = FALSE))
|
|
} else if (ext %in% c("xls", "xlsx")) {
|
|
readxl::read_excel(p)
|
|
} else {
|
|
stop("unsupported extension: ", ext)
|
|
}
|
|
}
|
|
df <- read_any(opt$data)
|
|
|
|
nms <- names(df)
|
|
|
|
find_col <- function(nms, pats) {
|
|
for (pat in pats) {
|
|
idx <- which(stringr::str_detect(tolower(nms), pat))
|
|
if (length(idx)) return(nms[idx[1]])
|
|
}
|
|
|
|
# I hate it here
|
|
NA_character_
|
|
}
|
|
|
|
region_col <- if (!is.na(opt$`region-col`)) opt$`region-col` else
|
|
find_col(nms, c("^region$", "regions?$", "world\\s*bank\\s*region"))
|
|
|
|
if (is.na(region_col)) stop("could not detect a region column; pass --region-col")
|
|
|
|
response <- if (!is.na(opt$response)) opt$response else if ("EPI.new" %in% nms) {
|
|
"EPI.new"
|
|
} else {
|
|
find_col(nms, c("^epi", "epi.*score", "index$", "score$"))
|
|
}
|
|
|
|
if (is.na(response)) {
|
|
num <- df |> dplyr::select(where(is.numeric)) |> names()
|
|
if (!length(num)) stop("no numeric columns; pass --response")
|
|
response <- num[1]
|
|
}
|
|
|
|
gdp_col <- find_col(nms, c("^gdp", "gdp.*per.*cap", "gdppc"))
|
|
pop_col <- find_col(nms, c("^pop", "^population$"))
|
|
|
|
counts <- sort(table(df[[region_col]]), decreasing = TRUE)
|
|
region_a <- if (!is.na(opt$`region-a`)) opt$`region-a` else
|
|
if ("Sub-Saharan Africa" %in% names(counts)) "Sub-Saharan Africa" else names(counts)[1]
|
|
|
|
region_b <- if (!is.na(opt$`region-b`)) opt$`region-b` else
|
|
if ("Latin America & Caribbean" %in% names(counts)) "Latin America & Caribbean" else names(counts)[2]
|
|
|
|
pred_sets <- list()
|
|
if (!is.na(opt$predictors)) {
|
|
pred_sets <- list(strsplit(opt$predictors, ",", fixed = TRUE)[[1]] |> trimws())
|
|
} else {
|
|
plist <- c()
|
|
if (!is.na(gdp_col)) plist <- c(plist, gdp_col)
|
|
if (!is.na(pop_col)) plist <- c(plist, pop_col)
|
|
if (length(plist) >= 1) pred_sets <- append(pred_sets, list(plist[1]))
|
|
if (length(plist) >= 2) pred_sets <- append(pred_sets, list(plist[1:2]))
|
|
}
|
|
|
|
pred_sets <- pred_sets[lengths(pred_sets) > 0]
|
|
|
|
choose_knn_vars <- function(df, exclude, k = 3) {
|
|
cands <- names(df)[endsWith(names(df), ".new") & names(df) != exclude]
|
|
cands <- cands[sapply(cands, function(c) is.numeric(df[[c]]))]
|
|
miss <- sapply(cands, function(c) mean(is.na(df[[c]])))
|
|
ord <- order(miss, cands)
|
|
head(cands[ord], k)
|
|
}
|
|
|
|
knn1 <- if (!is.na(opt$knn1)) {
|
|
strsplit(opt$knn1, ",", fixed = TRUE)[[1]] |> trimws()
|
|
} else {
|
|
choose_knn_vars(df, response, 3)
|
|
}
|
|
|
|
knn2 <- if (!is.na(opt$knn2)) {
|
|
strsplit(opt$knn2, ",", fixed = TRUE)[[1]] |> trimws()
|
|
} else {
|
|
setdiff(choose_knn_vars(df, response, 6), knn1)[1:3]
|
|
}
|
|
|
|
ctx <- list(
|
|
data = normalizePath(opt$data),
|
|
region_col = region_col,
|
|
response = response,
|
|
region_a = region_a,
|
|
region_b = region_b,
|
|
predictors = pred_sets,
|
|
knn1 = knn1,
|
|
knn2 = knn2,
|
|
k = opt$k,
|
|
fig_dir = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/figures",
|
|
stats_dir = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/stats"
|
|
)
|
|
|
|
|
|
writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
|
|
message("wrote ctx.json")
|