added lab 4
This commit is contained in:
@@ -0,0 +1,121 @@
|
||||
source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
|
||||
|
||||
# TODO: hard-code me
|
||||
|
||||
# NOTE: The options were generated by chatGPT from my horrendous hard-coded options
|
||||
option_list <- list(
|
||||
optparse::make_option("--data", type = "character", default = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/epi_results_2024_pop_gdp_v2.csv"),
|
||||
optparse::make_option("--region-col", type = "character", default = NA),
|
||||
optparse::make_option("--region-a", type = "character", default = NA),
|
||||
optparse::make_option("--region-b", type = "character", default = NA),
|
||||
optparse::make_option("--response", type = "character", default = NA),
|
||||
optparse::make_option("--predictors", type = "character", default = NA),
|
||||
optparse::make_option("--knn1", type = "character", default = NA),
|
||||
optparse::make_option("--knn2", type = "character", default = NA),
|
||||
optparse::make_option("--k", type = "integer", default = 5)
|
||||
)
|
||||
opt <- optparse::parse_args(optparse::OptionParser(option_list = option_list))
|
||||
|
||||
if (is.na(opt$data)) stop("--data is required")
|
||||
|
||||
read_any <- function(p) {
|
||||
ext <- tolower(tools::file_ext(p))
|
||||
if (ext %in% c("csv", "txt")) {
|
||||
suppressMessages(readr::read_csv(p, show_col_types = FALSE))
|
||||
} else if (ext %in% c("xls", "xlsx")) {
|
||||
readxl::read_excel(p)
|
||||
} else {
|
||||
stop("unsupported extension: ", ext)
|
||||
}
|
||||
}
|
||||
df <- read_any(opt$data)
|
||||
|
||||
nms <- names(df)
|
||||
|
||||
find_col <- function(nms, pats) {
|
||||
for (pat in pats) {
|
||||
idx <- which(stringr::str_detect(tolower(nms), pat))
|
||||
if (length(idx)) return(nms[idx[1]])
|
||||
}
|
||||
|
||||
# I hate it here
|
||||
NA_character_
|
||||
}
|
||||
|
||||
region_col <- if (!is.na(opt$`region-col`)) opt$`region-col` else
|
||||
find_col(nms, c("^region$", "regions?$", "world\\s*bank\\s*region"))
|
||||
|
||||
if (is.na(region_col)) stop("could not detect a region column; pass --region-col")
|
||||
|
||||
response <- if (!is.na(opt$response)) opt$response else if ("EPI.new" %in% nms) {
|
||||
"EPI.new"
|
||||
} else {
|
||||
find_col(nms, c("^epi", "epi.*score", "index$", "score$"))
|
||||
}
|
||||
|
||||
if (is.na(response)) {
|
||||
num <- df |> dplyr::select(where(is.numeric)) |> names()
|
||||
if (!length(num)) stop("no numeric columns; pass --response")
|
||||
response <- num[1]
|
||||
}
|
||||
|
||||
gdp_col <- find_col(nms, c("^gdp", "gdp.*per.*cap", "gdppc"))
|
||||
pop_col <- find_col(nms, c("^pop", "^population$"))
|
||||
|
||||
counts <- sort(table(df[[region_col]]), decreasing = TRUE)
|
||||
region_a <- if (!is.na(opt$`region-a`)) opt$`region-a` else
|
||||
if ("Sub-Saharan Africa" %in% names(counts)) "Sub-Saharan Africa" else names(counts)[1]
|
||||
|
||||
region_b <- if (!is.na(opt$`region-b`)) opt$`region-b` else
|
||||
if ("Latin America & Caribbean" %in% names(counts)) "Latin America & Caribbean" else names(counts)[2]
|
||||
|
||||
pred_sets <- list()
|
||||
if (!is.na(opt$predictors)) {
|
||||
pred_sets <- list(strsplit(opt$predictors, ",", fixed = TRUE)[[1]] |> trimws())
|
||||
} else {
|
||||
plist <- c()
|
||||
if (!is.na(gdp_col)) plist <- c(plist, gdp_col)
|
||||
if (!is.na(pop_col)) plist <- c(plist, pop_col)
|
||||
if (length(plist) >= 1) pred_sets <- append(pred_sets, list(plist[1]))
|
||||
if (length(plist) >= 2) pred_sets <- append(pred_sets, list(plist[1:2]))
|
||||
}
|
||||
|
||||
pred_sets <- pred_sets[lengths(pred_sets) > 0]
|
||||
|
||||
choose_knn_vars <- function(df, exclude, k = 3) {
|
||||
cands <- names(df)[endsWith(names(df), ".new") & names(df) != exclude]
|
||||
cands <- cands[sapply(cands, function(c) is.numeric(df[[c]]))]
|
||||
miss <- sapply(cands, function(c) mean(is.na(df[[c]])))
|
||||
ord <- order(miss, cands)
|
||||
head(cands[ord], k)
|
||||
}
|
||||
|
||||
knn1 <- if (!is.na(opt$knn1)) {
|
||||
strsplit(opt$knn1, ",", fixed = TRUE)[[1]] |> trimws()
|
||||
} else {
|
||||
choose_knn_vars(df, response, 3)
|
||||
}
|
||||
|
||||
knn2 <- if (!is.na(opt$knn2)) {
|
||||
strsplit(opt$knn2, ",", fixed = TRUE)[[1]] |> trimws()
|
||||
} else {
|
||||
setdiff(choose_knn_vars(df, response, 6), knn1)[1:3]
|
||||
}
|
||||
|
||||
ctx <- list(
|
||||
data = normalizePath(opt$data),
|
||||
region_col = region_col,
|
||||
response = response,
|
||||
region_a = region_a,
|
||||
region_b = region_b,
|
||||
predictors = pred_sets,
|
||||
knn1 = knn1,
|
||||
knn2 = knn2,
|
||||
k = opt$k,
|
||||
fig_dir = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/figures",
|
||||
stats_dir = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/stats"
|
||||
)
|
||||
|
||||
|
||||
writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
|
||||
message("wrote ctx.json")
|
||||
Reference in New Issue
Block a user