167 lines
5.3 KiB
R
167 lines
5.3 KiB
R
# bleh
|
|
|
|
suppressPackageStartupMessages({
|
|
library(tidyverse)
|
|
library(broom)
|
|
library(caret)
|
|
library(class)
|
|
});
|
|
|
|
# helpers
|
|
find_col <- function(df, patterns) {
|
|
cols <- names(df)
|
|
for (pat in patterns) {
|
|
hit <- cols[str_detect(tolower(cols), regex(pat, ignore_case = TRUE))]
|
|
if (length(hit) > 0) return(hit[[1]])
|
|
}
|
|
return(NULL)
|
|
}
|
|
|
|
sanitize_filename <- function(x) {
|
|
gsub("[^A-Za-z0-9_.-]+", "_", x)
|
|
}
|
|
|
|
pick_regions <- function(df, region_col, a = NULL, b = NULL) {
|
|
if (!is.null(a) && !is.null(b)) return(c(a, b))
|
|
cnt <- df %>% filter(!is.na(.data[[region_col]])) %>%
|
|
count(.data[[region_col]], sort = TRUE)
|
|
|
|
stopifnot(nrow(cnt) >= 2)
|
|
c(cnt[[1,1]], cnt[[2,1]])
|
|
}
|
|
|
|
save_txt <- function(path, txt) {
|
|
dir.create(dirname(path), showWarnings = FALSE, recursive = TRUE)
|
|
writeLines(txt, path)
|
|
}
|
|
|
|
# args
|
|
args <- commandArgs(trailingOnly = TRUE)
|
|
opt <- list(
|
|
data = NULL, region_col = NULL, region_a = NULL, region_b = NULL,
|
|
response = NULL, predictors = NULL, knn1 = NULL, knn2 = NULL,
|
|
k = 5L, render = "none" # "none" | "html" | "pdf"
|
|
)
|
|
|
|
parse_flag <- function(flag) {
|
|
key <- sub("^--", "", flag[[1]])
|
|
val <- if (length(flag) > 1) flag[[2]] else TRUE
|
|
opt[[key]] <<- val
|
|
}
|
|
|
|
if (length(args)) {
|
|
kv <- split(args, cumsum(grepl("^--", args)))
|
|
lapply(kv, parse_flag)
|
|
}
|
|
|
|
stopifnot(!is.null(opt$data))
|
|
|
|
# load data
|
|
message("reading: ", opt$data)
|
|
|
|
df <- if (grepl("\\.csv(\\.gz)?$", opt$data, ignore.case = TRUE)) {
|
|
suppressMessages(readr::read_csv(opt$data, show_col_types = FALSE))
|
|
} else {
|
|
suppressMessages(readxl::read_excel(opt$data))
|
|
}
|
|
|
|
# auto-detect columns
|
|
region_col <- opt$region_col %||%
|
|
find_col(df, c("^region$", "regions?$", "world\\s*bank\\s*region"))
|
|
|
|
if (is.null(region_col)) stop("could not detect a 'region' column; use --region-col")
|
|
|
|
response_col <- opt$response %||%
|
|
find_col(df, c("^epi$", "overall\\s*score$", "index$", "score$"))
|
|
|
|
if (is.null(response_col)) {
|
|
nums <- df %>% select(where(is.numeric)) %>% names()
|
|
if (length(nums) == 0) stop("no numeric columns found; set --response explicitly")
|
|
response_col <- nums[[1]]
|
|
}
|
|
|
|
gdp_col <- find_col(df, c("^gdp", "gdp.*per.*cap", "gdppc"))
|
|
pop_col <- find_col(df, c("^pop", "^population$"))
|
|
|
|
# predictors set(s)
|
|
pred_sets <- list()
|
|
if (!is.null(opt$predictors)) {
|
|
pred_sets <- list(strsplit(opt$predictors, ",", fixed = TRUE)[[1]] %>% trimws())
|
|
} else {
|
|
ps <- c(na.omit(gdp_col), na.omit(pop_col))
|
|
if (length(ps) >= 1) pred_sets <- append(pred_sets, list(ps[1]))
|
|
if (length(ps) >= 2) pred_sets <- append(pred_sets, list(ps[1:2]))
|
|
}
|
|
|
|
# choose regions
|
|
regions <- pick_regions(df, region_col, opt$region_a, opt$region_b)
|
|
region_a <- regions[[1]]; region_b <- regions[[2]]
|
|
|
|
# dirs
|
|
fig_dir <- "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/figures"; dir.create(fig_dir, showWarnings = FALSE)
|
|
stats_dir <- "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/stats"; dir.create(stats_dir, showWarnings = FALSE)
|
|
|
|
# distributions and qq
|
|
fa <- df %>% filter(.data[[region_col]] == region_a) %>% pull(all_of(response_col)) %>% as.numeric()
|
|
fb <- df %>% filter(.data[[region_col]] == region_b) %>% pull(all_of(response_col)) %>% as.numeric()
|
|
|
|
# le plot de box
|
|
p_box_a <- tibble(val = fa) %>%
|
|
ggplot(aes(x = "", y = val)) +
|
|
geom_boxplot() +
|
|
labs(title = paste0("boxplot: ", response_col, " (", region_a, ")"),
|
|
x = NULL, y = response_col)
|
|
ggsave(file.path(fig_dir, sprintf("box_%s_%s.png",
|
|
sanitize_filename(region_a), sanitize_filename(response_col))),
|
|
p_box_a, width = 5, height = 4, dpi = 160)
|
|
|
|
p_box_b <- tibble(val = fb) %>%
|
|
ggplot(aes(x = "", y = val)) +
|
|
geom_boxplot() +
|
|
labs(title = paste0("boxplot: ", response_col, " (", region_b, ")"),
|
|
x = NULL, y = response_col)
|
|
|
|
ggsave(file.path(fig_dir, sprintf("box_%s_%s.png",
|
|
sanitize_filename(region_b), sanitize_filename(response_col))),
|
|
p_box_b, width = 5, height = 4, dpi = 160)
|
|
|
|
# hist and density
|
|
p_hist_a <- tibble(val = fa) %>%
|
|
ggplot(aes(x = val)) +
|
|
geom_histogram(aes(y = after_stat(density)), bins = 30, alpha = 0.6) +
|
|
geom_density() +
|
|
labs(title = paste0("histogram + density: ", response_col, " (", region_a, ")"),
|
|
x = response_col, y = "density")
|
|
|
|
ggsave(file.path(fig_dir, sprintf("hist_%s_%s.png",
|
|
sanitize_filename(region_a), sanitize_filename(response_col))),
|
|
p_hist_a, width = 6, height = 4, dpi = 160)
|
|
|
|
p_hist_b <- tibble(val = fb) %>%
|
|
ggplot(aes(x = val)) +
|
|
geom_histogram(aes(y = after_stat(density)), bins = 30, alpha = 0.6) +
|
|
geom_density() +
|
|
labs(title = paste0("histogram + density: ", response_col, " (", region_b, ")"),
|
|
x = response_col, y = "density")
|
|
|
|
ggsave(file.path(fig_dir, sprintf("hist_%s_%s.png",
|
|
sanitize_filename(region_b), sanitize_filename(response_col))),
|
|
p_hist_b, width = 6, height = 4, dpi = 160)
|
|
|
|
# qq plot between the two regions (2 sample qq)
|
|
png(file.path(fig_dir, sprintf("qq_%s_%s_vs_%s.png",
|
|
sanitize_filename(response_col), sanitize_filename(region_a), sanitize_filename(region_b))),
|
|
width = 700, height = 500, res = 160)
|
|
|
|
n <- min(sum(is.finite(fa)), sum(is.finite(fb)))
|
|
q <- seq(0.01, 0.99, length.out = max(10, n))
|
|
xq <- quantile(fa, q, na.rm = TRUE); yq <- quantile(fb, q, na.rm = TRUE)
|
|
|
|
plot(sort(xq), sort(yq), pch = 19, cex = 0.6,
|
|
xlab = paste0("quantiles: ", region_a),
|
|
ylab = paste0("quantiles: ", region_b),
|
|
main = paste("qq plot:", region_a, "vs", region_b))
|
|
|
|
abline(0, 1)
|
|
dev.off()
|