Data-Analytics/Assignments/Assignment II/OLD_2.r

# bleh

suppressPackageStartupMessages({
  library(tidyverse)
  library(broom)
  library(caret)
  library(class)
});

# helpers
find_col <- function(df, patterns) {
	cols <- names(df)
	for (pat in patterns) {
		hit <- cols[str_detect(tolower(cols), regex(pat, ignore_case = TRUE))]
		if (length(hit) > 0) return(hit[[1]])
	}
	return(NULL)
}

sanitize_filename <- function(x) {
	gsub("[^A-Za-z0-9_.-]+", "_", x)
}

pick_regions <- function(df, region_col, a = NULL, b = NULL) {
	if (!is.null(a) && !is.null(b)) return(c(a, b))
	cnt <- df %>% filter(!is.na(.data[[region_col]])) %>%
		count(.data[[region_col]], sort = TRUE)

	stopifnot(nrow(cnt) >= 2)
	c(cnt[[1,1]], cnt[[2,1]])
}

save_txt <- function(path, txt) {
	dir.create(dirname(path), showWarnings = FALSE, recursive = TRUE)
	writeLines(txt, path)
}

# args
args <- commandArgs(trailingOnly = TRUE)
opt <- list(
	data = NULL, region_col = NULL, region_a = NULL, region_b = NULL,
	response = NULL, predictors = NULL, knn1 = NULL, knn2 = NULL,
	k = 5L, render = "none" # "none" | "html" | "pdf"
)

parse_flag <- function(flag) {
	key <- sub("^--", "", flag[[1]])
	val <- if (length(flag) > 1) flag[[2]] else TRUE
	opt[[key]] <<- val
}

if (length(args)) {
	kv <- split(args, cumsum(grepl("^--", args)))
	lapply(kv, parse_flag)
}

stopifnot(!is.null(opt$data))

# load data
message("reading: ", opt$data)

df <- if (grepl("\\.csv(\\.gz)?$", opt$data, ignore.case = TRUE)) {
	suppressMessages(readr::read_csv(opt$data, show_col_types = FALSE))
} else {
	suppressMessages(readxl::read_excel(opt$data))
}

# auto-detect columns
region_col <- opt$region_col %||%
	find_col(df, c("^region$", "regions?$", "world\\s*bank\\s*region"))

if (is.null(region_col)) stop("could not detect a 'region' column; use --region-col")

response_col <- opt$response %||%
	find_col(df, c("^epi$", "overall\\s*score$", "index$", "score$"))

if (is.null(response_col)) {
	nums <- df %>% select(where(is.numeric)) %>% names()
	if (length(nums) == 0) stop("no numeric columns found; set --response explicitly")
	response_col <- nums[[1]]
}

gdp_col <- find_col(df, c("^gdp", "gdp.*per.*cap", "gdppc"))
pop_col <- find_col(df, c("^pop", "^population$"))

# predictors set(s)
pred_sets <- list()
if (!is.null(opt$predictors)) {
	pred_sets <- list(strsplit(opt$predictors, ",", fixed = TRUE)[[1]] %>% trimws())
} else {
	ps <- c(na.omit(gdp_col), na.omit(pop_col))
	if (length(ps) >= 1) pred_sets <- append(pred_sets, list(ps[1]))
	if (length(ps) >= 2) pred_sets <- append(pred_sets, list(ps[1:2]))
}

# choose regions
regions <- pick_regions(df, region_col, opt$region_a, opt$region_b)
region_a <- regions[[1]]; region_b <- regions[[2]]

# dirs
fig_dir <- "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/figures"; dir.create(fig_dir, showWarnings = FALSE)
stats_dir <- "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/stats"; dir.create(stats_dir, showWarnings = FALSE)

# distributions and qq
fa <- df %>% filter(.data[[region_col]] == region_a) %>% pull(all_of(response_col)) %>% as.numeric()
fb <- df %>% filter(.data[[region_col]] == region_b) %>% pull(all_of(response_col)) %>% as.numeric()

# le plot de box
p_box_a <- tibble(val = fa) %>%
  ggplot(aes(x = "", y = val)) +
  geom_boxplot() +
  labs(title = paste0("boxplot: ", response_col, " (", region_a, ")"),
	   x = NULL, y = response_col)
ggsave(file.path(fig_dir, sprintf("box_%s_%s.png",
	   sanitize_filename(region_a), sanitize_filename(response_col))),
	   p_box_a, width = 5, height = 4, dpi = 160)

p_box_b <- tibble(val = fb) %>%
  ggplot(aes(x = "", y = val)) +
  geom_boxplot() +
  labs(title = paste0("boxplot: ", response_col, " (", region_b, ")"),
	   x = NULL, y = response_col)

ggsave(file.path(fig_dir, sprintf("box_%s_%s.png",
	   sanitize_filename(region_b), sanitize_filename(response_col))),
	   p_box_b, width = 5, height = 4, dpi = 160)

# hist and density
p_hist_a <- tibble(val = fa) %>%
  ggplot(aes(x = val)) +
  geom_histogram(aes(y = after_stat(density)), bins = 30, alpha = 0.6) +
  geom_density() +
  labs(title = paste0("histogram + density: ", response_col, " (", region_a, ")"),
	   x = response_col, y = "density")

ggsave(file.path(fig_dir, sprintf("hist_%s_%s.png",
	   sanitize_filename(region_a), sanitize_filename(response_col))),
	   p_hist_a, width = 6, height = 4, dpi = 160)

p_hist_b <- tibble(val = fb) %>%
  ggplot(aes(x = val)) +
  geom_histogram(aes(y = after_stat(density)), bins = 30, alpha = 0.6) +
  geom_density() +
  labs(title = paste0("histogram + density: ", response_col, " (", region_b, ")"),
	   x = response_col, y = "density")

ggsave(file.path(fig_dir, sprintf("hist_%s_%s.png",
	   sanitize_filename(region_b), sanitize_filename(response_col))),
	   p_hist_b, width = 6, height = 4, dpi = 160)

# qq plot between the two regions (2 sample qq)
png(file.path(fig_dir, sprintf("qq_%s_%s_vs_%s.png",
	sanitize_filename(response_col), sanitize_filename(region_a), sanitize_filename(region_b))),
	width = 700, height = 500, res = 160)

n <- min(sum(is.finite(fa)), sum(is.finite(fb)))
q <- seq(0.01, 0.99, length.out = max(10, n))
xq <- quantile(fa, q, na.rm = TRUE); yq <- quantile(fb, q, na.rm = TRUE)

plot(sort(xq), sort(yq), pch = 19, cex = 0.6,
	 xlab = paste0("quantiles: ", region_a),
	 ylab = paste0("quantiles: ", region_b),
	 main = paste("qq plot:", region_a, "vs", region_b))

abline(0, 1)
dev.off()