# bleh suppressPackageStartupMessages({ library(tidyverse) library(broom) library(caret) library(class) }); # helpers find_col <- function(df, patterns) { cols <- names(df) for (pat in patterns) { hit <- cols[str_detect(tolower(cols), regex(pat, ignore_case = TRUE))] if (length(hit) > 0) return(hit[[1]]) } return(NULL) } sanitize_filename <- function(x) { gsub("[^A-Za-z0-9_.-]+", "_", x) } pick_regions <- function(df, region_col, a = NULL, b = NULL) { if (!is.null(a) && !is.null(b)) return(c(a, b)) cnt <- df %>% filter(!is.na(.data[[region_col]])) %>% count(.data[[region_col]], sort = TRUE) stopifnot(nrow(cnt) >= 2) c(cnt[[1,1]], cnt[[2,1]]) } save_txt <- function(path, txt) { dir.create(dirname(path), showWarnings = FALSE, recursive = TRUE) writeLines(txt, path) } # args args <- commandArgs(trailingOnly = TRUE) opt <- list( data = NULL, region_col = NULL, region_a = NULL, region_b = NULL, response = NULL, predictors = NULL, knn1 = NULL, knn2 = NULL, k = 5L, render = "none" # "none" | "html" | "pdf" ) parse_flag <- function(flag) { key <- sub("^--", "", flag[[1]]) val <- if (length(flag) > 1) flag[[2]] else TRUE opt[[key]] <<- val } if (length(args)) { kv <- split(args, cumsum(grepl("^--", args))) lapply(kv, parse_flag) } stopifnot(!is.null(opt$data)) # load data message("reading: ", opt$data) df <- if (grepl("\\.csv(\\.gz)?$", opt$data, ignore.case = TRUE)) { suppressMessages(readr::read_csv(opt$data, show_col_types = FALSE)) } else { suppressMessages(readxl::read_excel(opt$data)) } # auto-detect columns region_col <- opt$region_col %||% find_col(df, c("^region$", "regions?$", "world\\s*bank\\s*region")) if (is.null(region_col)) stop("could not detect a 'region' column; use --region-col") response_col <- opt$response %||% find_col(df, c("^epi$", "overall\\s*score$", "index$", "score$")) if (is.null(response_col)) { nums <- df %>% select(where(is.numeric)) %>% names() if (length(nums) == 0) stop("no numeric columns found; set --response explicitly") response_col <- nums[[1]] } gdp_col <- find_col(df, c("^gdp", "gdp.*per.*cap", "gdppc")) pop_col <- find_col(df, c("^pop", "^population$")) # predictors set(s) pred_sets <- list() if (!is.null(opt$predictors)) { pred_sets <- list(strsplit(opt$predictors, ",", fixed = TRUE)[[1]] %>% trimws()) } else { ps <- c(na.omit(gdp_col), na.omit(pop_col)) if (length(ps) >= 1) pred_sets <- append(pred_sets, list(ps[1])) if (length(ps) >= 2) pred_sets <- append(pred_sets, list(ps[1:2])) } # choose regions regions <- pick_regions(df, region_col, opt$region_a, opt$region_b) region_a <- regions[[1]]; region_b <- regions[[2]] # dirs fig_dir <- "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/figures"; dir.create(fig_dir, showWarnings = FALSE) stats_dir <- "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/stats"; dir.create(stats_dir, showWarnings = FALSE) # distributions and qq fa <- df %>% filter(.data[[region_col]] == region_a) %>% pull(all_of(response_col)) %>% as.numeric() fb <- df %>% filter(.data[[region_col]] == region_b) %>% pull(all_of(response_col)) %>% as.numeric() # le plot de box p_box_a <- tibble(val = fa) %>% ggplot(aes(x = "", y = val)) + geom_boxplot() + labs(title = paste0("boxplot: ", response_col, " (", region_a, ")"), x = NULL, y = response_col) ggsave(file.path(fig_dir, sprintf("box_%s_%s.png", sanitize_filename(region_a), sanitize_filename(response_col))), p_box_a, width = 5, height = 4, dpi = 160) p_box_b <- tibble(val = fb) %>% ggplot(aes(x = "", y = val)) + geom_boxplot() + labs(title = paste0("boxplot: ", response_col, " (", region_b, ")"), x = NULL, y = response_col) ggsave(file.path(fig_dir, sprintf("box_%s_%s.png", sanitize_filename(region_b), sanitize_filename(response_col))), p_box_b, width = 5, height = 4, dpi = 160) # hist and density p_hist_a <- tibble(val = fa) %>% ggplot(aes(x = val)) + geom_histogram(aes(y = after_stat(density)), bins = 30, alpha = 0.6) + geom_density() + labs(title = paste0("histogram + density: ", response_col, " (", region_a, ")"), x = response_col, y = "density") ggsave(file.path(fig_dir, sprintf("hist_%s_%s.png", sanitize_filename(region_a), sanitize_filename(response_col))), p_hist_a, width = 6, height = 4, dpi = 160) p_hist_b <- tibble(val = fb) %>% ggplot(aes(x = val)) + geom_histogram(aes(y = after_stat(density)), bins = 30, alpha = 0.6) + geom_density() + labs(title = paste0("histogram + density: ", response_col, " (", region_b, ")"), x = response_col, y = "density") ggsave(file.path(fig_dir, sprintf("hist_%s_%s.png", sanitize_filename(region_b), sanitize_filename(response_col))), p_hist_b, width = 6, height = 4, dpi = 160) # qq plot between the two regions (2 sample qq) png(file.path(fig_dir, sprintf("qq_%s_%s_vs_%s.png", sanitize_filename(response_col), sanitize_filename(region_a), sanitize_filename(region_b))), width = 700, height = 500, res = 160) n <- min(sum(is.finite(fa)), sum(is.finite(fb))) q <- seq(0.01, 0.99, length.out = max(10, n)) xq <- quantile(fa, q, na.rm = TRUE); yq <- quantile(fb, q, na.rm = TRUE) plot(sort(xq), sort(yq), pch = 19, cex = 0.6, xlab = paste0("quantiles: ", region_a), ylab = paste0("quantiles: ", region_b), main = paste("qq plot:", region_a, "vs", region_b)) abline(0, 1) dev.off()