added lab 4
This commit is contained in:
@@ -0,0 +1,122 @@
|
||||
# TODO: organize this file better bc I just kinda dumped everything in here
|
||||
|
||||
|
||||
suppressPackageStartupMessages({
|
||||
pkgs <- c("tidyverse", "readr", "readxl", "broom", "jsonlite", "ggplot2", "class", "optparse", "markdown")
|
||||
to_install <- pkgs[!pkgs %in% rownames(installed.packages())]
|
||||
if (length(to_install)) install.packages(to_install, repos = "https://cloud.r-project.org")
|
||||
lapply(pkgs, library, character.only = TRUE)
|
||||
})
|
||||
|
||||
sanitize <- function(x) {
|
||||
gsub("[^A-Za-z0-9_.-]+", "_", x)
|
||||
}
|
||||
|
||||
save_plot <- function(p, path, w = 7, h = 5, dpi = 160) {
|
||||
dir.create(dirname(path), recursive = TRUE, showWarnings = FALSE)
|
||||
ggplot2::ggsave(path, p, width = w, height = h, dpi = dpi)
|
||||
}
|
||||
|
||||
hist_density_plot <- function(v, lbl) {
|
||||
tibble(x = v) |>
|
||||
ggplot(aes(x = x)) +
|
||||
geom_histogram(aes(y = after_stat(density)), bins = 30, alpha = 0.6) +
|
||||
geom_density() +
|
||||
labs(title = paste("histogram + density:", lbl), x = lbl, y = "density") +
|
||||
theme_minimal()
|
||||
}
|
||||
|
||||
box_plot <- function(v, lbl) {
|
||||
tibble(x = v) |>
|
||||
ggplot(aes(y = x)) +
|
||||
geom_boxplot(width = 0.3) +
|
||||
labs(title = paste("boxplot:", lbl), y = lbl, x = NULL) +
|
||||
theme_minimal()
|
||||
}
|
||||
|
||||
qq_two_sample <- function(a, b, title = "qq plot", n_q = NULL) {
|
||||
a <- a[is.finite(a)]
|
||||
b <- b[is.finite(b)]
|
||||
n <- min(length(a), length(b))
|
||||
|
||||
if (is.null(n_q)) n_q <- max(10, floor(n))
|
||||
if (n_q < 10) return(NULL)
|
||||
|
||||
probs <- seq(0.01, 0.99, length.out = n_q)
|
||||
qa <- quantile(a, probs, na.rm = TRUE, names = FALSE)
|
||||
qb <- quantile(b, probs, na.rm = TRUE, names = FALSE)
|
||||
d <- tibble(x = sort(qa), y = sort(qb))
|
||||
|
||||
ggplot(d, aes(x, y)) +
|
||||
geom_point(size = 1.6) +
|
||||
geom_abline(slope = 1, intercept = 0) +
|
||||
labs(title = title, x = "region a quantiles", y = "region b quantiles") +
|
||||
theme_minimal()
|
||||
}
|
||||
|
||||
tf_pos <- function(s) {
|
||||
s <- as.numeric(s)
|
||||
if (all(s[is.finite(s)] > 0)) log1p(s) else s
|
||||
}
|
||||
|
||||
strat_split <- function(d, label_col, test_prop = 0.25) {
|
||||
d <- d |> tidyr::drop_na({{label_col}})
|
||||
idx_tr <- integer(0)
|
||||
idx_te <- integer(0)
|
||||
|
||||
for (lev in unique(d[[label_col]])) {
|
||||
rows <- which(d[[label_col]] == lev)
|
||||
nte <- max(1, floor(length(rows) * test_prop))
|
||||
|
||||
te <- sample(rows, nte)
|
||||
tr <- setdiff(rows, te)
|
||||
|
||||
idx_tr <- c(idx_tr, tr)
|
||||
idx_te <- c(idx_te, te)
|
||||
}
|
||||
|
||||
list(train = d[idx_tr, , drop = FALSE], test = d[idx_te, , drop = FALSE])
|
||||
}
|
||||
|
||||
run_knn <- function(d, label_col, vars, k, tag, fig_dir) {
|
||||
use <- d |> dplyr::select(all_of(c(label_col, vars))) |> tidyr::drop_na()
|
||||
split <- strat_split(use, label_col, 0.25)
|
||||
tr <- split$train
|
||||
te <- split$test
|
||||
|
||||
mu <- sapply(tr[, vars, drop = FALSE], mean, na.rm = TRUE)
|
||||
sdv <- sapply(tr[, vars, drop = FALSE], sd, na.rm = TRUE)
|
||||
sdv[sdv == 0] <- 1
|
||||
|
||||
trX <- scale(as.matrix(tr[, vars, drop = FALSE]), center = mu, scale = sdv)
|
||||
teX <- scale(as.matrix(te[, vars, drop = FALSE]), center = mu, scale = sdv)
|
||||
|
||||
trY <- factor(tr[[label_col]])
|
||||
teY <- factor(te[[label_col]], levels = levels(trY))
|
||||
|
||||
pred <- class::knn(train = trX, test = teX, cl = trY, k = k)
|
||||
acc <- mean(pred == teY, na.rm = TRUE)
|
||||
|
||||
cm <- table(truth = teY, pred = pred)
|
||||
cm_df <- as.data.frame(cm)
|
||||
cm_fig <- file.path(fig_dir, paste0("knn_confusion_", sanitize(tag), ".png"))
|
||||
|
||||
p_cm <- ggplot(cm_df, aes(x = pred, y = truth, fill = Freq)) +
|
||||
geom_tile() + geom_text(aes(label = Freq)) +
|
||||
labs(
|
||||
title = paste0("confusion matrix (k=", k, ") – ", tag),
|
||||
x = "predicted", y = "true"
|
||||
) + theme_minimal() +
|
||||
theme(
|
||||
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
|
||||
)
|
||||
|
||||
save_plot(p_cm, cm_fig, w = 6, h = 5)
|
||||
|
||||
list(tag = tag,
|
||||
k = k,
|
||||
vars = vars,
|
||||
accuracy = unname(acc),
|
||||
confusion_fig = cm_fig,
|
||||
n_test = nrow(te))
|
||||
}
|
||||
@@ -0,0 +1,121 @@
|
||||
source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
|
||||
|
||||
# TODO: hard-code me
|
||||
|
||||
# NOTE: The options were generated by chatGPT from my horrendous hard-coded options
|
||||
option_list <- list(
|
||||
optparse::make_option("--data", type = "character", default = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/epi_results_2024_pop_gdp_v2.csv"),
|
||||
optparse::make_option("--region-col", type = "character", default = NA),
|
||||
optparse::make_option("--region-a", type = "character", default = NA),
|
||||
optparse::make_option("--region-b", type = "character", default = NA),
|
||||
optparse::make_option("--response", type = "character", default = NA),
|
||||
optparse::make_option("--predictors", type = "character", default = NA),
|
||||
optparse::make_option("--knn1", type = "character", default = NA),
|
||||
optparse::make_option("--knn2", type = "character", default = NA),
|
||||
optparse::make_option("--k", type = "integer", default = 5)
|
||||
)
|
||||
opt <- optparse::parse_args(optparse::OptionParser(option_list = option_list))
|
||||
|
||||
if (is.na(opt$data)) stop("--data is required")
|
||||
|
||||
read_any <- function(p) {
|
||||
ext <- tolower(tools::file_ext(p))
|
||||
if (ext %in% c("csv", "txt")) {
|
||||
suppressMessages(readr::read_csv(p, show_col_types = FALSE))
|
||||
} else if (ext %in% c("xls", "xlsx")) {
|
||||
readxl::read_excel(p)
|
||||
} else {
|
||||
stop("unsupported extension: ", ext)
|
||||
}
|
||||
}
|
||||
df <- read_any(opt$data)
|
||||
|
||||
nms <- names(df)
|
||||
|
||||
find_col <- function(nms, pats) {
|
||||
for (pat in pats) {
|
||||
idx <- which(stringr::str_detect(tolower(nms), pat))
|
||||
if (length(idx)) return(nms[idx[1]])
|
||||
}
|
||||
|
||||
# I hate it here
|
||||
NA_character_
|
||||
}
|
||||
|
||||
region_col <- if (!is.na(opt$`region-col`)) opt$`region-col` else
|
||||
find_col(nms, c("^region$", "regions?$", "world\\s*bank\\s*region"))
|
||||
|
||||
if (is.na(region_col)) stop("could not detect a region column; pass --region-col")
|
||||
|
||||
response <- if (!is.na(opt$response)) opt$response else if ("EPI.new" %in% nms) {
|
||||
"EPI.new"
|
||||
} else {
|
||||
find_col(nms, c("^epi", "epi.*score", "index$", "score$"))
|
||||
}
|
||||
|
||||
if (is.na(response)) {
|
||||
num <- df |> dplyr::select(where(is.numeric)) |> names()
|
||||
if (!length(num)) stop("no numeric columns; pass --response")
|
||||
response <- num[1]
|
||||
}
|
||||
|
||||
gdp_col <- find_col(nms, c("^gdp", "gdp.*per.*cap", "gdppc"))
|
||||
pop_col <- find_col(nms, c("^pop", "^population$"))
|
||||
|
||||
counts <- sort(table(df[[region_col]]), decreasing = TRUE)
|
||||
region_a <- if (!is.na(opt$`region-a`)) opt$`region-a` else
|
||||
if ("Sub-Saharan Africa" %in% names(counts)) "Sub-Saharan Africa" else names(counts)[1]
|
||||
|
||||
region_b <- if (!is.na(opt$`region-b`)) opt$`region-b` else
|
||||
if ("Latin America & Caribbean" %in% names(counts)) "Latin America & Caribbean" else names(counts)[2]
|
||||
|
||||
pred_sets <- list()
|
||||
if (!is.na(opt$predictors)) {
|
||||
pred_sets <- list(strsplit(opt$predictors, ",", fixed = TRUE)[[1]] |> trimws())
|
||||
} else {
|
||||
plist <- c()
|
||||
if (!is.na(gdp_col)) plist <- c(plist, gdp_col)
|
||||
if (!is.na(pop_col)) plist <- c(plist, pop_col)
|
||||
if (length(plist) >= 1) pred_sets <- append(pred_sets, list(plist[1]))
|
||||
if (length(plist) >= 2) pred_sets <- append(pred_sets, list(plist[1:2]))
|
||||
}
|
||||
|
||||
pred_sets <- pred_sets[lengths(pred_sets) > 0]
|
||||
|
||||
choose_knn_vars <- function(df, exclude, k = 3) {
|
||||
cands <- names(df)[endsWith(names(df), ".new") & names(df) != exclude]
|
||||
cands <- cands[sapply(cands, function(c) is.numeric(df[[c]]))]
|
||||
miss <- sapply(cands, function(c) mean(is.na(df[[c]])))
|
||||
ord <- order(miss, cands)
|
||||
head(cands[ord], k)
|
||||
}
|
||||
|
||||
knn1 <- if (!is.na(opt$knn1)) {
|
||||
strsplit(opt$knn1, ",", fixed = TRUE)[[1]] |> trimws()
|
||||
} else {
|
||||
choose_knn_vars(df, response, 3)
|
||||
}
|
||||
|
||||
knn2 <- if (!is.na(opt$knn2)) {
|
||||
strsplit(opt$knn2, ",", fixed = TRUE)[[1]] |> trimws()
|
||||
} else {
|
||||
setdiff(choose_knn_vars(df, response, 6), knn1)[1:3]
|
||||
}
|
||||
|
||||
ctx <- list(
|
||||
data = normalizePath(opt$data),
|
||||
region_col = region_col,
|
||||
response = response,
|
||||
region_a = region_a,
|
||||
region_b = region_b,
|
||||
predictors = pred_sets,
|
||||
knn1 = knn1,
|
||||
knn2 = knn2,
|
||||
k = opt$k,
|
||||
fig_dir = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/figures",
|
||||
stats_dir = "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/stats"
|
||||
)
|
||||
|
||||
|
||||
writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
|
||||
message("wrote ctx.json")
|
||||
@@ -0,0 +1,27 @@
|
||||
source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
|
||||
ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
|
||||
df <- suppressMessages(readr::read_csv(ctx$data, show_col_types = FALSE))
|
||||
|
||||
va <- df |> dplyr::filter(.data[[ctx$region_col]] == ctx$region_a) |> dplyr::pull(all_of(ctx$response)) |> as.numeric()
|
||||
vb <- df |> dplyr::filter(.data[[ctx$region_col]] == ctx$region_b) |> dplyr::pull(all_of(ctx$response)) |> as.numeric()
|
||||
|
||||
box_a_path <- file.path(ctx$fig_dir, paste0("box_", sanitize(ctx$region_a), "_", sanitize(ctx$response), ".png"))
|
||||
box_b_path <- file.path(ctx$fig_dir, paste0("box_", sanitize(ctx$region_b), "_", sanitize(ctx$response), ".png"))
|
||||
hist_a_path <- file.path(ctx$fig_dir, paste0("hist_", sanitize(ctx$region_a), "_", sanitize(ctx$response), ".png"))
|
||||
hist_b_path <- file.path(ctx$fig_dir, paste0("hist_", sanitize(ctx$region_b), "_", sanitize(ctx$response), ".png"))
|
||||
qq_path <- file.path(ctx$fig_dir, paste0("qq_", sanitize(ctx$response), "_", sanitize(ctx$region_a), "_vs_", sanitize(ctx$region_b), ".png"))
|
||||
|
||||
save_plot(box_plot(va, paste0(ctx$response, " (", ctx$region_a, ")")), box_a_path)
|
||||
save_plot(box_plot(vb, paste0(ctx$response, " (", ctx$region_b, ")")), box_b_path)
|
||||
save_plot(hist_density_plot(va, paste0(ctx$response, " (", ctx$region_a, ")")), hist_a_path)
|
||||
save_plot(hist_density_plot(vb, paste0(ctx$response, " (", ctx$region_b, ")")), hist_b_path)
|
||||
qp <- qq_two_sample(va, vb, paste0("qq plot: ", ctx$region_a, " vs ", ctx$region_b))
|
||||
if (!is.null(qp)) save_plot(qp, qq_path)
|
||||
|
||||
ctx$box_a <- box_a_path
|
||||
ctx$box_b <- box_b_path
|
||||
ctx$hist_a <- hist_a_path
|
||||
ctx$hist_b <- hist_b_path
|
||||
ctx$qq_fig <- qq_path
|
||||
writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
|
||||
message("plots done")
|
||||
@@ -0,0 +1,59 @@
|
||||
source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
|
||||
ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
|
||||
df <- suppressMessages(readr::read_csv(ctx$data, show_col_types = FALSE))
|
||||
|
||||
fit_ols <- function(data, y_col, x_cols, name_tag, fig_dir, stats_dir) {
|
||||
d <- data |> dplyr::select(all_of(c(y_col, x_cols))) |> tidyr::drop_na()
|
||||
for (xc in x_cols) d[[xc]] <- tf_pos(d[[xc]])
|
||||
f <- as.formula(paste(y_col, "~", paste(x_cols, collapse = " + ")))
|
||||
m <- lm(f, data = d)
|
||||
|
||||
# residuals vs fitted
|
||||
res_path <- file.path(fig_dir, paste0("residuals_", sanitize(name_tag), ".png"))
|
||||
p_res <- tibble(fitted = fitted(m), resid = resid(m)) |>
|
||||
ggplot(aes(fitted, resid)) +
|
||||
geom_point(size = 1.6) +
|
||||
geom_hline(yintercept = 0) +
|
||||
labs(title = paste("residuals vs fitted:", name_tag),
|
||||
x = "fitted", y = "residuals") +
|
||||
theme_minimal()
|
||||
save_plot(p_res, res_path)
|
||||
|
||||
# scatter first predictor vs response
|
||||
first <- x_cols[1]
|
||||
sc_path <- file.path(fig_dir, paste0("scatter_", sanitize(name_tag), "_", sanitize(first), ".png"))
|
||||
p_sc <- d |>
|
||||
ggplot(aes(.data[[first]], .data[[y_col]])) +
|
||||
geom_point(size = 1.6) +
|
||||
labs(title = paste(first, "vs", y_col), x = first, y = y_col) +
|
||||
theme_minimal()
|
||||
save_plot(p_sc, sc_path)
|
||||
|
||||
# summary to text
|
||||
summ_path <- file.path(stats_dir, paste0("ols_", sanitize(name_tag), ".txt"))
|
||||
capture.output(summary(m), file = summ_path)
|
||||
|
||||
gl <- broom::glance(m)
|
||||
list(
|
||||
name = name_tag,
|
||||
rsq = unname(gl$r.squared),
|
||||
aic = unname(gl$AIC),
|
||||
bic = unname(gl$BIC),
|
||||
nobs = stats::nobs(m),
|
||||
summary_file = summ_path,
|
||||
residuals_fig = res_path,
|
||||
scatter_fig = sc_path
|
||||
)
|
||||
}
|
||||
|
||||
ols <- list()
|
||||
if (length(ctx$predictors)) {
|
||||
for (p in ctx$predictors) {
|
||||
tag <- paste0("full: ", ctx$response, " ~ ", paste(p, collapse = " + "))
|
||||
ols <- append(ols, list(fit_ols(df, ctx$response, p, tag, ctx$fig_dir, ctx$stats_dir)))
|
||||
}
|
||||
}
|
||||
|
||||
ctx$ols <- ols
|
||||
writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
|
||||
message("ols (full) done")
|
||||
@@ -0,0 +1,72 @@
|
||||
source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
|
||||
ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
|
||||
df <- suppressMessages(readr::read_csv(ctx$data, show_col_types = FALSE))
|
||||
reg_df <- df |> dplyr::filter(.data[[ctx$region_col]] == ctx$region_a)
|
||||
|
||||
fit_ols <- function(data, y_col, x_cols, name_tag, fig_dir, stats_dir) {
|
||||
d <- data |> dplyr::select(all_of(c(y_col, x_cols))) |> tidyr::drop_na()
|
||||
for (xc in x_cols) d[[xc]] <- tf_pos(d[[xc]])
|
||||
f <- as.formula(paste(y_col, "~", paste(x_cols, collapse = " + ")))
|
||||
m <- lm(f, data = d)
|
||||
|
||||
res_path <- file.path(fig_dir, paste0("residuals_", sanitize(name_tag), ".png"))
|
||||
p_res <- tibble(fitted = fitted(m), resid = resid(m)) |>
|
||||
ggplot(aes(fitted, resid)) +
|
||||
geom_point(size = 1.6) +
|
||||
geom_hline(yintercept = 0) +
|
||||
labs(title = paste("residuals vs fitted:", name_tag),
|
||||
x = "fitted", y = "residuals") +
|
||||
theme_minimal()
|
||||
save_plot(p_res, res_path)
|
||||
|
||||
first <- x_cols[1]
|
||||
sc_path <- file.path(fig_dir, paste0("scatter_", sanitize(name_tag), "_", sanitize(first), ".png"))
|
||||
p_sc <- d |>
|
||||
ggplot(aes(.data[[first]], .data[[y_col]])) +
|
||||
geom_point(size = 1.6) +
|
||||
labs(title = paste(first, "vs", y_col), x = first, y = y_col) +
|
||||
theme_minimal()
|
||||
save_plot(p_sc, sc_path)
|
||||
|
||||
summ_path <- file.path(stats_dir, paste0("ols_", sanitize(name_tag), ".txt"))
|
||||
capture.output(summary(m), file = summ_path)
|
||||
|
||||
gl <- broom::glance(m)
|
||||
list(
|
||||
name = name_tag,
|
||||
rsq = unname(gl$r.squared),
|
||||
aic = unname(gl$AIC),
|
||||
bic = unname(gl$BIC),
|
||||
nobs = stats::nobs(m),
|
||||
summary_file = summ_path,
|
||||
residuals_fig = res_path,
|
||||
scatter_fig = sc_path
|
||||
)
|
||||
}
|
||||
|
||||
region_models <- list()
|
||||
if (length(ctx$predictors)) {
|
||||
for (p in ctx$predictors) {
|
||||
tag <- paste0("region ", ctx$region_a, ": ", ctx$response, " ~ ", paste(p, collapse = " + "))
|
||||
region_models <- append(region_models, list(fit_ols(reg_df, ctx$response, p, tag, ctx$fig_dir, ctx$stats_dir)))
|
||||
}
|
||||
}
|
||||
|
||||
best_note <- "no region-level comparison available."
|
||||
if (length(region_models) >= 1) {
|
||||
ord <- order(
|
||||
sapply(region_models, `[[`, "rsq"),
|
||||
-sapply(region_models, `[[`, "aic"),
|
||||
-sapply(region_models, `[[`, "bic"),
|
||||
decreasing = TRUE
|
||||
)
|
||||
best <- region_models[[ord[1]]]
|
||||
best_note <- sprintf(
|
||||
"on region `%s`, the better model is **%s** (r²=%.3f, aic=%.1f, bic=%.1f).",
|
||||
ctx$region_a, best$name, best$rsq, best$aic, best$bic
|
||||
)
|
||||
}
|
||||
|
||||
ctx$best_region_note <- best_note
|
||||
writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
|
||||
message("ols (region) done")
|
||||
@@ -0,0 +1,11 @@
|
||||
source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
|
||||
ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
|
||||
df <- suppressMessages(readr::read_csv(ctx$data, show_col_types = FALSE))
|
||||
|
||||
set.seed(42)
|
||||
knn1 <- run_knn(df, ctx$region_col, ctx$knn1, ctx$k, "model A", ctx$fig_dir)
|
||||
knn2 <- run_knn(df, ctx$region_col, ctx$knn2, ctx$k, "model B", ctx$fig_dir)
|
||||
|
||||
ctx$knn <- list(knn1, knn2)
|
||||
writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE), "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
|
||||
message("knn done")
|
||||
@@ -0,0 +1,89 @@
|
||||
library(markdown)
|
||||
|
||||
source("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/R/00_utils.R")
|
||||
ctx <- jsonlite::fromJSON("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/ctx.json")
|
||||
|
||||
md <- c(
|
||||
"# exploratory data analysis and models on the epi dataset",
|
||||
paste0("date: ", as.character(Sys.Date()), " "),
|
||||
"",
|
||||
"## dataset and choices",
|
||||
# I regret my life choices
|
||||
paste0("- **file**: `", basename(ctx$data), "`"),
|
||||
paste0("- **region column**: `", ctx$region_col, "`"),
|
||||
paste0("- **response var**: `", ctx$response, "`"),
|
||||
paste0("- **regions**: `", ctx$region_a, "` vs `", ctx$region_b, "`"),
|
||||
"",
|
||||
"## 1) variable distributions",
|
||||
"### 1.1 boxplots and histograms (with density!)",
|
||||
paste0(""),
|
||||
paste0(""),
|
||||
paste0(""),
|
||||
paste0(""),
|
||||
"",
|
||||
"### 1.2 qq plot (two-sample)",
|
||||
paste0(""),
|
||||
"",
|
||||
"## 2) linear models"
|
||||
)
|
||||
|
||||
# Normalize data.frame because NOTHING WORKS
|
||||
row_list <- function(x) {
|
||||
if (is.null(x)) return(list())
|
||||
if (is.data.frame(x)) {
|
||||
lapply(seq_len(nrow(x)), function(i) as.list(x[i, , drop = FALSE]))
|
||||
} else if (is.list(x)) {
|
||||
x
|
||||
} else {
|
||||
list()
|
||||
}
|
||||
}
|
||||
|
||||
if (!is.null(ctx$ols) && length(ctx$ols)) {
|
||||
for (m in row_list(ctx$ols)) {
|
||||
md <- c(md,
|
||||
paste0("### ", m$name),
|
||||
sprintf("- **r²**: %.4f | **aic**: %.2f | **bic**: %.2f", m$r2, m$aic, m$bic),
|
||||
""
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
md <- c(md,
|
||||
"### 2.2 same models on one region (comparison)",
|
||||
if (!is.null(ctx$best_region_note)) ctx$best_region_note else "no note available.",
|
||||
"",
|
||||
"## 3) classification (knn, label = region)"
|
||||
)
|
||||
|
||||
if (!is.null(ctx$knn) && length(ctx$knn)) {
|
||||
for (k in row_list(ctx$knn)) {
|
||||
md <- c(md,
|
||||
paste0("### ", k$tag),
|
||||
sprintf("- **k**: %d | **accuracy**: %.4f | **test n**: %d",
|
||||
k$k, k$accuracy, k$n_test),
|
||||
paste0("variables: `", paste(k$vars, collapse = ", "), "`"),
|
||||
paste0(""),
|
||||
""
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
# I hate markdown sometimes man
|
||||
md <- gsub("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/", "", md)
|
||||
|
||||
# writeLines(md, "/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/output/report.md")
|
||||
# writeLines(jsonlite::toJSON(ctx, pretty = TRUE, auto_unbox = TRUE),
|
||||
# file.path(ctx$stats_dir, "summary.json"))
|
||||
|
||||
md_file <- "output/report.md"
|
||||
html_file <- "output/report.html"
|
||||
pdf_file <- "output/report.pdf"
|
||||
|
||||
setwd("/home/ion606/Desktop/Homework/Data Analytics/Assignments/Assignment II/")
|
||||
markdownToHTML(
|
||||
md_file,
|
||||
html_file
|
||||
)
|
||||
|
||||
message("done")
|
||||
Reference in New Issue
Block a user