Data-Analytics/Assignment IV/analysis.r

# install.packages(
#     c("tidyverse", "lubridate", "broom", "forecast", "stringr", "dplyr"),
#     repos = "http://cran.us.r-project.org"
# )

library(tidyverse)
library(lubridate)
library(broom)
library(forecast)
library(stringr)
library(dplyr)


# directory for data files (adjust if desired)
data_dir <- "data"
if (!dir.exists(data_dir)) {
    dir.create(data_dir, recursive = TRUE)
}

# directory for plots
imgs_dir <- "imgs"
if (!dir.exists(imgs_dir)) {
    dir.create(imgs_dir, recursive = TRUE)
}

# constants: key event dates related to chatgpt and so policy

# chatgpt public research preview launch
chatgpt_launch_date <- as.Date("2022-11-30") # openai "introducing chatgpt" blog

# stack overflow generative ai ban policy (meta so, 5 dec 2022)
so_ai_policy_date <- as.Date("2022-12-05")

# moderation strike on stack exchange (june–aug 2023) from meta posts
so_mod_strike_start <- as.Date("2023-06-05")
so_mod_strike_end <- as.Date("2023-08-07")

# helper: safe downloader
download_if_missing <- function(url, destfile) {
    if (!file.exists(destfile)) {
        message("downloading ", basename(destfile), " ...")
        download.file(url, destfile, mode = "wb")
        message("saved to ", destfile)
    } else {
        message("file already exists: ", destfile)
    }
}

coerce_month_to_date <- function(x) {
    if (inherits(x, "Date")) {
        return(x)
    }
    if (inherits(x, "POSIXct")) {
        return(lubridate::as_date(x))
    }
    if (inherits(x, "POSIXlt")) {
        return(as.Date(x))
    }
    if (is.numeric(x)) {
        return(as.Date(x, origin = "1970-01-01"))
    }
    if (is.character(x)) {
        parsed <- suppressWarnings(lubridate::ymd_hms(x))
        if (all(is.na(parsed))) {
            parsed <- suppressWarnings(lubridate::ymd(x))
        }
        if (all(is.na(parsed))) {
            parsed <- suppressWarnings(as.Date(x))
        }
        return(parsed)
    }
    suppressWarnings(as.Date(x))
}

# 1) load stack overflow monthly answers (dataset 1)

answers_csv_path <- file.path(data_dir, "so_new_answers_per_month_2018_2025.csv")

if (!file.exists(answers_csv_path)) {
    stop(
        "missing ", answers_csv_path,
        "\nrun the sede query in this script and download the csv to that path first."
    )
}

answers_raw <- readr::read_csv(answers_csv_path, show_col_types = FALSE) |>
    rename(
        month = matches("^Date$|Month", ignore.case = TRUE),
        status = matches("^Status$", ignore.case = TRUE),
        new_answers = matches("NewAnswers|Count", ignore.case = TRUE)
    )

answers_raw <- answers_raw |>
    mutate(
        month = coerce_month_to_date(month),
        status = tolower(status)
    )


# inspect column names so you can adjust if sede changes them
print(names(answers_raw))

# expected columns: "Month", "Status", "NewAnswers"
# normalise to lower snake case just in case

answers_raw <- answers_raw |>
    rename(
        month = matches("Month", ignore.case = TRUE),
        status = matches("Status", ignore.case = TRUE),
        new_answers = matches("NewAnswers|Count", ignore.case = TRUE)
    )

print(head(answers_raw))

# aggregate deleted vs non-deleted into separate columns per month
answers_monthly <- answers_raw |>
    mutate(
        month = as.Date(month),
        status = tolower(status)
    ) |>
    group_by(month) |>
    summarise(
        answers_total = sum(new_answers, na.rm = TRUE),
        answers_non_deleted = sum(if_else(status == "non-deleted", new_answers, 0L)),
        answers_deleted = sum(if_else(status == "deleted", new_answers, 0L)),
        .groups = "drop"
    ) |>
    arrange(month) |>
    mutate(
        year = year(month),
        month_num = month(month),
        time_index = row_number(),
        post_chatgpt = month >= chatgpt_launch_date,
        post_ai_policy = month >= so_ai_policy_date,
        during_mod_strike = month >= so_mod_strike_start & month <= so_mod_strike_end,
        period = case_when(
            month < chatgpt_launch_date ~ "pre_chatgpt",
            TRUE ~ "post_chatgpt"
        )
    )

glimpse(answers_monthly)

# 2) download and load stack overflow developer survey 2023/2024 (dataset 2)

# official survey zip files as exposed on survey.stackoverflow.co
# these urls are the same ones behind the "download full data set (csv)" links
# see: https://survey.stackoverflow.co/
survey_2023_url <- "https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2023.zip"
survey_2024_url <- "https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2024.zip"

survey_2023_zip <- file.path(data_dir, "stack-overflow-developer-survey-2023.zip")
survey_2024_zip <- file.path(data_dir, "stack-overflow-developer-survey-2024.zip")

download_if_missing(survey_2023_url, survey_2023_zip)
download_if_missing(survey_2024_url, survey_2024_zip)

# helper to read the "survey_results_public.csv" inside each zip
read_so_survey_from_zip <- function(zip_path, csv_pattern = "survey_results_public.csv") {
    if (!file.exists(zip_path)) {
        stop("zip file not found: ", zip_path)
    }

    # list files inside zip (works even when the CSV is in a subfolder)
    zlist <- utils::unzip(zip_path, list = TRUE)
    # try to find the csv by exact name or by pattern
    csv_name <- zlist$Name[stringr::str_detect(zlist$Name, regex(csv_pattern, ignore_case = TRUE))]
    if (length(csv_name) == 0) {
        stop("could not find a csv matching ", csv_pattern, " inside ", zip_path)
    }
    csv_name <- csv_name[1] # take first match

    # read it without extracting to disk using unz() connection
    # optionally supply col_types to speed parsing
    df <- readr::read_csv(
        unz(zip_path, csv_name),
        show_col_types = FALSE,
        # col_types = cols(.default = col_character()) # uncomment & customize if you want explicit types
    )
    df
}

survey2023_raw <- read_so_survey_from_zip(survey_2023_zip)
survey2024_raw <- read_so_survey_from_zip(survey_2024_zip)

# look at column names to locate ai + stackoverflow usage questions
names(survey2023_raw)[1:80]

############################################################
# create a harmonised survey subset focusing on:
#  - so visit frequency (column like SOVisitFreq)
#  - ai tool usage (column like AISelect or SOAI)
############################################################

find_first_col <- function(df, pattern) {
    cols <- names(df)[stringr::str_detect(names(df), regex(pattern, ignore_case = TRUE))]
    if (length(cols) == 0) {
        return(NA_character_)
    }
    cols[1]
}

pull_col_or_default <- function(df, col_name, default = NA_character_) {
    if (is.na(col_name)) {
        return(rep(default, nrow(df)))
    }
    df[[col_name]]
}

pull_age_numeric <- function(df, col_name) {
    vec <- pull_col_or_default(df, col_name, default = NA_real_)
    if (is.numeric(vec)) {
        return(vec)
    }
    if (is.factor(vec)) {
        vec <- as.character(vec)
    }
    if (is.character(vec)) {
        vec <- stringr::str_trim(vec)
        vec[vec == ""] <- NA_character_
        vec[stringr::str_detect(vec, regex("prefer not to say", ignore_case = TRUE))] <- NA_character_
        # parse_number extracts the leading numeric value (e.g., 25 from "25-34 years old")
        return(suppressWarnings(readr::parse_number(vec)))
    }
    suppressWarnings(as.numeric(vec))
}

main_branch_col_2023 <- find_first_col(survey2023_raw, "^MainBranch$|MainBranch")
country_col_2023 <- find_first_col(survey2023_raw, "^Country$|Country")
age_col_2023 <- find_first_col(survey2023_raw, "^Age$|Age")
gender_col_2023 <- find_first_col(survey2023_raw, "^Gender$|Gender")
so_visit_col_2023 <- find_first_col(survey2023_raw, "SOVisitFreq")
ai_select_col_2023 <- find_first_col(survey2023_raw, "AISelect|SOAI")

main_branch_col_2024 <- find_first_col(survey2024_raw, "^MainBranch$|MainBranch")
country_col_2024 <- find_first_col(survey2024_raw, "^Country$|Country")
age_col_2024 <- find_first_col(survey2024_raw, "^Age$|Age")
gender_col_2024 <- find_first_col(survey2024_raw, "^Gender$|Gender")
so_visit_col_2024 <- find_first_col(survey2024_raw, "SOVisitFreq")
ai_select_col_2024 <- find_first_col(survey2024_raw, "AISelect|SOAI")

message("2023 so visit col: ", so_visit_col_2023)
message("2023 ai col      : ", ai_select_col_2023)
message("2024 so visit col: ", so_visit_col_2024)
message("2024 ai col      : ", ai_select_col_2024)

# build a clean survey frame for 2023

survey2023 <- survey2023_raw |>
    transmute(
        year = 2023L,
        main_branch = pull_col_or_default(survey2023_raw, main_branch_col_2023),
        country = pull_col_or_default(survey2023_raw, country_col_2023),
        age = pull_age_numeric(survey2023_raw, age_col_2023),
        gender = pull_col_or_default(survey2023_raw, gender_col_2023),
        so_visit = pull_col_or_default(survey2023_raw, so_visit_col_2023),
        ai_select = pull_col_or_default(survey2023_raw, ai_select_col_2023)
    )

# same idea for 2024 (schema is very similar)

survey2024 <- survey2024_raw |>
    transmute(
        year = 2024L,
        main_branch = pull_col_or_default(survey2024_raw, main_branch_col_2024),
        country = pull_col_or_default(survey2024_raw, country_col_2024),
        age = pull_age_numeric(survey2024_raw, age_col_2024),
        gender = pull_col_or_default(survey2024_raw, gender_col_2024),
        so_visit = pull_col_or_default(survey2024_raw, so_visit_col_2024),
        ai_select = pull_col_or_default(survey2024_raw, ai_select_col_2024)
    )

survey_all <- bind_rows(survey2023, survey2024)

# engineer features:
#   - binary flag: frequent so visitor
#   - binary flag: uses chatgpt as ai tool (from ai_select free text / semicolon list)
#   - coarser age groups

survey_model <- survey_all |>
    filter(!is.na(so_visit)) |>
    mutate(
        so_visit = as.character(so_visit),
        ai_select = as.character(ai_select),

        # frequent so visitor: daily or multiple times per day etc.
        frequent_so = dplyr::case_when(
            stringr::str_detect(so_visit, regex("multiple times per day", ignore_case = TRUE)) ~ 1L,
            stringr::str_detect(so_visit, regex("daily|almost every day", ignore_case = TRUE)) ~ 1L,
            TRUE ~ 0L
        ),
        uses_chatgpt = dplyr::case_when(
            is.na(ai_select) ~ 0L,
            stringr::str_detect(ai_select, regex("chatgpt", ignore_case = TRUE)) ~ 1L,
            TRUE ~ 0L
        ),
        age_group = dplyr::case_when(
            !is.na(age) & age < 25 ~ "<25",
            !is.na(age) & age >= 25 & age < 35 ~ "25-34",
            !is.na(age) & age >= 35 & age < 45 ~ "35-44",
            !is.na(age) & age >= 45 ~ "45+",
            TRUE ~ "unknown"
        ),
        gender = if_else(is.na(gender) | gender == "", "Unknown", gender)
    ) |>
    filter(!is.na(frequent_so)) |>
    mutate(
        frequent_so = as.integer(frequent_so),
        uses_chatgpt = as.integer(uses_chatgpt),
        age_group = factor(age_group),
        gender = factor(gender),
        year = factor(year)
    )

glimpse(survey_model)

# SECTION 2: data description + preliminary plots (dataset 1)
# basic time series plot of answers over time (for section 2)

p_answers_ts <- ggplot(answers_monthly, aes(x = month, y = answers_total)) +
    geom_line() +
    geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
    geom_vline(xintercept = so_ai_policy_date, linetype = "dotted") +
    labs(
        title = "monthly new answers on stack overflow",
        x = "month",
        y = "number of answers"
    )

print(p_answers_ts)
ggsave(
    filename = file.path(imgs_dir, "01_answers_ts.png"),
    plot = p_answers_ts,
    width = 10, height = 6, units = "in", dpi = 300
)

# boxplot pre vs post chatgpt

p_box_pre_post <- ggplot(answers_monthly, aes(x = period, y = answers_total)) +
    geom_boxplot() +
    labs(
        title = "distribution of monthly answers: pre vs post chatgpt launch",
        x = "period",
        y = "monthly answers"
    )

print(p_box_pre_post)
ggsave(file.path(imgs_dir, "02_box_pre_post.png"), plot = p_box_pre_post, width = 8, height = 6, units = "in", dpi = 300)

# basic summary table

answers_summary_period <- answers_monthly |>
    group_by(period) |>
    summarise(
        n_months = n(),
        mean_answers = mean(answers_total),
        median_answers = median(answers_total),
        sd_answers = sd(answers_total),
        min_answers = min(answers_total),
        max_answers = max(answers_total),
        .groups = "drop"
    )

print(answers_summary_period)

# SECTION 3: exploratory analysis
# seasonal pattern: answers by calendar month across years

p_seasonal <- answers_monthly |>
    mutate(month_label = factor(month_num, labels = month.abb)) |>
    ggplot(aes(x = month_label, y = answers_total, group = year, color = period)) +
    geom_line(alpha = 0.6) +
    labs(
        title = "seasonality of answers by calendar month and year",
        x = "calendar month",
        y = "monthly answers"
    )

print(p_seasonal)
ggsave(file.path(imgs_dir, "03_seasonal.png"), plot = p_seasonal, width = 10, height = 6, units = "in", dpi = 300)

# rolling 3-month moving average to smooth noise

answers_monthly <- answers_monthly |>
    arrange(month) |>
    mutate(
        answers_ma3 = zoo::rollmean(answers_total, k = 3, fill = NA, align = "right")
    )

p_ma3 <- ggplot(answers_monthly, aes(x = month)) +
    geom_line(aes(y = answers_total), alpha = 0.3) +
    geom_line(aes(y = answers_ma3)) +
    geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
    labs(
        title = "monthly answers with 3-month moving average",
        x = "month",
        y = "answers"
    )

print(p_ma3)
ggsave(file.path(imgs_dir, "04_ma3.png"), plot = p_ma3, width = 10, height = 6, units = "in", dpi = 300)

# simple percentage change around chatgpt launch

pre_window <- answers_monthly |>
    filter(
        month >= chatgpt_launch_date - months(6),
        month < chatgpt_launch_date
    )

post_window <- answers_monthly |>
    filter(
        month >= chatgpt_launch_date,
        month < chatgpt_launch_date + months(6)
    )

pre_mean <- mean(pre_window$answers_total)
post_mean <- mean(post_window$answers_total)
pct_change <- (post_mean - pre_mean) / pre_mean * 100

pct_change

# survey exploratory: relation between ai usage and so visit frequency

survey_counts <- survey_model |>
    mutate(
        uses_chatgpt_label = if_else(uses_chatgpt == 1L, "uses chatgpt", "does not use chatgpt"),
        freq_label = if_else(frequent_so == 1L, "visits so daily", "visits so less often")
    ) |>
    count(year, uses_chatgpt_label, freq_label) |>
    group_by(year, uses_chatgpt_label) |>
    mutate(prop = n / sum(n)) |>
    ungroup()

p_survey_bar <- ggplot(survey_counts, aes(x = uses_chatgpt_label, y = prop, fill = freq_label)) +
    geom_col(position = "fill") +
    facet_wrap(~year) +
    scale_y_continuous(labels = scales::percent_format()) +
    labs(
        title = "relationship between chatgpt use and stack overflow visit frequency (survey)",
        x = "ai usage segment",
        y = "share of respondents",
        fill = "so visit frequency"
    )

print(p_survey_bar)
ggsave(file.path(imgs_dir, "08_survey_bar.png"), plot = p_survey_bar, width = 10, height = 6, units = "in", dpi = 300)

# SECTION 4: model development (four different model types)

# MODEL 1: interrupted time series linear regression
# outcome: monthly answers_total
# predictors: time trend, post_chatgpt level change, slope change after chatgpt

its_data <- answers_monthly |>
    mutate(
        time = time_index,
        chatgpt_time = if_else(month >= chatgpt_launch_date,
            time_index - min(time_index[month >= chatgpt_launch_date]) + 1L,
            0L
        )
    )

model_lm <- lm(
    answers_total ~ time + post_chatgpt + chatgpt_time,
    data = its_data
)

summary(model_lm)
tidy(model_lm)
glance(model_lm)

# predictions and plot

its_data <- its_data |>
    mutate(
        lm_fitted = predict(model_lm)
    )

p_lm_fit <- ggplot(its_data, aes(x = month)) +
    geom_line(aes(y = answers_total), alpha = 0.4) +
    geom_line(aes(y = lm_fitted), color = "blue") +
    geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
    labs(
        title = "interrupted time series regression: observed vs fitted answers",
        x = "month",
        y = "answers"
    )

print(p_lm_fit)
ggsave(file.path(imgs_dir, "05_lm_fit.png"), plot = p_lm_fit, width = 10, height = 6, units = "in", dpi = 300)

# MODEL 2: poisson regression for count data

model_pois <- glm(
    answers_total ~ time + post_chatgpt + chatgpt_time,
    data = its_data,
    family = poisson(link = "log")
)

summary(model_pois)
tidy(model_pois, exponentiate = TRUE) # exp(coef) ~ multiplicative effect

# compare predicted counts

its_data <- its_data |>
    mutate(
        pois_fitted = predict(model_pois, type = "response")
    )

p_pois_fit <- ggplot(its_data, aes(x = month)) +
    geom_line(aes(y = answers_total), alpha = 0.3) +
    geom_line(aes(y = pois_fitted), color = "red") +
    geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
    labs(
        title = "poisson regression: observed vs predicted monthly answers",
        x = "month",
        y = "answers"
    )

print(p_pois_fit)
ggsave(file.path(imgs_dir, "06_pois_fit.png"), plot = p_pois_fit, width = 10, height = 6, units = "in", dpi = 300)

# MODEL 3: arima time series forecast (pre-chatgpt vs actual)

# construct monthly ts object (frequency = 12)
start_year <- year(min(answers_monthly$month))
start_month <- month(min(answers_monthly$month))

answers_ts <- ts(
    answers_monthly$answers_total,
    start = c(start_year, start_month),
    frequency = 12
)

# train on pre-chatgpt data (up to oct 2022) and forecast forward
train_end <- c(2022, 10) # october 2022
train_ts <- window(answers_ts, end = train_end)
test_ts <- window(answers_ts, start = c(2022, 11))

arima_fit <- auto.arima(train_ts)

summary(arima_fit)

h <- length(test_ts)
fc <- forecast(arima_fit, h = h)

# compare forecast vs actual on the holdout period

fc_df <- data.frame(
    month = answers_monthly$month[answers_monthly$month >= as.Date("2022-11-01")],
    actual = as.numeric(test_ts),
    forecast = as.numeric(fc$mean),
    lower_80 = as.numeric(fc$lower[, "80%"]),
    upper_80 = as.numeric(fc$upper[, "80%"])
)

p_arima <- ggplot(fc_df, aes(x = month)) +
    geom_line(aes(y = actual), alpha = 0.6) +
    geom_line(aes(y = forecast), linetype = "dashed") +
    geom_ribbon(aes(ymin = lower_80, ymax = upper_80), alpha = 0.2) +
    labs(
        title = "arima forecast (trained on pre-chatgpt) vs actual answers",
        x = "month",
        y = "answers"
    )

print(p_arima)
ggsave(file.path(imgs_dir, "07_arima_forecast.png"), plot = p_arima, width = 10, height = 6, units = "in", dpi = 300)

# simple accuracy metrics on the holdout

fc_accuracy <- accuracy(fc, test_ts)
print(fc_accuracy)

# MODEL 4: logistic regression – does using chatgpt predict being a frequent stack overflow visitor?

set.seed(123)

survey_model_complete <- survey_model |>
    filter(!is.na(uses_chatgpt), !is.na(frequent_so))

candidate_predictors <- c("uses_chatgpt", "age_group", "gender", "year")
valid_predictors <- candidate_predictors[sapply(
    candidate_predictors,
    function(col) dplyr::n_distinct(survey_model_complete[[col]], na.rm = TRUE) > 1
)]
drop_predictors <- setdiff(candidate_predictors, valid_predictors)
if (length(drop_predictors) > 0) {
    message("dropping predictors with <2 levels: ", paste(drop_predictors, collapse = ", "))
}

logit_formula <- if (length(valid_predictors) == 0) {
    frequent_so ~ 1
} else {
    as.formula(paste("frequent_so ~", paste(valid_predictors, collapse = " + ")))
}

n <- nrow(survey_model_complete)
train_idx <- sample(seq_len(n), size = floor(0.8 * n))

survey_train <- survey_model_complete[train_idx, ]
survey_test <- survey_model_complete[-train_idx, ]

positive_rate <- mean(survey_train$frequent_so, na.rm = TRUE)
classification_threshold <- dplyr::case_when(
    is.na(positive_rate) ~ 0.5,
    positive_rate <= 0 ~ 0.5,
    positive_rate >= 1 ~ 0.5,
    TRUE ~ positive_rate
)

message(
    "classification threshold (training frequent_so share): ",
    round(classification_threshold, 3)
)

logit_model <- glm(
    formula = logit_formula,
    family = binomial(link = "logit"),
    data = survey_train
)

summary(logit_model)
tidy(logit_model, exponentiate = TRUE, conf.int = TRUE)

# predict on test set

survey_test <- survey_test |>
    mutate(
        pred_prob = predict(logit_model, newdata = survey_test, type = "response"),
        pred_class = if_else(pred_prob >= classification_threshold, 1L, 0L)
    )

# confusion matrix and simple metrics

conf_mat <- table(
    truth = factor(survey_test$frequent_so, levels = c(0, 1)),
    pred = factor(survey_test$pred_class, levels = c(0, 1))
)

conf_mat

tp <- conf_mat["1", "1"]
tn <- conf_mat["0", "0"]
fp <- conf_mat["0", "1"]
fn <- conf_mat["1", "0"]

accuracy <- (tp + tn) / sum(conf_mat)
precision <- if ((tp + fp) > 0) tp / (tp + fp) else NA_real_
recall <- if ((tp + fn) > 0) tp / (tp + fn) else NA_real_

list(
    accuracy = accuracy,
    precision = precision,
    recall = recall
)

# visual: predicted probability vs ai usage

p_logit_probs <- survey_test |>
    mutate(uses_chatgpt_label = if_else(uses_chatgpt == 1L, "uses chatgpt", "does not use chatgpt")) |>
    ggplot(aes(x = uses_chatgpt_label, y = pred_prob)) +
    geom_boxplot() +
    labs(
        title = "predicted probability of being a frequent so visitor by chatgpt use",
        x = "ai usage segment",
        y = "predicted probability (logistic model)"
    )

print(p_logit_probs)
ggsave(file.path(imgs_dir, "09_logit_probs.png"), plot = p_logit_probs, width = 8, height = 6, units = "in", dpi = 300)

# save key tables and model outputs to disk for report

write_csv(answers_monthly, file.path(data_dir, "answers_monthly_clean.csv"))
write_csv(answers_summary_period, file.path(data_dir, "answers_summary_period.csv"))
write_csv(survey_counts, file.path(data_dir, "survey_ai_vs_so_visit.csv"))

saveRDS(model_lm, file.path(data_dir, "model_lm_its.rds"))
saveRDS(model_pois, file.path(data_dir, "model_pois.rds"))
saveRDS(arima_fit, file.path(data_dir, "model_arima_prechatgpt.rds"))
saveRDS(logit_model, file.path(data_dir, "model_logit_survey.rds"))