This repository has been archived on 2026-05-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files

684 lines
22 KiB
R
Raw Permalink Normal View History

2025-11-22 15:25:17 -05:00
# install.packages(
# c("tidyverse", "lubridate", "broom", "forecast", "stringr", "dplyr"),
# repos = "http://cran.us.r-project.org"
# )
library(tidyverse)
library(lubridate)
library(broom)
library(forecast)
library(stringr)
library(dplyr)
# directory for data files (adjust if desired)
data_dir <- "data"
if (!dir.exists(data_dir)) {
dir.create(data_dir, recursive = TRUE)
}
# directory for plots
imgs_dir <- "imgs"
if (!dir.exists(imgs_dir)) {
dir.create(imgs_dir, recursive = TRUE)
}
# constants: key event dates related to chatgpt and so policy
# chatgpt public research preview launch
chatgpt_launch_date <- as.Date("2022-11-30") # openai "introducing chatgpt" blog
# stack overflow generative ai ban policy (meta so, 5 dec 2022)
so_ai_policy_date <- as.Date("2022-12-05")
# moderation strike on stack exchange (juneaug 2023) from meta posts
so_mod_strike_start <- as.Date("2023-06-05")
so_mod_strike_end <- as.Date("2023-08-07")
# helper: safe downloader
download_if_missing <- function(url, destfile) {
if (!file.exists(destfile)) {
message("downloading ", basename(destfile), " ...")
download.file(url, destfile, mode = "wb")
message("saved to ", destfile)
} else {
message("file already exists: ", destfile)
}
}
coerce_month_to_date <- function(x) {
if (inherits(x, "Date")) {
return(x)
}
if (inherits(x, "POSIXct")) {
return(lubridate::as_date(x))
}
if (inherits(x, "POSIXlt")) {
return(as.Date(x))
}
if (is.numeric(x)) {
return(as.Date(x, origin = "1970-01-01"))
}
if (is.character(x)) {
parsed <- suppressWarnings(lubridate::ymd_hms(x))
if (all(is.na(parsed))) {
parsed <- suppressWarnings(lubridate::ymd(x))
}
if (all(is.na(parsed))) {
parsed <- suppressWarnings(as.Date(x))
}
return(parsed)
}
suppressWarnings(as.Date(x))
}
# 1) load stack overflow monthly answers (dataset 1)
answers_csv_path <- file.path(data_dir, "so_new_answers_per_month_2018_2025.csv")
if (!file.exists(answers_csv_path)) {
stop(
"missing ", answers_csv_path,
"\nrun the sede query in this script and download the csv to that path first."
)
}
answers_raw <- readr::read_csv(answers_csv_path, show_col_types = FALSE) |>
rename(
month = matches("^Date$|Month", ignore.case = TRUE),
status = matches("^Status$", ignore.case = TRUE),
new_answers = matches("NewAnswers|Count", ignore.case = TRUE)
)
answers_raw <- answers_raw |>
mutate(
month = coerce_month_to_date(month),
status = tolower(status)
)
# inspect column names so you can adjust if sede changes them
print(names(answers_raw))
# expected columns: "Month", "Status", "NewAnswers"
# normalise to lower snake case just in case
answers_raw <- answers_raw |>
rename(
month = matches("Month", ignore.case = TRUE),
status = matches("Status", ignore.case = TRUE),
new_answers = matches("NewAnswers|Count", ignore.case = TRUE)
)
print(head(answers_raw))
# aggregate deleted vs non-deleted into separate columns per month
answers_monthly <- answers_raw |>
mutate(
month = as.Date(month),
status = tolower(status)
) |>
group_by(month) |>
summarise(
answers_total = sum(new_answers, na.rm = TRUE),
answers_non_deleted = sum(if_else(status == "non-deleted", new_answers, 0L)),
answers_deleted = sum(if_else(status == "deleted", new_answers, 0L)),
.groups = "drop"
) |>
arrange(month) |>
mutate(
year = year(month),
month_num = month(month),
time_index = row_number(),
post_chatgpt = month >= chatgpt_launch_date,
post_ai_policy = month >= so_ai_policy_date,
during_mod_strike = month >= so_mod_strike_start & month <= so_mod_strike_end,
period = case_when(
month < chatgpt_launch_date ~ "pre_chatgpt",
TRUE ~ "post_chatgpt"
)
)
glimpse(answers_monthly)
# 2) download and load stack overflow developer survey 2023/2024 (dataset 2)
# official survey zip files as exposed on survey.stackoverflow.co
# these urls are the same ones behind the "download full data set (csv)" links
# see: https://survey.stackoverflow.co/
survey_2023_url <- "https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2023.zip"
survey_2024_url <- "https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2024.zip"
survey_2023_zip <- file.path(data_dir, "stack-overflow-developer-survey-2023.zip")
survey_2024_zip <- file.path(data_dir, "stack-overflow-developer-survey-2024.zip")
download_if_missing(survey_2023_url, survey_2023_zip)
download_if_missing(survey_2024_url, survey_2024_zip)
# helper to read the "survey_results_public.csv" inside each zip
read_so_survey_from_zip <- function(zip_path, csv_pattern = "survey_results_public.csv") {
if (!file.exists(zip_path)) {
stop("zip file not found: ", zip_path)
}
# list files inside zip (works even when the CSV is in a subfolder)
zlist <- utils::unzip(zip_path, list = TRUE)
# try to find the csv by exact name or by pattern
csv_name <- zlist$Name[stringr::str_detect(zlist$Name, regex(csv_pattern, ignore_case = TRUE))]
if (length(csv_name) == 0) {
stop("could not find a csv matching ", csv_pattern, " inside ", zip_path)
}
csv_name <- csv_name[1] # take first match
# read it without extracting to disk using unz() connection
# optionally supply col_types to speed parsing
df <- readr::read_csv(
unz(zip_path, csv_name),
show_col_types = FALSE,
# col_types = cols(.default = col_character()) # uncomment & customize if you want explicit types
)
df
}
survey2023_raw <- read_so_survey_from_zip(survey_2023_zip)
survey2024_raw <- read_so_survey_from_zip(survey_2024_zip)
# look at column names to locate ai + stackoverflow usage questions
names(survey2023_raw)[1:80]
############################################################
# create a harmonised survey subset focusing on:
# - so visit frequency (column like SOVisitFreq)
# - ai tool usage (column like AISelect or SOAI)
############################################################
find_first_col <- function(df, pattern) {
cols <- names(df)[stringr::str_detect(names(df), regex(pattern, ignore_case = TRUE))]
if (length(cols) == 0) {
return(NA_character_)
}
cols[1]
}
pull_col_or_default <- function(df, col_name, default = NA_character_) {
if (is.na(col_name)) {
return(rep(default, nrow(df)))
}
df[[col_name]]
}
pull_age_numeric <- function(df, col_name) {
vec <- pull_col_or_default(df, col_name, default = NA_real_)
if (is.numeric(vec)) {
return(vec)
}
if (is.factor(vec)) {
vec <- as.character(vec)
}
if (is.character(vec)) {
vec <- stringr::str_trim(vec)
vec[vec == ""] <- NA_character_
vec[stringr::str_detect(vec, regex("prefer not to say", ignore_case = TRUE))] <- NA_character_
# parse_number extracts the leading numeric value (e.g., 25 from "25-34 years old")
return(suppressWarnings(readr::parse_number(vec)))
}
suppressWarnings(as.numeric(vec))
}
main_branch_col_2023 <- find_first_col(survey2023_raw, "^MainBranch$|MainBranch")
country_col_2023 <- find_first_col(survey2023_raw, "^Country$|Country")
age_col_2023 <- find_first_col(survey2023_raw, "^Age$|Age")
gender_col_2023 <- find_first_col(survey2023_raw, "^Gender$|Gender")
so_visit_col_2023 <- find_first_col(survey2023_raw, "SOVisitFreq")
ai_select_col_2023 <- find_first_col(survey2023_raw, "AISelect|SOAI")
main_branch_col_2024 <- find_first_col(survey2024_raw, "^MainBranch$|MainBranch")
country_col_2024 <- find_first_col(survey2024_raw, "^Country$|Country")
age_col_2024 <- find_first_col(survey2024_raw, "^Age$|Age")
gender_col_2024 <- find_first_col(survey2024_raw, "^Gender$|Gender")
so_visit_col_2024 <- find_first_col(survey2024_raw, "SOVisitFreq")
ai_select_col_2024 <- find_first_col(survey2024_raw, "AISelect|SOAI")
message("2023 so visit col: ", so_visit_col_2023)
message("2023 ai col : ", ai_select_col_2023)
message("2024 so visit col: ", so_visit_col_2024)
message("2024 ai col : ", ai_select_col_2024)
# build a clean survey frame for 2023
survey2023 <- survey2023_raw |>
transmute(
year = 2023L,
main_branch = pull_col_or_default(survey2023_raw, main_branch_col_2023),
country = pull_col_or_default(survey2023_raw, country_col_2023),
age = pull_age_numeric(survey2023_raw, age_col_2023),
gender = pull_col_or_default(survey2023_raw, gender_col_2023),
so_visit = pull_col_or_default(survey2023_raw, so_visit_col_2023),
ai_select = pull_col_or_default(survey2023_raw, ai_select_col_2023)
)
# same idea for 2024 (schema is very similar)
survey2024 <- survey2024_raw |>
transmute(
year = 2024L,
main_branch = pull_col_or_default(survey2024_raw, main_branch_col_2024),
country = pull_col_or_default(survey2024_raw, country_col_2024),
age = pull_age_numeric(survey2024_raw, age_col_2024),
gender = pull_col_or_default(survey2024_raw, gender_col_2024),
so_visit = pull_col_or_default(survey2024_raw, so_visit_col_2024),
ai_select = pull_col_or_default(survey2024_raw, ai_select_col_2024)
)
survey_all <- bind_rows(survey2023, survey2024)
# engineer features:
# - binary flag: frequent so visitor
# - binary flag: uses chatgpt as ai tool (from ai_select free text / semicolon list)
# - coarser age groups
survey_model <- survey_all |>
filter(!is.na(so_visit)) |>
mutate(
so_visit = as.character(so_visit),
ai_select = as.character(ai_select),
# frequent so visitor: daily or multiple times per day etc.
frequent_so = dplyr::case_when(
stringr::str_detect(so_visit, regex("multiple times per day", ignore_case = TRUE)) ~ 1L,
stringr::str_detect(so_visit, regex("daily|almost every day", ignore_case = TRUE)) ~ 1L,
TRUE ~ 0L
),
uses_chatgpt = dplyr::case_when(
is.na(ai_select) ~ 0L,
stringr::str_detect(ai_select, regex("chatgpt", ignore_case = TRUE)) ~ 1L,
TRUE ~ 0L
),
age_group = dplyr::case_when(
!is.na(age) & age < 25 ~ "<25",
!is.na(age) & age >= 25 & age < 35 ~ "25-34",
!is.na(age) & age >= 35 & age < 45 ~ "35-44",
!is.na(age) & age >= 45 ~ "45+",
TRUE ~ "unknown"
),
gender = if_else(is.na(gender) | gender == "", "Unknown", gender)
) |>
filter(!is.na(frequent_so)) |>
mutate(
frequent_so = as.integer(frequent_so),
uses_chatgpt = as.integer(uses_chatgpt),
age_group = factor(age_group),
gender = factor(gender),
year = factor(year)
)
glimpse(survey_model)
# SECTION 2: data description + preliminary plots (dataset 1)
# basic time series plot of answers over time (for section 2)
p_answers_ts <- ggplot(answers_monthly, aes(x = month, y = answers_total)) +
geom_line() +
geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
geom_vline(xintercept = so_ai_policy_date, linetype = "dotted") +
labs(
title = "monthly new answers on stack overflow",
x = "month",
y = "number of answers"
)
print(p_answers_ts)
ggsave(
filename = file.path(imgs_dir, "01_answers_ts.png"),
plot = p_answers_ts,
width = 10, height = 6, units = "in", dpi = 300
)
# boxplot pre vs post chatgpt
p_box_pre_post <- ggplot(answers_monthly, aes(x = period, y = answers_total)) +
geom_boxplot() +
labs(
title = "distribution of monthly answers: pre vs post chatgpt launch",
x = "period",
y = "monthly answers"
)
print(p_box_pre_post)
ggsave(file.path(imgs_dir, "02_box_pre_post.png"), plot = p_box_pre_post, width = 8, height = 6, units = "in", dpi = 300)
# basic summary table
answers_summary_period <- answers_monthly |>
group_by(period) |>
summarise(
n_months = n(),
mean_answers = mean(answers_total),
median_answers = median(answers_total),
sd_answers = sd(answers_total),
min_answers = min(answers_total),
max_answers = max(answers_total),
.groups = "drop"
)
print(answers_summary_period)
# SECTION 3: exploratory analysis
# seasonal pattern: answers by calendar month across years
p_seasonal <- answers_monthly |>
mutate(month_label = factor(month_num, labels = month.abb)) |>
ggplot(aes(x = month_label, y = answers_total, group = year, color = period)) +
geom_line(alpha = 0.6) +
labs(
title = "seasonality of answers by calendar month and year",
x = "calendar month",
y = "monthly answers"
)
print(p_seasonal)
ggsave(file.path(imgs_dir, "03_seasonal.png"), plot = p_seasonal, width = 10, height = 6, units = "in", dpi = 300)
# rolling 3-month moving average to smooth noise
answers_monthly <- answers_monthly |>
arrange(month) |>
mutate(
answers_ma3 = zoo::rollmean(answers_total, k = 3, fill = NA, align = "right")
)
p_ma3 <- ggplot(answers_monthly, aes(x = month)) +
geom_line(aes(y = answers_total), alpha = 0.3) +
geom_line(aes(y = answers_ma3)) +
geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
labs(
title = "monthly answers with 3-month moving average",
x = "month",
y = "answers"
)
print(p_ma3)
ggsave(file.path(imgs_dir, "04_ma3.png"), plot = p_ma3, width = 10, height = 6, units = "in", dpi = 300)
# simple percentage change around chatgpt launch
pre_window <- answers_monthly |>
filter(
month >= chatgpt_launch_date - months(6),
month < chatgpt_launch_date
)
post_window <- answers_monthly |>
filter(
month >= chatgpt_launch_date,
month < chatgpt_launch_date + months(6)
)
pre_mean <- mean(pre_window$answers_total)
post_mean <- mean(post_window$answers_total)
pct_change <- (post_mean - pre_mean) / pre_mean * 100
pct_change
# survey exploratory: relation between ai usage and so visit frequency
survey_counts <- survey_model |>
mutate(
uses_chatgpt_label = if_else(uses_chatgpt == 1L, "uses chatgpt", "does not use chatgpt"),
freq_label = if_else(frequent_so == 1L, "visits so daily", "visits so less often")
) |>
count(year, uses_chatgpt_label, freq_label) |>
group_by(year, uses_chatgpt_label) |>
mutate(prop = n / sum(n)) |>
ungroup()
p_survey_bar <- ggplot(survey_counts, aes(x = uses_chatgpt_label, y = prop, fill = freq_label)) +
geom_col(position = "fill") +
facet_wrap(~year) +
scale_y_continuous(labels = scales::percent_format()) +
labs(
title = "relationship between chatgpt use and stack overflow visit frequency (survey)",
x = "ai usage segment",
y = "share of respondents",
fill = "so visit frequency"
)
print(p_survey_bar)
ggsave(file.path(imgs_dir, "08_survey_bar.png"), plot = p_survey_bar, width = 10, height = 6, units = "in", dpi = 300)
# SECTION 4: model development (four different model types)
# MODEL 1: interrupted time series linear regression
# outcome: monthly answers_total
# predictors: time trend, post_chatgpt level change, slope change after chatgpt
its_data <- answers_monthly |>
mutate(
time = time_index,
chatgpt_time = if_else(month >= chatgpt_launch_date,
time_index - min(time_index[month >= chatgpt_launch_date]) + 1L,
0L
)
)
model_lm <- lm(
answers_total ~ time + post_chatgpt + chatgpt_time,
data = its_data
)
summary(model_lm)
tidy(model_lm)
glance(model_lm)
# predictions and plot
its_data <- its_data |>
mutate(
lm_fitted = predict(model_lm)
)
p_lm_fit <- ggplot(its_data, aes(x = month)) +
geom_line(aes(y = answers_total), alpha = 0.4) +
geom_line(aes(y = lm_fitted), color = "blue") +
geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
labs(
title = "interrupted time series regression: observed vs fitted answers",
x = "month",
y = "answers"
)
print(p_lm_fit)
ggsave(file.path(imgs_dir, "05_lm_fit.png"), plot = p_lm_fit, width = 10, height = 6, units = "in", dpi = 300)
# MODEL 2: poisson regression for count data
model_pois <- glm(
answers_total ~ time + post_chatgpt + chatgpt_time,
data = its_data,
family = poisson(link = "log")
)
summary(model_pois)
tidy(model_pois, exponentiate = TRUE) # exp(coef) ~ multiplicative effect
# compare predicted counts
its_data <- its_data |>
mutate(
pois_fitted = predict(model_pois, type = "response")
)
p_pois_fit <- ggplot(its_data, aes(x = month)) +
geom_line(aes(y = answers_total), alpha = 0.3) +
geom_line(aes(y = pois_fitted), color = "red") +
geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
labs(
title = "poisson regression: observed vs predicted monthly answers",
x = "month",
y = "answers"
)
print(p_pois_fit)
ggsave(file.path(imgs_dir, "06_pois_fit.png"), plot = p_pois_fit, width = 10, height = 6, units = "in", dpi = 300)
# MODEL 3: arima time series forecast (pre-chatgpt vs actual)
# construct monthly ts object (frequency = 12)
start_year <- year(min(answers_monthly$month))
start_month <- month(min(answers_monthly$month))
answers_ts <- ts(
answers_monthly$answers_total,
start = c(start_year, start_month),
frequency = 12
)
# train on pre-chatgpt data (up to oct 2022) and forecast forward
train_end <- c(2022, 10) # october 2022
train_ts <- window(answers_ts, end = train_end)
test_ts <- window(answers_ts, start = c(2022, 11))
arima_fit <- auto.arima(train_ts)
summary(arima_fit)
h <- length(test_ts)
fc <- forecast(arima_fit, h = h)
# compare forecast vs actual on the holdout period
fc_df <- data.frame(
month = answers_monthly$month[answers_monthly$month >= as.Date("2022-11-01")],
actual = as.numeric(test_ts),
forecast = as.numeric(fc$mean),
lower_80 = as.numeric(fc$lower[, "80%"]),
upper_80 = as.numeric(fc$upper[, "80%"])
)
p_arima <- ggplot(fc_df, aes(x = month)) +
geom_line(aes(y = actual), alpha = 0.6) +
geom_line(aes(y = forecast), linetype = "dashed") +
geom_ribbon(aes(ymin = lower_80, ymax = upper_80), alpha = 0.2) +
labs(
title = "arima forecast (trained on pre-chatgpt) vs actual answers",
x = "month",
y = "answers"
)
print(p_arima)
ggsave(file.path(imgs_dir, "07_arima_forecast.png"), plot = p_arima, width = 10, height = 6, units = "in", dpi = 300)
# simple accuracy metrics on the holdout
fc_accuracy <- accuracy(fc, test_ts)
print(fc_accuracy)
# MODEL 4: logistic regression does using chatgpt predict being a frequent stack overflow visitor?
set.seed(123)
survey_model_complete <- survey_model |>
filter(!is.na(uses_chatgpt), !is.na(frequent_so))
candidate_predictors <- c("uses_chatgpt", "age_group", "gender", "year")
valid_predictors <- candidate_predictors[sapply(
candidate_predictors,
function(col) dplyr::n_distinct(survey_model_complete[[col]], na.rm = TRUE) > 1
)]
drop_predictors <- setdiff(candidate_predictors, valid_predictors)
if (length(drop_predictors) > 0) {
message("dropping predictors with <2 levels: ", paste(drop_predictors, collapse = ", "))
}
logit_formula <- if (length(valid_predictors) == 0) {
frequent_so ~ 1
} else {
as.formula(paste("frequent_so ~", paste(valid_predictors, collapse = " + ")))
}
n <- nrow(survey_model_complete)
train_idx <- sample(seq_len(n), size = floor(0.8 * n))
survey_train <- survey_model_complete[train_idx, ]
survey_test <- survey_model_complete[-train_idx, ]
positive_rate <- mean(survey_train$frequent_so, na.rm = TRUE)
classification_threshold <- dplyr::case_when(
is.na(positive_rate) ~ 0.5,
positive_rate <= 0 ~ 0.5,
positive_rate >= 1 ~ 0.5,
TRUE ~ positive_rate
)
message(
"classification threshold (training frequent_so share): ",
round(classification_threshold, 3)
)
logit_model <- glm(
formula = logit_formula,
family = binomial(link = "logit"),
data = survey_train
)
summary(logit_model)
tidy(logit_model, exponentiate = TRUE, conf.int = TRUE)
# predict on test set
survey_test <- survey_test |>
mutate(
pred_prob = predict(logit_model, newdata = survey_test, type = "response"),
pred_class = if_else(pred_prob >= classification_threshold, 1L, 0L)
)
# confusion matrix and simple metrics
conf_mat <- table(
truth = factor(survey_test$frequent_so, levels = c(0, 1)),
pred = factor(survey_test$pred_class, levels = c(0, 1))
)
conf_mat
tp <- conf_mat["1", "1"]
tn <- conf_mat["0", "0"]
fp <- conf_mat["0", "1"]
fn <- conf_mat["1", "0"]
accuracy <- (tp + tn) / sum(conf_mat)
precision <- if ((tp + fp) > 0) tp / (tp + fp) else NA_real_
recall <- if ((tp + fn) > 0) tp / (tp + fn) else NA_real_
list(
accuracy = accuracy,
precision = precision,
recall = recall
)
# visual: predicted probability vs ai usage
p_logit_probs <- survey_test |>
mutate(uses_chatgpt_label = if_else(uses_chatgpt == 1L, "uses chatgpt", "does not use chatgpt")) |>
ggplot(aes(x = uses_chatgpt_label, y = pred_prob)) +
geom_boxplot() +
labs(
title = "predicted probability of being a frequent so visitor by chatgpt use",
x = "ai usage segment",
y = "predicted probability (logistic model)"
)
print(p_logit_probs)
ggsave(file.path(imgs_dir, "09_logit_probs.png"), plot = p_logit_probs, width = 8, height = 6, units = "in", dpi = 300)
# save key tables and model outputs to disk for report
write_csv(answers_monthly, file.path(data_dir, "answers_monthly_clean.csv"))
write_csv(answers_summary_period, file.path(data_dir, "answers_summary_period.csv"))
write_csv(survey_counts, file.path(data_dir, "survey_ai_vs_so_visit.csv"))
saveRDS(model_lm, file.path(data_dir, "model_lm_its.rds"))
saveRDS(model_pois, file.path(data_dir, "model_pois.rds"))
saveRDS(arima_fit, file.path(data_dir, "model_arima_prechatgpt.rds"))
saveRDS(logit_model, file.path(data_dir, "model_logit_survey.rds"))