# install.packages( # c("tidyverse", "lubridate", "broom", "forecast", "stringr", "dplyr"), # repos = "http://cran.us.r-project.org" # ) library(tidyverse) library(lubridate) library(broom) library(forecast) library(stringr) library(dplyr) # directory for data files (adjust if desired) data_dir <- "data" if (!dir.exists(data_dir)) { dir.create(data_dir, recursive = TRUE) } # directory for plots imgs_dir <- "imgs" if (!dir.exists(imgs_dir)) { dir.create(imgs_dir, recursive = TRUE) } # constants: key event dates related to chatgpt and so policy # chatgpt public research preview launch chatgpt_launch_date <- as.Date("2022-11-30") # openai "introducing chatgpt" blog # stack overflow generative ai ban policy (meta so, 5 dec 2022) so_ai_policy_date <- as.Date("2022-12-05") # moderation strike on stack exchange (june–aug 2023) from meta posts so_mod_strike_start <- as.Date("2023-06-05") so_mod_strike_end <- as.Date("2023-08-07") # helper: safe downloader download_if_missing <- function(url, destfile) { if (!file.exists(destfile)) { message("downloading ", basename(destfile), " ...") download.file(url, destfile, mode = "wb") message("saved to ", destfile) } else { message("file already exists: ", destfile) } } coerce_month_to_date <- function(x) { if (inherits(x, "Date")) { return(x) } if (inherits(x, "POSIXct")) { return(lubridate::as_date(x)) } if (inherits(x, "POSIXlt")) { return(as.Date(x)) } if (is.numeric(x)) { return(as.Date(x, origin = "1970-01-01")) } if (is.character(x)) { parsed <- suppressWarnings(lubridate::ymd_hms(x)) if (all(is.na(parsed))) { parsed <- suppressWarnings(lubridate::ymd(x)) } if (all(is.na(parsed))) { parsed <- suppressWarnings(as.Date(x)) } return(parsed) } suppressWarnings(as.Date(x)) } # 1) load stack overflow monthly answers (dataset 1) answers_csv_path <- file.path(data_dir, "so_new_answers_per_month_2018_2025.csv") if (!file.exists(answers_csv_path)) { stop( "missing ", answers_csv_path, "\nrun the sede query in this script and download the csv to that path first." ) } answers_raw <- readr::read_csv(answers_csv_path, show_col_types = FALSE) |> rename( month = matches("^Date$|Month", ignore.case = TRUE), status = matches("^Status$", ignore.case = TRUE), new_answers = matches("NewAnswers|Count", ignore.case = TRUE) ) answers_raw <- answers_raw |> mutate( month = coerce_month_to_date(month), status = tolower(status) ) # inspect column names so you can adjust if sede changes them print(names(answers_raw)) # expected columns: "Month", "Status", "NewAnswers" # normalise to lower snake case just in case answers_raw <- answers_raw |> rename( month = matches("Month", ignore.case = TRUE), status = matches("Status", ignore.case = TRUE), new_answers = matches("NewAnswers|Count", ignore.case = TRUE) ) print(head(answers_raw)) # aggregate deleted vs non-deleted into separate columns per month answers_monthly <- answers_raw |> mutate( month = as.Date(month), status = tolower(status) ) |> group_by(month) |> summarise( answers_total = sum(new_answers, na.rm = TRUE), answers_non_deleted = sum(if_else(status == "non-deleted", new_answers, 0L)), answers_deleted = sum(if_else(status == "deleted", new_answers, 0L)), .groups = "drop" ) |> arrange(month) |> mutate( year = year(month), month_num = month(month), time_index = row_number(), post_chatgpt = month >= chatgpt_launch_date, post_ai_policy = month >= so_ai_policy_date, during_mod_strike = month >= so_mod_strike_start & month <= so_mod_strike_end, period = case_when( month < chatgpt_launch_date ~ "pre_chatgpt", TRUE ~ "post_chatgpt" ) ) glimpse(answers_monthly) # 2) download and load stack overflow developer survey 2023/2024 (dataset 2) # official survey zip files as exposed on survey.stackoverflow.co # these urls are the same ones behind the "download full data set (csv)" links # see: https://survey.stackoverflow.co/ survey_2023_url <- "https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2023.zip" survey_2024_url <- "https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2024.zip" survey_2023_zip <- file.path(data_dir, "stack-overflow-developer-survey-2023.zip") survey_2024_zip <- file.path(data_dir, "stack-overflow-developer-survey-2024.zip") download_if_missing(survey_2023_url, survey_2023_zip) download_if_missing(survey_2024_url, survey_2024_zip) # helper to read the "survey_results_public.csv" inside each zip read_so_survey_from_zip <- function(zip_path, csv_pattern = "survey_results_public.csv") { if (!file.exists(zip_path)) { stop("zip file not found: ", zip_path) } # list files inside zip (works even when the CSV is in a subfolder) zlist <- utils::unzip(zip_path, list = TRUE) # try to find the csv by exact name or by pattern csv_name <- zlist$Name[stringr::str_detect(zlist$Name, regex(csv_pattern, ignore_case = TRUE))] if (length(csv_name) == 0) { stop("could not find a csv matching ", csv_pattern, " inside ", zip_path) } csv_name <- csv_name[1] # take first match # read it without extracting to disk using unz() connection # optionally supply col_types to speed parsing df <- readr::read_csv( unz(zip_path, csv_name), show_col_types = FALSE, # col_types = cols(.default = col_character()) # uncomment & customize if you want explicit types ) df } survey2023_raw <- read_so_survey_from_zip(survey_2023_zip) survey2024_raw <- read_so_survey_from_zip(survey_2024_zip) # look at column names to locate ai + stackoverflow usage questions names(survey2023_raw)[1:80] ############################################################ # create a harmonised survey subset focusing on: # - so visit frequency (column like SOVisitFreq) # - ai tool usage (column like AISelect or SOAI) ############################################################ find_first_col <- function(df, pattern) { cols <- names(df)[stringr::str_detect(names(df), regex(pattern, ignore_case = TRUE))] if (length(cols) == 0) { return(NA_character_) } cols[1] } pull_col_or_default <- function(df, col_name, default = NA_character_) { if (is.na(col_name)) { return(rep(default, nrow(df))) } df[[col_name]] } pull_age_numeric <- function(df, col_name) { vec <- pull_col_or_default(df, col_name, default = NA_real_) if (is.numeric(vec)) { return(vec) } if (is.factor(vec)) { vec <- as.character(vec) } if (is.character(vec)) { vec <- stringr::str_trim(vec) vec[vec == ""] <- NA_character_ vec[stringr::str_detect(vec, regex("prefer not to say", ignore_case = TRUE))] <- NA_character_ # parse_number extracts the leading numeric value (e.g., 25 from "25-34 years old") return(suppressWarnings(readr::parse_number(vec))) } suppressWarnings(as.numeric(vec)) } main_branch_col_2023 <- find_first_col(survey2023_raw, "^MainBranch$|MainBranch") country_col_2023 <- find_first_col(survey2023_raw, "^Country$|Country") age_col_2023 <- find_first_col(survey2023_raw, "^Age$|Age") gender_col_2023 <- find_first_col(survey2023_raw, "^Gender$|Gender") so_visit_col_2023 <- find_first_col(survey2023_raw, "SOVisitFreq") ai_select_col_2023 <- find_first_col(survey2023_raw, "AISelect|SOAI") main_branch_col_2024 <- find_first_col(survey2024_raw, "^MainBranch$|MainBranch") country_col_2024 <- find_first_col(survey2024_raw, "^Country$|Country") age_col_2024 <- find_first_col(survey2024_raw, "^Age$|Age") gender_col_2024 <- find_first_col(survey2024_raw, "^Gender$|Gender") so_visit_col_2024 <- find_first_col(survey2024_raw, "SOVisitFreq") ai_select_col_2024 <- find_first_col(survey2024_raw, "AISelect|SOAI") message("2023 so visit col: ", so_visit_col_2023) message("2023 ai col : ", ai_select_col_2023) message("2024 so visit col: ", so_visit_col_2024) message("2024 ai col : ", ai_select_col_2024) # build a clean survey frame for 2023 survey2023 <- survey2023_raw |> transmute( year = 2023L, main_branch = pull_col_or_default(survey2023_raw, main_branch_col_2023), country = pull_col_or_default(survey2023_raw, country_col_2023), age = pull_age_numeric(survey2023_raw, age_col_2023), gender = pull_col_or_default(survey2023_raw, gender_col_2023), so_visit = pull_col_or_default(survey2023_raw, so_visit_col_2023), ai_select = pull_col_or_default(survey2023_raw, ai_select_col_2023) ) # same idea for 2024 (schema is very similar) survey2024 <- survey2024_raw |> transmute( year = 2024L, main_branch = pull_col_or_default(survey2024_raw, main_branch_col_2024), country = pull_col_or_default(survey2024_raw, country_col_2024), age = pull_age_numeric(survey2024_raw, age_col_2024), gender = pull_col_or_default(survey2024_raw, gender_col_2024), so_visit = pull_col_or_default(survey2024_raw, so_visit_col_2024), ai_select = pull_col_or_default(survey2024_raw, ai_select_col_2024) ) survey_all <- bind_rows(survey2023, survey2024) # engineer features: # - binary flag: frequent so visitor # - binary flag: uses chatgpt as ai tool (from ai_select free text / semicolon list) # - coarser age groups survey_model <- survey_all |> filter(!is.na(so_visit)) |> mutate( so_visit = as.character(so_visit), ai_select = as.character(ai_select), # frequent so visitor: daily or multiple times per day etc. frequent_so = dplyr::case_when( stringr::str_detect(so_visit, regex("multiple times per day", ignore_case = TRUE)) ~ 1L, stringr::str_detect(so_visit, regex("daily|almost every day", ignore_case = TRUE)) ~ 1L, TRUE ~ 0L ), uses_chatgpt = dplyr::case_when( is.na(ai_select) ~ 0L, stringr::str_detect(ai_select, regex("chatgpt", ignore_case = TRUE)) ~ 1L, TRUE ~ 0L ), age_group = dplyr::case_when( !is.na(age) & age < 25 ~ "<25", !is.na(age) & age >= 25 & age < 35 ~ "25-34", !is.na(age) & age >= 35 & age < 45 ~ "35-44", !is.na(age) & age >= 45 ~ "45+", TRUE ~ "unknown" ), gender = if_else(is.na(gender) | gender == "", "Unknown", gender) ) |> filter(!is.na(frequent_so)) |> mutate( frequent_so = as.integer(frequent_so), uses_chatgpt = as.integer(uses_chatgpt), age_group = factor(age_group), gender = factor(gender), year = factor(year) ) glimpse(survey_model) # SECTION 2: data description + preliminary plots (dataset 1) # basic time series plot of answers over time (for section 2) p_answers_ts <- ggplot(answers_monthly, aes(x = month, y = answers_total)) + geom_line() + geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") + geom_vline(xintercept = so_ai_policy_date, linetype = "dotted") + labs( title = "monthly new answers on stack overflow", x = "month", y = "number of answers" ) print(p_answers_ts) ggsave( filename = file.path(imgs_dir, "01_answers_ts.png"), plot = p_answers_ts, width = 10, height = 6, units = "in", dpi = 300 ) # boxplot pre vs post chatgpt p_box_pre_post <- ggplot(answers_monthly, aes(x = period, y = answers_total)) + geom_boxplot() + labs( title = "distribution of monthly answers: pre vs post chatgpt launch", x = "period", y = "monthly answers" ) print(p_box_pre_post) ggsave(file.path(imgs_dir, "02_box_pre_post.png"), plot = p_box_pre_post, width = 8, height = 6, units = "in", dpi = 300) # basic summary table answers_summary_period <- answers_monthly |> group_by(period) |> summarise( n_months = n(), mean_answers = mean(answers_total), median_answers = median(answers_total), sd_answers = sd(answers_total), min_answers = min(answers_total), max_answers = max(answers_total), .groups = "drop" ) print(answers_summary_period) # SECTION 3: exploratory analysis # seasonal pattern: answers by calendar month across years p_seasonal <- answers_monthly |> mutate(month_label = factor(month_num, labels = month.abb)) |> ggplot(aes(x = month_label, y = answers_total, group = year, color = period)) + geom_line(alpha = 0.6) + labs( title = "seasonality of answers by calendar month and year", x = "calendar month", y = "monthly answers" ) print(p_seasonal) ggsave(file.path(imgs_dir, "03_seasonal.png"), plot = p_seasonal, width = 10, height = 6, units = "in", dpi = 300) # rolling 3-month moving average to smooth noise answers_monthly <- answers_monthly |> arrange(month) |> mutate( answers_ma3 = zoo::rollmean(answers_total, k = 3, fill = NA, align = "right") ) p_ma3 <- ggplot(answers_monthly, aes(x = month)) + geom_line(aes(y = answers_total), alpha = 0.3) + geom_line(aes(y = answers_ma3)) + geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") + labs( title = "monthly answers with 3-month moving average", x = "month", y = "answers" ) print(p_ma3) ggsave(file.path(imgs_dir, "04_ma3.png"), plot = p_ma3, width = 10, height = 6, units = "in", dpi = 300) # simple percentage change around chatgpt launch pre_window <- answers_monthly |> filter( month >= chatgpt_launch_date - months(6), month < chatgpt_launch_date ) post_window <- answers_monthly |> filter( month >= chatgpt_launch_date, month < chatgpt_launch_date + months(6) ) pre_mean <- mean(pre_window$answers_total) post_mean <- mean(post_window$answers_total) pct_change <- (post_mean - pre_mean) / pre_mean * 100 pct_change # survey exploratory: relation between ai usage and so visit frequency survey_counts <- survey_model |> mutate( uses_chatgpt_label = if_else(uses_chatgpt == 1L, "uses chatgpt", "does not use chatgpt"), freq_label = if_else(frequent_so == 1L, "visits so daily", "visits so less often") ) |> count(year, uses_chatgpt_label, freq_label) |> group_by(year, uses_chatgpt_label) |> mutate(prop = n / sum(n)) |> ungroup() p_survey_bar <- ggplot(survey_counts, aes(x = uses_chatgpt_label, y = prop, fill = freq_label)) + geom_col(position = "fill") + facet_wrap(~year) + scale_y_continuous(labels = scales::percent_format()) + labs( title = "relationship between chatgpt use and stack overflow visit frequency (survey)", x = "ai usage segment", y = "share of respondents", fill = "so visit frequency" ) print(p_survey_bar) ggsave(file.path(imgs_dir, "08_survey_bar.png"), plot = p_survey_bar, width = 10, height = 6, units = "in", dpi = 300) # SECTION 4: model development (four different model types) # MODEL 1: interrupted time series linear regression # outcome: monthly answers_total # predictors: time trend, post_chatgpt level change, slope change after chatgpt its_data <- answers_monthly |> mutate( time = time_index, chatgpt_time = if_else(month >= chatgpt_launch_date, time_index - min(time_index[month >= chatgpt_launch_date]) + 1L, 0L ) ) model_lm <- lm( answers_total ~ time + post_chatgpt + chatgpt_time, data = its_data ) summary(model_lm) tidy(model_lm) glance(model_lm) # predictions and plot its_data <- its_data |> mutate( lm_fitted = predict(model_lm) ) p_lm_fit <- ggplot(its_data, aes(x = month)) + geom_line(aes(y = answers_total), alpha = 0.4) + geom_line(aes(y = lm_fitted), color = "blue") + geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") + labs( title = "interrupted time series regression: observed vs fitted answers", x = "month", y = "answers" ) print(p_lm_fit) ggsave(file.path(imgs_dir, "05_lm_fit.png"), plot = p_lm_fit, width = 10, height = 6, units = "in", dpi = 300) # MODEL 2: poisson regression for count data model_pois <- glm( answers_total ~ time + post_chatgpt + chatgpt_time, data = its_data, family = poisson(link = "log") ) summary(model_pois) tidy(model_pois, exponentiate = TRUE) # exp(coef) ~ multiplicative effect # compare predicted counts its_data <- its_data |> mutate( pois_fitted = predict(model_pois, type = "response") ) p_pois_fit <- ggplot(its_data, aes(x = month)) + geom_line(aes(y = answers_total), alpha = 0.3) + geom_line(aes(y = pois_fitted), color = "red") + geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") + labs( title = "poisson regression: observed vs predicted monthly answers", x = "month", y = "answers" ) print(p_pois_fit) ggsave(file.path(imgs_dir, "06_pois_fit.png"), plot = p_pois_fit, width = 10, height = 6, units = "in", dpi = 300) # MODEL 3: arima time series forecast (pre-chatgpt vs actual) # construct monthly ts object (frequency = 12) start_year <- year(min(answers_monthly$month)) start_month <- month(min(answers_monthly$month)) answers_ts <- ts( answers_monthly$answers_total, start = c(start_year, start_month), frequency = 12 ) # train on pre-chatgpt data (up to oct 2022) and forecast forward train_end <- c(2022, 10) # october 2022 train_ts <- window(answers_ts, end = train_end) test_ts <- window(answers_ts, start = c(2022, 11)) arima_fit <- auto.arima(train_ts) summary(arima_fit) h <- length(test_ts) fc <- forecast(arima_fit, h = h) # compare forecast vs actual on the holdout period fc_df <- data.frame( month = answers_monthly$month[answers_monthly$month >= as.Date("2022-11-01")], actual = as.numeric(test_ts), forecast = as.numeric(fc$mean), lower_80 = as.numeric(fc$lower[, "80%"]), upper_80 = as.numeric(fc$upper[, "80%"]) ) p_arima <- ggplot(fc_df, aes(x = month)) + geom_line(aes(y = actual), alpha = 0.6) + geom_line(aes(y = forecast), linetype = "dashed") + geom_ribbon(aes(ymin = lower_80, ymax = upper_80), alpha = 0.2) + labs( title = "arima forecast (trained on pre-chatgpt) vs actual answers", x = "month", y = "answers" ) print(p_arima) ggsave(file.path(imgs_dir, "07_arima_forecast.png"), plot = p_arima, width = 10, height = 6, units = "in", dpi = 300) # simple accuracy metrics on the holdout fc_accuracy <- accuracy(fc, test_ts) print(fc_accuracy) # MODEL 4: logistic regression – does using chatgpt predict being a frequent stack overflow visitor? set.seed(123) survey_model_complete <- survey_model |> filter(!is.na(uses_chatgpt), !is.na(frequent_so)) candidate_predictors <- c("uses_chatgpt", "age_group", "gender", "year") valid_predictors <- candidate_predictors[sapply( candidate_predictors, function(col) dplyr::n_distinct(survey_model_complete[[col]], na.rm = TRUE) > 1 )] drop_predictors <- setdiff(candidate_predictors, valid_predictors) if (length(drop_predictors) > 0) { message("dropping predictors with <2 levels: ", paste(drop_predictors, collapse = ", ")) } logit_formula <- if (length(valid_predictors) == 0) { frequent_so ~ 1 } else { as.formula(paste("frequent_so ~", paste(valid_predictors, collapse = " + "))) } n <- nrow(survey_model_complete) train_idx <- sample(seq_len(n), size = floor(0.8 * n)) survey_train <- survey_model_complete[train_idx, ] survey_test <- survey_model_complete[-train_idx, ] positive_rate <- mean(survey_train$frequent_so, na.rm = TRUE) classification_threshold <- dplyr::case_when( is.na(positive_rate) ~ 0.5, positive_rate <= 0 ~ 0.5, positive_rate >= 1 ~ 0.5, TRUE ~ positive_rate ) message( "classification threshold (training frequent_so share): ", round(classification_threshold, 3) ) logit_model <- glm( formula = logit_formula, family = binomial(link = "logit"), data = survey_train ) summary(logit_model) tidy(logit_model, exponentiate = TRUE, conf.int = TRUE) # predict on test set survey_test <- survey_test |> mutate( pred_prob = predict(logit_model, newdata = survey_test, type = "response"), pred_class = if_else(pred_prob >= classification_threshold, 1L, 0L) ) # confusion matrix and simple metrics conf_mat <- table( truth = factor(survey_test$frequent_so, levels = c(0, 1)), pred = factor(survey_test$pred_class, levels = c(0, 1)) ) conf_mat tp <- conf_mat["1", "1"] tn <- conf_mat["0", "0"] fp <- conf_mat["0", "1"] fn <- conf_mat["1", "0"] accuracy <- (tp + tn) / sum(conf_mat) precision <- if ((tp + fp) > 0) tp / (tp + fp) else NA_real_ recall <- if ((tp + fn) > 0) tp / (tp + fn) else NA_real_ list( accuracy = accuracy, precision = precision, recall = recall ) # visual: predicted probability vs ai usage p_logit_probs <- survey_test |> mutate(uses_chatgpt_label = if_else(uses_chatgpt == 1L, "uses chatgpt", "does not use chatgpt")) |> ggplot(aes(x = uses_chatgpt_label, y = pred_prob)) + geom_boxplot() + labs( title = "predicted probability of being a frequent so visitor by chatgpt use", x = "ai usage segment", y = "predicted probability (logistic model)" ) print(p_logit_probs) ggsave(file.path(imgs_dir, "09_logit_probs.png"), plot = p_logit_probs, width = 8, height = 6, units = "in", dpi = 300) # save key tables and model outputs to disk for report write_csv(answers_monthly, file.path(data_dir, "answers_monthly_clean.csv")) write_csv(answers_summary_period, file.path(data_dir, "answers_summary_period.csv")) write_csv(survey_counts, file.path(data_dir, "survey_ai_vs_so_visit.csv")) saveRDS(model_lm, file.path(data_dir, "model_lm_its.rds")) saveRDS(model_pois, file.path(data_dir, "model_pois.rds")) saveRDS(arima_fit, file.path(data_dir, "model_arima_prechatgpt.rds")) saveRDS(logit_model, file.path(data_dir, "model_logit_survey.rds"))