684 lines
22 KiB
R
684 lines
22 KiB
R
# install.packages(
|
||
# c("tidyverse", "lubridate", "broom", "forecast", "stringr", "dplyr"),
|
||
# repos = "http://cran.us.r-project.org"
|
||
# )
|
||
|
||
library(tidyverse)
|
||
library(lubridate)
|
||
library(broom)
|
||
library(forecast)
|
||
library(stringr)
|
||
library(dplyr)
|
||
|
||
|
||
# directory for data files (adjust if desired)
|
||
data_dir <- "data"
|
||
if (!dir.exists(data_dir)) {
|
||
dir.create(data_dir, recursive = TRUE)
|
||
}
|
||
|
||
# directory for plots
|
||
imgs_dir <- "imgs"
|
||
if (!dir.exists(imgs_dir)) {
|
||
dir.create(imgs_dir, recursive = TRUE)
|
||
}
|
||
|
||
# constants: key event dates related to chatgpt and so policy
|
||
|
||
# chatgpt public research preview launch
|
||
chatgpt_launch_date <- as.Date("2022-11-30") # openai "introducing chatgpt" blog
|
||
|
||
# stack overflow generative ai ban policy (meta so, 5 dec 2022)
|
||
so_ai_policy_date <- as.Date("2022-12-05")
|
||
|
||
# moderation strike on stack exchange (june–aug 2023) from meta posts
|
||
so_mod_strike_start <- as.Date("2023-06-05")
|
||
so_mod_strike_end <- as.Date("2023-08-07")
|
||
|
||
# helper: safe downloader
|
||
download_if_missing <- function(url, destfile) {
|
||
if (!file.exists(destfile)) {
|
||
message("downloading ", basename(destfile), " ...")
|
||
download.file(url, destfile, mode = "wb")
|
||
message("saved to ", destfile)
|
||
} else {
|
||
message("file already exists: ", destfile)
|
||
}
|
||
}
|
||
|
||
coerce_month_to_date <- function(x) {
|
||
if (inherits(x, "Date")) {
|
||
return(x)
|
||
}
|
||
if (inherits(x, "POSIXct")) {
|
||
return(lubridate::as_date(x))
|
||
}
|
||
if (inherits(x, "POSIXlt")) {
|
||
return(as.Date(x))
|
||
}
|
||
if (is.numeric(x)) {
|
||
return(as.Date(x, origin = "1970-01-01"))
|
||
}
|
||
if (is.character(x)) {
|
||
parsed <- suppressWarnings(lubridate::ymd_hms(x))
|
||
if (all(is.na(parsed))) {
|
||
parsed <- suppressWarnings(lubridate::ymd(x))
|
||
}
|
||
if (all(is.na(parsed))) {
|
||
parsed <- suppressWarnings(as.Date(x))
|
||
}
|
||
return(parsed)
|
||
}
|
||
suppressWarnings(as.Date(x))
|
||
}
|
||
|
||
# 1) load stack overflow monthly answers (dataset 1)
|
||
|
||
answers_csv_path <- file.path(data_dir, "so_new_answers_per_month_2018_2025.csv")
|
||
|
||
if (!file.exists(answers_csv_path)) {
|
||
stop(
|
||
"missing ", answers_csv_path,
|
||
"\nrun the sede query in this script and download the csv to that path first."
|
||
)
|
||
}
|
||
|
||
answers_raw <- readr::read_csv(answers_csv_path, show_col_types = FALSE) |>
|
||
rename(
|
||
month = matches("^Date$|Month", ignore.case = TRUE),
|
||
status = matches("^Status$", ignore.case = TRUE),
|
||
new_answers = matches("NewAnswers|Count", ignore.case = TRUE)
|
||
)
|
||
|
||
answers_raw <- answers_raw |>
|
||
mutate(
|
||
month = coerce_month_to_date(month),
|
||
status = tolower(status)
|
||
)
|
||
|
||
|
||
# inspect column names so you can adjust if sede changes them
|
||
print(names(answers_raw))
|
||
|
||
# expected columns: "Month", "Status", "NewAnswers"
|
||
# normalise to lower snake case just in case
|
||
|
||
answers_raw <- answers_raw |>
|
||
rename(
|
||
month = matches("Month", ignore.case = TRUE),
|
||
status = matches("Status", ignore.case = TRUE),
|
||
new_answers = matches("NewAnswers|Count", ignore.case = TRUE)
|
||
)
|
||
|
||
print(head(answers_raw))
|
||
|
||
# aggregate deleted vs non-deleted into separate columns per month
|
||
answers_monthly <- answers_raw |>
|
||
mutate(
|
||
month = as.Date(month),
|
||
status = tolower(status)
|
||
) |>
|
||
group_by(month) |>
|
||
summarise(
|
||
answers_total = sum(new_answers, na.rm = TRUE),
|
||
answers_non_deleted = sum(if_else(status == "non-deleted", new_answers, 0L)),
|
||
answers_deleted = sum(if_else(status == "deleted", new_answers, 0L)),
|
||
.groups = "drop"
|
||
) |>
|
||
arrange(month) |>
|
||
mutate(
|
||
year = year(month),
|
||
month_num = month(month),
|
||
time_index = row_number(),
|
||
post_chatgpt = month >= chatgpt_launch_date,
|
||
post_ai_policy = month >= so_ai_policy_date,
|
||
during_mod_strike = month >= so_mod_strike_start & month <= so_mod_strike_end,
|
||
period = case_when(
|
||
month < chatgpt_launch_date ~ "pre_chatgpt",
|
||
TRUE ~ "post_chatgpt"
|
||
)
|
||
)
|
||
|
||
glimpse(answers_monthly)
|
||
|
||
# 2) download and load stack overflow developer survey 2023/2024 (dataset 2)
|
||
|
||
# official survey zip files as exposed on survey.stackoverflow.co
|
||
# these urls are the same ones behind the "download full data set (csv)" links
|
||
# see: https://survey.stackoverflow.co/
|
||
survey_2023_url <- "https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2023.zip"
|
||
survey_2024_url <- "https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2024.zip"
|
||
|
||
survey_2023_zip <- file.path(data_dir, "stack-overflow-developer-survey-2023.zip")
|
||
survey_2024_zip <- file.path(data_dir, "stack-overflow-developer-survey-2024.zip")
|
||
|
||
download_if_missing(survey_2023_url, survey_2023_zip)
|
||
download_if_missing(survey_2024_url, survey_2024_zip)
|
||
|
||
# helper to read the "survey_results_public.csv" inside each zip
|
||
read_so_survey_from_zip <- function(zip_path, csv_pattern = "survey_results_public.csv") {
|
||
if (!file.exists(zip_path)) {
|
||
stop("zip file not found: ", zip_path)
|
||
}
|
||
|
||
# list files inside zip (works even when the CSV is in a subfolder)
|
||
zlist <- utils::unzip(zip_path, list = TRUE)
|
||
# try to find the csv by exact name or by pattern
|
||
csv_name <- zlist$Name[stringr::str_detect(zlist$Name, regex(csv_pattern, ignore_case = TRUE))]
|
||
if (length(csv_name) == 0) {
|
||
stop("could not find a csv matching ", csv_pattern, " inside ", zip_path)
|
||
}
|
||
csv_name <- csv_name[1] # take first match
|
||
|
||
# read it without extracting to disk using unz() connection
|
||
# optionally supply col_types to speed parsing
|
||
df <- readr::read_csv(
|
||
unz(zip_path, csv_name),
|
||
show_col_types = FALSE,
|
||
# col_types = cols(.default = col_character()) # uncomment & customize if you want explicit types
|
||
)
|
||
df
|
||
}
|
||
|
||
survey2023_raw <- read_so_survey_from_zip(survey_2023_zip)
|
||
survey2024_raw <- read_so_survey_from_zip(survey_2024_zip)
|
||
|
||
# look at column names to locate ai + stackoverflow usage questions
|
||
names(survey2023_raw)[1:80]
|
||
|
||
############################################################
|
||
# create a harmonised survey subset focusing on:
|
||
# - so visit frequency (column like SOVisitFreq)
|
||
# - ai tool usage (column like AISelect or SOAI)
|
||
############################################################
|
||
|
||
find_first_col <- function(df, pattern) {
|
||
cols <- names(df)[stringr::str_detect(names(df), regex(pattern, ignore_case = TRUE))]
|
||
if (length(cols) == 0) {
|
||
return(NA_character_)
|
||
}
|
||
cols[1]
|
||
}
|
||
|
||
pull_col_or_default <- function(df, col_name, default = NA_character_) {
|
||
if (is.na(col_name)) {
|
||
return(rep(default, nrow(df)))
|
||
}
|
||
df[[col_name]]
|
||
}
|
||
|
||
pull_age_numeric <- function(df, col_name) {
|
||
vec <- pull_col_or_default(df, col_name, default = NA_real_)
|
||
if (is.numeric(vec)) {
|
||
return(vec)
|
||
}
|
||
if (is.factor(vec)) {
|
||
vec <- as.character(vec)
|
||
}
|
||
if (is.character(vec)) {
|
||
vec <- stringr::str_trim(vec)
|
||
vec[vec == ""] <- NA_character_
|
||
vec[stringr::str_detect(vec, regex("prefer not to say", ignore_case = TRUE))] <- NA_character_
|
||
# parse_number extracts the leading numeric value (e.g., 25 from "25-34 years old")
|
||
return(suppressWarnings(readr::parse_number(vec)))
|
||
}
|
||
suppressWarnings(as.numeric(vec))
|
||
}
|
||
|
||
main_branch_col_2023 <- find_first_col(survey2023_raw, "^MainBranch$|MainBranch")
|
||
country_col_2023 <- find_first_col(survey2023_raw, "^Country$|Country")
|
||
age_col_2023 <- find_first_col(survey2023_raw, "^Age$|Age")
|
||
gender_col_2023 <- find_first_col(survey2023_raw, "^Gender$|Gender")
|
||
so_visit_col_2023 <- find_first_col(survey2023_raw, "SOVisitFreq")
|
||
ai_select_col_2023 <- find_first_col(survey2023_raw, "AISelect|SOAI")
|
||
|
||
main_branch_col_2024 <- find_first_col(survey2024_raw, "^MainBranch$|MainBranch")
|
||
country_col_2024 <- find_first_col(survey2024_raw, "^Country$|Country")
|
||
age_col_2024 <- find_first_col(survey2024_raw, "^Age$|Age")
|
||
gender_col_2024 <- find_first_col(survey2024_raw, "^Gender$|Gender")
|
||
so_visit_col_2024 <- find_first_col(survey2024_raw, "SOVisitFreq")
|
||
ai_select_col_2024 <- find_first_col(survey2024_raw, "AISelect|SOAI")
|
||
|
||
message("2023 so visit col: ", so_visit_col_2023)
|
||
message("2023 ai col : ", ai_select_col_2023)
|
||
message("2024 so visit col: ", so_visit_col_2024)
|
||
message("2024 ai col : ", ai_select_col_2024)
|
||
|
||
# build a clean survey frame for 2023
|
||
|
||
survey2023 <- survey2023_raw |>
|
||
transmute(
|
||
year = 2023L,
|
||
main_branch = pull_col_or_default(survey2023_raw, main_branch_col_2023),
|
||
country = pull_col_or_default(survey2023_raw, country_col_2023),
|
||
age = pull_age_numeric(survey2023_raw, age_col_2023),
|
||
gender = pull_col_or_default(survey2023_raw, gender_col_2023),
|
||
so_visit = pull_col_or_default(survey2023_raw, so_visit_col_2023),
|
||
ai_select = pull_col_or_default(survey2023_raw, ai_select_col_2023)
|
||
)
|
||
|
||
# same idea for 2024 (schema is very similar)
|
||
|
||
survey2024 <- survey2024_raw |>
|
||
transmute(
|
||
year = 2024L,
|
||
main_branch = pull_col_or_default(survey2024_raw, main_branch_col_2024),
|
||
country = pull_col_or_default(survey2024_raw, country_col_2024),
|
||
age = pull_age_numeric(survey2024_raw, age_col_2024),
|
||
gender = pull_col_or_default(survey2024_raw, gender_col_2024),
|
||
so_visit = pull_col_or_default(survey2024_raw, so_visit_col_2024),
|
||
ai_select = pull_col_or_default(survey2024_raw, ai_select_col_2024)
|
||
)
|
||
|
||
survey_all <- bind_rows(survey2023, survey2024)
|
||
|
||
# engineer features:
|
||
# - binary flag: frequent so visitor
|
||
# - binary flag: uses chatgpt as ai tool (from ai_select free text / semicolon list)
|
||
# - coarser age groups
|
||
|
||
survey_model <- survey_all |>
|
||
filter(!is.na(so_visit)) |>
|
||
mutate(
|
||
so_visit = as.character(so_visit),
|
||
ai_select = as.character(ai_select),
|
||
|
||
# frequent so visitor: daily or multiple times per day etc.
|
||
frequent_so = dplyr::case_when(
|
||
stringr::str_detect(so_visit, regex("multiple times per day", ignore_case = TRUE)) ~ 1L,
|
||
stringr::str_detect(so_visit, regex("daily|almost every day", ignore_case = TRUE)) ~ 1L,
|
||
TRUE ~ 0L
|
||
),
|
||
uses_chatgpt = dplyr::case_when(
|
||
is.na(ai_select) ~ 0L,
|
||
stringr::str_detect(ai_select, regex("chatgpt", ignore_case = TRUE)) ~ 1L,
|
||
TRUE ~ 0L
|
||
),
|
||
age_group = dplyr::case_when(
|
||
!is.na(age) & age < 25 ~ "<25",
|
||
!is.na(age) & age >= 25 & age < 35 ~ "25-34",
|
||
!is.na(age) & age >= 35 & age < 45 ~ "35-44",
|
||
!is.na(age) & age >= 45 ~ "45+",
|
||
TRUE ~ "unknown"
|
||
),
|
||
gender = if_else(is.na(gender) | gender == "", "Unknown", gender)
|
||
) |>
|
||
filter(!is.na(frequent_so)) |>
|
||
mutate(
|
||
frequent_so = as.integer(frequent_so),
|
||
uses_chatgpt = as.integer(uses_chatgpt),
|
||
age_group = factor(age_group),
|
||
gender = factor(gender),
|
||
year = factor(year)
|
||
)
|
||
|
||
glimpse(survey_model)
|
||
|
||
# SECTION 2: data description + preliminary plots (dataset 1)
|
||
# basic time series plot of answers over time (for section 2)
|
||
|
||
p_answers_ts <- ggplot(answers_monthly, aes(x = month, y = answers_total)) +
|
||
geom_line() +
|
||
geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
|
||
geom_vline(xintercept = so_ai_policy_date, linetype = "dotted") +
|
||
labs(
|
||
title = "monthly new answers on stack overflow",
|
||
x = "month",
|
||
y = "number of answers"
|
||
)
|
||
|
||
print(p_answers_ts)
|
||
ggsave(
|
||
filename = file.path(imgs_dir, "01_answers_ts.png"),
|
||
plot = p_answers_ts,
|
||
width = 10, height = 6, units = "in", dpi = 300
|
||
)
|
||
|
||
# boxplot pre vs post chatgpt
|
||
|
||
p_box_pre_post <- ggplot(answers_monthly, aes(x = period, y = answers_total)) +
|
||
geom_boxplot() +
|
||
labs(
|
||
title = "distribution of monthly answers: pre vs post chatgpt launch",
|
||
x = "period",
|
||
y = "monthly answers"
|
||
)
|
||
|
||
print(p_box_pre_post)
|
||
ggsave(file.path(imgs_dir, "02_box_pre_post.png"), plot = p_box_pre_post, width = 8, height = 6, units = "in", dpi = 300)
|
||
|
||
# basic summary table
|
||
|
||
answers_summary_period <- answers_monthly |>
|
||
group_by(period) |>
|
||
summarise(
|
||
n_months = n(),
|
||
mean_answers = mean(answers_total),
|
||
median_answers = median(answers_total),
|
||
sd_answers = sd(answers_total),
|
||
min_answers = min(answers_total),
|
||
max_answers = max(answers_total),
|
||
.groups = "drop"
|
||
)
|
||
|
||
print(answers_summary_period)
|
||
|
||
# SECTION 3: exploratory analysis
|
||
# seasonal pattern: answers by calendar month across years
|
||
|
||
p_seasonal <- answers_monthly |>
|
||
mutate(month_label = factor(month_num, labels = month.abb)) |>
|
||
ggplot(aes(x = month_label, y = answers_total, group = year, color = period)) +
|
||
geom_line(alpha = 0.6) +
|
||
labs(
|
||
title = "seasonality of answers by calendar month and year",
|
||
x = "calendar month",
|
||
y = "monthly answers"
|
||
)
|
||
|
||
print(p_seasonal)
|
||
ggsave(file.path(imgs_dir, "03_seasonal.png"), plot = p_seasonal, width = 10, height = 6, units = "in", dpi = 300)
|
||
|
||
# rolling 3-month moving average to smooth noise
|
||
|
||
answers_monthly <- answers_monthly |>
|
||
arrange(month) |>
|
||
mutate(
|
||
answers_ma3 = zoo::rollmean(answers_total, k = 3, fill = NA, align = "right")
|
||
)
|
||
|
||
p_ma3 <- ggplot(answers_monthly, aes(x = month)) +
|
||
geom_line(aes(y = answers_total), alpha = 0.3) +
|
||
geom_line(aes(y = answers_ma3)) +
|
||
geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
|
||
labs(
|
||
title = "monthly answers with 3-month moving average",
|
||
x = "month",
|
||
y = "answers"
|
||
)
|
||
|
||
print(p_ma3)
|
||
ggsave(file.path(imgs_dir, "04_ma3.png"), plot = p_ma3, width = 10, height = 6, units = "in", dpi = 300)
|
||
|
||
# simple percentage change around chatgpt launch
|
||
|
||
pre_window <- answers_monthly |>
|
||
filter(
|
||
month >= chatgpt_launch_date - months(6),
|
||
month < chatgpt_launch_date
|
||
)
|
||
|
||
post_window <- answers_monthly |>
|
||
filter(
|
||
month >= chatgpt_launch_date,
|
||
month < chatgpt_launch_date + months(6)
|
||
)
|
||
|
||
pre_mean <- mean(pre_window$answers_total)
|
||
post_mean <- mean(post_window$answers_total)
|
||
pct_change <- (post_mean - pre_mean) / pre_mean * 100
|
||
|
||
pct_change
|
||
|
||
# survey exploratory: relation between ai usage and so visit frequency
|
||
|
||
survey_counts <- survey_model |>
|
||
mutate(
|
||
uses_chatgpt_label = if_else(uses_chatgpt == 1L, "uses chatgpt", "does not use chatgpt"),
|
||
freq_label = if_else(frequent_so == 1L, "visits so daily", "visits so less often")
|
||
) |>
|
||
count(year, uses_chatgpt_label, freq_label) |>
|
||
group_by(year, uses_chatgpt_label) |>
|
||
mutate(prop = n / sum(n)) |>
|
||
ungroup()
|
||
|
||
p_survey_bar <- ggplot(survey_counts, aes(x = uses_chatgpt_label, y = prop, fill = freq_label)) +
|
||
geom_col(position = "fill") +
|
||
facet_wrap(~year) +
|
||
scale_y_continuous(labels = scales::percent_format()) +
|
||
labs(
|
||
title = "relationship between chatgpt use and stack overflow visit frequency (survey)",
|
||
x = "ai usage segment",
|
||
y = "share of respondents",
|
||
fill = "so visit frequency"
|
||
)
|
||
|
||
print(p_survey_bar)
|
||
ggsave(file.path(imgs_dir, "08_survey_bar.png"), plot = p_survey_bar, width = 10, height = 6, units = "in", dpi = 300)
|
||
|
||
# SECTION 4: model development (four different model types)
|
||
|
||
# MODEL 1: interrupted time series linear regression
|
||
# outcome: monthly answers_total
|
||
# predictors: time trend, post_chatgpt level change, slope change after chatgpt
|
||
|
||
its_data <- answers_monthly |>
|
||
mutate(
|
||
time = time_index,
|
||
chatgpt_time = if_else(month >= chatgpt_launch_date,
|
||
time_index - min(time_index[month >= chatgpt_launch_date]) + 1L,
|
||
0L
|
||
)
|
||
)
|
||
|
||
model_lm <- lm(
|
||
answers_total ~ time + post_chatgpt + chatgpt_time,
|
||
data = its_data
|
||
)
|
||
|
||
summary(model_lm)
|
||
tidy(model_lm)
|
||
glance(model_lm)
|
||
|
||
# predictions and plot
|
||
|
||
its_data <- its_data |>
|
||
mutate(
|
||
lm_fitted = predict(model_lm)
|
||
)
|
||
|
||
p_lm_fit <- ggplot(its_data, aes(x = month)) +
|
||
geom_line(aes(y = answers_total), alpha = 0.4) +
|
||
geom_line(aes(y = lm_fitted), color = "blue") +
|
||
geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
|
||
labs(
|
||
title = "interrupted time series regression: observed vs fitted answers",
|
||
x = "month",
|
||
y = "answers"
|
||
)
|
||
|
||
print(p_lm_fit)
|
||
ggsave(file.path(imgs_dir, "05_lm_fit.png"), plot = p_lm_fit, width = 10, height = 6, units = "in", dpi = 300)
|
||
|
||
# MODEL 2: poisson regression for count data
|
||
|
||
model_pois <- glm(
|
||
answers_total ~ time + post_chatgpt + chatgpt_time,
|
||
data = its_data,
|
||
family = poisson(link = "log")
|
||
)
|
||
|
||
summary(model_pois)
|
||
tidy(model_pois, exponentiate = TRUE) # exp(coef) ~ multiplicative effect
|
||
|
||
# compare predicted counts
|
||
|
||
its_data <- its_data |>
|
||
mutate(
|
||
pois_fitted = predict(model_pois, type = "response")
|
||
)
|
||
|
||
p_pois_fit <- ggplot(its_data, aes(x = month)) +
|
||
geom_line(aes(y = answers_total), alpha = 0.3) +
|
||
geom_line(aes(y = pois_fitted), color = "red") +
|
||
geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
|
||
labs(
|
||
title = "poisson regression: observed vs predicted monthly answers",
|
||
x = "month",
|
||
y = "answers"
|
||
)
|
||
|
||
print(p_pois_fit)
|
||
ggsave(file.path(imgs_dir, "06_pois_fit.png"), plot = p_pois_fit, width = 10, height = 6, units = "in", dpi = 300)
|
||
|
||
# MODEL 3: arima time series forecast (pre-chatgpt vs actual)
|
||
|
||
# construct monthly ts object (frequency = 12)
|
||
start_year <- year(min(answers_monthly$month))
|
||
start_month <- month(min(answers_monthly$month))
|
||
|
||
answers_ts <- ts(
|
||
answers_monthly$answers_total,
|
||
start = c(start_year, start_month),
|
||
frequency = 12
|
||
)
|
||
|
||
# train on pre-chatgpt data (up to oct 2022) and forecast forward
|
||
train_end <- c(2022, 10) # october 2022
|
||
train_ts <- window(answers_ts, end = train_end)
|
||
test_ts <- window(answers_ts, start = c(2022, 11))
|
||
|
||
arima_fit <- auto.arima(train_ts)
|
||
|
||
summary(arima_fit)
|
||
|
||
h <- length(test_ts)
|
||
fc <- forecast(arima_fit, h = h)
|
||
|
||
# compare forecast vs actual on the holdout period
|
||
|
||
fc_df <- data.frame(
|
||
month = answers_monthly$month[answers_monthly$month >= as.Date("2022-11-01")],
|
||
actual = as.numeric(test_ts),
|
||
forecast = as.numeric(fc$mean),
|
||
lower_80 = as.numeric(fc$lower[, "80%"]),
|
||
upper_80 = as.numeric(fc$upper[, "80%"])
|
||
)
|
||
|
||
p_arima <- ggplot(fc_df, aes(x = month)) +
|
||
geom_line(aes(y = actual), alpha = 0.6) +
|
||
geom_line(aes(y = forecast), linetype = "dashed") +
|
||
geom_ribbon(aes(ymin = lower_80, ymax = upper_80), alpha = 0.2) +
|
||
labs(
|
||
title = "arima forecast (trained on pre-chatgpt) vs actual answers",
|
||
x = "month",
|
||
y = "answers"
|
||
)
|
||
|
||
print(p_arima)
|
||
ggsave(file.path(imgs_dir, "07_arima_forecast.png"), plot = p_arima, width = 10, height = 6, units = "in", dpi = 300)
|
||
|
||
# simple accuracy metrics on the holdout
|
||
|
||
fc_accuracy <- accuracy(fc, test_ts)
|
||
print(fc_accuracy)
|
||
|
||
# MODEL 4: logistic regression – does using chatgpt predict being a frequent stack overflow visitor?
|
||
|
||
set.seed(123)
|
||
|
||
survey_model_complete <- survey_model |>
|
||
filter(!is.na(uses_chatgpt), !is.na(frequent_so))
|
||
|
||
candidate_predictors <- c("uses_chatgpt", "age_group", "gender", "year")
|
||
valid_predictors <- candidate_predictors[sapply(
|
||
candidate_predictors,
|
||
function(col) dplyr::n_distinct(survey_model_complete[[col]], na.rm = TRUE) > 1
|
||
)]
|
||
drop_predictors <- setdiff(candidate_predictors, valid_predictors)
|
||
if (length(drop_predictors) > 0) {
|
||
message("dropping predictors with <2 levels: ", paste(drop_predictors, collapse = ", "))
|
||
}
|
||
|
||
logit_formula <- if (length(valid_predictors) == 0) {
|
||
frequent_so ~ 1
|
||
} else {
|
||
as.formula(paste("frequent_so ~", paste(valid_predictors, collapse = " + ")))
|
||
}
|
||
|
||
n <- nrow(survey_model_complete)
|
||
train_idx <- sample(seq_len(n), size = floor(0.8 * n))
|
||
|
||
survey_train <- survey_model_complete[train_idx, ]
|
||
survey_test <- survey_model_complete[-train_idx, ]
|
||
|
||
positive_rate <- mean(survey_train$frequent_so, na.rm = TRUE)
|
||
classification_threshold <- dplyr::case_when(
|
||
is.na(positive_rate) ~ 0.5,
|
||
positive_rate <= 0 ~ 0.5,
|
||
positive_rate >= 1 ~ 0.5,
|
||
TRUE ~ positive_rate
|
||
)
|
||
|
||
message(
|
||
"classification threshold (training frequent_so share): ",
|
||
round(classification_threshold, 3)
|
||
)
|
||
|
||
logit_model <- glm(
|
||
formula = logit_formula,
|
||
family = binomial(link = "logit"),
|
||
data = survey_train
|
||
)
|
||
|
||
summary(logit_model)
|
||
tidy(logit_model, exponentiate = TRUE, conf.int = TRUE)
|
||
|
||
# predict on test set
|
||
|
||
survey_test <- survey_test |>
|
||
mutate(
|
||
pred_prob = predict(logit_model, newdata = survey_test, type = "response"),
|
||
pred_class = if_else(pred_prob >= classification_threshold, 1L, 0L)
|
||
)
|
||
|
||
# confusion matrix and simple metrics
|
||
|
||
conf_mat <- table(
|
||
truth = factor(survey_test$frequent_so, levels = c(0, 1)),
|
||
pred = factor(survey_test$pred_class, levels = c(0, 1))
|
||
)
|
||
|
||
conf_mat
|
||
|
||
tp <- conf_mat["1", "1"]
|
||
tn <- conf_mat["0", "0"]
|
||
fp <- conf_mat["0", "1"]
|
||
fn <- conf_mat["1", "0"]
|
||
|
||
accuracy <- (tp + tn) / sum(conf_mat)
|
||
precision <- if ((tp + fp) > 0) tp / (tp + fp) else NA_real_
|
||
recall <- if ((tp + fn) > 0) tp / (tp + fn) else NA_real_
|
||
|
||
list(
|
||
accuracy = accuracy,
|
||
precision = precision,
|
||
recall = recall
|
||
)
|
||
|
||
# visual: predicted probability vs ai usage
|
||
|
||
p_logit_probs <- survey_test |>
|
||
mutate(uses_chatgpt_label = if_else(uses_chatgpt == 1L, "uses chatgpt", "does not use chatgpt")) |>
|
||
ggplot(aes(x = uses_chatgpt_label, y = pred_prob)) +
|
||
geom_boxplot() +
|
||
labs(
|
||
title = "predicted probability of being a frequent so visitor by chatgpt use",
|
||
x = "ai usage segment",
|
||
y = "predicted probability (logistic model)"
|
||
)
|
||
|
||
print(p_logit_probs)
|
||
ggsave(file.path(imgs_dir, "09_logit_probs.png"), plot = p_logit_probs, width = 8, height = 6, units = "in", dpi = 300)
|
||
|
||
# save key tables and model outputs to disk for report
|
||
|
||
write_csv(answers_monthly, file.path(data_dir, "answers_monthly_clean.csv"))
|
||
write_csv(answers_summary_period, file.path(data_dir, "answers_summary_period.csv"))
|
||
write_csv(survey_counts, file.path(data_dir, "survey_ai_vs_so_visit.csv"))
|
||
|
||
saveRDS(model_lm, file.path(data_dir, "model_lm_its.rds"))
|
||
saveRDS(model_pois, file.path(data_dir, "model_pois.rds"))
|
||
saveRDS(arima_fit, file.path(data_dir, "model_arima_prechatgpt.rds"))
|
||
saveRDS(logit_model, file.path(data_dir, "model_logit_survey.rds"))
|