This repository has been archived on 2026-05-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Data-Analytics/Assignment IV/analysis.r
T
2025-11-22 15:25:17 -05:00

684 lines
22 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# install.packages(
# c("tidyverse", "lubridate", "broom", "forecast", "stringr", "dplyr"),
# repos = "http://cran.us.r-project.org"
# )
library(tidyverse)
library(lubridate)
library(broom)
library(forecast)
library(stringr)
library(dplyr)
# directory for data files (adjust if desired)
data_dir <- "data"
if (!dir.exists(data_dir)) {
dir.create(data_dir, recursive = TRUE)
}
# directory for plots
imgs_dir <- "imgs"
if (!dir.exists(imgs_dir)) {
dir.create(imgs_dir, recursive = TRUE)
}
# constants: key event dates related to chatgpt and so policy
# chatgpt public research preview launch
chatgpt_launch_date <- as.Date("2022-11-30") # openai "introducing chatgpt" blog
# stack overflow generative ai ban policy (meta so, 5 dec 2022)
so_ai_policy_date <- as.Date("2022-12-05")
# moderation strike on stack exchange (juneaug 2023) from meta posts
so_mod_strike_start <- as.Date("2023-06-05")
so_mod_strike_end <- as.Date("2023-08-07")
# helper: safe downloader
download_if_missing <- function(url, destfile) {
if (!file.exists(destfile)) {
message("downloading ", basename(destfile), " ...")
download.file(url, destfile, mode = "wb")
message("saved to ", destfile)
} else {
message("file already exists: ", destfile)
}
}
coerce_month_to_date <- function(x) {
if (inherits(x, "Date")) {
return(x)
}
if (inherits(x, "POSIXct")) {
return(lubridate::as_date(x))
}
if (inherits(x, "POSIXlt")) {
return(as.Date(x))
}
if (is.numeric(x)) {
return(as.Date(x, origin = "1970-01-01"))
}
if (is.character(x)) {
parsed <- suppressWarnings(lubridate::ymd_hms(x))
if (all(is.na(parsed))) {
parsed <- suppressWarnings(lubridate::ymd(x))
}
if (all(is.na(parsed))) {
parsed <- suppressWarnings(as.Date(x))
}
return(parsed)
}
suppressWarnings(as.Date(x))
}
# 1) load stack overflow monthly answers (dataset 1)
answers_csv_path <- file.path(data_dir, "so_new_answers_per_month_2018_2025.csv")
if (!file.exists(answers_csv_path)) {
stop(
"missing ", answers_csv_path,
"\nrun the sede query in this script and download the csv to that path first."
)
}
answers_raw <- readr::read_csv(answers_csv_path, show_col_types = FALSE) |>
rename(
month = matches("^Date$|Month", ignore.case = TRUE),
status = matches("^Status$", ignore.case = TRUE),
new_answers = matches("NewAnswers|Count", ignore.case = TRUE)
)
answers_raw <- answers_raw |>
mutate(
month = coerce_month_to_date(month),
status = tolower(status)
)
# inspect column names so you can adjust if sede changes them
print(names(answers_raw))
# expected columns: "Month", "Status", "NewAnswers"
# normalise to lower snake case just in case
answers_raw <- answers_raw |>
rename(
month = matches("Month", ignore.case = TRUE),
status = matches("Status", ignore.case = TRUE),
new_answers = matches("NewAnswers|Count", ignore.case = TRUE)
)
print(head(answers_raw))
# aggregate deleted vs non-deleted into separate columns per month
answers_monthly <- answers_raw |>
mutate(
month = as.Date(month),
status = tolower(status)
) |>
group_by(month) |>
summarise(
answers_total = sum(new_answers, na.rm = TRUE),
answers_non_deleted = sum(if_else(status == "non-deleted", new_answers, 0L)),
answers_deleted = sum(if_else(status == "deleted", new_answers, 0L)),
.groups = "drop"
) |>
arrange(month) |>
mutate(
year = year(month),
month_num = month(month),
time_index = row_number(),
post_chatgpt = month >= chatgpt_launch_date,
post_ai_policy = month >= so_ai_policy_date,
during_mod_strike = month >= so_mod_strike_start & month <= so_mod_strike_end,
period = case_when(
month < chatgpt_launch_date ~ "pre_chatgpt",
TRUE ~ "post_chatgpt"
)
)
glimpse(answers_monthly)
# 2) download and load stack overflow developer survey 2023/2024 (dataset 2)
# official survey zip files as exposed on survey.stackoverflow.co
# these urls are the same ones behind the "download full data set (csv)" links
# see: https://survey.stackoverflow.co/
survey_2023_url <- "https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2023.zip"
survey_2024_url <- "https://survey.stackoverflow.co/datasets/stack-overflow-developer-survey-2024.zip"
survey_2023_zip <- file.path(data_dir, "stack-overflow-developer-survey-2023.zip")
survey_2024_zip <- file.path(data_dir, "stack-overflow-developer-survey-2024.zip")
download_if_missing(survey_2023_url, survey_2023_zip)
download_if_missing(survey_2024_url, survey_2024_zip)
# helper to read the "survey_results_public.csv" inside each zip
read_so_survey_from_zip <- function(zip_path, csv_pattern = "survey_results_public.csv") {
if (!file.exists(zip_path)) {
stop("zip file not found: ", zip_path)
}
# list files inside zip (works even when the CSV is in a subfolder)
zlist <- utils::unzip(zip_path, list = TRUE)
# try to find the csv by exact name or by pattern
csv_name <- zlist$Name[stringr::str_detect(zlist$Name, regex(csv_pattern, ignore_case = TRUE))]
if (length(csv_name) == 0) {
stop("could not find a csv matching ", csv_pattern, " inside ", zip_path)
}
csv_name <- csv_name[1] # take first match
# read it without extracting to disk using unz() connection
# optionally supply col_types to speed parsing
df <- readr::read_csv(
unz(zip_path, csv_name),
show_col_types = FALSE,
# col_types = cols(.default = col_character()) # uncomment & customize if you want explicit types
)
df
}
survey2023_raw <- read_so_survey_from_zip(survey_2023_zip)
survey2024_raw <- read_so_survey_from_zip(survey_2024_zip)
# look at column names to locate ai + stackoverflow usage questions
names(survey2023_raw)[1:80]
############################################################
# create a harmonised survey subset focusing on:
# - so visit frequency (column like SOVisitFreq)
# - ai tool usage (column like AISelect or SOAI)
############################################################
find_first_col <- function(df, pattern) {
cols <- names(df)[stringr::str_detect(names(df), regex(pattern, ignore_case = TRUE))]
if (length(cols) == 0) {
return(NA_character_)
}
cols[1]
}
pull_col_or_default <- function(df, col_name, default = NA_character_) {
if (is.na(col_name)) {
return(rep(default, nrow(df)))
}
df[[col_name]]
}
pull_age_numeric <- function(df, col_name) {
vec <- pull_col_or_default(df, col_name, default = NA_real_)
if (is.numeric(vec)) {
return(vec)
}
if (is.factor(vec)) {
vec <- as.character(vec)
}
if (is.character(vec)) {
vec <- stringr::str_trim(vec)
vec[vec == ""] <- NA_character_
vec[stringr::str_detect(vec, regex("prefer not to say", ignore_case = TRUE))] <- NA_character_
# parse_number extracts the leading numeric value (e.g., 25 from "25-34 years old")
return(suppressWarnings(readr::parse_number(vec)))
}
suppressWarnings(as.numeric(vec))
}
main_branch_col_2023 <- find_first_col(survey2023_raw, "^MainBranch$|MainBranch")
country_col_2023 <- find_first_col(survey2023_raw, "^Country$|Country")
age_col_2023 <- find_first_col(survey2023_raw, "^Age$|Age")
gender_col_2023 <- find_first_col(survey2023_raw, "^Gender$|Gender")
so_visit_col_2023 <- find_first_col(survey2023_raw, "SOVisitFreq")
ai_select_col_2023 <- find_first_col(survey2023_raw, "AISelect|SOAI")
main_branch_col_2024 <- find_first_col(survey2024_raw, "^MainBranch$|MainBranch")
country_col_2024 <- find_first_col(survey2024_raw, "^Country$|Country")
age_col_2024 <- find_first_col(survey2024_raw, "^Age$|Age")
gender_col_2024 <- find_first_col(survey2024_raw, "^Gender$|Gender")
so_visit_col_2024 <- find_first_col(survey2024_raw, "SOVisitFreq")
ai_select_col_2024 <- find_first_col(survey2024_raw, "AISelect|SOAI")
message("2023 so visit col: ", so_visit_col_2023)
message("2023 ai col : ", ai_select_col_2023)
message("2024 so visit col: ", so_visit_col_2024)
message("2024 ai col : ", ai_select_col_2024)
# build a clean survey frame for 2023
survey2023 <- survey2023_raw |>
transmute(
year = 2023L,
main_branch = pull_col_or_default(survey2023_raw, main_branch_col_2023),
country = pull_col_or_default(survey2023_raw, country_col_2023),
age = pull_age_numeric(survey2023_raw, age_col_2023),
gender = pull_col_or_default(survey2023_raw, gender_col_2023),
so_visit = pull_col_or_default(survey2023_raw, so_visit_col_2023),
ai_select = pull_col_or_default(survey2023_raw, ai_select_col_2023)
)
# same idea for 2024 (schema is very similar)
survey2024 <- survey2024_raw |>
transmute(
year = 2024L,
main_branch = pull_col_or_default(survey2024_raw, main_branch_col_2024),
country = pull_col_or_default(survey2024_raw, country_col_2024),
age = pull_age_numeric(survey2024_raw, age_col_2024),
gender = pull_col_or_default(survey2024_raw, gender_col_2024),
so_visit = pull_col_or_default(survey2024_raw, so_visit_col_2024),
ai_select = pull_col_or_default(survey2024_raw, ai_select_col_2024)
)
survey_all <- bind_rows(survey2023, survey2024)
# engineer features:
# - binary flag: frequent so visitor
# - binary flag: uses chatgpt as ai tool (from ai_select free text / semicolon list)
# - coarser age groups
survey_model <- survey_all |>
filter(!is.na(so_visit)) |>
mutate(
so_visit = as.character(so_visit),
ai_select = as.character(ai_select),
# frequent so visitor: daily or multiple times per day etc.
frequent_so = dplyr::case_when(
stringr::str_detect(so_visit, regex("multiple times per day", ignore_case = TRUE)) ~ 1L,
stringr::str_detect(so_visit, regex("daily|almost every day", ignore_case = TRUE)) ~ 1L,
TRUE ~ 0L
),
uses_chatgpt = dplyr::case_when(
is.na(ai_select) ~ 0L,
stringr::str_detect(ai_select, regex("chatgpt", ignore_case = TRUE)) ~ 1L,
TRUE ~ 0L
),
age_group = dplyr::case_when(
!is.na(age) & age < 25 ~ "<25",
!is.na(age) & age >= 25 & age < 35 ~ "25-34",
!is.na(age) & age >= 35 & age < 45 ~ "35-44",
!is.na(age) & age >= 45 ~ "45+",
TRUE ~ "unknown"
),
gender = if_else(is.na(gender) | gender == "", "Unknown", gender)
) |>
filter(!is.na(frequent_so)) |>
mutate(
frequent_so = as.integer(frequent_so),
uses_chatgpt = as.integer(uses_chatgpt),
age_group = factor(age_group),
gender = factor(gender),
year = factor(year)
)
glimpse(survey_model)
# SECTION 2: data description + preliminary plots (dataset 1)
# basic time series plot of answers over time (for section 2)
p_answers_ts <- ggplot(answers_monthly, aes(x = month, y = answers_total)) +
geom_line() +
geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
geom_vline(xintercept = so_ai_policy_date, linetype = "dotted") +
labs(
title = "monthly new answers on stack overflow",
x = "month",
y = "number of answers"
)
print(p_answers_ts)
ggsave(
filename = file.path(imgs_dir, "01_answers_ts.png"),
plot = p_answers_ts,
width = 10, height = 6, units = "in", dpi = 300
)
# boxplot pre vs post chatgpt
p_box_pre_post <- ggplot(answers_monthly, aes(x = period, y = answers_total)) +
geom_boxplot() +
labs(
title = "distribution of monthly answers: pre vs post chatgpt launch",
x = "period",
y = "monthly answers"
)
print(p_box_pre_post)
ggsave(file.path(imgs_dir, "02_box_pre_post.png"), plot = p_box_pre_post, width = 8, height = 6, units = "in", dpi = 300)
# basic summary table
answers_summary_period <- answers_monthly |>
group_by(period) |>
summarise(
n_months = n(),
mean_answers = mean(answers_total),
median_answers = median(answers_total),
sd_answers = sd(answers_total),
min_answers = min(answers_total),
max_answers = max(answers_total),
.groups = "drop"
)
print(answers_summary_period)
# SECTION 3: exploratory analysis
# seasonal pattern: answers by calendar month across years
p_seasonal <- answers_monthly |>
mutate(month_label = factor(month_num, labels = month.abb)) |>
ggplot(aes(x = month_label, y = answers_total, group = year, color = period)) +
geom_line(alpha = 0.6) +
labs(
title = "seasonality of answers by calendar month and year",
x = "calendar month",
y = "monthly answers"
)
print(p_seasonal)
ggsave(file.path(imgs_dir, "03_seasonal.png"), plot = p_seasonal, width = 10, height = 6, units = "in", dpi = 300)
# rolling 3-month moving average to smooth noise
answers_monthly <- answers_monthly |>
arrange(month) |>
mutate(
answers_ma3 = zoo::rollmean(answers_total, k = 3, fill = NA, align = "right")
)
p_ma3 <- ggplot(answers_monthly, aes(x = month)) +
geom_line(aes(y = answers_total), alpha = 0.3) +
geom_line(aes(y = answers_ma3)) +
geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
labs(
title = "monthly answers with 3-month moving average",
x = "month",
y = "answers"
)
print(p_ma3)
ggsave(file.path(imgs_dir, "04_ma3.png"), plot = p_ma3, width = 10, height = 6, units = "in", dpi = 300)
# simple percentage change around chatgpt launch
pre_window <- answers_monthly |>
filter(
month >= chatgpt_launch_date - months(6),
month < chatgpt_launch_date
)
post_window <- answers_monthly |>
filter(
month >= chatgpt_launch_date,
month < chatgpt_launch_date + months(6)
)
pre_mean <- mean(pre_window$answers_total)
post_mean <- mean(post_window$answers_total)
pct_change <- (post_mean - pre_mean) / pre_mean * 100
pct_change
# survey exploratory: relation between ai usage and so visit frequency
survey_counts <- survey_model |>
mutate(
uses_chatgpt_label = if_else(uses_chatgpt == 1L, "uses chatgpt", "does not use chatgpt"),
freq_label = if_else(frequent_so == 1L, "visits so daily", "visits so less often")
) |>
count(year, uses_chatgpt_label, freq_label) |>
group_by(year, uses_chatgpt_label) |>
mutate(prop = n / sum(n)) |>
ungroup()
p_survey_bar <- ggplot(survey_counts, aes(x = uses_chatgpt_label, y = prop, fill = freq_label)) +
geom_col(position = "fill") +
facet_wrap(~year) +
scale_y_continuous(labels = scales::percent_format()) +
labs(
title = "relationship between chatgpt use and stack overflow visit frequency (survey)",
x = "ai usage segment",
y = "share of respondents",
fill = "so visit frequency"
)
print(p_survey_bar)
ggsave(file.path(imgs_dir, "08_survey_bar.png"), plot = p_survey_bar, width = 10, height = 6, units = "in", dpi = 300)
# SECTION 4: model development (four different model types)
# MODEL 1: interrupted time series linear regression
# outcome: monthly answers_total
# predictors: time trend, post_chatgpt level change, slope change after chatgpt
its_data <- answers_monthly |>
mutate(
time = time_index,
chatgpt_time = if_else(month >= chatgpt_launch_date,
time_index - min(time_index[month >= chatgpt_launch_date]) + 1L,
0L
)
)
model_lm <- lm(
answers_total ~ time + post_chatgpt + chatgpt_time,
data = its_data
)
summary(model_lm)
tidy(model_lm)
glance(model_lm)
# predictions and plot
its_data <- its_data |>
mutate(
lm_fitted = predict(model_lm)
)
p_lm_fit <- ggplot(its_data, aes(x = month)) +
geom_line(aes(y = answers_total), alpha = 0.4) +
geom_line(aes(y = lm_fitted), color = "blue") +
geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
labs(
title = "interrupted time series regression: observed vs fitted answers",
x = "month",
y = "answers"
)
print(p_lm_fit)
ggsave(file.path(imgs_dir, "05_lm_fit.png"), plot = p_lm_fit, width = 10, height = 6, units = "in", dpi = 300)
# MODEL 2: poisson regression for count data
model_pois <- glm(
answers_total ~ time + post_chatgpt + chatgpt_time,
data = its_data,
family = poisson(link = "log")
)
summary(model_pois)
tidy(model_pois, exponentiate = TRUE) # exp(coef) ~ multiplicative effect
# compare predicted counts
its_data <- its_data |>
mutate(
pois_fitted = predict(model_pois, type = "response")
)
p_pois_fit <- ggplot(its_data, aes(x = month)) +
geom_line(aes(y = answers_total), alpha = 0.3) +
geom_line(aes(y = pois_fitted), color = "red") +
geom_vline(xintercept = chatgpt_launch_date, linetype = "dashed") +
labs(
title = "poisson regression: observed vs predicted monthly answers",
x = "month",
y = "answers"
)
print(p_pois_fit)
ggsave(file.path(imgs_dir, "06_pois_fit.png"), plot = p_pois_fit, width = 10, height = 6, units = "in", dpi = 300)
# MODEL 3: arima time series forecast (pre-chatgpt vs actual)
# construct monthly ts object (frequency = 12)
start_year <- year(min(answers_monthly$month))
start_month <- month(min(answers_monthly$month))
answers_ts <- ts(
answers_monthly$answers_total,
start = c(start_year, start_month),
frequency = 12
)
# train on pre-chatgpt data (up to oct 2022) and forecast forward
train_end <- c(2022, 10) # october 2022
train_ts <- window(answers_ts, end = train_end)
test_ts <- window(answers_ts, start = c(2022, 11))
arima_fit <- auto.arima(train_ts)
summary(arima_fit)
h <- length(test_ts)
fc <- forecast(arima_fit, h = h)
# compare forecast vs actual on the holdout period
fc_df <- data.frame(
month = answers_monthly$month[answers_monthly$month >= as.Date("2022-11-01")],
actual = as.numeric(test_ts),
forecast = as.numeric(fc$mean),
lower_80 = as.numeric(fc$lower[, "80%"]),
upper_80 = as.numeric(fc$upper[, "80%"])
)
p_arima <- ggplot(fc_df, aes(x = month)) +
geom_line(aes(y = actual), alpha = 0.6) +
geom_line(aes(y = forecast), linetype = "dashed") +
geom_ribbon(aes(ymin = lower_80, ymax = upper_80), alpha = 0.2) +
labs(
title = "arima forecast (trained on pre-chatgpt) vs actual answers",
x = "month",
y = "answers"
)
print(p_arima)
ggsave(file.path(imgs_dir, "07_arima_forecast.png"), plot = p_arima, width = 10, height = 6, units = "in", dpi = 300)
# simple accuracy metrics on the holdout
fc_accuracy <- accuracy(fc, test_ts)
print(fc_accuracy)
# MODEL 4: logistic regression does using chatgpt predict being a frequent stack overflow visitor?
set.seed(123)
survey_model_complete <- survey_model |>
filter(!is.na(uses_chatgpt), !is.na(frequent_so))
candidate_predictors <- c("uses_chatgpt", "age_group", "gender", "year")
valid_predictors <- candidate_predictors[sapply(
candidate_predictors,
function(col) dplyr::n_distinct(survey_model_complete[[col]], na.rm = TRUE) > 1
)]
drop_predictors <- setdiff(candidate_predictors, valid_predictors)
if (length(drop_predictors) > 0) {
message("dropping predictors with <2 levels: ", paste(drop_predictors, collapse = ", "))
}
logit_formula <- if (length(valid_predictors) == 0) {
frequent_so ~ 1
} else {
as.formula(paste("frequent_so ~", paste(valid_predictors, collapse = " + ")))
}
n <- nrow(survey_model_complete)
train_idx <- sample(seq_len(n), size = floor(0.8 * n))
survey_train <- survey_model_complete[train_idx, ]
survey_test <- survey_model_complete[-train_idx, ]
positive_rate <- mean(survey_train$frequent_so, na.rm = TRUE)
classification_threshold <- dplyr::case_when(
is.na(positive_rate) ~ 0.5,
positive_rate <= 0 ~ 0.5,
positive_rate >= 1 ~ 0.5,
TRUE ~ positive_rate
)
message(
"classification threshold (training frequent_so share): ",
round(classification_threshold, 3)
)
logit_model <- glm(
formula = logit_formula,
family = binomial(link = "logit"),
data = survey_train
)
summary(logit_model)
tidy(logit_model, exponentiate = TRUE, conf.int = TRUE)
# predict on test set
survey_test <- survey_test |>
mutate(
pred_prob = predict(logit_model, newdata = survey_test, type = "response"),
pred_class = if_else(pred_prob >= classification_threshold, 1L, 0L)
)
# confusion matrix and simple metrics
conf_mat <- table(
truth = factor(survey_test$frequent_so, levels = c(0, 1)),
pred = factor(survey_test$pred_class, levels = c(0, 1))
)
conf_mat
tp <- conf_mat["1", "1"]
tn <- conf_mat["0", "0"]
fp <- conf_mat["0", "1"]
fn <- conf_mat["1", "0"]
accuracy <- (tp + tn) / sum(conf_mat)
precision <- if ((tp + fp) > 0) tp / (tp + fp) else NA_real_
recall <- if ((tp + fn) > 0) tp / (tp + fn) else NA_real_
list(
accuracy = accuracy,
precision = precision,
recall = recall
)
# visual: predicted probability vs ai usage
p_logit_probs <- survey_test |>
mutate(uses_chatgpt_label = if_else(uses_chatgpt == 1L, "uses chatgpt", "does not use chatgpt")) |>
ggplot(aes(x = uses_chatgpt_label, y = pred_prob)) +
geom_boxplot() +
labs(
title = "predicted probability of being a frequent so visitor by chatgpt use",
x = "ai usage segment",
y = "predicted probability (logistic model)"
)
print(p_logit_probs)
ggsave(file.path(imgs_dir, "09_logit_probs.png"), plot = p_logit_probs, width = 8, height = 6, units = "in", dpi = 300)
# save key tables and model outputs to disk for report
write_csv(answers_monthly, file.path(data_dir, "answers_monthly_clean.csv"))
write_csv(answers_summary_period, file.path(data_dir, "answers_summary_period.csv"))
write_csv(survey_counts, file.path(data_dir, "survey_ai_vs_so_visit.csv"))
saveRDS(model_lm, file.path(data_dir, "model_lm_its.rds"))
saveRDS(model_pois, file.path(data_dir, "model_pois.rds"))
saveRDS(arima_fit, file.path(data_dir, "model_arima_prechatgpt.rds"))
saveRDS(logit_model, file.path(data_dir, "model_logit_survey.rds"))