# die suppressPackageStartupMessages({ library(ggplot2) library(dplyr) library(readr) library(broom) # for augment/tidy library(scales) # for label_comma library(tidyr) # for crossing # library(lmtest) # bp test # library(sandwich) # good(er) ses }) setwd("/home/ion606/Desktop/Data Analytics/Lab 2") # configuration data_path <- "NY-House-Dataset.csv" out_dir <- "outputs" if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE) # load raw <- read_csv(file = data_path, show_col_types = FALSE) # drop missing df <- raw |> transmute( PRICE = as.numeric(PRICE), PROPERTYSQFT = as.numeric(PROPERTYSQFT), BEDS = as.numeric(BEDS), BATH = as.numeric(BATH) ) |> filter(is.finite(PRICE), is.finite(PROPERTYSQFT), is.finite(BEDS), is.finite(BATH)) # basic summaries summary(df) fivenum(df$PRICE, na.rm = TRUE) # no outliters with 1%/99% quantiles quant_trim <- function(x, lo = 0.01, hi = 0.99) { qs <- quantile(x, probs = c(lo, hi), na.rm = TRUE, names = FALSE) x >= qs[1] & x <= qs[2] } keep <- quant_trim(df$PRICE) & quant_trim(df$PROPERTYSQFT) & quant_trim(df$BEDS) & quant_trim(df$BATH) dat <- df[keep, , drop = FALSE] |> filter(PROPERTYSQFT > 0, PRICE > 0, BEDS >= 0, BATH >= 0) |> mutate( log_PRICE = log(PRICE), log_SQFT = log(PROPERTYSQFT) ) # helper to get the most significant non-intercept term (TODO: ASK PROF ABOUT THIS) most_sig_term <- function(model) { tt <- broom::tidy(model) |> dplyr::filter(term != "(Intercept)") |> dplyr::arrange(p.value) if (nrow(tt) == 0) return(NA_character_) tt$term[1] } # model 1: PRICE ~ PROPERTYSQFT m1 <- lm(PRICE ~ PROPERTYSQFT, data = dat) cat("\n==== model 1: PRICE ~ PROPERTYSQFT ====\n") print(summary(m1)) top1 <- most_sig_term(m1) p1_scatter <- ggplot(dat, aes(x = PROPERTYSQFT, y = PRICE)) + geom_point(alpha = 0.35) + stat_smooth(method = "lm", se = TRUE) + scale_y_continuous(labels = label_comma()) + labs(title = "model 1: price vs property sqft with lm fit", x = "property sqft", y = "price (usd)") p1_resid <- augment(m1) |> ggplot(aes(x = .fitted, y = .resid)) + geom_point(alpha = 0.35) + geom_hline(yintercept = 0) + labs(title = "model 1: residuals vs fitted", x = "fitted", y = "residuals") ggsave(file.path(out_dir, "m1_scatter.png"), p1_scatter, width = 7, height = 5, dpi = 150) ggsave(file.path(out_dir, "m1_residuals.png"), p1_resid, width = 7, height = 5, dpi = 150) # model 2: PRICE ~ PROPERTYSQFT + BEDS + BATH m2 <- lm(PRICE ~ PROPERTYSQFT + BEDS + BATH, data = dat) cat("\n==== model 2: PRICE ~ PROPERTYSQFT + BEDS + BATH ====\n") print(summary(m2)) top2 <- most_sig_term(m2) xlab2 <- paste0("most significant predictor: ", top2) p2_scatter <- ggplot(dat, aes_string(x = top2, y = "PRICE")) + geom_point(alpha = 0.35) + stat_smooth(method = "lm", se = TRUE) + scale_y_continuous(labels = label_comma()) + labs(title = "model 2: price vs most significant predictor with lm fit", x = xlab2, y = "price (usd)") p2_resid <- augment(m2) |> ggplot(aes(x = .fitted, y = .resid)) + geom_point(alpha = 0.35) + geom_hline(yintercept = 0) + labs(title = "model 2: residuals vs fitted", x = "fitted", y = "residuals") ggsave(file.path(out_dir, "m2_scatter.png"), p2_scatter, width = 7, height = 5, dpi = 150) ggsave(file.path(out_dir, "m2_residuals.png"), p2_resid, width = 7, height = 5, dpi = 150) # model 3: log(PRICE) ~ log(PROPERTYSQFT) + BEDS + BATH m3 <- lm(log_PRICE ~ log_SQFT + BEDS + BATH, data = dat) cat("\n==== model 3: log(PRICE) ~ log(PROPERTYSQFT) + BEDS + BATH ====\n") print(summary(m3)) top3 <- most_sig_term(m3) # price vs top predictor, overlay w/back-transformed fit -- hold other predictors at medians meds <- dat |> summarise( PROPERTYSQFT = median(PROPERTYSQFT, na.rm = TRUE), BEDS = median(BEDS, na.rm = TRUE), BATH = median(BATH, na.rm = TRUE) ) grid <- tibble::tibble( x = seq(min(dat[[top3]], na.rm = TRUE), max(dat[[top3]], na.rm = TRUE), length.out = 200) ) nd <- meds[rep(1, nrow(grid)), ] nd[[top3]] <- grid$x nd$log_SQFT <- log(nd$PROPERTYSQFT) pred_log <- predict(m3, newdata = nd, se.fit = TRUE) nd$PRICE_hat <- exp(pred_log$fit) # back-transform p3_scatter <- ggplot(dat, aes_string(x = top3, y = "PRICE")) + geom_point(alpha = 0.35) + geom_line(data = nd, aes_string(x = top3, y = "PRICE_hat"), linewidth = 1) + scale_y_continuous(labels = label_comma()) + labs(title = "model 3: price vs most significant predictor (back-transformed fit)", x = paste0("most significant predictor: ", top3), y = "price (usd)") p3_resid <- augment(m3) |> ggplot(aes(x = .fitted, y = .resid)) + geom_point(alpha = 0.35) + geom_hline(yintercept = 0) + labs(title = "model 3: residuals vs fitted (log scale)", x = "fitted (log price)", y = "residuals") ggsave(file.path(out_dir, "m3_scatter.png"), p3_scatter, width = 7, height = 5, dpi = 150) ggsave(file.path(out_dir, "m3_residuals.png"), p3_resid, width = 7, height = 5, dpi = 150) # comp table compare <- tibble::tibble( model = c("PRICE ~ PROPERTYSQFT", "PRICE ~ PROPERTYSQFT + BEDS + BATH", "log(PRICE) ~ log(PROPERTYSQFT) + BEDS + BATH"), r2 = c(summary(m1)$r.squared, summary(m2)$r.squared, summary(m3)$r.squared), adj_r2 = c(summary(m1)$adj.r.squared, summary(m2)$adj.r.squared, summary(m3)$adj.r.squared), aic = c(AIC(m1), AIC(m2), AIC(m3)), bic = c(BIC(m1), BIC(m2), BIC(m3)), top_var = c(top1, top2, top3) ) print(compare) readr::write_csv(compare, file.path(out_dir, "model_comparison.csv")) message("\ndone!")