added lab 2
This commit is contained in:
@@ -0,0 +1 @@
|
||||
other/
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,166 @@
|
||||
# die
|
||||
suppressPackageStartupMessages({
|
||||
library(ggplot2)
|
||||
library(dplyr)
|
||||
library(readr)
|
||||
library(broom) # for augment/tidy
|
||||
library(scales) # for label_comma
|
||||
library(tidyr) # for crossing
|
||||
# library(lmtest) # bp test
|
||||
# library(sandwich) # good(er) ses
|
||||
})
|
||||
|
||||
setwd("/home/ion606/Desktop/Data Analytics/Lab 2")
|
||||
|
||||
# configuration
|
||||
data_path <- "NY-House-Dataset.csv"
|
||||
out_dir <- "outputs"
|
||||
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)
|
||||
|
||||
# load
|
||||
raw <- read_csv(file = data_path, show_col_types = FALSE)
|
||||
|
||||
# keep numeric cols needed, drop missing
|
||||
df <- raw |>
|
||||
transmute(
|
||||
PRICE = as.numeric(PRICE),
|
||||
PROPERTYSQFT = as.numeric(PROPERTYSQFT),
|
||||
BEDS = as.numeric(BEDS),
|
||||
BATH = as.numeric(BATH)
|
||||
) |>
|
||||
filter(is.finite(PRICE), is.finite(PROPERTYSQFT), is.finite(BEDS), is.finite(BATH))
|
||||
|
||||
# basic summaries
|
||||
summary(df)
|
||||
fivenum(df$PRICE, na.rm = TRUE)
|
||||
|
||||
# no outliters with 1%/99% quantiles
|
||||
quant_trim <- function(x, lo = 0.01, hi = 0.99) {
|
||||
qs <- quantile(x, probs = c(lo, hi), na.rm = TRUE, names = FALSE)
|
||||
x >= qs[1] & x <= qs[2]
|
||||
}
|
||||
|
||||
keep <- quant_trim(df$PRICE) & quant_trim(df$PROPERTYSQFT) & quant_trim(df$BEDS) & quant_trim(df$BATH)
|
||||
|
||||
dat <- df[keep, , drop = FALSE] |>
|
||||
filter(PROPERTYSQFT > 0, PRICE > 0, BEDS >= 0, BATH >= 0) |>
|
||||
mutate(
|
||||
log_PRICE = log(PRICE),
|
||||
log_SQFT = log(PROPERTYSQFT)
|
||||
)
|
||||
|
||||
# helper to get the most significant non-intercept term (TODO: ASK PROF ABOUT THIS)
|
||||
most_sig_term <- function(model) {
|
||||
tt <- broom::tidy(model) |>
|
||||
dplyr::filter(term != "(Intercept)") |>
|
||||
dplyr::arrange(p.value)
|
||||
if (nrow(tt) == 0) return(NA_character_)
|
||||
tt$term[1]
|
||||
}
|
||||
|
||||
# model 1: PRICE ~ PROPERTYSQFT
|
||||
m1 <- lm(PRICE ~ PROPERTYSQFT, data = dat)
|
||||
cat("\n==== model 1: PRICE ~ PROPERTYSQFT ====\n")
|
||||
print(summary(m1))
|
||||
|
||||
top1 <- most_sig_term(m1)
|
||||
|
||||
p1_scatter <- ggplot(dat, aes(x = PROPERTYSQFT, y = PRICE)) +
|
||||
geom_point(alpha = 0.35) +
|
||||
stat_smooth(method = "lm", se = TRUE) +
|
||||
scale_y_continuous(labels = label_comma()) +
|
||||
labs(title = "model 1: price vs property sqft with lm fit",
|
||||
x = "property sqft", y = "price (usd)")
|
||||
|
||||
p1_resid <- augment(m1) |>
|
||||
ggplot(aes(x = .fitted, y = .resid)) +
|
||||
geom_point(alpha = 0.35) +
|
||||
geom_hline(yintercept = 0) +
|
||||
labs(title = "model 1: residuals vs fitted", x = "fitted", y = "residuals")
|
||||
|
||||
ggsave(file.path(out_dir, "m1_scatter.png"), p1_scatter, width = 7, height = 5, dpi = 150)
|
||||
ggsave(file.path(out_dir, "m1_residuals.png"), p1_resid, width = 7, height = 5, dpi = 150)
|
||||
|
||||
# model 2: PRICE ~ PROPERTYSQFT + BEDS + BATH
|
||||
m2 <- lm(PRICE ~ PROPERTYSQFT + BEDS + BATH, data = dat)
|
||||
cat("\n==== model 2: PRICE ~ PROPERTYSQFT + BEDS + BATH ====\n")
|
||||
print(summary(m2))
|
||||
|
||||
top2 <- most_sig_term(m2)
|
||||
|
||||
xlab2 <- paste0("most significant predictor: ", top2)
|
||||
|
||||
p2_scatter <- ggplot(dat, aes_string(x = top2, y = "PRICE")) +
|
||||
geom_point(alpha = 0.35) +
|
||||
stat_smooth(method = "lm", se = TRUE) +
|
||||
scale_y_continuous(labels = label_comma()) +
|
||||
labs(title = "model 2: price vs most significant predictor with lm fit",
|
||||
x = xlab2, y = "price (usd)")
|
||||
|
||||
p2_resid <- augment(m2) |>
|
||||
ggplot(aes(x = .fitted, y = .resid)) +
|
||||
geom_point(alpha = 0.35) +
|
||||
geom_hline(yintercept = 0) +
|
||||
labs(title = "model 2: residuals vs fitted", x = "fitted", y = "residuals")
|
||||
|
||||
ggsave(file.path(out_dir, "m2_scatter.png"), p2_scatter, width = 7, height = 5, dpi = 150)
|
||||
ggsave(file.path(out_dir, "m2_residuals.png"), p2_resid, width = 7, height = 5, dpi = 150)
|
||||
|
||||
# model 3: log(PRICE) ~ log(PROPERTYSQFT) + BEDS + BATH
|
||||
m3 <- lm(log_PRICE ~ log_SQFT + BEDS + BATH, data = dat)
|
||||
cat("\n==== model 3: log(PRICE) ~ log(PROPERTYSQFT) + BEDS + BATH ====\n")
|
||||
print(summary(m3))
|
||||
|
||||
top3 <- most_sig_term(m3)
|
||||
|
||||
# price vs top predictor, overlay w/back-transformed fit -- hold other predictors at medians
|
||||
meds <- dat |>
|
||||
summarise(
|
||||
PROPERTYSQFT = median(PROPERTYSQFT, na.rm = TRUE),
|
||||
BEDS = median(BEDS, na.rm = TRUE),
|
||||
BATH = median(BATH, na.rm = TRUE)
|
||||
)
|
||||
|
||||
grid <- tibble::tibble(
|
||||
x = seq(min(dat[[top3]], na.rm = TRUE), max(dat[[top3]], na.rm = TRUE), length.out = 200)
|
||||
)
|
||||
|
||||
nd <- meds[rep(1, nrow(grid)), ]
|
||||
nd[[top3]] <- grid$x
|
||||
nd$log_SQFT <- log(nd$PROPERTYSQFT)
|
||||
|
||||
pred_log <- predict(m3, newdata = nd, se.fit = TRUE)
|
||||
nd$PRICE_hat <- exp(pred_log$fit) # back-transform
|
||||
|
||||
p3_scatter <- ggplot(dat, aes_string(x = top3, y = "PRICE")) +
|
||||
geom_point(alpha = 0.35) +
|
||||
geom_line(data = nd, aes_string(x = top3, y = "PRICE_hat"), linewidth = 1) +
|
||||
scale_y_continuous(labels = label_comma()) +
|
||||
labs(title = "model 3: price vs most significant predictor (back-transformed fit)",
|
||||
x = paste0("most significant predictor: ", top3), y = "price (usd)")
|
||||
|
||||
p3_resid <- augment(m3) |>
|
||||
ggplot(aes(x = .fitted, y = .resid)) +
|
||||
geom_point(alpha = 0.35) +
|
||||
geom_hline(yintercept = 0) +
|
||||
labs(title = "model 3: residuals vs fitted (log scale)", x = "fitted (log price)", y = "residuals")
|
||||
|
||||
ggsave(file.path(out_dir, "m3_scatter.png"), p3_scatter, width = 7, height = 5, dpi = 150)
|
||||
ggsave(file.path(out_dir, "m3_residuals.png"), p3_resid, width = 7, height = 5, dpi = 150)
|
||||
|
||||
# comp table
|
||||
compare <- tibble::tibble(
|
||||
model = c("PRICE ~ PROPERTYSQFT",
|
||||
"PRICE ~ PROPERTYSQFT + BEDS + BATH",
|
||||
"log(PRICE) ~ log(PROPERTYSQFT) + BEDS + BATH"),
|
||||
r2 = c(summary(m1)$r.squared, summary(m2)$r.squared, summary(m3)$r.squared),
|
||||
adj_r2 = c(summary(m1)$adj.r.squared, summary(m2)$adj.r.squared, summary(m3)$adj.r.squared),
|
||||
aic = c(AIC(m1), AIC(m2), AIC(m3)),
|
||||
bic = c(BIC(m1), BIC(m2), BIC(m3)),
|
||||
top_var = c(top1, top2, top3)
|
||||
)
|
||||
print(compare)
|
||||
|
||||
readr::write_csv(compare, file.path(out_dir, "model_comparison.csv"))
|
||||
|
||||
message("\ndone!")
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 61 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 138 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 56 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 142 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 132 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 40 KiB |
@@ -0,0 +1,4 @@
|
||||
model,r2,adj_r2,aic,bic,top_var
|
||||
PRICE ~ PROPERTYSQFT,0.28368247344839936,0.2835263109180851,146322.7288184832,146342.0230707264,PROPERTYSQFT
|
||||
PRICE ~ PROPERTYSQFT + BEDS + BATH,0.3385405215516886,0.33810772363776387,145961.10108380177,145993.25817087374,PROPERTYSQFT
|
||||
log(PRICE) ~ log(PROPERTYSQFT) + BEDS + BATH,0.4401401151381639,0.43977379460281263,9572.876890519634,9605.033977591607,BATH
|
||||
|
Reference in New Issue
Block a user