505 lines
15 KiB
R
505 lines
15 KiB
R
# install.packages(
|
|
# c("dplyr", "ggplot2", "randomForest", "caret", "nnet", "e1071", "scales"),
|
|
# repos = "https://cloud.r-project.org"
|
|
# )
|
|
|
|
library(dplyr)
|
|
library(ggplot2)
|
|
library(randomForest)
|
|
library(caret)
|
|
library(nnet)
|
|
library(e1071)
|
|
library(scales)
|
|
|
|
# load data / basic subsets
|
|
options(stringsAsFactors = FALSE)
|
|
set.seed(42L)
|
|
|
|
file_path <- "Given/NYC_Citywide_Annualized_Calendar_Sales_Update_20241107.csv"
|
|
|
|
# columns we actually need
|
|
cols_needed <- c(
|
|
"BOROUGH", "NEIGHBORHOOD", "BUILDING CLASS CATEGORY",
|
|
"TAX CLASS AS OF FINAL ROLL", "BLOCK", "LOT",
|
|
"BUILDING CLASS AS OF FINAL ROLL", "ZIP CODE",
|
|
"RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS",
|
|
"LAND SQUARE FEET", "GROSS SQUARE FEET", "YEAR BUILT",
|
|
"TAX CLASS AT TIME OF SALE", "BUILDING CLASS AT TIME OF SALE",
|
|
"SALE PRICE", "SALE DATE"
|
|
)
|
|
|
|
nyc <- read.csv(file_path, stringsAsFactors = FALSE, check.names = FALSE)
|
|
nyc <- nyc[, intersect(cols_needed, colnames(nyc))]
|
|
|
|
# force borough numeric
|
|
nyc$BOROUGH <- suppressWarnings(as.numeric(nyc$BOROUGH))
|
|
manhattan_raw <- nyc %>% filter(BOROUGH == 1)
|
|
brooklyn_raw <- nyc %>% filter(BOROUGH == 3)
|
|
|
|
cat("raw manhattan rows:", nrow(manhattan_raw), "\n")
|
|
cat("raw brooklyn rows:", nrow(brooklyn_raw), "\n")
|
|
|
|
|
|
clean_borough <- function(df, min_price = 10000) {
|
|
# numeric from char / factor with commas
|
|
parse_numeric <- function(x) {
|
|
x <- as.character(x)
|
|
x <- trimws(x)
|
|
x[x %in% c("0", "0.0", "- 0", "", ".", "NA", "NaN")] <- NA
|
|
x <- gsub(",", "", x, fixed = TRUE)
|
|
suppressWarnings(as.numeric(x))
|
|
}
|
|
|
|
df <- df
|
|
|
|
# convert TO COMPUTER SCIENCE ITWS OVERRATED
|
|
df$`SALE PRICE` <- parse_numeric(df$`SALE PRICE`)
|
|
df$`LAND SQUARE FEET` <- parse_numeric(df$`LAND SQUARE FEET`)
|
|
df$`GROSS SQUARE FEET` <- parse_numeric(df$`GROSS SQUARE FEET`)
|
|
df$`YEAR BUILT` <- parse_numeric(df$`YEAR BUILT`)
|
|
|
|
unit_cols <- c("RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS")
|
|
for (col in unit_cols) {
|
|
if (col %in% names(df)) {
|
|
df[[col]] <- parse_numeric(df[[col]])
|
|
}
|
|
}
|
|
|
|
# drop non-arms-length / tiny sales
|
|
df <- df %>%
|
|
filter(!is.na(`SALE PRICE`)) %>%
|
|
filter(`SALE PRICE` > min_price)
|
|
|
|
# very old or zero years --> missing
|
|
df$`YEAR BUILT`[df$`YEAR BUILT` < 1800] <- NA
|
|
|
|
# need usable size / year
|
|
df <- df %>%
|
|
filter(
|
|
!is.na(`GROSS SQUARE FEET`),
|
|
!is.na(`LAND SQUARE FEET`),
|
|
!is.na(`YEAR BUILT`),
|
|
`GROSS SQUARE FEET` > 0,
|
|
`LAND SQUARE FEET` > 0
|
|
)
|
|
|
|
# fill missing units with 0 because I am creative 10
|
|
for (col in unit_cols) {
|
|
if (col %in% names(df)) {
|
|
df[[col]][is.na(df[[col]])] <- 0
|
|
}
|
|
}
|
|
|
|
# I AM LAZY (create new cols)
|
|
df <- df %>%
|
|
rename(
|
|
neighborhood = NEIGHBORHOOD
|
|
) %>%
|
|
mutate(
|
|
land_sqft = `LAND SQUARE FEET`,
|
|
gross_sqft = `GROSS SQUARE FEET`,
|
|
year_built = `YEAR BUILT`,
|
|
res_units = `RESIDENTIAL UNITS`,
|
|
comm_units = `COMMERCIAL UNITS`,
|
|
total_units = `TOTAL UNITS`,
|
|
sale_price = `SALE PRICE`
|
|
)
|
|
|
|
df
|
|
}
|
|
|
|
# http://localhost:21486/library/psych/html/manhattan.html
|
|
manhattan <- clean_borough(manhattan_raw)
|
|
brooklyn <- clean_borough(brooklyn_raw)
|
|
|
|
cat("clean manhattan rows:", nrow(manhattan), "\n")
|
|
cat("clean brooklyn rows:", nrow(brooklyn), "\n")
|
|
|
|
# manhattan exploratory data analysis
|
|
|
|
# summary stats for sale price
|
|
summary_manhattan_price <- summary(manhattan$sale_price)
|
|
print(summary_manhattan_price)
|
|
|
|
quantiles_manhattan <- quantile(
|
|
manhattan$sale_price,
|
|
probs = c(0.25, 0.5, 0.75, 0.9, 0.95, 0.99),
|
|
na.rm = TRUE
|
|
)
|
|
|
|
print(quantiles_manhattan)
|
|
|
|
# iqr-based outlier bounds
|
|
q1 <- quantiles_manhattan[1]
|
|
q3 <- quantiles_manhattan[3]
|
|
iqr <- q3 - q1
|
|
lower_bound <- q1 - 1.5 * iqr
|
|
upper_bound <- q3 + 1.5 * iqr
|
|
|
|
cat("manhattan iqr upper bound:", upper_bound, "\n")
|
|
cat("manhattan max sale price:", max(manhattan$sale_price, na.rm = TRUE), "\n")
|
|
|
|
outlier_mask <- (manhattan$sale_price < lower_bound) |
|
|
(manhattan$sale_price > upper_bound)
|
|
cat("number of sale price outliers:", sum(outlier_mask, na.rm = TRUE), "\n")
|
|
|
|
|
|
# correlation with other numeric vars
|
|
num_cols <- c(
|
|
"sale_price", "gross_sqft", "land_sqft",
|
|
"year_built", "res_units", "comm_units", "total_units"
|
|
)
|
|
|
|
corr_manhattan <- cor(manhattan[, num_cols], use = "complete.obs")
|
|
print(corr_manhattan[, "sale_price"])
|
|
|
|
|
|
# histogram of raw sale prices
|
|
p_hist_raw <- ggplot(manhattan, aes(x = sale_price)) +
|
|
geom_histogram(bins = 50, color = "black", fill = NA) +
|
|
scale_x_continuous(labels = comma) +
|
|
labs(
|
|
title = "manhattan sale price distribution (raw)",
|
|
x = "sale price (usd)",
|
|
y = "count of sales"
|
|
)
|
|
|
|
|
|
# histogram of log(1 + sale price)
|
|
p_hist_log <- ggplot(manhattan, aes(x = log1p(sale_price))) +
|
|
geom_histogram(bins = 50, color = "black", fill = NA) +
|
|
labs(
|
|
title = "manhattan sale price distribution (log scale)",
|
|
x = "log(1 + sale price)",
|
|
y = "count of sales"
|
|
)
|
|
|
|
|
|
# boxplot of sale price (for show outliers)
|
|
p_box <- ggplot(manhattan, aes(y = sale_price)) +
|
|
geom_boxplot(outlier.alpha = 0.4) +
|
|
scale_y_continuous(labels = comma) +
|
|
labs(
|
|
title = "manhattan sale price with outliers",
|
|
y = "sale price (usd)",
|
|
x = ""
|
|
)
|
|
|
|
|
|
# scatter: gross sqft vs sale price (log y)
|
|
p_scatter <- ggplot(manhattan, aes(x = gross_sqft, y = sale_price)) +
|
|
geom_point(alpha = 0.3) +
|
|
scale_y_continuous(trans = "log10", labels = comma) +
|
|
labs(
|
|
title = "manhattan sale price vs gross square feet",
|
|
x = "gross square feet",
|
|
y = "sale price (log10 scale)"
|
|
)
|
|
|
|
|
|
# print or save plots as needed
|
|
print(p_hist_raw)
|
|
print(p_hist_log)
|
|
print(p_box)
|
|
print(p_scatter)
|
|
|
|
# regression analysis (manhattan)
|
|
|
|
reg_vars <- c(
|
|
"land_sqft", "gross_sqft", "year_built",
|
|
"res_units", "comm_units", "total_units"
|
|
)
|
|
|
|
reg_df <- manhattan %>%
|
|
select(all_of(reg_vars), sale_price) %>%
|
|
tidyr::drop_na()
|
|
|
|
reg_df$log_price <- log1p(reg_df$sale_price)
|
|
|
|
set.seed(42L)
|
|
|
|
train_idx_reg <- createDataPartition(reg_df$log_price, p = 0.75, list = FALSE)
|
|
train_reg <- reg_df[train_idx_reg, ]
|
|
test_reg <- reg_df[-train_idx_reg, ]
|
|
|
|
# linear regression
|
|
lm_fit <- lm(
|
|
log_price ~ land_sqft + gross_sqft + year_built +
|
|
res_units + comm_units + total_units,
|
|
data = train_reg
|
|
)
|
|
|
|
lm_pred <- predict(lm_fit, newdata = test_reg)
|
|
r2_lm <- cor(test_reg$log_price, lm_pred)^2
|
|
rmse_lm <- sqrt(mean((test_reg$log_price - lm_pred)^2))
|
|
mae_lm <- mean(abs(test_reg$log_price - lm_pred))
|
|
|
|
cat("\nlinear model (log price) metrics on manhattan:\n")
|
|
cat("r2:", r2_lm, " rmse:", rmse_lm, " mae:", mae_lm, "\n")
|
|
|
|
# random forest regression on log price
|
|
set.seed(42L)
|
|
|
|
rf_fit <- randomForest(
|
|
x = train_reg[, reg_vars],
|
|
y = train_reg$log_price,
|
|
ntree = 200,
|
|
mtry = 3,
|
|
maxnodes = 100,
|
|
importance = TRUE
|
|
)
|
|
|
|
rf_pred <- predict(rf_fit, newdata = test_reg[, reg_vars])
|
|
r2_rf <- cor(test_reg$log_price, rf_pred)^2
|
|
rmse_rf <- sqrt(mean((test_reg$log_price - rf_pred)^2))
|
|
mae_rf <- mean(abs(test_reg$log_price - rf_pred))
|
|
|
|
cat("\nrandom forest (log price) metrics on manhattan:\n")
|
|
cat("r2:", r2_rf, " rmse:", rmse_rf, " mae:", mae_rf, "\n")
|
|
|
|
# predicted vs actual plot (manhattan)
|
|
rf_diag_df <- data.frame(
|
|
actual = test_reg$log_price,
|
|
predicted = rf_pred
|
|
)
|
|
|
|
p_rf_pred_vs_actual <- ggplot(rf_diag_df, aes(x = actual, y = predicted)) +
|
|
geom_point(alpha = 0.3) +
|
|
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
|
|
labs(
|
|
title = "random forest: predicted vs actual log(1 + sale price) (manhattan)",
|
|
x = "actual log(1 + sale price)",
|
|
y = "predicted log(1 + sale price)"
|
|
)
|
|
|
|
|
|
# residuals vs predicted plot (manhattan)
|
|
rf_diag_df$residual <- rf_diag_df$actual - rf_diag_df$predicted
|
|
p_rf_resid <- ggplot(rf_diag_df, aes(x = predicted, y = residual)) +
|
|
geom_point(alpha = 0.3) +
|
|
geom_hline(yintercept = 0, linetype = "dashed") +
|
|
labs(
|
|
title = "random forest residuals (manhattan)",
|
|
x = "predicted log(1 + sale price)",
|
|
y = "residual"
|
|
)
|
|
|
|
print(p_rf_pred_vs_actual)
|
|
print(p_rf_resid)
|
|
|
|
|
|
# apply manhattan regression model to brooklyn
|
|
|
|
brook_reg_df <- brooklyn %>%
|
|
select(all_of(reg_vars), sale_price) %>%
|
|
tidyr::drop_na()
|
|
brook_reg_df$log_price <- log1p(brook_reg_df$sale_price)
|
|
|
|
rf_pred_brook <- predict(rf_fit, newdata = brook_reg_df[, reg_vars])
|
|
r2_rf_brook <- cor(brook_reg_df$log_price, rf_pred_brook)^2
|
|
|
|
rmse_rf_brook <- sqrt(mean((brook_reg_df$log_price - rf_pred_brook)^2))
|
|
mae_rf_brook <- mean(abs(brook_reg_df$log_price - rf_pred_brook))
|
|
|
|
cat("\nrandom forest (log price) metrics on brooklyn (trained on manhattan):\n")
|
|
cat("r2:", r2_rf_brook, " rmse:", rmse_rf_brook, " mae:", mae_rf_brook, "\n")
|
|
|
|
brook_diag_df <- data.frame(
|
|
actual = brook_reg_df$log_price,
|
|
predicted = rf_pred_brook
|
|
)
|
|
|
|
p_brook_pred_vs_actual <- ggplot(brook_diag_df, aes(x = actual, y = predicted)) +
|
|
geom_point(alpha = 0.3) +
|
|
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
|
|
labs(
|
|
title = "random forest: manhattan model on brooklyn (log price)",
|
|
x = "actual log(1 + sale price) (brooklyn)",
|
|
y = "predicted log(1 + sale price)"
|
|
)
|
|
|
|
|
|
brook_diag_df$residual <- brook_diag_df$actual - brook_diag_df$predicted
|
|
|
|
p_brook_resid <- ggplot(brook_diag_df, aes(x = predicted, y = residual)) +
|
|
geom_point(alpha = 0.3) +
|
|
geom_hline(yintercept = 0, linetype = "dashed") +
|
|
labs(
|
|
title = "residuals on brooklyn using manhattan random forest",
|
|
x = "predicted log(1 + sale price)",
|
|
y = "residual"
|
|
)
|
|
|
|
print(p_brook_pred_vs_actual)
|
|
print(p_brook_resid)
|
|
|
|
|
|
# classification: manhattan predict neighborhood
|
|
|
|
clf_vars <- c(
|
|
"sale_price", "land_sqft", "gross_sqft",
|
|
"year_built", "res_units", "comm_units", "total_units"
|
|
)
|
|
|
|
prepare_classification_df <- function(df, min_per_class = 100L) {
|
|
df <- df %>%
|
|
filter(!is.na(neighborhood))
|
|
|
|
counts <- table(df$neighborhood)
|
|
keep <- names(counts[counts >= min_per_class])
|
|
|
|
df <- df %>%
|
|
filter(neighborhood %in% keep) %>%
|
|
mutate(neighborhood = factor(neighborhood)) %>%
|
|
select(neighborhood, all_of(clf_vars)) %>%
|
|
tidyr::drop_na()
|
|
|
|
df
|
|
}
|
|
|
|
manhattan_clf <- prepare_classification_df(manhattan, min_per_class = 100L)
|
|
|
|
cat("\nmanhattan classification subset rows:", nrow(manhattan_clf), "\n")
|
|
cat("manhattan neighborhoods:", nlevels(manhattan_clf$neighborhood), "\n")
|
|
|
|
set.seed(42L)
|
|
|
|
train_idx_clf <- createDataPartition(manhattan_clf$neighborhood, p = 0.75, list = FALSE)
|
|
train_clf <- manhattan_clf[train_idx_clf, ]
|
|
test_clf <- manhattan_clf[-train_idx_clf, ]
|
|
|
|
# helper function for macro f1 from a confusionMatrix object
|
|
macro_f1_from_cm <- function(cm_obj) {
|
|
byc <- cm_obj$byClass
|
|
|
|
if (!is.matrix(byc)) {
|
|
precision <- byc["Pos Pred Value"]
|
|
recall <- byc["Sensitivity"]
|
|
return(2 * precision * recall / (precision + recall))
|
|
} else {
|
|
precision <- byc[, "Pos Pred Value"]
|
|
recall <- byc[, "Sensitivity"]
|
|
f1 <- 2 * precision * recall / (precision + recall)
|
|
mean(f1, na.rm = TRUE)
|
|
}
|
|
}
|
|
|
|
# k-nn classifier (<- ->)
|
|
|
|
ctrl_none <- trainControl(method = "none")
|
|
set.seed(42L)
|
|
|
|
knn_fit <- train(
|
|
neighborhood ~ sale_price + land_sqft + gross_sqft + year_built +
|
|
res_units + comm_units + total_units,
|
|
data = train_clf,
|
|
method = "knn",
|
|
preProcess = c("center", "scale"),
|
|
tuneGrid = data.frame(k = 7),
|
|
trControl = ctrl_none
|
|
)
|
|
|
|
knn_pred <- predict(knn_fit, newdata = test_clf)
|
|
cm_knn <- confusionMatrix(knn_pred, test_clf$neighborhood)
|
|
macro_f1_knn <- macro_f1_from_cm(cm_knn)
|
|
|
|
cat(
|
|
"\nknn (manhattan) accuracy:", cm_knn$overall["Accuracy"],
|
|
" macro f1:", macro_f1_knn, "\n"
|
|
)
|
|
|
|
# random forest classifier
|
|
set.seed(42L)
|
|
rf_clf_fit <- randomForest(
|
|
neighborhood ~ sale_price + land_sqft + gross_sqft + year_built +
|
|
res_units + comm_units + total_units,
|
|
data = train_clf,
|
|
ntree = 300,
|
|
mtry = 3
|
|
)
|
|
|
|
rf_clf_pred <- predict(rf_clf_fit, newdata = test_clf)
|
|
cm_rf_clf <- confusionMatrix(rf_clf_pred, test_clf$neighborhood)
|
|
macro_f1_rf_clf <- macro_f1_from_cm(cm_rf_clf)
|
|
|
|
cat(
|
|
"\nrandom forest classifier (manhattan) accuracy:",
|
|
cm_rf_clf$overall["Accuracy"],
|
|
" macro f1:", macro_f1_rf_clf, "\n"
|
|
)
|
|
|
|
# ex contingency table
|
|
rf_cm_table <- cm_rf_clf$table
|
|
print(dim(rf_cm_table))
|
|
|
|
# num neighborhoods x num neighborhoods
|
|
print(rf_cm_table)
|
|
|
|
# multinomial logistic regression
|
|
|
|
set.seed(42L)
|
|
logit_fit <- multinom(
|
|
neighborhood ~ sale_price + land_sqft + gross_sqft + year_built +
|
|
res_units + comm_units + total_units,
|
|
data = train_clf,
|
|
MaxNWts = 10000,
|
|
maxit = 2000,
|
|
trace = FALSE
|
|
)
|
|
|
|
logit_pred <- predict(logit_fit, newdata = test_clf)
|
|
cm_logit <- confusionMatrix(logit_pred, test_clf$neighborhood)
|
|
macro_f1_logit <- macro_f1_from_cm(cm_logit)
|
|
|
|
cat(
|
|
"\nmultinomial logistic regression (manhattan) accuracy:",
|
|
cm_logit$overall["Accuracy"],
|
|
" macro f1:", macro_f1_logit, "\n"
|
|
)
|
|
|
|
# use manhattan classifiers on brooklyn
|
|
|
|
brooklyn_clf <- prepare_classification_df(brooklyn, min_per_class = 100L)
|
|
cat("\nbrooklyn classification subset rows:", nrow(brooklyn_clf), "\n")
|
|
cat("brooklyn neighborhoods:", nlevels(brooklyn_clf$neighborhood), "\n")
|
|
|
|
# predictions from manhattan-trained models on brooklyn data
|
|
knn_pred_brook <- predict(knn_fit, newdata = brooklyn_clf)
|
|
rf_pred_brook <- predict(rf_clf_fit, newdata = brooklyn_clf)
|
|
logit_pred_brook <- predict(logit_fit, newdata = brooklyn_clf)
|
|
|
|
# contingency tables (true brooklyn neigh vs predicted manhattan neigh)
|
|
# these will be essentially all off-diagonal because label sets differ
|
|
# idk how to make this look better though :(
|
|
tab_knn_brook <- table(true = brooklyn_clf$neighborhood, pred = knn_pred_brook)
|
|
tab_rf_brook <- table(true = brooklyn_clf$neighborhood, pred = rf_pred_brook)
|
|
tab_logit_brook <- table(true = brooklyn_clf$neighborhood, pred = logit_pred_brook)
|
|
|
|
cat("\ncontingency table dimensions (knn):", dim(tab_knn_brook), "\n")
|
|
cat("contingency table dimensions (random forest):", dim(tab_rf_brook), "\n")
|
|
cat("contingency table dimensions (logit):", dim(tab_logit_brook), "\n")
|
|
|
|
|
|
plots <- list(
|
|
manhattan_hist_raw = p_hist_raw,
|
|
manhattan_hist_log = p_hist_log,
|
|
manhattan_box = p_box,
|
|
manhattan_scatter = p_scatter,
|
|
rf_pred_vs_actual_manhattan = p_rf_pred_vs_actual,
|
|
rf_resid_manhattan = p_rf_resid,
|
|
rf_pred_vs_actual_brooklyn = p_brook_pred_vs_actual,
|
|
rf_resid_brooklyn = p_brook_resid
|
|
)
|
|
|
|
dir.create("plots", showWarnings = FALSE)
|
|
|
|
for (nm in names(plots)) {
|
|
ggsave(
|
|
filename = file.path("plots", paste0(nm, ".png")),
|
|
plot = plots[[nm]],
|
|
width = 7,
|
|
height = 5,
|
|
dpi = 300
|
|
)
|
|
}
|