# install.packages( # c("dplyr", "ggplot2", "randomForest", "caret", "nnet", "e1071", "scales"), # repos = "https://cloud.r-project.org" # ) library(dplyr) library(ggplot2) library(randomForest) library(caret) library(nnet) library(e1071) library(scales) # load data / basic subsets options(stringsAsFactors = FALSE) set.seed(42L) file_path <- "Given/NYC_Citywide_Annualized_Calendar_Sales_Update_20241107.csv" # columns we actually need cols_needed <- c( "BOROUGH", "NEIGHBORHOOD", "BUILDING CLASS CATEGORY", "TAX CLASS AS OF FINAL ROLL", "BLOCK", "LOT", "BUILDING CLASS AS OF FINAL ROLL", "ZIP CODE", "RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS", "LAND SQUARE FEET", "GROSS SQUARE FEET", "YEAR BUILT", "TAX CLASS AT TIME OF SALE", "BUILDING CLASS AT TIME OF SALE", "SALE PRICE", "SALE DATE" ) nyc <- read.csv(file_path, stringsAsFactors = FALSE, check.names = FALSE) nyc <- nyc[, intersect(cols_needed, colnames(nyc))] # force borough numeric nyc$BOROUGH <- suppressWarnings(as.numeric(nyc$BOROUGH)) manhattan_raw <- nyc %>% filter(BOROUGH == 1) brooklyn_raw <- nyc %>% filter(BOROUGH == 3) cat("raw manhattan rows:", nrow(manhattan_raw), "\n") cat("raw brooklyn rows:", nrow(brooklyn_raw), "\n") clean_borough <- function(df, min_price = 10000) { # numeric from char / factor with commas parse_numeric <- function(x) { x <- as.character(x) x <- trimws(x) x[x %in% c("0", "0.0", "- 0", "", ".", "NA", "NaN")] <- NA x <- gsub(",", "", x, fixed = TRUE) suppressWarnings(as.numeric(x)) } df <- df # convert TO COMPUTER SCIENCE ITWS OVERRATED df$`SALE PRICE` <- parse_numeric(df$`SALE PRICE`) df$`LAND SQUARE FEET` <- parse_numeric(df$`LAND SQUARE FEET`) df$`GROSS SQUARE FEET` <- parse_numeric(df$`GROSS SQUARE FEET`) df$`YEAR BUILT` <- parse_numeric(df$`YEAR BUILT`) unit_cols <- c("RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS") for (col in unit_cols) { if (col %in% names(df)) { df[[col]] <- parse_numeric(df[[col]]) } } # drop non-arms-length / tiny sales df <- df %>% filter(!is.na(`SALE PRICE`)) %>% filter(`SALE PRICE` > min_price) # very old or zero years --> missing df$`YEAR BUILT`[df$`YEAR BUILT` < 1800] <- NA # need usable size / year df <- df %>% filter( !is.na(`GROSS SQUARE FEET`), !is.na(`LAND SQUARE FEET`), !is.na(`YEAR BUILT`), `GROSS SQUARE FEET` > 0, `LAND SQUARE FEET` > 0 ) # fill missing units with 0 because I am creative 10 for (col in unit_cols) { if (col %in% names(df)) { df[[col]][is.na(df[[col]])] <- 0 } } # I AM LAZY (create new cols) df <- df %>% rename( neighborhood = NEIGHBORHOOD ) %>% mutate( land_sqft = `LAND SQUARE FEET`, gross_sqft = `GROSS SQUARE FEET`, year_built = `YEAR BUILT`, res_units = `RESIDENTIAL UNITS`, comm_units = `COMMERCIAL UNITS`, total_units = `TOTAL UNITS`, sale_price = `SALE PRICE` ) df } # http://localhost:21486/library/psych/html/manhattan.html manhattan <- clean_borough(manhattan_raw) brooklyn <- clean_borough(brooklyn_raw) cat("clean manhattan rows:", nrow(manhattan), "\n") cat("clean brooklyn rows:", nrow(brooklyn), "\n") # manhattan exploratory data analysis # summary stats for sale price summary_manhattan_price <- summary(manhattan$sale_price) print(summary_manhattan_price) quantiles_manhattan <- quantile( manhattan$sale_price, probs = c(0.25, 0.5, 0.75, 0.9, 0.95, 0.99), na.rm = TRUE ) print(quantiles_manhattan) # iqr-based outlier bounds q1 <- quantiles_manhattan[1] q3 <- quantiles_manhattan[3] iqr <- q3 - q1 lower_bound <- q1 - 1.5 * iqr upper_bound <- q3 + 1.5 * iqr cat("manhattan iqr upper bound:", upper_bound, "\n") cat("manhattan max sale price:", max(manhattan$sale_price, na.rm = TRUE), "\n") outlier_mask <- (manhattan$sale_price < lower_bound) | (manhattan$sale_price > upper_bound) cat("number of sale price outliers:", sum(outlier_mask, na.rm = TRUE), "\n") # correlation with other numeric vars num_cols <- c( "sale_price", "gross_sqft", "land_sqft", "year_built", "res_units", "comm_units", "total_units" ) corr_manhattan <- cor(manhattan[, num_cols], use = "complete.obs") print(corr_manhattan[, "sale_price"]) # histogram of raw sale prices p_hist_raw <- ggplot(manhattan, aes(x = sale_price)) + geom_histogram(bins = 50, color = "black", fill = NA) + scale_x_continuous(labels = comma) + labs( title = "manhattan sale price distribution (raw)", x = "sale price (usd)", y = "count of sales" ) # histogram of log(1 + sale price) p_hist_log <- ggplot(manhattan, aes(x = log1p(sale_price))) + geom_histogram(bins = 50, color = "black", fill = NA) + labs( title = "manhattan sale price distribution (log scale)", x = "log(1 + sale price)", y = "count of sales" ) # boxplot of sale price (for show outliers) p_box <- ggplot(manhattan, aes(y = sale_price)) + geom_boxplot(outlier.alpha = 0.4) + scale_y_continuous(labels = comma) + labs( title = "manhattan sale price with outliers", y = "sale price (usd)", x = "" ) # scatter: gross sqft vs sale price (log y) p_scatter <- ggplot(manhattan, aes(x = gross_sqft, y = sale_price)) + geom_point(alpha = 0.3) + scale_y_continuous(trans = "log10", labels = comma) + labs( title = "manhattan sale price vs gross square feet", x = "gross square feet", y = "sale price (log10 scale)" ) # print or save plots as needed print(p_hist_raw) print(p_hist_log) print(p_box) print(p_scatter) # regression analysis (manhattan) reg_vars <- c( "land_sqft", "gross_sqft", "year_built", "res_units", "comm_units", "total_units" ) reg_df <- manhattan %>% select(all_of(reg_vars), sale_price) %>% tidyr::drop_na() reg_df$log_price <- log1p(reg_df$sale_price) set.seed(42L) train_idx_reg <- createDataPartition(reg_df$log_price, p = 0.75, list = FALSE) train_reg <- reg_df[train_idx_reg, ] test_reg <- reg_df[-train_idx_reg, ] # linear regression lm_fit <- lm( log_price ~ land_sqft + gross_sqft + year_built + res_units + comm_units + total_units, data = train_reg ) lm_pred <- predict(lm_fit, newdata = test_reg) r2_lm <- cor(test_reg$log_price, lm_pred)^2 rmse_lm <- sqrt(mean((test_reg$log_price - lm_pred)^2)) mae_lm <- mean(abs(test_reg$log_price - lm_pred)) cat("\nlinear model (log price) metrics on manhattan:\n") cat("r2:", r2_lm, " rmse:", rmse_lm, " mae:", mae_lm, "\n") # random forest regression on log price set.seed(42L) rf_fit <- randomForest( x = train_reg[, reg_vars], y = train_reg$log_price, ntree = 200, mtry = 3, maxnodes = 100, importance = TRUE ) rf_pred <- predict(rf_fit, newdata = test_reg[, reg_vars]) r2_rf <- cor(test_reg$log_price, rf_pred)^2 rmse_rf <- sqrt(mean((test_reg$log_price - rf_pred)^2)) mae_rf <- mean(abs(test_reg$log_price - rf_pred)) cat("\nrandom forest (log price) metrics on manhattan:\n") cat("r2:", r2_rf, " rmse:", rmse_rf, " mae:", mae_rf, "\n") # predicted vs actual plot (manhattan) rf_diag_df <- data.frame( actual = test_reg$log_price, predicted = rf_pred ) p_rf_pred_vs_actual <- ggplot(rf_diag_df, aes(x = actual, y = predicted)) + geom_point(alpha = 0.3) + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + labs( title = "random forest: predicted vs actual log(1 + sale price) (manhattan)", x = "actual log(1 + sale price)", y = "predicted log(1 + sale price)" ) # residuals vs predicted plot (manhattan) rf_diag_df$residual <- rf_diag_df$actual - rf_diag_df$predicted p_rf_resid <- ggplot(rf_diag_df, aes(x = predicted, y = residual)) + geom_point(alpha = 0.3) + geom_hline(yintercept = 0, linetype = "dashed") + labs( title = "random forest residuals (manhattan)", x = "predicted log(1 + sale price)", y = "residual" ) print(p_rf_pred_vs_actual) print(p_rf_resid) # apply manhattan regression model to brooklyn brook_reg_df <- brooklyn %>% select(all_of(reg_vars), sale_price) %>% tidyr::drop_na() brook_reg_df$log_price <- log1p(brook_reg_df$sale_price) rf_pred_brook <- predict(rf_fit, newdata = brook_reg_df[, reg_vars]) r2_rf_brook <- cor(brook_reg_df$log_price, rf_pred_brook)^2 rmse_rf_brook <- sqrt(mean((brook_reg_df$log_price - rf_pred_brook)^2)) mae_rf_brook <- mean(abs(brook_reg_df$log_price - rf_pred_brook)) cat("\nrandom forest (log price) metrics on brooklyn (trained on manhattan):\n") cat("r2:", r2_rf_brook, " rmse:", rmse_rf_brook, " mae:", mae_rf_brook, "\n") brook_diag_df <- data.frame( actual = brook_reg_df$log_price, predicted = rf_pred_brook ) p_brook_pred_vs_actual <- ggplot(brook_diag_df, aes(x = actual, y = predicted)) + geom_point(alpha = 0.3) + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + labs( title = "random forest: manhattan model on brooklyn (log price)", x = "actual log(1 + sale price) (brooklyn)", y = "predicted log(1 + sale price)" ) brook_diag_df$residual <- brook_diag_df$actual - brook_diag_df$predicted p_brook_resid <- ggplot(brook_diag_df, aes(x = predicted, y = residual)) + geom_point(alpha = 0.3) + geom_hline(yintercept = 0, linetype = "dashed") + labs( title = "residuals on brooklyn using manhattan random forest", x = "predicted log(1 + sale price)", y = "residual" ) print(p_brook_pred_vs_actual) print(p_brook_resid) # classification: manhattan predict neighborhood clf_vars <- c( "sale_price", "land_sqft", "gross_sqft", "year_built", "res_units", "comm_units", "total_units" ) prepare_classification_df <- function(df, min_per_class = 100L) { df <- df %>% filter(!is.na(neighborhood)) counts <- table(df$neighborhood) keep <- names(counts[counts >= min_per_class]) df <- df %>% filter(neighborhood %in% keep) %>% mutate(neighborhood = factor(neighborhood)) %>% select(neighborhood, all_of(clf_vars)) %>% tidyr::drop_na() df } manhattan_clf <- prepare_classification_df(manhattan, min_per_class = 100L) cat("\nmanhattan classification subset rows:", nrow(manhattan_clf), "\n") cat("manhattan neighborhoods:", nlevels(manhattan_clf$neighborhood), "\n") set.seed(42L) train_idx_clf <- createDataPartition(manhattan_clf$neighborhood, p = 0.75, list = FALSE) train_clf <- manhattan_clf[train_idx_clf, ] test_clf <- manhattan_clf[-train_idx_clf, ] # helper function for macro f1 from a confusionMatrix object macro_f1_from_cm <- function(cm_obj) { byc <- cm_obj$byClass if (!is.matrix(byc)) { precision <- byc["Pos Pred Value"] recall <- byc["Sensitivity"] return(2 * precision * recall / (precision + recall)) } else { precision <- byc[, "Pos Pred Value"] recall <- byc[, "Sensitivity"] f1 <- 2 * precision * recall / (precision + recall) mean(f1, na.rm = TRUE) } } # k-nn classifier (<- ->) ctrl_none <- trainControl(method = "none") set.seed(42L) knn_fit <- train( neighborhood ~ sale_price + land_sqft + gross_sqft + year_built + res_units + comm_units + total_units, data = train_clf, method = "knn", preProcess = c("center", "scale"), tuneGrid = data.frame(k = 7), trControl = ctrl_none ) knn_pred <- predict(knn_fit, newdata = test_clf) cm_knn <- confusionMatrix(knn_pred, test_clf$neighborhood) macro_f1_knn <- macro_f1_from_cm(cm_knn) cat( "\nknn (manhattan) accuracy:", cm_knn$overall["Accuracy"], " macro f1:", macro_f1_knn, "\n" ) # random forest classifier set.seed(42L) rf_clf_fit <- randomForest( neighborhood ~ sale_price + land_sqft + gross_sqft + year_built + res_units + comm_units + total_units, data = train_clf, ntree = 300, mtry = 3 ) rf_clf_pred <- predict(rf_clf_fit, newdata = test_clf) cm_rf_clf <- confusionMatrix(rf_clf_pred, test_clf$neighborhood) macro_f1_rf_clf <- macro_f1_from_cm(cm_rf_clf) cat( "\nrandom forest classifier (manhattan) accuracy:", cm_rf_clf$overall["Accuracy"], " macro f1:", macro_f1_rf_clf, "\n" ) # ex contingency table rf_cm_table <- cm_rf_clf$table print(dim(rf_cm_table)) # num neighborhoods x num neighborhoods print(rf_cm_table) # multinomial logistic regression set.seed(42L) logit_fit <- multinom( neighborhood ~ sale_price + land_sqft + gross_sqft + year_built + res_units + comm_units + total_units, data = train_clf, MaxNWts = 10000, maxit = 2000, trace = FALSE ) logit_pred <- predict(logit_fit, newdata = test_clf) cm_logit <- confusionMatrix(logit_pred, test_clf$neighborhood) macro_f1_logit <- macro_f1_from_cm(cm_logit) cat( "\nmultinomial logistic regression (manhattan) accuracy:", cm_logit$overall["Accuracy"], " macro f1:", macro_f1_logit, "\n" ) # use manhattan classifiers on brooklyn brooklyn_clf <- prepare_classification_df(brooklyn, min_per_class = 100L) cat("\nbrooklyn classification subset rows:", nrow(brooklyn_clf), "\n") cat("brooklyn neighborhoods:", nlevels(brooklyn_clf$neighborhood), "\n") # predictions from manhattan-trained models on brooklyn data knn_pred_brook <- predict(knn_fit, newdata = brooklyn_clf) rf_pred_brook <- predict(rf_clf_fit, newdata = brooklyn_clf) logit_pred_brook <- predict(logit_fit, newdata = brooklyn_clf) # contingency tables (true brooklyn neigh vs predicted manhattan neigh) # these will be essentially all off-diagonal because label sets differ # idk how to make this look better though :( tab_knn_brook <- table(true = brooklyn_clf$neighborhood, pred = knn_pred_brook) tab_rf_brook <- table(true = brooklyn_clf$neighborhood, pred = rf_pred_brook) tab_logit_brook <- table(true = brooklyn_clf$neighborhood, pred = logit_pred_brook) cat("\ncontingency table dimensions (knn):", dim(tab_knn_brook), "\n") cat("contingency table dimensions (random forest):", dim(tab_rf_brook), "\n") cat("contingency table dimensions (logit):", dim(tab_logit_brook), "\n") plots <- list( manhattan_hist_raw = p_hist_raw, manhattan_hist_log = p_hist_log, manhattan_box = p_box, manhattan_scatter = p_scatter, rf_pred_vs_actual_manhattan = p_rf_pred_vs_actual, rf_resid_manhattan = p_rf_resid, rf_pred_vs_actual_brooklyn = p_brook_pred_vs_actual, rf_resid_brooklyn = p_brook_resid ) dir.create("plots", showWarnings = FALSE) for (nm in names(plots)) { ggsave( filename = file.path("plots", paste0(nm, ".png")), plot = plots[[nm]], width = 7, height = 5, dpi = 300 ) }