Data-Analytics/Assignment V/toSubmit/assignment.r

# install.packages(
#     c("dplyr", "ggplot2", "randomForest", "caret", "nnet", "e1071", "scales"),
#     repos = "https://cloud.r-project.org"
# )

library(dplyr)
library(ggplot2)
library(randomForest)
library(caret)
library(nnet)
library(e1071)
library(scales)

# load data / basic subsets
options(stringsAsFactors = FALSE)
set.seed(42L)

file_path <- "Given/NYC_Citywide_Annualized_Calendar_Sales_Update_20241107.csv"

# columns we actually need
cols_needed <- c(
    "BOROUGH", "NEIGHBORHOOD", "BUILDING CLASS CATEGORY",
    "TAX CLASS AS OF FINAL ROLL", "BLOCK", "LOT",
    "BUILDING CLASS AS OF FINAL ROLL", "ZIP CODE",
    "RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS",
    "LAND SQUARE FEET", "GROSS SQUARE FEET", "YEAR BUILT",
    "TAX CLASS AT TIME OF SALE", "BUILDING CLASS AT TIME OF SALE",
    "SALE PRICE", "SALE DATE"
)

nyc <- read.csv(file_path, stringsAsFactors = FALSE, check.names = FALSE)
nyc <- nyc[, intersect(cols_needed, colnames(nyc))]

# force borough numeric
nyc$BOROUGH <- suppressWarnings(as.numeric(nyc$BOROUGH))
manhattan_raw <- nyc %>% filter(BOROUGH == 1)
brooklyn_raw <- nyc %>% filter(BOROUGH == 3)

cat("raw manhattan rows:", nrow(manhattan_raw), "\n")
cat("raw brooklyn rows:", nrow(brooklyn_raw), "\n")


clean_borough <- function(df, min_price = 10000) {
    # numeric from char / factor with commas
    parse_numeric <- function(x) {
        x <- as.character(x)
        x <- trimws(x)
        x[x %in% c("0", "0.0", "- 0", "", ".", "NA", "NaN")] <- NA
        x <- gsub(",", "", x, fixed = TRUE)
        suppressWarnings(as.numeric(x))
    }

    df <- df

    # convert TO COMPUTER SCIENCE ITWS OVERRATED
    df$`SALE PRICE` <- parse_numeric(df$`SALE PRICE`)
    df$`LAND SQUARE FEET` <- parse_numeric(df$`LAND SQUARE FEET`)
    df$`GROSS SQUARE FEET` <- parse_numeric(df$`GROSS SQUARE FEET`)
    df$`YEAR BUILT` <- parse_numeric(df$`YEAR BUILT`)

    unit_cols <- c("RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS")
    for (col in unit_cols) {
        if (col %in% names(df)) {
            df[[col]] <- parse_numeric(df[[col]])
        }
    }

    # drop non-arms-length / tiny sales
    df <- df %>%
        filter(!is.na(`SALE PRICE`)) %>%
        filter(`SALE PRICE` > min_price)

    # very old or zero years --> missing
    df$`YEAR BUILT`[df$`YEAR BUILT` < 1800] <- NA

    # need usable size / year
    df <- df %>%
        filter(
            !is.na(`GROSS SQUARE FEET`),
            !is.na(`LAND SQUARE FEET`),
            !is.na(`YEAR BUILT`),
            `GROSS SQUARE FEET` > 0,
            `LAND SQUARE FEET` > 0
        )

    # fill missing units with 0 because I am creative 10
    for (col in unit_cols) {
        if (col %in% names(df)) {
            df[[col]][is.na(df[[col]])] <- 0
        }
    }

    # I AM LAZY (create new cols)
    df <- df %>%
        rename(
            neighborhood = NEIGHBORHOOD
        ) %>%
        mutate(
            land_sqft   = `LAND SQUARE FEET`,
            gross_sqft  = `GROSS SQUARE FEET`,
            year_built  = `YEAR BUILT`,
            res_units   = `RESIDENTIAL UNITS`,
            comm_units  = `COMMERCIAL UNITS`,
            total_units = `TOTAL UNITS`,
            sale_price  = `SALE PRICE`
        )

    df
}

# http://localhost:21486/library/psych/html/manhattan.html
manhattan <- clean_borough(manhattan_raw)
brooklyn <- clean_borough(brooklyn_raw)

cat("clean manhattan rows:", nrow(manhattan), "\n")
cat("clean brooklyn rows:", nrow(brooklyn), "\n")

# manhattan exploratory data analysis

# summary stats for sale price
summary_manhattan_price <- summary(manhattan$sale_price)
print(summary_manhattan_price)

quantiles_manhattan <- quantile(
    manhattan$sale_price,
    probs = c(0.25, 0.5, 0.75, 0.9, 0.95, 0.99),
    na.rm = TRUE
)

print(quantiles_manhattan)

# iqr-based outlier bounds
q1 <- quantiles_manhattan[1]
q3 <- quantiles_manhattan[3]
iqr <- q3 - q1
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr

cat("manhattan iqr upper bound:", upper_bound, "\n")
cat("manhattan max sale price:", max(manhattan$sale_price, na.rm = TRUE), "\n")

outlier_mask <- (manhattan$sale_price < lower_bound) |
    (manhattan$sale_price > upper_bound)
cat("number of sale price outliers:", sum(outlier_mask, na.rm = TRUE), "\n")


# correlation with other numeric vars
num_cols <- c(
    "sale_price", "gross_sqft", "land_sqft",
    "year_built", "res_units", "comm_units", "total_units"
)

corr_manhattan <- cor(manhattan[, num_cols], use = "complete.obs")
print(corr_manhattan[, "sale_price"])


# histogram of raw sale prices
p_hist_raw <- ggplot(manhattan, aes(x = sale_price)) +
    geom_histogram(bins = 50, color = "black", fill = NA) +
    scale_x_continuous(labels = comma) +
    labs(
        title = "manhattan sale price distribution (raw)",
        x = "sale price (usd)",
        y = "count of sales"
    )


# histogram of log(1 + sale price)
p_hist_log <- ggplot(manhattan, aes(x = log1p(sale_price))) +
    geom_histogram(bins = 50, color = "black", fill = NA) +
    labs(
        title = "manhattan sale price distribution (log scale)",
        x = "log(1 + sale price)",
        y = "count of sales"
    )


# boxplot of sale price (for show outliers)
p_box <- ggplot(manhattan, aes(y = sale_price)) +
    geom_boxplot(outlier.alpha = 0.4) +
    scale_y_continuous(labels = comma) +
    labs(
        title = "manhattan sale price with outliers",
        y = "sale price (usd)",
        x = ""
    )


# scatter: gross sqft vs sale price (log y)
p_scatter <- ggplot(manhattan, aes(x = gross_sqft, y = sale_price)) +
    geom_point(alpha = 0.3) +
    scale_y_continuous(trans = "log10", labels = comma) +
    labs(
        title = "manhattan sale price vs gross square feet",
        x = "gross square feet",
        y = "sale price (log10 scale)"
    )


# print or save plots as needed
print(p_hist_raw)
print(p_hist_log)
print(p_box)
print(p_scatter)

# regression analysis (manhattan)

reg_vars <- c(
    "land_sqft", "gross_sqft", "year_built",
    "res_units", "comm_units", "total_units"
)

reg_df <- manhattan %>%
    select(all_of(reg_vars), sale_price) %>%
    tidyr::drop_na()

reg_df$log_price <- log1p(reg_df$sale_price)

set.seed(42L)

train_idx_reg <- createDataPartition(reg_df$log_price, p = 0.75, list = FALSE)
train_reg <- reg_df[train_idx_reg, ]
test_reg <- reg_df[-train_idx_reg, ]

# linear regression
lm_fit <- lm(
    log_price ~ land_sqft + gross_sqft + year_built +
        res_units + comm_units + total_units,
    data = train_reg
)

lm_pred <- predict(lm_fit, newdata = test_reg)
r2_lm <- cor(test_reg$log_price, lm_pred)^2
rmse_lm <- sqrt(mean((test_reg$log_price - lm_pred)^2))
mae_lm <- mean(abs(test_reg$log_price - lm_pred))

cat("\nlinear model (log price) metrics on manhattan:\n")
cat("r2:", r2_lm, " rmse:", rmse_lm, " mae:", mae_lm, "\n")

# random forest regression on log price
set.seed(42L)

rf_fit <- randomForest(
    x = train_reg[, reg_vars],
    y = train_reg$log_price,
    ntree = 200,
    mtry = 3,
    maxnodes = 100,
    importance = TRUE
)

rf_pred <- predict(rf_fit, newdata = test_reg[, reg_vars])
r2_rf <- cor(test_reg$log_price, rf_pred)^2
rmse_rf <- sqrt(mean((test_reg$log_price - rf_pred)^2))
mae_rf <- mean(abs(test_reg$log_price - rf_pred))

cat("\nrandom forest (log price) metrics on manhattan:\n")
cat("r2:", r2_rf, " rmse:", rmse_rf, " mae:", mae_rf, "\n")

# predicted vs actual plot (manhattan)
rf_diag_df <- data.frame(
    actual    = test_reg$log_price,
    predicted = rf_pred
)

p_rf_pred_vs_actual <- ggplot(rf_diag_df, aes(x = actual, y = predicted)) +
    geom_point(alpha = 0.3) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
    labs(
        title = "random forest: predicted vs actual log(1 + sale price) (manhattan)",
        x = "actual log(1 + sale price)",
        y = "predicted log(1 + sale price)"
    )


# residuals vs predicted plot (manhattan)
rf_diag_df$residual <- rf_diag_df$actual - rf_diag_df$predicted
p_rf_resid <- ggplot(rf_diag_df, aes(x = predicted, y = residual)) +
    geom_point(alpha = 0.3) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    labs(
        title = "random forest residuals (manhattan)",
        x = "predicted log(1 + sale price)",
        y = "residual"
    )

print(p_rf_pred_vs_actual)
print(p_rf_resid)


# apply manhattan regression model to brooklyn

brook_reg_df <- brooklyn %>%
    select(all_of(reg_vars), sale_price) %>%
    tidyr::drop_na()
brook_reg_df$log_price <- log1p(brook_reg_df$sale_price)

rf_pred_brook <- predict(rf_fit, newdata = brook_reg_df[, reg_vars])
r2_rf_brook <- cor(brook_reg_df$log_price, rf_pred_brook)^2

rmse_rf_brook <- sqrt(mean((brook_reg_df$log_price - rf_pred_brook)^2))
mae_rf_brook <- mean(abs(brook_reg_df$log_price - rf_pred_brook))

cat("\nrandom forest (log price) metrics on brooklyn (trained on manhattan):\n")
cat("r2:", r2_rf_brook, " rmse:", rmse_rf_brook, " mae:", mae_rf_brook, "\n")

brook_diag_df <- data.frame(
    actual    = brook_reg_df$log_price,
    predicted = rf_pred_brook
)

p_brook_pred_vs_actual <- ggplot(brook_diag_df, aes(x = actual, y = predicted)) +
    geom_point(alpha = 0.3) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
    labs(
        title = "random forest: manhattan model on brooklyn (log price)",
        x = "actual log(1 + sale price) (brooklyn)",
        y = "predicted log(1 + sale price)"
    )


brook_diag_df$residual <- brook_diag_df$actual - brook_diag_df$predicted

p_brook_resid <- ggplot(brook_diag_df, aes(x = predicted, y = residual)) +
    geom_point(alpha = 0.3) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    labs(
        title = "residuals on brooklyn using manhattan random forest",
        x = "predicted log(1 + sale price)",
        y = "residual"
    )

print(p_brook_pred_vs_actual)
print(p_brook_resid)


# classification: manhattan predict neighborhood

clf_vars <- c(
    "sale_price", "land_sqft", "gross_sqft",
    "year_built", "res_units", "comm_units", "total_units"
)

prepare_classification_df <- function(df, min_per_class = 100L) {
    df <- df %>%
        filter(!is.na(neighborhood))

    counts <- table(df$neighborhood)
    keep <- names(counts[counts >= min_per_class])

    df <- df %>%
        filter(neighborhood %in% keep) %>%
        mutate(neighborhood = factor(neighborhood)) %>%
        select(neighborhood, all_of(clf_vars)) %>%
        tidyr::drop_na()

    df
}

manhattan_clf <- prepare_classification_df(manhattan, min_per_class = 100L)

cat("\nmanhattan classification subset rows:", nrow(manhattan_clf), "\n")
cat("manhattan neighborhoods:", nlevels(manhattan_clf$neighborhood), "\n")

set.seed(42L)

train_idx_clf <- createDataPartition(manhattan_clf$neighborhood, p = 0.75, list = FALSE)
train_clf <- manhattan_clf[train_idx_clf, ]
test_clf <- manhattan_clf[-train_idx_clf, ]

# helper function for macro f1 from a confusionMatrix object
macro_f1_from_cm <- function(cm_obj) {
    byc <- cm_obj$byClass

    if (!is.matrix(byc)) {
        precision <- byc["Pos Pred Value"]
        recall <- byc["Sensitivity"]
        return(2 * precision * recall / (precision + recall))
    } else {
        precision <- byc[, "Pos Pred Value"]
        recall <- byc[, "Sensitivity"]
        f1 <- 2 * precision * recall / (precision + recall)
        mean(f1, na.rm = TRUE)
    }
}

# k-nn classifier (<-  ->)

ctrl_none <- trainControl(method = "none")
set.seed(42L)

knn_fit <- train(
    neighborhood ~ sale_price + land_sqft + gross_sqft + year_built +
        res_units + comm_units + total_units,
    data = train_clf,
    method = "knn",
    preProcess = c("center", "scale"),
    tuneGrid = data.frame(k = 7),
    trControl = ctrl_none
)

knn_pred <- predict(knn_fit, newdata = test_clf)
cm_knn <- confusionMatrix(knn_pred, test_clf$neighborhood)
macro_f1_knn <- macro_f1_from_cm(cm_knn)

cat(
    "\nknn (manhattan) accuracy:", cm_knn$overall["Accuracy"],
    " macro f1:", macro_f1_knn, "\n"
)

# random forest classifier
set.seed(42L)
rf_clf_fit <- randomForest(
    neighborhood ~ sale_price + land_sqft + gross_sqft + year_built +
        res_units + comm_units + total_units,
    data = train_clf,
    ntree = 300,
    mtry = 3
)

rf_clf_pred <- predict(rf_clf_fit, newdata = test_clf)
cm_rf_clf <- confusionMatrix(rf_clf_pred, test_clf$neighborhood)
macro_f1_rf_clf <- macro_f1_from_cm(cm_rf_clf)

cat(
    "\nrandom forest classifier (manhattan) accuracy:",
    cm_rf_clf$overall["Accuracy"],
    " macro f1:", macro_f1_rf_clf, "\n"
)

# ex contingency table
rf_cm_table <- cm_rf_clf$table
print(dim(rf_cm_table))

# num neighborhoods x num neighborhoods
print(rf_cm_table)

# multinomial logistic regression

set.seed(42L)
logit_fit <- multinom(
    neighborhood ~ sale_price + land_sqft + gross_sqft + year_built +
        res_units + comm_units + total_units,
    data = train_clf,
    MaxNWts = 10000,
    maxit = 2000,
    trace = FALSE
)

logit_pred <- predict(logit_fit, newdata = test_clf)
cm_logit <- confusionMatrix(logit_pred, test_clf$neighborhood)
macro_f1_logit <- macro_f1_from_cm(cm_logit)

cat(
    "\nmultinomial logistic regression (manhattan) accuracy:",
    cm_logit$overall["Accuracy"],
    " macro f1:", macro_f1_logit, "\n"
)

# use manhattan classifiers on brooklyn

brooklyn_clf <- prepare_classification_df(brooklyn, min_per_class = 100L)
cat("\nbrooklyn classification subset rows:", nrow(brooklyn_clf), "\n")
cat("brooklyn neighborhoods:", nlevels(brooklyn_clf$neighborhood), "\n")

# predictions from manhattan-trained models on brooklyn data
knn_pred_brook <- predict(knn_fit, newdata = brooklyn_clf)
rf_pred_brook <- predict(rf_clf_fit, newdata = brooklyn_clf)
logit_pred_brook <- predict(logit_fit, newdata = brooklyn_clf)

# contingency tables (true brooklyn neigh vs predicted manhattan neigh)
# these will be essentially all off-diagonal because label sets differ
# idk how to make this look better though :(
tab_knn_brook <- table(true = brooklyn_clf$neighborhood, pred = knn_pred_brook)
tab_rf_brook <- table(true = brooklyn_clf$neighborhood, pred = rf_pred_brook)
tab_logit_brook <- table(true = brooklyn_clf$neighborhood, pred = logit_pred_brook)

cat("\ncontingency table dimensions (knn):", dim(tab_knn_brook), "\n")
cat("contingency table dimensions (random forest):", dim(tab_rf_brook), "\n")
cat("contingency table dimensions (logit):", dim(tab_logit_brook), "\n")


plots <- list(
    manhattan_hist_raw = p_hist_raw,
    manhattan_hist_log = p_hist_log,
    manhattan_box = p_box,
    manhattan_scatter = p_scatter,
    rf_pred_vs_actual_manhattan = p_rf_pred_vs_actual,
    rf_resid_manhattan = p_rf_resid,
    rf_pred_vs_actual_brooklyn = p_brook_pred_vs_actual,
    rf_resid_brooklyn = p_brook_resid
)

dir.create("plots", showWarnings = FALSE)

for (nm in names(plots)) {
    ggsave(
        filename = file.path("plots", paste0(nm, ".png")),
        plot = plots[[nm]],
        width = 7,
        height = 5,
        dpi = 300
    )
}