This repository has been archived on 2026-05-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Data-Analytics/Assignment V/toSubmit/assignment.py
T
2025-11-14 15:53:48 -05:00

554 lines
16 KiB
Python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
r2_score,
mean_squared_error,
mean_absolute_error,
accuracy_score,
precision_recall_fscore_support,
confusion_matrix,
)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
file_path = "Given/NYC_Citywide_Annualized_Calendar_Sales_Update_20241107.csv"
cols_needed = [
"BOROUGH", "NEIGHBORHOOD", "BUILDING CLASS CATEGORY",
"TAX CLASS AS OF FINAL ROLL", "BLOCK", "LOT",
"BUILDING CLASS AS OF FINAL ROLL", "ZIP CODE",
"RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS",
"LAND SQUARE FEET", "GROSS SQUARE FEET", "YEAR BUILT",
"TAX CLASS AT TIME OF SALE", "BUILDING CLASS AT TIME OF SALE",
"SALE PRICE", "SALE DATE",
]
# loading...
nyc = pd.read_csv(file_path, dtype=str, low_memory=False)
cols_present = [c for c in cols_needed if c in nyc.columns]
nyc = nyc[cols_present]
# force borough numeric
nyc["BOROUGH"] = pd.to_numeric(nyc["BOROUGH"], errors="coerce")
manhattan_raw = nyc[nyc["BOROUGH"] == 1].copy()
brooklyn_raw = nyc[nyc["BOROUGH"] == 3].copy()
print(f"raw manhattan rows: {len(manhattan_raw)}")
print(f"raw brooklyn rows: {len(brooklyn_raw)}")
def clean_borough(df: pd.DataFrame, min_price: float = 10000.0) -> pd.DataFrame:
"""clean borough-level dataframe similarly to the r function."""
df = df.copy()
def parse_numeric(series: pd.Series) -> pd.Series:
"""numeric from char / factor with commas and junk values."""
s = series.astype(str).str.strip()
s = s.replace(
{
"0": np.nan,
"0.0": np.nan,
"- 0": np.nan,
"": np.nan,
".": np.nan,
"NA": np.nan,
"NaN": np.nan,
}
)
s = s.str.replace(",", "", regex=False)
return pd.to_numeric(s, errors="coerce")
# convert numeric columns
if "SALE PRICE" in df.columns:
df["SALE PRICE"] = parse_numeric(df["SALE PRICE"])
if "LAND SQUARE FEET" in df.columns:
df["LAND SQUARE FEET"] = parse_numeric(df["LAND SQUARE FEET"])
if "GROSS SQUARE FEET" in df.columns:
df["GROSS SQUARE FEET"] = parse_numeric(df["GROSS SQUARE FEET"])
if "YEAR BUILT" in df.columns:
df["YEAR BUILT"] = parse_numeric(df["YEAR BUILT"])
unit_cols = ["RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS"]
for col in unit_cols:
if col in df.columns:
df[col] = parse_numeric(df[col])
# drop non-arms-length / tiny sales
if "SALE PRICE" in df.columns:
df = df[df["SALE PRICE"].notna()]
df = df[df["SALE PRICE"] > min_price]
# very old or zero years --> missing
if "YEAR BUILT" in df.columns:
df.loc[df["YEAR BUILT"] < 1800, "YEAR BUILT"] = np.nan
# need usable size / year
required_cols = ["GROSS SQUARE FEET", "LAND SQUARE FEET", "YEAR BUILT"]
if all(c in df.columns for c in required_cols):
df = df[
df["GROSS SQUARE FEET"].notna()
& df["LAND SQUARE FEET"].notna()
& df["YEAR BUILT"].notna()
& (df["GROSS SQUARE FEET"] > 0)
& (df["LAND SQUARE FEET"] > 0)
]
# fill missing units with 0
for col in unit_cols:
if col in df.columns:
df[col] = df[col].fillna(0)
# rename neighborhood
if "NEIGHBORHOOD" in df.columns:
df = df.rename(columns={"NEIGHBORHOOD": "neighborhood"})
# create new columns
if "LAND SQUARE FEET" in df.columns:
df["land_sqft"] = df["LAND SQUARE FEET"]
if "GROSS SQUARE FEET" in df.columns:
df["gross_sqft"] = df["GROSS SQUARE FEET"]
if "YEAR BUILT" in df.columns:
df["year_built"] = df["YEAR BUILT"]
if "RESIDENTIAL UNITS" in df.columns:
df["res_units"] = df["RESIDENTIAL UNITS"]
else:
df["res_units"] = 0
if "COMMERCIAL UNITS" in df.columns:
df["comm_units"] = df["COMMERCIAL UNITS"]
else:
df["comm_units"] = 0
if "TOTAL UNITS" in df.columns:
df["total_units"] = df["TOTAL UNITS"]
else:
df["total_units"] = 0
if "SALE PRICE" in df.columns:
df["sale_price"] = df["SALE PRICE"]
return df
manhattan = clean_borough(manhattan_raw)
brooklyn = clean_borough(brooklyn_raw)
print(f"clean manhattan rows: {len(manhattan)}")
print(f"clean brooklyn rows: {len(brooklyn)}")
# manhattan exploratory data analysis
# summary stats for sale price
summary_manhattan_price = manhattan["sale_price"].describe()
print("summary of manhattan sale_price:")
print(summary_manhattan_price)
quantiles_manhattan = manhattan["sale_price"].quantile(
[0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
)
print("selected quantiles for manhattan sale_price:")
print(quantiles_manhattan)
q1 = quantiles_manhattan.loc[0.25]
q3 = quantiles_manhattan.loc[0.75]
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
print(f"manhattan iqr upper bound: {upper_bound}")
print(
f"manhattan max sale price: {manhattan['sale_price'].max(skipna=True)}"
)
outlier_mask = (manhattan["sale_price"] < lower_bound) | (
manhattan["sale_price"] > upper_bound
)
print(f"number of sale price outliers: {outlier_mask.sum()}")
# correlation with other numeric vars
num_cols = [
"sale_price", "gross_sqft", "land_sqft",
"year_built", "res_units", "comm_units", "total_units",
]
corr_manhattan = manhattan[num_cols].corr()
print("correlation with sale_price:")
print(corr_manhattan["sale_price"])
# formatter for comma-separated tick labels
def comma_format(x, pos):
try:
return f"{int(x):,}"
except Exception:
return str(x)
# histogram of raw sale prices
fig, ax = plt.subplots()
ax.hist(manhattan["sale_price"], bins=50)
ax.set_title("manhattan sale price distribution (raw)")
ax.set_xlabel("sale price (usd)")
ax.set_ylabel("count of sales")
ax.xaxis.set_major_formatter(FuncFormatter(comma_format))
plt.tight_layout()
plt.show()
# histogram of log(1 + sale price)
fig, ax = plt.subplots()
ax.hist(np.log1p(manhattan["sale_price"]), bins=50)
ax.set_title("manhattan sale price distribution (log scale)")
ax.set_xlabel("log(1 + sale price)")
ax.set_ylabel("count of sales")
plt.tight_layout()
plt.show()
# boxplot of sale price (for outliers)
fig, ax = plt.subplots()
ax.boxplot(manhattan["sale_price"].values, vert=True)
ax.set_title("manhattan sale price with outliers")
ax.set_ylabel("sale price (usd)")
ax.set_xticks([])
ax.yaxis.set_major_formatter(FuncFormatter(comma_format))
plt.tight_layout()
plt.show()
# scatter: gross sqft vs sale price (log y)
fig, ax = plt.subplots()
ax.scatter(manhattan["gross_sqft"], manhattan["sale_price"], alpha=0.3)
ax.set_yscale("log")
ax.set_title("manhattan sale price vs gross square feet")
ax.set_xlabel("gross square feet")
ax.set_ylabel("sale price (log10-ish scale)")
plt.tight_layout()
plt.show()
# manhattan regression analysis
reg_vars = [
"land_sqft", "gross_sqft", "year_built",
"res_units", "comm_units", "total_units",
]
reg_df = manhattan[reg_vars + ["sale_price"]].dropna()
reg_df["log_price"] = np.log1p(reg_df["sale_price"])
X = reg_df[reg_vars]
y = reg_df["log_price"]
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X, y, test_size=0.25, random_state=RANDOM_STATE
)
# linear regression on log_price
lm = LinearRegression()
lm.fit(X_train_reg, y_train_reg)
lm_pred = lm.predict(X_test_reg)
r2_lm = r2_score(y_test_reg, lm_pred)
rmse_lm = np.sqrt(mean_squared_error(y_test_reg, lm_pred))
mae_lm = mean_absolute_error(y_test_reg, lm_pred)
print("\nlinear model (log price) metrics on manhattan:")
print(f"r2: {r2_lm:.4f} rmse: {rmse_lm:.4f} mae: {mae_lm:.4f}")
# random forest regression on log_price
rf_reg = RandomForestRegressor(
n_estimators=200,
max_features=3,
max_leaf_nodes=100,
random_state=RANDOM_STATE,
n_jobs=-1,
)
rf_reg.fit(X_train_reg, y_train_reg)
rf_pred = rf_reg.predict(X_test_reg)
r2_rf = r2_score(y_test_reg, rf_pred)
rmse_rf = np.sqrt(mean_squared_error(y_test_reg, rf_pred))
mae_rf = mean_absolute_error(y_test_reg, rf_pred)
print("\nrandom forest (log price) metrics on manhattan:")
print(f"r2: {r2_rf:.4f} rmse: {rmse_rf:.4f} mae: {mae_rf:.4f}")
# manhattan predicted vs actual plot
rf_diag_df = pd.DataFrame(
{
"actual": y_test_reg.values,
"predicted": rf_pred,
}
)
fig, ax = plt.subplots()
ax.scatter(rf_diag_df["actual"], rf_diag_df["predicted"], alpha=0.3)
min_val = min(rf_diag_df["actual"].min(), rf_diag_df["predicted"].min())
max_val = max(rf_diag_df["actual"].max(), rf_diag_df["predicted"].max())
ax.plot([min_val, max_val], [min_val, max_val], linestyle="--")
ax.set_title(
"random forest: predicted vs actual log(1 + sale price) (manhattan)"
)
ax.set_xlabel("actual log(1 + sale price)")
ax.set_ylabel("predicted log(1 + sale price)")
plt.tight_layout()
plt.show()
# manhattan residuals vs predicted plot
rf_diag_df["residual"] = (
rf_diag_df["actual"] - rf_diag_df["predicted"]
)
fig, ax = plt.subplots()
ax.scatter(rf_diag_df["predicted"], rf_diag_df["residual"], alpha=0.3)
ax.axhline(0.0, linestyle="--")
ax.set_title("random forest residuals (manhattan)")
ax.set_xlabel("predicted log(1 + sale price)")
ax.set_ylabel("residual")
plt.tight_layout()
plt.show()
# use manhattan regression model on brooklyn
brook_reg_df = brooklyn[reg_vars + ["sale_price"]].dropna()
brook_reg_df["log_price"] = np.log1p(brook_reg_df["sale_price"])
X_brook = brook_reg_df[reg_vars]
y_brook = brook_reg_df["log_price"]
rf_pred_brook = rf_reg.predict(X_brook)
r2_rf_brook = r2_score(y_brook, rf_pred_brook)
rmse_rf_brook = np.sqrt(mean_squared_error(y_brook, rf_pred_brook))
mae_rf_brook = mean_absolute_error(y_brook, rf_pred_brook)
print(
"\nrandom forest (log price) metrics on brooklyn "
"(trained on manhattan):"
)
print(
f"r2: {r2_rf_brook:.4f} rmse: {rmse_rf_brook:.4f} "
f"mae: {mae_rf_brook:.4f}"
)
brook_diag_df = pd.DataFrame(
{
"actual": y_brook.values,
"predicted": rf_pred_brook,
}
)
brook_diag_df["residual"] = (
brook_diag_df["actual"] - brook_diag_df["predicted"]
)
fig, ax = plt.subplots()
ax.scatter(brook_diag_df["actual"], brook_diag_df["predicted"], alpha=0.3)
min_val = min(brook_diag_df["actual"].min(), brook_diag_df["predicted"].min())
max_val = max(brook_diag_df["actual"].max(), brook_diag_df["predicted"].max())
ax.plot([min_val, max_val], [min_val, max_val], linestyle="--")
ax.set_title(
"random forest: manhattan model on brooklyn (log price)"
)
ax.set_xlabel("actual log(1 + sale price) (brooklyn)")
ax.set_ylabel("predicted log(1 + sale price)")
plt.tight_layout()
plt.show()
fig, ax = plt.subplots()
ax.scatter(brook_diag_df["predicted"], brook_diag_df["residual"], alpha=0.3)
ax.axhline(0.0, linestyle="--")
ax.set_title(
"residuals on brooklyn using manhattan random forest"
)
ax.set_xlabel("predicted log(1 + sale price)")
ax.set_ylabel("residual")
plt.tight_layout()
plt.show()
# classification: manhattan predict neighborhood
clf_vars = [
"sale_price", "land_sqft", "gross_sqft",
"year_built", "res_units", "comm_units", "total_units",
]
def prepare_classification_df(
df: pd.DataFrame, min_per_class: int = 100
) -> pd.DataFrame:
"""prepare classification df similar to r code."""
tmp = df.copy()
if "neighborhood" not in tmp.columns:
raise ValueError("neighborhood column missing")
tmp = tmp[tmp["neighborhood"].notna()]
counts = tmp["neighborhood"].value_counts()
keep = counts[counts >= min_per_class].index
tmp = tmp[tmp["neighborhood"].isin(keep)]
cols = ["neighborhood"] + clf_vars
tmp = tmp[cols].dropna()
tmp["neighborhood"] = tmp["neighborhood"].astype("category")
return tmp
manhattan_clf = prepare_classification_df(
manhattan, min_per_class=100
)
print(
f"\nmanhattan classification subset rows: {len(manhattan_clf)}"
)
print(
"manhattan neighborhoods: "
f"{manhattan_clf['neighborhood'].nunique()}"
)
X_clf = manhattan_clf[clf_vars]
y_clf = manhattan_clf["neighborhood"]
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
X_clf,
y_clf,
test_size=0.25,
stratify=y_clf,
random_state=RANDOM_STATE,
)
def macro_f1_score(y_true, y_pred) -> float:
"""macro f1 similar to caret::confusionMatrix byClass averaging."""
_, _, f1, _ = precision_recall_fscore_support(
y_true, y_pred, average="macro", zero_division=0
)
return float(f1)
# k-nn classifier with scaling
knn_pipeline = Pipeline(
[
("scaler", StandardScaler()),
("knn", KNeighborsClassifier(n_neighbors=7)),
]
)
knn_pipeline.fit(X_train_clf, y_train_clf)
knn_pred = knn_pipeline.predict(X_test_clf)
acc_knn = accuracy_score(y_test_clf, knn_pred)
macro_f1_knn = macro_f1_score(y_test_clf, knn_pred)
print(
f"\nknn (manhattan) accuracy: {acc_knn:.4f} "
f"macro f1: {macro_f1_knn:.4f}"
)
# random forest classifier
rf_clf = RandomForestClassifier(
n_estimators=300,
max_features=3,
random_state=RANDOM_STATE,
n_jobs=-1,
)
rf_clf.fit(X_train_clf, y_train_clf)
rf_clf_pred = rf_clf.predict(X_test_clf)
acc_rf_clf = accuracy_score(y_test_clf, rf_clf_pred)
macro_f1_rf_clf = macro_f1_score(y_test_clf, rf_clf_pred)
print(
"\nrandom forest classifier (manhattan) accuracy: "
f"{acc_rf_clf:.4f} macro f1: {macro_f1_rf_clf:.4f}"
)
# contingency table (rf example)
labels = sorted(y_clf.unique())
rf_cm_table = confusion_matrix(
y_test_clf, rf_clf_pred, labels=labels
)
print("rf confusion matrix shape:", rf_cm_table.shape)
print("rf confusion matrix (rows=true, cols=pred):")
print(rf_cm_table)
# multinomial logistic regression
logit_clf = LogisticRegression(
multi_class="multinomial",
max_iter=2000,
solver="lbfgs",
n_jobs=-1,
)
logit_clf.fit(X_train_clf, y_train_clf)
logit_pred = logit_clf.predict(X_test_clf)
acc_logit = accuracy_score(y_test_clf, logit_pred)
macro_f1_logit = macro_f1_score(y_test_clf, logit_pred)
print(
"\nmultinomial logistic regression (manhattan) accuracy: "
f"{acc_logit:.4f} macro f1: {macro_f1_logit:.4f}"
)
# use manhattan classifiers on brooklyn
brooklyn_clf = prepare_classification_df(
brooklyn, min_per_class=100
)
print(
f"\nbrooklyn classification subset rows: {len(brooklyn_clf)}"
)
print(
"brooklyn neighborhoods: "
f"{brooklyn_clf['neighborhood'].nunique()}"
)
X_brook_clf = brooklyn_clf[clf_vars]
y_brook_clf = brooklyn_clf["neighborhood"]
# predictions from manhattan-trained models on brooklyn data
knn_pred_brook = knn_pipeline.predict(X_brook_clf)
rf_pred_brook_clf = rf_clf.predict(X_brook_clf)
logit_pred_brook = logit_clf.predict(X_brook_clf)
# contingency tables (true brooklyn neigh vs predicted manhattan neigh)
tab_knn_brook = pd.crosstab(
y_brook_clf, knn_pred_brook,
rownames=["true"], colnames=["pred"]
)
tab_rf_brook = pd.crosstab(
y_brook_clf, rf_pred_brook_clf,
rownames=["true"], colnames=["pred"]
)
tab_logit_brook = pd.crosstab(
y_brook_clf, logit_pred_brook,
rownames=["true"], colnames=["pred"]
)
print("\ncontingency table dimensions (knn):", tab_knn_brook.shape)
print(tab_knn_brook)
print(
"contingency table dimensions (random forest):",
tab_rf_brook.shape,
)
print(tab_rf_brook)
print("contingency table dimensions (logit):", tab_logit_brook.shape)
print(tab_logit_brook)