554 lines
16 KiB
Python
554 lines
16 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
|
|
from matplotlib.ticker import FuncFormatter
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import (
|
|
r2_score,
|
|
mean_squared_error,
|
|
mean_absolute_error,
|
|
accuracy_score,
|
|
precision_recall_fscore_support,
|
|
confusion_matrix,
|
|
)
|
|
from sklearn.linear_model import LinearRegression, LogisticRegression
|
|
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.pipeline import Pipeline
|
|
|
|
RANDOM_STATE = 42
|
|
np.random.seed(RANDOM_STATE)
|
|
|
|
file_path = "Given/NYC_Citywide_Annualized_Calendar_Sales_Update_20241107.csv"
|
|
|
|
cols_needed = [
|
|
"BOROUGH", "NEIGHBORHOOD", "BUILDING CLASS CATEGORY",
|
|
"TAX CLASS AS OF FINAL ROLL", "BLOCK", "LOT",
|
|
"BUILDING CLASS AS OF FINAL ROLL", "ZIP CODE",
|
|
"RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS",
|
|
"LAND SQUARE FEET", "GROSS SQUARE FEET", "YEAR BUILT",
|
|
"TAX CLASS AT TIME OF SALE", "BUILDING CLASS AT TIME OF SALE",
|
|
"SALE PRICE", "SALE DATE",
|
|
]
|
|
|
|
# loading...
|
|
nyc = pd.read_csv(file_path, dtype=str, low_memory=False)
|
|
cols_present = [c for c in cols_needed if c in nyc.columns]
|
|
nyc = nyc[cols_present]
|
|
|
|
# force borough numeric
|
|
nyc["BOROUGH"] = pd.to_numeric(nyc["BOROUGH"], errors="coerce")
|
|
|
|
manhattan_raw = nyc[nyc["BOROUGH"] == 1].copy()
|
|
brooklyn_raw = nyc[nyc["BOROUGH"] == 3].copy()
|
|
|
|
print(f"raw manhattan rows: {len(manhattan_raw)}")
|
|
print(f"raw brooklyn rows: {len(brooklyn_raw)}")
|
|
|
|
|
|
def clean_borough(df: pd.DataFrame, min_price: float = 10000.0) -> pd.DataFrame:
|
|
"""clean borough-level dataframe similarly to the r function."""
|
|
df = df.copy()
|
|
|
|
def parse_numeric(series: pd.Series) -> pd.Series:
|
|
"""numeric from char / factor with commas and junk values."""
|
|
s = series.astype(str).str.strip()
|
|
s = s.replace(
|
|
{
|
|
"0": np.nan,
|
|
"0.0": np.nan,
|
|
"- 0": np.nan,
|
|
"": np.nan,
|
|
".": np.nan,
|
|
"NA": np.nan,
|
|
"NaN": np.nan,
|
|
}
|
|
)
|
|
s = s.str.replace(",", "", regex=False)
|
|
return pd.to_numeric(s, errors="coerce")
|
|
|
|
# convert numeric columns
|
|
if "SALE PRICE" in df.columns:
|
|
df["SALE PRICE"] = parse_numeric(df["SALE PRICE"])
|
|
if "LAND SQUARE FEET" in df.columns:
|
|
df["LAND SQUARE FEET"] = parse_numeric(df["LAND SQUARE FEET"])
|
|
if "GROSS SQUARE FEET" in df.columns:
|
|
df["GROSS SQUARE FEET"] = parse_numeric(df["GROSS SQUARE FEET"])
|
|
if "YEAR BUILT" in df.columns:
|
|
df["YEAR BUILT"] = parse_numeric(df["YEAR BUILT"])
|
|
|
|
unit_cols = ["RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS"]
|
|
for col in unit_cols:
|
|
if col in df.columns:
|
|
df[col] = parse_numeric(df[col])
|
|
|
|
# drop non-arms-length / tiny sales
|
|
if "SALE PRICE" in df.columns:
|
|
df = df[df["SALE PRICE"].notna()]
|
|
df = df[df["SALE PRICE"] > min_price]
|
|
|
|
# very old or zero years --> missing
|
|
if "YEAR BUILT" in df.columns:
|
|
df.loc[df["YEAR BUILT"] < 1800, "YEAR BUILT"] = np.nan
|
|
|
|
# need usable size / year
|
|
required_cols = ["GROSS SQUARE FEET", "LAND SQUARE FEET", "YEAR BUILT"]
|
|
if all(c in df.columns for c in required_cols):
|
|
df = df[
|
|
df["GROSS SQUARE FEET"].notna()
|
|
& df["LAND SQUARE FEET"].notna()
|
|
& df["YEAR BUILT"].notna()
|
|
& (df["GROSS SQUARE FEET"] > 0)
|
|
& (df["LAND SQUARE FEET"] > 0)
|
|
]
|
|
|
|
# fill missing units with 0
|
|
for col in unit_cols:
|
|
if col in df.columns:
|
|
df[col] = df[col].fillna(0)
|
|
|
|
# rename neighborhood
|
|
if "NEIGHBORHOOD" in df.columns:
|
|
df = df.rename(columns={"NEIGHBORHOOD": "neighborhood"})
|
|
|
|
# create new columns
|
|
if "LAND SQUARE FEET" in df.columns:
|
|
df["land_sqft"] = df["LAND SQUARE FEET"]
|
|
if "GROSS SQUARE FEET" in df.columns:
|
|
df["gross_sqft"] = df["GROSS SQUARE FEET"]
|
|
if "YEAR BUILT" in df.columns:
|
|
df["year_built"] = df["YEAR BUILT"]
|
|
if "RESIDENTIAL UNITS" in df.columns:
|
|
df["res_units"] = df["RESIDENTIAL UNITS"]
|
|
else:
|
|
df["res_units"] = 0
|
|
if "COMMERCIAL UNITS" in df.columns:
|
|
df["comm_units"] = df["COMMERCIAL UNITS"]
|
|
else:
|
|
df["comm_units"] = 0
|
|
if "TOTAL UNITS" in df.columns:
|
|
df["total_units"] = df["TOTAL UNITS"]
|
|
else:
|
|
df["total_units"] = 0
|
|
if "SALE PRICE" in df.columns:
|
|
df["sale_price"] = df["SALE PRICE"]
|
|
|
|
return df
|
|
|
|
|
|
manhattan = clean_borough(manhattan_raw)
|
|
brooklyn = clean_borough(brooklyn_raw)
|
|
|
|
print(f"clean manhattan rows: {len(manhattan)}")
|
|
print(f"clean brooklyn rows: {len(brooklyn)}")
|
|
|
|
|
|
# manhattan exploratory data analysis
|
|
|
|
# summary stats for sale price
|
|
summary_manhattan_price = manhattan["sale_price"].describe()
|
|
print("summary of manhattan sale_price:")
|
|
print(summary_manhattan_price)
|
|
|
|
quantiles_manhattan = manhattan["sale_price"].quantile(
|
|
[0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
|
|
)
|
|
print("selected quantiles for manhattan sale_price:")
|
|
print(quantiles_manhattan)
|
|
|
|
q1 = quantiles_manhattan.loc[0.25]
|
|
q3 = quantiles_manhattan.loc[0.75]
|
|
iqr = q3 - q1
|
|
lower_bound = q1 - 1.5 * iqr
|
|
upper_bound = q3 + 1.5 * iqr
|
|
|
|
print(f"manhattan iqr upper bound: {upper_bound}")
|
|
print(
|
|
f"manhattan max sale price: {manhattan['sale_price'].max(skipna=True)}"
|
|
)
|
|
|
|
outlier_mask = (manhattan["sale_price"] < lower_bound) | (
|
|
manhattan["sale_price"] > upper_bound
|
|
)
|
|
print(f"number of sale price outliers: {outlier_mask.sum()}")
|
|
|
|
# correlation with other numeric vars
|
|
num_cols = [
|
|
"sale_price", "gross_sqft", "land_sqft",
|
|
"year_built", "res_units", "comm_units", "total_units",
|
|
]
|
|
|
|
corr_manhattan = manhattan[num_cols].corr()
|
|
print("correlation with sale_price:")
|
|
print(corr_manhattan["sale_price"])
|
|
|
|
# formatter for comma-separated tick labels
|
|
|
|
|
|
def comma_format(x, pos):
|
|
try:
|
|
return f"{int(x):,}"
|
|
except Exception:
|
|
return str(x)
|
|
|
|
|
|
# histogram of raw sale prices
|
|
fig, ax = plt.subplots()
|
|
ax.hist(manhattan["sale_price"], bins=50)
|
|
ax.set_title("manhattan sale price distribution (raw)")
|
|
ax.set_xlabel("sale price (usd)")
|
|
ax.set_ylabel("count of sales")
|
|
ax.xaxis.set_major_formatter(FuncFormatter(comma_format))
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
# histogram of log(1 + sale price)
|
|
fig, ax = plt.subplots()
|
|
ax.hist(np.log1p(manhattan["sale_price"]), bins=50)
|
|
ax.set_title("manhattan sale price distribution (log scale)")
|
|
ax.set_xlabel("log(1 + sale price)")
|
|
ax.set_ylabel("count of sales")
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
# boxplot of sale price (for outliers)
|
|
fig, ax = plt.subplots()
|
|
ax.boxplot(manhattan["sale_price"].values, vert=True)
|
|
ax.set_title("manhattan sale price with outliers")
|
|
ax.set_ylabel("sale price (usd)")
|
|
ax.set_xticks([])
|
|
ax.yaxis.set_major_formatter(FuncFormatter(comma_format))
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
# scatter: gross sqft vs sale price (log y)
|
|
fig, ax = plt.subplots()
|
|
ax.scatter(manhattan["gross_sqft"], manhattan["sale_price"], alpha=0.3)
|
|
ax.set_yscale("log")
|
|
ax.set_title("manhattan sale price vs gross square feet")
|
|
ax.set_xlabel("gross square feet")
|
|
ax.set_ylabel("sale price (log10-ish scale)")
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
# manhattan regression analysis
|
|
|
|
reg_vars = [
|
|
"land_sqft", "gross_sqft", "year_built",
|
|
"res_units", "comm_units", "total_units",
|
|
]
|
|
|
|
reg_df = manhattan[reg_vars + ["sale_price"]].dropna()
|
|
reg_df["log_price"] = np.log1p(reg_df["sale_price"])
|
|
|
|
X = reg_df[reg_vars]
|
|
y = reg_df["log_price"]
|
|
|
|
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
|
|
X, y, test_size=0.25, random_state=RANDOM_STATE
|
|
)
|
|
|
|
|
|
# linear regression on log_price
|
|
lm = LinearRegression()
|
|
lm.fit(X_train_reg, y_train_reg)
|
|
lm_pred = lm.predict(X_test_reg)
|
|
|
|
r2_lm = r2_score(y_test_reg, lm_pred)
|
|
rmse_lm = np.sqrt(mean_squared_error(y_test_reg, lm_pred))
|
|
mae_lm = mean_absolute_error(y_test_reg, lm_pred)
|
|
|
|
print("\nlinear model (log price) metrics on manhattan:")
|
|
print(f"r2: {r2_lm:.4f} rmse: {rmse_lm:.4f} mae: {mae_lm:.4f}")
|
|
|
|
|
|
# random forest regression on log_price
|
|
rf_reg = RandomForestRegressor(
|
|
n_estimators=200,
|
|
max_features=3,
|
|
max_leaf_nodes=100,
|
|
random_state=RANDOM_STATE,
|
|
n_jobs=-1,
|
|
)
|
|
rf_reg.fit(X_train_reg, y_train_reg)
|
|
rf_pred = rf_reg.predict(X_test_reg)
|
|
|
|
r2_rf = r2_score(y_test_reg, rf_pred)
|
|
rmse_rf = np.sqrt(mean_squared_error(y_test_reg, rf_pred))
|
|
mae_rf = mean_absolute_error(y_test_reg, rf_pred)
|
|
|
|
print("\nrandom forest (log price) metrics on manhattan:")
|
|
print(f"r2: {r2_rf:.4f} rmse: {rmse_rf:.4f} mae: {mae_rf:.4f}")
|
|
|
|
|
|
# manhattan predicted vs actual plot
|
|
rf_diag_df = pd.DataFrame(
|
|
{
|
|
"actual": y_test_reg.values,
|
|
"predicted": rf_pred,
|
|
}
|
|
)
|
|
fig, ax = plt.subplots()
|
|
ax.scatter(rf_diag_df["actual"], rf_diag_df["predicted"], alpha=0.3)
|
|
min_val = min(rf_diag_df["actual"].min(), rf_diag_df["predicted"].min())
|
|
max_val = max(rf_diag_df["actual"].max(), rf_diag_df["predicted"].max())
|
|
ax.plot([min_val, max_val], [min_val, max_val], linestyle="--")
|
|
ax.set_title(
|
|
"random forest: predicted vs actual log(1 + sale price) (manhattan)"
|
|
)
|
|
|
|
ax.set_xlabel("actual log(1 + sale price)")
|
|
ax.set_ylabel("predicted log(1 + sale price)")
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
|
|
# manhattan residuals vs predicted plot
|
|
rf_diag_df["residual"] = (
|
|
rf_diag_df["actual"] - rf_diag_df["predicted"]
|
|
)
|
|
fig, ax = plt.subplots()
|
|
ax.scatter(rf_diag_df["predicted"], rf_diag_df["residual"], alpha=0.3)
|
|
ax.axhline(0.0, linestyle="--")
|
|
ax.set_title("random forest residuals (manhattan)")
|
|
ax.set_xlabel("predicted log(1 + sale price)")
|
|
ax.set_ylabel("residual")
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
# use manhattan regression model on brooklyn
|
|
|
|
brook_reg_df = brooklyn[reg_vars + ["sale_price"]].dropna()
|
|
brook_reg_df["log_price"] = np.log1p(brook_reg_df["sale_price"])
|
|
|
|
X_brook = brook_reg_df[reg_vars]
|
|
y_brook = brook_reg_df["log_price"]
|
|
|
|
rf_pred_brook = rf_reg.predict(X_brook)
|
|
|
|
r2_rf_brook = r2_score(y_brook, rf_pred_brook)
|
|
rmse_rf_brook = np.sqrt(mean_squared_error(y_brook, rf_pred_brook))
|
|
mae_rf_brook = mean_absolute_error(y_brook, rf_pred_brook)
|
|
|
|
print(
|
|
"\nrandom forest (log price) metrics on brooklyn "
|
|
"(trained on manhattan):"
|
|
)
|
|
print(
|
|
f"r2: {r2_rf_brook:.4f} rmse: {rmse_rf_brook:.4f} "
|
|
f"mae: {mae_rf_brook:.4f}"
|
|
)
|
|
|
|
brook_diag_df = pd.DataFrame(
|
|
{
|
|
"actual": y_brook.values,
|
|
"predicted": rf_pred_brook,
|
|
}
|
|
)
|
|
brook_diag_df["residual"] = (
|
|
brook_diag_df["actual"] - brook_diag_df["predicted"]
|
|
)
|
|
|
|
fig, ax = plt.subplots()
|
|
ax.scatter(brook_diag_df["actual"], brook_diag_df["predicted"], alpha=0.3)
|
|
min_val = min(brook_diag_df["actual"].min(), brook_diag_df["predicted"].min())
|
|
max_val = max(brook_diag_df["actual"].max(), brook_diag_df["predicted"].max())
|
|
ax.plot([min_val, max_val], [min_val, max_val], linestyle="--")
|
|
ax.set_title(
|
|
"random forest: manhattan model on brooklyn (log price)"
|
|
)
|
|
|
|
ax.set_xlabel("actual log(1 + sale price) (brooklyn)")
|
|
ax.set_ylabel("predicted log(1 + sale price)")
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
|
|
fig, ax = plt.subplots()
|
|
ax.scatter(brook_diag_df["predicted"], brook_diag_df["residual"], alpha=0.3)
|
|
ax.axhline(0.0, linestyle="--")
|
|
ax.set_title(
|
|
"residuals on brooklyn using manhattan random forest"
|
|
)
|
|
|
|
ax.set_xlabel("predicted log(1 + sale price)")
|
|
ax.set_ylabel("residual")
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
|
|
# classification: manhattan predict neighborhood
|
|
|
|
clf_vars = [
|
|
"sale_price", "land_sqft", "gross_sqft",
|
|
"year_built", "res_units", "comm_units", "total_units",
|
|
]
|
|
|
|
|
|
def prepare_classification_df(
|
|
df: pd.DataFrame, min_per_class: int = 100
|
|
) -> pd.DataFrame:
|
|
"""prepare classification df similar to r code."""
|
|
tmp = df.copy()
|
|
if "neighborhood" not in tmp.columns:
|
|
raise ValueError("neighborhood column missing")
|
|
tmp = tmp[tmp["neighborhood"].notna()]
|
|
|
|
counts = tmp["neighborhood"].value_counts()
|
|
keep = counts[counts >= min_per_class].index
|
|
tmp = tmp[tmp["neighborhood"].isin(keep)]
|
|
|
|
cols = ["neighborhood"] + clf_vars
|
|
tmp = tmp[cols].dropna()
|
|
|
|
tmp["neighborhood"] = tmp["neighborhood"].astype("category")
|
|
return tmp
|
|
|
|
|
|
manhattan_clf = prepare_classification_df(
|
|
manhattan, min_per_class=100
|
|
)
|
|
|
|
print(
|
|
f"\nmanhattan classification subset rows: {len(manhattan_clf)}"
|
|
)
|
|
print(
|
|
"manhattan neighborhoods: "
|
|
f"{manhattan_clf['neighborhood'].nunique()}"
|
|
)
|
|
|
|
X_clf = manhattan_clf[clf_vars]
|
|
y_clf = manhattan_clf["neighborhood"]
|
|
|
|
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
|
|
X_clf,
|
|
y_clf,
|
|
test_size=0.25,
|
|
stratify=y_clf,
|
|
random_state=RANDOM_STATE,
|
|
)
|
|
|
|
|
|
def macro_f1_score(y_true, y_pred) -> float:
|
|
"""macro f1 similar to caret::confusionMatrix byClass averaging."""
|
|
_, _, f1, _ = precision_recall_fscore_support(
|
|
y_true, y_pred, average="macro", zero_division=0
|
|
)
|
|
return float(f1)
|
|
|
|
|
|
# k-nn classifier with scaling
|
|
knn_pipeline = Pipeline(
|
|
[
|
|
("scaler", StandardScaler()),
|
|
("knn", KNeighborsClassifier(n_neighbors=7)),
|
|
]
|
|
)
|
|
knn_pipeline.fit(X_train_clf, y_train_clf)
|
|
knn_pred = knn_pipeline.predict(X_test_clf)
|
|
|
|
acc_knn = accuracy_score(y_test_clf, knn_pred)
|
|
macro_f1_knn = macro_f1_score(y_test_clf, knn_pred)
|
|
|
|
print(
|
|
f"\nknn (manhattan) accuracy: {acc_knn:.4f} "
|
|
f"macro f1: {macro_f1_knn:.4f}"
|
|
)
|
|
|
|
# random forest classifier
|
|
rf_clf = RandomForestClassifier(
|
|
n_estimators=300,
|
|
max_features=3,
|
|
random_state=RANDOM_STATE,
|
|
n_jobs=-1,
|
|
)
|
|
rf_clf.fit(X_train_clf, y_train_clf)
|
|
rf_clf_pred = rf_clf.predict(X_test_clf)
|
|
|
|
acc_rf_clf = accuracy_score(y_test_clf, rf_clf_pred)
|
|
macro_f1_rf_clf = macro_f1_score(y_test_clf, rf_clf_pred)
|
|
|
|
print(
|
|
"\nrandom forest classifier (manhattan) accuracy: "
|
|
f"{acc_rf_clf:.4f} macro f1: {macro_f1_rf_clf:.4f}"
|
|
)
|
|
|
|
# contingency table (rf example)
|
|
labels = sorted(y_clf.unique())
|
|
rf_cm_table = confusion_matrix(
|
|
y_test_clf, rf_clf_pred, labels=labels
|
|
)
|
|
print("rf confusion matrix shape:", rf_cm_table.shape)
|
|
print("rf confusion matrix (rows=true, cols=pred):")
|
|
print(rf_cm_table)
|
|
|
|
# multinomial logistic regression
|
|
logit_clf = LogisticRegression(
|
|
multi_class="multinomial",
|
|
max_iter=2000,
|
|
solver="lbfgs",
|
|
n_jobs=-1,
|
|
)
|
|
logit_clf.fit(X_train_clf, y_train_clf)
|
|
logit_pred = logit_clf.predict(X_test_clf)
|
|
|
|
acc_logit = accuracy_score(y_test_clf, logit_pred)
|
|
macro_f1_logit = macro_f1_score(y_test_clf, logit_pred)
|
|
|
|
print(
|
|
"\nmultinomial logistic regression (manhattan) accuracy: "
|
|
f"{acc_logit:.4f} macro f1: {macro_f1_logit:.4f}"
|
|
)
|
|
|
|
# use manhattan classifiers on brooklyn
|
|
|
|
brooklyn_clf = prepare_classification_df(
|
|
brooklyn, min_per_class=100
|
|
)
|
|
print(
|
|
f"\nbrooklyn classification subset rows: {len(brooklyn_clf)}"
|
|
)
|
|
print(
|
|
"brooklyn neighborhoods: "
|
|
f"{brooklyn_clf['neighborhood'].nunique()}"
|
|
)
|
|
|
|
X_brook_clf = brooklyn_clf[clf_vars]
|
|
y_brook_clf = brooklyn_clf["neighborhood"]
|
|
|
|
# predictions from manhattan-trained models on brooklyn data
|
|
knn_pred_brook = knn_pipeline.predict(X_brook_clf)
|
|
rf_pred_brook_clf = rf_clf.predict(X_brook_clf)
|
|
logit_pred_brook = logit_clf.predict(X_brook_clf)
|
|
|
|
# contingency tables (true brooklyn neigh vs predicted manhattan neigh)
|
|
tab_knn_brook = pd.crosstab(
|
|
y_brook_clf, knn_pred_brook,
|
|
rownames=["true"], colnames=["pred"]
|
|
)
|
|
|
|
tab_rf_brook = pd.crosstab(
|
|
y_brook_clf, rf_pred_brook_clf,
|
|
rownames=["true"], colnames=["pred"]
|
|
)
|
|
|
|
tab_logit_brook = pd.crosstab(
|
|
y_brook_clf, logit_pred_brook,
|
|
rownames=["true"], colnames=["pred"]
|
|
)
|
|
|
|
print("\ncontingency table dimensions (knn):", tab_knn_brook.shape)
|
|
print(tab_knn_brook)
|
|
|
|
print(
|
|
"contingency table dimensions (random forest):",
|
|
tab_rf_brook.shape,
|
|
)
|
|
print(tab_rf_brook)
|
|
|
|
print("contingency table dimensions (logit):", tab_logit_brook.shape)
|
|
print(tab_logit_brook)
|