import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib.ticker import FuncFormatter from sklearn.model_selection import train_test_split from sklearn.metrics import ( r2_score, mean_squared_error, mean_absolute_error, accuracy_score, precision_recall_fscore_support, confusion_matrix, ) from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline RANDOM_STATE = 42 np.random.seed(RANDOM_STATE) file_path = "Given/NYC_Citywide_Annualized_Calendar_Sales_Update_20241107.csv" cols_needed = [ "BOROUGH", "NEIGHBORHOOD", "BUILDING CLASS CATEGORY", "TAX CLASS AS OF FINAL ROLL", "BLOCK", "LOT", "BUILDING CLASS AS OF FINAL ROLL", "ZIP CODE", "RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS", "LAND SQUARE FEET", "GROSS SQUARE FEET", "YEAR BUILT", "TAX CLASS AT TIME OF SALE", "BUILDING CLASS AT TIME OF SALE", "SALE PRICE", "SALE DATE", ] # loading... nyc = pd.read_csv(file_path, dtype=str, low_memory=False) cols_present = [c for c in cols_needed if c in nyc.columns] nyc = nyc[cols_present] # force borough numeric nyc["BOROUGH"] = pd.to_numeric(nyc["BOROUGH"], errors="coerce") manhattan_raw = nyc[nyc["BOROUGH"] == 1].copy() brooklyn_raw = nyc[nyc["BOROUGH"] == 3].copy() print(f"raw manhattan rows: {len(manhattan_raw)}") print(f"raw brooklyn rows: {len(brooklyn_raw)}") def clean_borough(df: pd.DataFrame, min_price: float = 10000.0) -> pd.DataFrame: """clean borough-level dataframe similarly to the r function.""" df = df.copy() def parse_numeric(series: pd.Series) -> pd.Series: """numeric from char / factor with commas and junk values.""" s = series.astype(str).str.strip() s = s.replace( { "0": np.nan, "0.0": np.nan, "- 0": np.nan, "": np.nan, ".": np.nan, "NA": np.nan, "NaN": np.nan, } ) s = s.str.replace(",", "", regex=False) return pd.to_numeric(s, errors="coerce") # convert numeric columns if "SALE PRICE" in df.columns: df["SALE PRICE"] = parse_numeric(df["SALE PRICE"]) if "LAND SQUARE FEET" in df.columns: df["LAND SQUARE FEET"] = parse_numeric(df["LAND SQUARE FEET"]) if "GROSS SQUARE FEET" in df.columns: df["GROSS SQUARE FEET"] = parse_numeric(df["GROSS SQUARE FEET"]) if "YEAR BUILT" in df.columns: df["YEAR BUILT"] = parse_numeric(df["YEAR BUILT"]) unit_cols = ["RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS"] for col in unit_cols: if col in df.columns: df[col] = parse_numeric(df[col]) # drop non-arms-length / tiny sales if "SALE PRICE" in df.columns: df = df[df["SALE PRICE"].notna()] df = df[df["SALE PRICE"] > min_price] # very old or zero years --> missing if "YEAR BUILT" in df.columns: df.loc[df["YEAR BUILT"] < 1800, "YEAR BUILT"] = np.nan # need usable size / year required_cols = ["GROSS SQUARE FEET", "LAND SQUARE FEET", "YEAR BUILT"] if all(c in df.columns for c in required_cols): df = df[ df["GROSS SQUARE FEET"].notna() & df["LAND SQUARE FEET"].notna() & df["YEAR BUILT"].notna() & (df["GROSS SQUARE FEET"] > 0) & (df["LAND SQUARE FEET"] > 0) ] # fill missing units with 0 for col in unit_cols: if col in df.columns: df[col] = df[col].fillna(0) # rename neighborhood if "NEIGHBORHOOD" in df.columns: df = df.rename(columns={"NEIGHBORHOOD": "neighborhood"}) # create new columns if "LAND SQUARE FEET" in df.columns: df["land_sqft"] = df["LAND SQUARE FEET"] if "GROSS SQUARE FEET" in df.columns: df["gross_sqft"] = df["GROSS SQUARE FEET"] if "YEAR BUILT" in df.columns: df["year_built"] = df["YEAR BUILT"] if "RESIDENTIAL UNITS" in df.columns: df["res_units"] = df["RESIDENTIAL UNITS"] else: df["res_units"] = 0 if "COMMERCIAL UNITS" in df.columns: df["comm_units"] = df["COMMERCIAL UNITS"] else: df["comm_units"] = 0 if "TOTAL UNITS" in df.columns: df["total_units"] = df["TOTAL UNITS"] else: df["total_units"] = 0 if "SALE PRICE" in df.columns: df["sale_price"] = df["SALE PRICE"] return df manhattan = clean_borough(manhattan_raw) brooklyn = clean_borough(brooklyn_raw) print(f"clean manhattan rows: {len(manhattan)}") print(f"clean brooklyn rows: {len(brooklyn)}") # manhattan exploratory data analysis # summary stats for sale price summary_manhattan_price = manhattan["sale_price"].describe() print("summary of manhattan sale_price:") print(summary_manhattan_price) quantiles_manhattan = manhattan["sale_price"].quantile( [0.25, 0.5, 0.75, 0.9, 0.95, 0.99] ) print("selected quantiles for manhattan sale_price:") print(quantiles_manhattan) q1 = quantiles_manhattan.loc[0.25] q3 = quantiles_manhattan.loc[0.75] iqr = q3 - q1 lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr print(f"manhattan iqr upper bound: {upper_bound}") print( f"manhattan max sale price: {manhattan['sale_price'].max(skipna=True)}" ) outlier_mask = (manhattan["sale_price"] < lower_bound) | ( manhattan["sale_price"] > upper_bound ) print(f"number of sale price outliers: {outlier_mask.sum()}") # correlation with other numeric vars num_cols = [ "sale_price", "gross_sqft", "land_sqft", "year_built", "res_units", "comm_units", "total_units", ] corr_manhattan = manhattan[num_cols].corr() print("correlation with sale_price:") print(corr_manhattan["sale_price"]) # formatter for comma-separated tick labels def comma_format(x, pos): try: return f"{int(x):,}" except Exception: return str(x) # histogram of raw sale prices fig, ax = plt.subplots() ax.hist(manhattan["sale_price"], bins=50) ax.set_title("manhattan sale price distribution (raw)") ax.set_xlabel("sale price (usd)") ax.set_ylabel("count of sales") ax.xaxis.set_major_formatter(FuncFormatter(comma_format)) plt.tight_layout() plt.show() # histogram of log(1 + sale price) fig, ax = plt.subplots() ax.hist(np.log1p(manhattan["sale_price"]), bins=50) ax.set_title("manhattan sale price distribution (log scale)") ax.set_xlabel("log(1 + sale price)") ax.set_ylabel("count of sales") plt.tight_layout() plt.show() # boxplot of sale price (for outliers) fig, ax = plt.subplots() ax.boxplot(manhattan["sale_price"].values, vert=True) ax.set_title("manhattan sale price with outliers") ax.set_ylabel("sale price (usd)") ax.set_xticks([]) ax.yaxis.set_major_formatter(FuncFormatter(comma_format)) plt.tight_layout() plt.show() # scatter: gross sqft vs sale price (log y) fig, ax = plt.subplots() ax.scatter(manhattan["gross_sqft"], manhattan["sale_price"], alpha=0.3) ax.set_yscale("log") ax.set_title("manhattan sale price vs gross square feet") ax.set_xlabel("gross square feet") ax.set_ylabel("sale price (log10-ish scale)") plt.tight_layout() plt.show() # manhattan regression analysis reg_vars = [ "land_sqft", "gross_sqft", "year_built", "res_units", "comm_units", "total_units", ] reg_df = manhattan[reg_vars + ["sale_price"]].dropna() reg_df["log_price"] = np.log1p(reg_df["sale_price"]) X = reg_df[reg_vars] y = reg_df["log_price"] X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split( X, y, test_size=0.25, random_state=RANDOM_STATE ) # linear regression on log_price lm = LinearRegression() lm.fit(X_train_reg, y_train_reg) lm_pred = lm.predict(X_test_reg) r2_lm = r2_score(y_test_reg, lm_pred) rmse_lm = np.sqrt(mean_squared_error(y_test_reg, lm_pred)) mae_lm = mean_absolute_error(y_test_reg, lm_pred) print("\nlinear model (log price) metrics on manhattan:") print(f"r2: {r2_lm:.4f} rmse: {rmse_lm:.4f} mae: {mae_lm:.4f}") # random forest regression on log_price rf_reg = RandomForestRegressor( n_estimators=200, max_features=3, max_leaf_nodes=100, random_state=RANDOM_STATE, n_jobs=-1, ) rf_reg.fit(X_train_reg, y_train_reg) rf_pred = rf_reg.predict(X_test_reg) r2_rf = r2_score(y_test_reg, rf_pred) rmse_rf = np.sqrt(mean_squared_error(y_test_reg, rf_pred)) mae_rf = mean_absolute_error(y_test_reg, rf_pred) print("\nrandom forest (log price) metrics on manhattan:") print(f"r2: {r2_rf:.4f} rmse: {rmse_rf:.4f} mae: {mae_rf:.4f}") # manhattan predicted vs actual plot rf_diag_df = pd.DataFrame( { "actual": y_test_reg.values, "predicted": rf_pred, } ) fig, ax = plt.subplots() ax.scatter(rf_diag_df["actual"], rf_diag_df["predicted"], alpha=0.3) min_val = min(rf_diag_df["actual"].min(), rf_diag_df["predicted"].min()) max_val = max(rf_diag_df["actual"].max(), rf_diag_df["predicted"].max()) ax.plot([min_val, max_val], [min_val, max_val], linestyle="--") ax.set_title( "random forest: predicted vs actual log(1 + sale price) (manhattan)" ) ax.set_xlabel("actual log(1 + sale price)") ax.set_ylabel("predicted log(1 + sale price)") plt.tight_layout() plt.show() # manhattan residuals vs predicted plot rf_diag_df["residual"] = ( rf_diag_df["actual"] - rf_diag_df["predicted"] ) fig, ax = plt.subplots() ax.scatter(rf_diag_df["predicted"], rf_diag_df["residual"], alpha=0.3) ax.axhline(0.0, linestyle="--") ax.set_title("random forest residuals (manhattan)") ax.set_xlabel("predicted log(1 + sale price)") ax.set_ylabel("residual") plt.tight_layout() plt.show() # use manhattan regression model on brooklyn brook_reg_df = brooklyn[reg_vars + ["sale_price"]].dropna() brook_reg_df["log_price"] = np.log1p(brook_reg_df["sale_price"]) X_brook = brook_reg_df[reg_vars] y_brook = brook_reg_df["log_price"] rf_pred_brook = rf_reg.predict(X_brook) r2_rf_brook = r2_score(y_brook, rf_pred_brook) rmse_rf_brook = np.sqrt(mean_squared_error(y_brook, rf_pred_brook)) mae_rf_brook = mean_absolute_error(y_brook, rf_pred_brook) print( "\nrandom forest (log price) metrics on brooklyn " "(trained on manhattan):" ) print( f"r2: {r2_rf_brook:.4f} rmse: {rmse_rf_brook:.4f} " f"mae: {mae_rf_brook:.4f}" ) brook_diag_df = pd.DataFrame( { "actual": y_brook.values, "predicted": rf_pred_brook, } ) brook_diag_df["residual"] = ( brook_diag_df["actual"] - brook_diag_df["predicted"] ) fig, ax = plt.subplots() ax.scatter(brook_diag_df["actual"], brook_diag_df["predicted"], alpha=0.3) min_val = min(brook_diag_df["actual"].min(), brook_diag_df["predicted"].min()) max_val = max(brook_diag_df["actual"].max(), brook_diag_df["predicted"].max()) ax.plot([min_val, max_val], [min_val, max_val], linestyle="--") ax.set_title( "random forest: manhattan model on brooklyn (log price)" ) ax.set_xlabel("actual log(1 + sale price) (brooklyn)") ax.set_ylabel("predicted log(1 + sale price)") plt.tight_layout() plt.show() fig, ax = plt.subplots() ax.scatter(brook_diag_df["predicted"], brook_diag_df["residual"], alpha=0.3) ax.axhline(0.0, linestyle="--") ax.set_title( "residuals on brooklyn using manhattan random forest" ) ax.set_xlabel("predicted log(1 + sale price)") ax.set_ylabel("residual") plt.tight_layout() plt.show() # classification: manhattan predict neighborhood clf_vars = [ "sale_price", "land_sqft", "gross_sqft", "year_built", "res_units", "comm_units", "total_units", ] def prepare_classification_df( df: pd.DataFrame, min_per_class: int = 100 ) -> pd.DataFrame: """prepare classification df similar to r code.""" tmp = df.copy() if "neighborhood" not in tmp.columns: raise ValueError("neighborhood column missing") tmp = tmp[tmp["neighborhood"].notna()] counts = tmp["neighborhood"].value_counts() keep = counts[counts >= min_per_class].index tmp = tmp[tmp["neighborhood"].isin(keep)] cols = ["neighborhood"] + clf_vars tmp = tmp[cols].dropna() tmp["neighborhood"] = tmp["neighborhood"].astype("category") return tmp manhattan_clf = prepare_classification_df( manhattan, min_per_class=100 ) print( f"\nmanhattan classification subset rows: {len(manhattan_clf)}" ) print( "manhattan neighborhoods: " f"{manhattan_clf['neighborhood'].nunique()}" ) X_clf = manhattan_clf[clf_vars] y_clf = manhattan_clf["neighborhood"] X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split( X_clf, y_clf, test_size=0.25, stratify=y_clf, random_state=RANDOM_STATE, ) def macro_f1_score(y_true, y_pred) -> float: """macro f1 similar to caret::confusionMatrix byClass averaging.""" _, _, f1, _ = precision_recall_fscore_support( y_true, y_pred, average="macro", zero_division=0 ) return float(f1) # k-nn classifier with scaling knn_pipeline = Pipeline( [ ("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=7)), ] ) knn_pipeline.fit(X_train_clf, y_train_clf) knn_pred = knn_pipeline.predict(X_test_clf) acc_knn = accuracy_score(y_test_clf, knn_pred) macro_f1_knn = macro_f1_score(y_test_clf, knn_pred) print( f"\nknn (manhattan) accuracy: {acc_knn:.4f} " f"macro f1: {macro_f1_knn:.4f}" ) # random forest classifier rf_clf = RandomForestClassifier( n_estimators=300, max_features=3, random_state=RANDOM_STATE, n_jobs=-1, ) rf_clf.fit(X_train_clf, y_train_clf) rf_clf_pred = rf_clf.predict(X_test_clf) acc_rf_clf = accuracy_score(y_test_clf, rf_clf_pred) macro_f1_rf_clf = macro_f1_score(y_test_clf, rf_clf_pred) print( "\nrandom forest classifier (manhattan) accuracy: " f"{acc_rf_clf:.4f} macro f1: {macro_f1_rf_clf:.4f}" ) # contingency table (rf example) labels = sorted(y_clf.unique()) rf_cm_table = confusion_matrix( y_test_clf, rf_clf_pred, labels=labels ) print("rf confusion matrix shape:", rf_cm_table.shape) print("rf confusion matrix (rows=true, cols=pred):") print(rf_cm_table) # multinomial logistic regression logit_clf = LogisticRegression( multi_class="multinomial", max_iter=2000, solver="lbfgs", n_jobs=-1, ) logit_clf.fit(X_train_clf, y_train_clf) logit_pred = logit_clf.predict(X_test_clf) acc_logit = accuracy_score(y_test_clf, logit_pred) macro_f1_logit = macro_f1_score(y_test_clf, logit_pred) print( "\nmultinomial logistic regression (manhattan) accuracy: " f"{acc_logit:.4f} macro f1: {macro_f1_logit:.4f}" ) # use manhattan classifiers on brooklyn brooklyn_clf = prepare_classification_df( brooklyn, min_per_class=100 ) print( f"\nbrooklyn classification subset rows: {len(brooklyn_clf)}" ) print( "brooklyn neighborhoods: " f"{brooklyn_clf['neighborhood'].nunique()}" ) X_brook_clf = brooklyn_clf[clf_vars] y_brook_clf = brooklyn_clf["neighborhood"] # predictions from manhattan-trained models on brooklyn data knn_pred_brook = knn_pipeline.predict(X_brook_clf) rf_pred_brook_clf = rf_clf.predict(X_brook_clf) logit_pred_brook = logit_clf.predict(X_brook_clf) # contingency tables (true brooklyn neigh vs predicted manhattan neigh) tab_knn_brook = pd.crosstab( y_brook_clf, knn_pred_brook, rownames=["true"], colnames=["pred"] ) tab_rf_brook = pd.crosstab( y_brook_clf, rf_pred_brook_clf, rownames=["true"], colnames=["pred"] ) tab_logit_brook = pd.crosstab( y_brook_clf, logit_pred_brook, rownames=["true"], colnames=["pred"] ) print("\ncontingency table dimensions (knn):", tab_knn_brook.shape) print(tab_knn_brook) print( "contingency table dimensions (random forest):", tab_rf_brook.shape, ) print(tab_rf_brook) print("contingency table dimensions (logit):", tab_logit_brook.shape) print(tab_logit_brook)