import zipfile import numpy as np import pandas as pd import matplotlib.pyplot as plt import os from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.ensemble import RandomForestRegressor from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.metrics import ( r2_score, root_mean_squared_error, accuracy_score, f1_score, roc_auc_score, confusion_matrix, silhouette_score, ) from sklearn.pipeline import Pipeline from sklearn.cluster import KMeans # ensure imgs dir exists os.makedirs("imgs", exist_ok=True) # data loading zip_path = "news+popularity+in+multiple+social+media+platforms.zip" with zipfile.ZipFile(zip_path, "r") as zf: with zf.open("Data/News_Final.csv") as f: news = pd.read_csv(f) # basic cleaning pop_cols = ["Facebook", "GooglePlus", "LinkedIn"] # encode -1 as missing for col in pop_cols: news.loc[news[col] < 0, col] = np.nan # convert publishdate and add numeric time feature news["PublishDate"] = pd.to_datetime(news["PublishDate"]) news["DaysSinceEpoch"] = ( news["PublishDate"] - pd.Timestamp("1970-01-01") ).dt.days # log transform facebook popularity where available news["log_Facebook"] = np.log1p(news["Facebook"]) # eda helpers (optional plotting) def plot_eda(): plt.figure() vals = news["Facebook"].dropna() vals = vals[vals > 0] vals.plot.hist(bins=50) plt.xlabel("facebook shares") plt.ylabel("count") plt.title("distribution of facebook popularity") plt.xscale("log") plt.tight_layout() plt.savefig("imgs/eda_facebook_hist.png") plt.close() plt.figure() news["log_Facebook"].dropna().plot.hist(bins=50) plt.xlabel("log1p(facebook shares)") plt.ylabel("count") plt.title("distribution of log-transformed facebook popularity") plt.tight_layout() plt.savefig("imgs/eda_log_facebook_hist.png") plt.close() mean_by_topic = ( news.groupby("Topic")["log_Facebook"].mean().sort_values() ) plt.figure() mean_by_topic.plot(kind="bar") plt.ylabel("mean log1p(facebook shares)") plt.title("average facebook popularity by topic") plt.tight_layout() plt.savefig("imgs/eda_mean_by_topic.png") plt.close() sample = news.dropna( subset=["log_Facebook", "SentimentTitle"] ).sample(5000, random_state=42) plt.figure() plt.scatter( sample["SentimentTitle"], sample["log_Facebook"], alpha=0.3, ) plt.xlabel("sentimenttitle") plt.ylabel("log1p(facebook shares)") plt.title("title sentiment vs facebook popularity (sample)") plt.tight_layout() plt.savefig("imgs/eda_sentiment_vs_popularity.png") plt.close() # model 1: linear regression def run_model_1(): df = news.dropna(subset=["log_Facebook"]).copy() X = df[["SentimentTitle", "SentimentHeadline", "DaysSinceEpoch", "Topic"]] X = pd.get_dummies(X, columns=["Topic"], drop_first=True) y = df["log_Facebook"] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) linreg = LinearRegression() linreg.fit(X_train, y_train) y_pred = linreg.predict(X_test) r2 = r2_score(y_test, y_pred) rmse = root_mean_squared_error(y_test, y_pred) print("model 1 – linear regression") print("r2:", r2) print("rmse:", rmse) print("coefficients:") print(pd.Series(linreg.coef_, index=X.columns)) # optional diagnostic plot plt.figure() plt.scatter(y_test, y_pred, alpha=0.3) plt.xlabel("actual log1p(facebook)") plt.ylabel("predicted log1p(facebook)") plt.title("model 1: actual vs predicted") plt.tight_layout() plt.savefig("imgs/model1_actual_vs_predicted.png") plt.close() return linreg, (X_test, y_test, y_pred) # prepare economy + facebook time-slice data with zipfile.ZipFile(zip_path, "r") as zf: with zf.open("Data/Facebook_Economy.csv") as f: fb_econ = pd.read_csv(f) # ensure integer id for join news["IDLink_int"] = news["IDLink"].astype(int) news_econ = news[news["Topic"] == "economy"].copy() news_econ["IDLink_int"] = news_econ["IDLink"].astype(int) fb_econ_merged = fb_econ.merge( news_econ, left_on="IDLink", right_on="IDLink_int", how="inner" ) # clean time-slice features ts_cols = [c for c in fb_econ.columns if c.startswith("TS")] for col in ts_cols: fb_econ_merged.loc[fb_econ_merged[col] < 0, col] = 0 # drop rows with missing facebook target fb_econ_merged = fb_econ_merged[fb_econ_merged["Facebook"].notna()].copy() fb_econ_merged["log_Facebook"] = np.log1p(fb_econ_merged["Facebook"]) ts_cols_early = ts_cols[:50] # model 2: random forest on raw early ts def run_model_2(): X = fb_econ_merged[ts_cols_early + ["SentimentTitle", "SentimentHeadline"]] y = fb_econ_merged["log_Facebook"] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) rf = RandomForestRegressor( n_estimators=120, random_state=42, n_jobs=-1, max_depth=None, min_samples_leaf=2, ) rf.fit(X_train, y_train) pipe = Pipeline([ ("scaler", StandardScaler()), ("pca", PCA(n_components=10, random_state=42)), ("rf", RandomForestRegressor( n_estimators=120, random_state=42, n_jobs=-1, max_depth=None, min_samples_leaf=2, )), ]) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) r2 = r2_score(y_test, y_pred) rmse = root_mean_squared_error(y_test, y_pred) print("model 2 – random forest on raw ts") print("r2:", r2) print("rmse:", rmse) importances = pd.Series(rf.feature_importances_, index=X.columns) print("top importances:") print(importances.sort_values(ascending=False).head(10)) return rf, (X_test, y_test, y_pred) # model 3: pca + random forest def run_model_3(): ts = fb_econ_merged[ts_cols_early] sent = fb_econ_merged[["SentimentTitle", "SentimentHeadline"]] X = pd.concat([ts, sent], axis=1) y = fb_econ_merged["log_Facebook"] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train[ts_cols_early]) X_test_scaled = scaler.transform(X_test[ts_cols_early]) pca = PCA(n_components=10, random_state=42) X_train_pca = pca.fit_transform(X_train_scaled) X_test_pca = pca.transform(X_test_scaled) train_sent = X_train[["SentimentTitle", "SentimentHeadline"]].values test_sent = X_test[["SentimentTitle", "SentimentHeadline"]].values X_train_final = np.hstack([X_train_pca, train_sent]) X_test_final = np.hstack([X_test_pca, test_sent]) rf = RandomForestRegressor( n_estimators=120, random_state=42, n_jobs=-1, max_depth=None, min_samples_leaf=2, ) rf.fit(X_train_final, y_train) y_pred = rf.predict(X_test_final) r2 = r2_score(y_test, y_pred) rmse = root_mean_squared_error(y_test, y_pred) print("model 3 – random forest on pca(ts)") print("r2:", r2) print("rmse:", rmse) print("pca variance explained (first 10):", pca.explained_variance_ratio_) print("total variance explained:", pca.explained_variance_ratio_.sum()) return rf, (X_test, y_test, y_pred), (pca, scaler) # model 4: logistic regression (viral vs non-viral) def run_model_4(): df = news.copy() df = df[df["Facebook"].notna()].copy() threshold = df["Facebook"].quantile(0.9) df["viral_fb"] = (df["Facebook"] >= threshold).astype(int) X = df[["SentimentTitle", "SentimentHeadline", "DaysSinceEpoch", "Topic"]] X = pd.get_dummies(X, columns=["Topic"], drop_first=True) y = df["viral_fb"] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y, ) clf = LogisticRegression( max_iter=500, class_weight="balanced", ) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_proba = clf.predict_proba(X_test)[:, 1] acc = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) auc = roc_auc_score(y_test, y_proba) cm = confusion_matrix(y_test, y_pred) print("model 4 – logistic regression (viral vs non-viral)") print("threshold (shares):", threshold) print("accuracy:", acc) print("f1 (positive class):", f1) print("roc auc:", auc) print("confusion matrix:\n", cm) return clf, (X_test, y_test, y_pred, y_proba) # model 5: k-means clustering on ts shapes def run_model_5(): X = fb_econ_merged[ts_cols_early].values scaler = StandardScaler() X_scaled = scaler.fit_transform(X) rng = np.random.RandomState(42) idx = rng.choice(X_scaled.shape[0], size=5000, replace=False) X_sample = X_scaled[idx] fb_sample = fb_econ_merged["Facebook"].values[idx] kmeans = KMeans(n_clusters=3, random_state=42, n_init=10) kmeans.fit(X_sample) labels = kmeans.labels_ sil = silhouette_score(X_sample, labels) print("model 5 – kmeans on ts shapes") print("silhouette score:", sil) cluster_df = pd.DataFrame( {"cluster": labels, "Facebook": fb_sample} ) print(cluster_df.groupby("cluster")["Facebook"].agg( ["count", "mean", "median", "max"] )) centers_scaled = kmeans.cluster_centers_ centers = scaler.inverse_transform(centers_scaled) centers_df = pd.DataFrame(centers, columns=ts_cols_early) summary = pd.DataFrame({ "cluster": list(range(centers_df.shape[0])), "avg_ts": centers_df.mean(axis=1), "ts1": centers_df["TS1"], "ts10": centers_df["TS10"], "ts25": centers_df["TS25"], "ts50": centers_df["TS50"], }) print("cluster centroid summary:\n", summary) return kmeans, scaler, summary if __name__ == "__main__": run_model_1() run_model_2() run_model_3() run_model_4() run_model_5() plot_eda()