added assignment VI

2025-12-05 19:59:00 -05:00
parent 2667c06e09
commit 4f6434ff72
11 changed files with 851 additions and 0 deletions
@@ -0,0 +1,363 @@
+import zipfile
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import (
+	r2_score,
+	root_mean_squared_error,
+	accuracy_score,
+	f1_score,
+	roc_auc_score,
+	confusion_matrix,
+	silhouette_score,
+)
+from sklearn.pipeline import Pipeline
+from sklearn.cluster import KMeans
+
+# ensure imgs dir exists
+os.makedirs("imgs", exist_ok=True)
+
+# data loading
+
+zip_path = "news+popularity+in+multiple+social+media+platforms.zip"
+
+with zipfile.ZipFile(zip_path, "r") as zf:
+	with zf.open("Data/News_Final.csv") as f:
+		news = pd.read_csv(f)
+
+# basic cleaning
+
+pop_cols = ["Facebook", "GooglePlus", "LinkedIn"]
+
+# encode -1 as missing
+for col in pop_cols:
+	news.loc[news[col] < 0, col] = np.nan
+
+# convert publishdate and add numeric time feature
+news["PublishDate"] = pd.to_datetime(news["PublishDate"])
+news["DaysSinceEpoch"] = (
+	news["PublishDate"] - pd.Timestamp("1970-01-01")
+).dt.days
+
+# log transform facebook popularity where available
+news["log_Facebook"] = np.log1p(news["Facebook"])
+
+# eda helpers (optional plotting)
+
+
+def plot_eda():
+	plt.figure()
+	vals = news["Facebook"].dropna()
+	vals = vals[vals > 0]
+	vals.plot.hist(bins=50)
+	plt.xlabel("facebook shares")
+	plt.ylabel("count")
+	plt.title("distribution of facebook popularity")
+	plt.xscale("log")
+	plt.tight_layout()
+	plt.savefig("imgs/eda_facebook_hist.png")
+	plt.close()
+
+	plt.figure()
+	news["log_Facebook"].dropna().plot.hist(bins=50)
+	plt.xlabel("log1p(facebook shares)")
+	plt.ylabel("count")
+	plt.title("distribution of log-transformed facebook popularity")
+	plt.tight_layout()
+	plt.savefig("imgs/eda_log_facebook_hist.png")
+	plt.close()
+
+	mean_by_topic = (
+		news.groupby("Topic")["log_Facebook"].mean().sort_values()
+	)
+	plt.figure()
+	mean_by_topic.plot(kind="bar")
+	plt.ylabel("mean log1p(facebook shares)")
+	plt.title("average facebook popularity by topic")
+	plt.tight_layout()
+	plt.savefig("imgs/eda_mean_by_topic.png")
+	plt.close()
+
+	sample = news.dropna(
+		subset=["log_Facebook", "SentimentTitle"]
+	).sample(5000, random_state=42)
+	plt.figure()
+	plt.scatter(
+		sample["SentimentTitle"],
+		sample["log_Facebook"],
+		alpha=0.3,
+	)
+	plt.xlabel("sentimenttitle")
+	plt.ylabel("log1p(facebook shares)")
+	plt.title("title sentiment vs facebook popularity (sample)")
+	plt.tight_layout()
+	plt.savefig("imgs/eda_sentiment_vs_popularity.png")
+	plt.close()
+
+# model 1: linear regression
+
+
+def run_model_1():
+	df = news.dropna(subset=["log_Facebook"]).copy()
+
+	X = df[["SentimentTitle", "SentimentHeadline", "DaysSinceEpoch", "Topic"]]
+	X = pd.get_dummies(X, columns=["Topic"], drop_first=True)
+	y = df["log_Facebook"]
+
+	X_train, X_test, y_train, y_test = train_test_split(
+		X, y, test_size=0.2, random_state=42
+	)
+
+	linreg = LinearRegression()
+	linreg.fit(X_train, y_train)
+	y_pred = linreg.predict(X_test)
+
+	r2 = r2_score(y_test, y_pred)
+	rmse = root_mean_squared_error(y_test, y_pred)
+
+	print("model 1 – linear regression")
+	print("r2:", r2)
+	print("rmse:", rmse)
+	print("coefficients:")
+	print(pd.Series(linreg.coef_, index=X.columns))
+
+	# optional diagnostic plot
+	plt.figure()
+	plt.scatter(y_test, y_pred, alpha=0.3)
+	plt.xlabel("actual log1p(facebook)")
+	plt.ylabel("predicted log1p(facebook)")
+	plt.title("model 1: actual vs predicted")
+	plt.tight_layout()
+	plt.savefig("imgs/model1_actual_vs_predicted.png")
+	plt.close()
+
+	return linreg, (X_test, y_test, y_pred)
+
+# prepare economy + facebook time-slice data
+
+
+with zipfile.ZipFile(zip_path, "r") as zf:
+	with zf.open("Data/Facebook_Economy.csv") as f:
+		fb_econ = pd.read_csv(f)
+
+# ensure integer id for join
+news["IDLink_int"] = news["IDLink"].astype(int)
+
+news_econ = news[news["Topic"] == "economy"].copy()
+news_econ["IDLink_int"] = news_econ["IDLink"].astype(int)
+
+fb_econ_merged = fb_econ.merge(
+	news_econ, left_on="IDLink", right_on="IDLink_int", how="inner"
+)
+
+# clean time-slice features
+ts_cols = [c for c in fb_econ.columns if c.startswith("TS")]
+for col in ts_cols:
+	fb_econ_merged.loc[fb_econ_merged[col] < 0, col] = 0
+
+# drop rows with missing facebook target
+fb_econ_merged = fb_econ_merged[fb_econ_merged["Facebook"].notna()].copy()
+fb_econ_merged["log_Facebook"] = np.log1p(fb_econ_merged["Facebook"])
+
+ts_cols_early = ts_cols[:50]
+
+# model 2: random forest on raw early ts
+
+
+def run_model_2():
+	X = fb_econ_merged[ts_cols_early + ["SentimentTitle", "SentimentHeadline"]]
+	y = fb_econ_merged["log_Facebook"]
+
+	X_train, X_test, y_train, y_test = train_test_split(
+		X, y, test_size=0.2, random_state=42
+	)
+
+	rf = RandomForestRegressor(
+		n_estimators=120,
+		random_state=42,
+		n_jobs=-1,
+		max_depth=None,
+		min_samples_leaf=2,
+	)
+	rf.fit(X_train, y_train)
+
+	pipe = Pipeline([
+		("scaler", StandardScaler()),
+		("pca", PCA(n_components=10, random_state=42)),
+		("rf", RandomForestRegressor(
+			n_estimators=120,
+			random_state=42,
+			n_jobs=-1,
+			max_depth=None,
+			min_samples_leaf=2,
+		)),
+	])
+
+	pipe.fit(X_train, y_train)
+	y_pred = pipe.predict(X_test)
+
+	r2 = r2_score(y_test, y_pred)
+	rmse = root_mean_squared_error(y_test, y_pred)
+
+	print("model 2 – random forest on raw ts")
+	print("r2:", r2)
+	print("rmse:", rmse)
+
+	importances = pd.Series(rf.feature_importances_, index=X.columns)
+	print("top importances:")
+	print(importances.sort_values(ascending=False).head(10))
+
+	return rf, (X_test, y_test, y_pred)
+
+# model 3: pca + random forest
+
+
+def run_model_3():
+	ts = fb_econ_merged[ts_cols_early]
+	sent = fb_econ_merged[["SentimentTitle", "SentimentHeadline"]]
+	X = pd.concat([ts, sent], axis=1)
+	y = fb_econ_merged["log_Facebook"]
+
+	X_train, X_test, y_train, y_test = train_test_split(
+		X, y, test_size=0.2, random_state=42
+	)
+
+	scaler = StandardScaler()
+	X_train_scaled = scaler.fit_transform(X_train[ts_cols_early])
+	X_test_scaled = scaler.transform(X_test[ts_cols_early])
+
+	pca = PCA(n_components=10, random_state=42)
+	X_train_pca = pca.fit_transform(X_train_scaled)
+	X_test_pca = pca.transform(X_test_scaled)
+
+	train_sent = X_train[["SentimentTitle", "SentimentHeadline"]].values
+	test_sent = X_test[["SentimentTitle", "SentimentHeadline"]].values
+
+	X_train_final = np.hstack([X_train_pca, train_sent])
+	X_test_final = np.hstack([X_test_pca, test_sent])
+
+	rf = RandomForestRegressor(
+		n_estimators=120,
+		random_state=42,
+		n_jobs=-1,
+		max_depth=None,
+		min_samples_leaf=2,
+	)
+	rf.fit(X_train_final, y_train)
+	y_pred = rf.predict(X_test_final)
+
+	r2 = r2_score(y_test, y_pred)
+	rmse = root_mean_squared_error(y_test, y_pred)
+
+	print("model 3 – random forest on pca(ts)")
+	print("r2:", r2)
+	print("rmse:", rmse)
+	print("pca variance explained (first 10):", pca.explained_variance_ratio_)
+	print("total variance explained:", pca.explained_variance_ratio_.sum())
+
+	return rf, (X_test, y_test, y_pred), (pca, scaler)
+
+# model 4: logistic regression (viral vs non-viral)
+
+
+def run_model_4():
+	df = news.copy()
+	df = df[df["Facebook"].notna()].copy()
+
+	threshold = df["Facebook"].quantile(0.9)
+	df["viral_fb"] = (df["Facebook"] >= threshold).astype(int)
+
+	X = df[["SentimentTitle", "SentimentHeadline", "DaysSinceEpoch", "Topic"]]
+	X = pd.get_dummies(X, columns=["Topic"], drop_first=True)
+	y = df["viral_fb"]
+
+	X_train, X_test, y_train, y_test = train_test_split(
+		X,
+		y,
+		test_size=0.2,
+		random_state=42,
+		stratify=y,
+	)
+
+	clf = LogisticRegression(
+		max_iter=500,
+		class_weight="balanced",
+	)
+	clf.fit(X_train, y_train)
+
+	y_pred = clf.predict(X_test)
+	y_proba = clf.predict_proba(X_test)[:, 1]
+
+	acc = accuracy_score(y_test, y_pred)
+	f1 = f1_score(y_test, y_pred)
+	auc = roc_auc_score(y_test, y_proba)
+	cm = confusion_matrix(y_test, y_pred)
+
+	print("model 4 – logistic regression (viral vs non-viral)")
+	print("threshold (shares):", threshold)
+	print("accuracy:", acc)
+	print("f1 (positive class):", f1)
+	print("roc auc:", auc)
+	print("confusion matrix:\n", cm)
+
+	return clf, (X_test, y_test, y_pred, y_proba)
+
+# model 5: k-means clustering on ts shapes
+
+
+def run_model_5():
+	X = fb_econ_merged[ts_cols_early].values
+	scaler = StandardScaler()
+	X_scaled = scaler.fit_transform(X)
+
+	rng = np.random.RandomState(42)
+	idx = rng.choice(X_scaled.shape[0], size=5000, replace=False)
+	X_sample = X_scaled[idx]
+	fb_sample = fb_econ_merged["Facebook"].values[idx]
+
+	kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
+	kmeans.fit(X_sample)
+	labels = kmeans.labels_
+
+	sil = silhouette_score(X_sample, labels)
+	print("model 5 – kmeans on ts shapes")
+	print("silhouette score:", sil)
+
+	cluster_df = pd.DataFrame(
+		{"cluster": labels, "Facebook": fb_sample}
+	)
+	print(cluster_df.groupby("cluster")["Facebook"].agg(
+		["count", "mean", "median", "max"]
+	))
+
+	centers_scaled = kmeans.cluster_centers_
+	centers = scaler.inverse_transform(centers_scaled)
+	centers_df = pd.DataFrame(centers, columns=ts_cols_early)
+
+	summary = pd.DataFrame({
+		"cluster": list(range(centers_df.shape[0])),
+		"avg_ts": centers_df.mean(axis=1),
+		"ts1": centers_df["TS1"],
+		"ts10": centers_df["TS10"],
+		"ts25": centers_df["TS25"],
+		"ts50": centers_df["TS50"],
+	})
+	print("cluster centroid summary:\n", summary)
+
+	return kmeans, scaler, summary
+
+
+if __name__ == "__main__":
+	run_model_1()
+	run_model_2()
+	run_model_3()
+	run_model_4()
+	run_model_5()
+	plot_eda()