364 lines
9.2 KiB
Python
364 lines
9.2 KiB
Python
import zipfile
|
||
import numpy as np
|
||
import pandas as pd
|
||
import matplotlib.pyplot as plt
|
||
import os
|
||
|
||
from sklearn.model_selection import train_test_split
|
||
from sklearn.linear_model import LinearRegression, LogisticRegression
|
||
from sklearn.ensemble import RandomForestRegressor
|
||
from sklearn.decomposition import PCA
|
||
from sklearn.preprocessing import StandardScaler
|
||
from sklearn.metrics import (
|
||
r2_score,
|
||
root_mean_squared_error,
|
||
accuracy_score,
|
||
f1_score,
|
||
roc_auc_score,
|
||
confusion_matrix,
|
||
silhouette_score,
|
||
)
|
||
from sklearn.pipeline import Pipeline
|
||
from sklearn.cluster import KMeans
|
||
|
||
# ensure imgs dir exists
|
||
os.makedirs("imgs", exist_ok=True)
|
||
|
||
# data loading
|
||
|
||
zip_path = "news+popularity+in+multiple+social+media+platforms.zip"
|
||
|
||
with zipfile.ZipFile(zip_path, "r") as zf:
|
||
with zf.open("Data/News_Final.csv") as f:
|
||
news = pd.read_csv(f)
|
||
|
||
# basic cleaning
|
||
|
||
pop_cols = ["Facebook", "GooglePlus", "LinkedIn"]
|
||
|
||
# encode -1 as missing
|
||
for col in pop_cols:
|
||
news.loc[news[col] < 0, col] = np.nan
|
||
|
||
# convert publishdate and add numeric time feature
|
||
news["PublishDate"] = pd.to_datetime(news["PublishDate"])
|
||
news["DaysSinceEpoch"] = (
|
||
news["PublishDate"] - pd.Timestamp("1970-01-01")
|
||
).dt.days
|
||
|
||
# log transform facebook popularity where available
|
||
news["log_Facebook"] = np.log1p(news["Facebook"])
|
||
|
||
# eda helpers (optional plotting)
|
||
|
||
|
||
def plot_eda():
|
||
plt.figure()
|
||
vals = news["Facebook"].dropna()
|
||
vals = vals[vals > 0]
|
||
vals.plot.hist(bins=50)
|
||
plt.xlabel("facebook shares")
|
||
plt.ylabel("count")
|
||
plt.title("distribution of facebook popularity")
|
||
plt.xscale("log")
|
||
plt.tight_layout()
|
||
plt.savefig("imgs/eda_facebook_hist.png")
|
||
plt.close()
|
||
|
||
plt.figure()
|
||
news["log_Facebook"].dropna().plot.hist(bins=50)
|
||
plt.xlabel("log1p(facebook shares)")
|
||
plt.ylabel("count")
|
||
plt.title("distribution of log-transformed facebook popularity")
|
||
plt.tight_layout()
|
||
plt.savefig("imgs/eda_log_facebook_hist.png")
|
||
plt.close()
|
||
|
||
mean_by_topic = (
|
||
news.groupby("Topic")["log_Facebook"].mean().sort_values()
|
||
)
|
||
plt.figure()
|
||
mean_by_topic.plot(kind="bar")
|
||
plt.ylabel("mean log1p(facebook shares)")
|
||
plt.title("average facebook popularity by topic")
|
||
plt.tight_layout()
|
||
plt.savefig("imgs/eda_mean_by_topic.png")
|
||
plt.close()
|
||
|
||
sample = news.dropna(
|
||
subset=["log_Facebook", "SentimentTitle"]
|
||
).sample(5000, random_state=42)
|
||
plt.figure()
|
||
plt.scatter(
|
||
sample["SentimentTitle"],
|
||
sample["log_Facebook"],
|
||
alpha=0.3,
|
||
)
|
||
plt.xlabel("sentimenttitle")
|
||
plt.ylabel("log1p(facebook shares)")
|
||
plt.title("title sentiment vs facebook popularity (sample)")
|
||
plt.tight_layout()
|
||
plt.savefig("imgs/eda_sentiment_vs_popularity.png")
|
||
plt.close()
|
||
|
||
# model 1: linear regression
|
||
|
||
|
||
def run_model_1():
|
||
df = news.dropna(subset=["log_Facebook"]).copy()
|
||
|
||
X = df[["SentimentTitle", "SentimentHeadline", "DaysSinceEpoch", "Topic"]]
|
||
X = pd.get_dummies(X, columns=["Topic"], drop_first=True)
|
||
y = df["log_Facebook"]
|
||
|
||
X_train, X_test, y_train, y_test = train_test_split(
|
||
X, y, test_size=0.2, random_state=42
|
||
)
|
||
|
||
linreg = LinearRegression()
|
||
linreg.fit(X_train, y_train)
|
||
y_pred = linreg.predict(X_test)
|
||
|
||
r2 = r2_score(y_test, y_pred)
|
||
rmse = root_mean_squared_error(y_test, y_pred)
|
||
|
||
print("model 1 – linear regression")
|
||
print("r2:", r2)
|
||
print("rmse:", rmse)
|
||
print("coefficients:")
|
||
print(pd.Series(linreg.coef_, index=X.columns))
|
||
|
||
# optional diagnostic plot
|
||
plt.figure()
|
||
plt.scatter(y_test, y_pred, alpha=0.3)
|
||
plt.xlabel("actual log1p(facebook)")
|
||
plt.ylabel("predicted log1p(facebook)")
|
||
plt.title("model 1: actual vs predicted")
|
||
plt.tight_layout()
|
||
plt.savefig("imgs/model1_actual_vs_predicted.png")
|
||
plt.close()
|
||
|
||
return linreg, (X_test, y_test, y_pred)
|
||
|
||
# prepare economy + facebook time-slice data
|
||
|
||
|
||
with zipfile.ZipFile(zip_path, "r") as zf:
|
||
with zf.open("Data/Facebook_Economy.csv") as f:
|
||
fb_econ = pd.read_csv(f)
|
||
|
||
# ensure integer id for join
|
||
news["IDLink_int"] = news["IDLink"].astype(int)
|
||
|
||
news_econ = news[news["Topic"] == "economy"].copy()
|
||
news_econ["IDLink_int"] = news_econ["IDLink"].astype(int)
|
||
|
||
fb_econ_merged = fb_econ.merge(
|
||
news_econ, left_on="IDLink", right_on="IDLink_int", how="inner"
|
||
)
|
||
|
||
# clean time-slice features
|
||
ts_cols = [c for c in fb_econ.columns if c.startswith("TS")]
|
||
for col in ts_cols:
|
||
fb_econ_merged.loc[fb_econ_merged[col] < 0, col] = 0
|
||
|
||
# drop rows with missing facebook target
|
||
fb_econ_merged = fb_econ_merged[fb_econ_merged["Facebook"].notna()].copy()
|
||
fb_econ_merged["log_Facebook"] = np.log1p(fb_econ_merged["Facebook"])
|
||
|
||
ts_cols_early = ts_cols[:50]
|
||
|
||
# model 2: random forest on raw early ts
|
||
|
||
|
||
def run_model_2():
|
||
X = fb_econ_merged[ts_cols_early + ["SentimentTitle", "SentimentHeadline"]]
|
||
y = fb_econ_merged["log_Facebook"]
|
||
|
||
X_train, X_test, y_train, y_test = train_test_split(
|
||
X, y, test_size=0.2, random_state=42
|
||
)
|
||
|
||
rf = RandomForestRegressor(
|
||
n_estimators=120,
|
||
random_state=42,
|
||
n_jobs=-1,
|
||
max_depth=None,
|
||
min_samples_leaf=2,
|
||
)
|
||
rf.fit(X_train, y_train)
|
||
|
||
pipe = Pipeline([
|
||
("scaler", StandardScaler()),
|
||
("pca", PCA(n_components=10, random_state=42)),
|
||
("rf", RandomForestRegressor(
|
||
n_estimators=120,
|
||
random_state=42,
|
||
n_jobs=-1,
|
||
max_depth=None,
|
||
min_samples_leaf=2,
|
||
)),
|
||
])
|
||
|
||
pipe.fit(X_train, y_train)
|
||
y_pred = pipe.predict(X_test)
|
||
|
||
r2 = r2_score(y_test, y_pred)
|
||
rmse = root_mean_squared_error(y_test, y_pred)
|
||
|
||
print("model 2 – random forest on raw ts")
|
||
print("r2:", r2)
|
||
print("rmse:", rmse)
|
||
|
||
importances = pd.Series(rf.feature_importances_, index=X.columns)
|
||
print("top importances:")
|
||
print(importances.sort_values(ascending=False).head(10))
|
||
|
||
return rf, (X_test, y_test, y_pred)
|
||
|
||
# model 3: pca + random forest
|
||
|
||
|
||
def run_model_3():
|
||
ts = fb_econ_merged[ts_cols_early]
|
||
sent = fb_econ_merged[["SentimentTitle", "SentimentHeadline"]]
|
||
X = pd.concat([ts, sent], axis=1)
|
||
y = fb_econ_merged["log_Facebook"]
|
||
|
||
X_train, X_test, y_train, y_test = train_test_split(
|
||
X, y, test_size=0.2, random_state=42
|
||
)
|
||
|
||
scaler = StandardScaler()
|
||
X_train_scaled = scaler.fit_transform(X_train[ts_cols_early])
|
||
X_test_scaled = scaler.transform(X_test[ts_cols_early])
|
||
|
||
pca = PCA(n_components=10, random_state=42)
|
||
X_train_pca = pca.fit_transform(X_train_scaled)
|
||
X_test_pca = pca.transform(X_test_scaled)
|
||
|
||
train_sent = X_train[["SentimentTitle", "SentimentHeadline"]].values
|
||
test_sent = X_test[["SentimentTitle", "SentimentHeadline"]].values
|
||
|
||
X_train_final = np.hstack([X_train_pca, train_sent])
|
||
X_test_final = np.hstack([X_test_pca, test_sent])
|
||
|
||
rf = RandomForestRegressor(
|
||
n_estimators=120,
|
||
random_state=42,
|
||
n_jobs=-1,
|
||
max_depth=None,
|
||
min_samples_leaf=2,
|
||
)
|
||
rf.fit(X_train_final, y_train)
|
||
y_pred = rf.predict(X_test_final)
|
||
|
||
r2 = r2_score(y_test, y_pred)
|
||
rmse = root_mean_squared_error(y_test, y_pred)
|
||
|
||
print("model 3 – random forest on pca(ts)")
|
||
print("r2:", r2)
|
||
print("rmse:", rmse)
|
||
print("pca variance explained (first 10):", pca.explained_variance_ratio_)
|
||
print("total variance explained:", pca.explained_variance_ratio_.sum())
|
||
|
||
return rf, (X_test, y_test, y_pred), (pca, scaler)
|
||
|
||
# model 4: logistic regression (viral vs non-viral)
|
||
|
||
|
||
def run_model_4():
|
||
df = news.copy()
|
||
df = df[df["Facebook"].notna()].copy()
|
||
|
||
threshold = df["Facebook"].quantile(0.9)
|
||
df["viral_fb"] = (df["Facebook"] >= threshold).astype(int)
|
||
|
||
X = df[["SentimentTitle", "SentimentHeadline", "DaysSinceEpoch", "Topic"]]
|
||
X = pd.get_dummies(X, columns=["Topic"], drop_first=True)
|
||
y = df["viral_fb"]
|
||
|
||
X_train, X_test, y_train, y_test = train_test_split(
|
||
X,
|
||
y,
|
||
test_size=0.2,
|
||
random_state=42,
|
||
stratify=y,
|
||
)
|
||
|
||
clf = LogisticRegression(
|
||
max_iter=500,
|
||
class_weight="balanced",
|
||
)
|
||
clf.fit(X_train, y_train)
|
||
|
||
y_pred = clf.predict(X_test)
|
||
y_proba = clf.predict_proba(X_test)[:, 1]
|
||
|
||
acc = accuracy_score(y_test, y_pred)
|
||
f1 = f1_score(y_test, y_pred)
|
||
auc = roc_auc_score(y_test, y_proba)
|
||
cm = confusion_matrix(y_test, y_pred)
|
||
|
||
print("model 4 – logistic regression (viral vs non-viral)")
|
||
print("threshold (shares):", threshold)
|
||
print("accuracy:", acc)
|
||
print("f1 (positive class):", f1)
|
||
print("roc auc:", auc)
|
||
print("confusion matrix:\n", cm)
|
||
|
||
return clf, (X_test, y_test, y_pred, y_proba)
|
||
|
||
# model 5: k-means clustering on ts shapes
|
||
|
||
|
||
def run_model_5():
|
||
X = fb_econ_merged[ts_cols_early].values
|
||
scaler = StandardScaler()
|
||
X_scaled = scaler.fit_transform(X)
|
||
|
||
rng = np.random.RandomState(42)
|
||
idx = rng.choice(X_scaled.shape[0], size=5000, replace=False)
|
||
X_sample = X_scaled[idx]
|
||
fb_sample = fb_econ_merged["Facebook"].values[idx]
|
||
|
||
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
|
||
kmeans.fit(X_sample)
|
||
labels = kmeans.labels_
|
||
|
||
sil = silhouette_score(X_sample, labels)
|
||
print("model 5 – kmeans on ts shapes")
|
||
print("silhouette score:", sil)
|
||
|
||
cluster_df = pd.DataFrame(
|
||
{"cluster": labels, "Facebook": fb_sample}
|
||
)
|
||
print(cluster_df.groupby("cluster")["Facebook"].agg(
|
||
["count", "mean", "median", "max"]
|
||
))
|
||
|
||
centers_scaled = kmeans.cluster_centers_
|
||
centers = scaler.inverse_transform(centers_scaled)
|
||
centers_df = pd.DataFrame(centers, columns=ts_cols_early)
|
||
|
||
summary = pd.DataFrame({
|
||
"cluster": list(range(centers_df.shape[0])),
|
||
"avg_ts": centers_df.mean(axis=1),
|
||
"ts1": centers_df["TS1"],
|
||
"ts10": centers_df["TS10"],
|
||
"ts25": centers_df["TS25"],
|
||
"ts50": centers_df["TS50"],
|
||
})
|
||
print("cluster centroid summary:\n", summary)
|
||
|
||
return kmeans, scaler, summary
|
||
|
||
|
||
if __name__ == "__main__":
|
||
run_model_1()
|
||
run_model_2()
|
||
run_model_3()
|
||
run_model_4()
|
||
run_model_5()
|
||
plot_eda()
|