364 lines
9.2 KiB
Python
364 lines
9.2 KiB
Python
|
|
import zipfile
|
|||
|
|
import numpy as np
|
|||
|
|
import pandas as pd
|
|||
|
|
import matplotlib.pyplot as plt
|
|||
|
|
import os
|
|||
|
|
|
|||
|
|
from sklearn.model_selection import train_test_split
|
|||
|
|
from sklearn.linear_model import LinearRegression, LogisticRegression
|
|||
|
|
from sklearn.ensemble import RandomForestRegressor
|
|||
|
|
from sklearn.decomposition import PCA
|
|||
|
|
from sklearn.preprocessing import StandardScaler
|
|||
|
|
from sklearn.metrics import (
|
|||
|
|
r2_score,
|
|||
|
|
root_mean_squared_error,
|
|||
|
|
accuracy_score,
|
|||
|
|
f1_score,
|
|||
|
|
roc_auc_score,
|
|||
|
|
confusion_matrix,
|
|||
|
|
silhouette_score,
|
|||
|
|
)
|
|||
|
|
from sklearn.pipeline import Pipeline
|
|||
|
|
from sklearn.cluster import KMeans
|
|||
|
|
|
|||
|
|
# ensure imgs dir exists
|
|||
|
|
os.makedirs("imgs", exist_ok=True)
|
|||
|
|
|
|||
|
|
# data loading
|
|||
|
|
|
|||
|
|
zip_path = "news+popularity+in+multiple+social+media+platforms.zip"
|
|||
|
|
|
|||
|
|
with zipfile.ZipFile(zip_path, "r") as zf:
|
|||
|
|
with zf.open("Data/News_Final.csv") as f:
|
|||
|
|
news = pd.read_csv(f)
|
|||
|
|
|
|||
|
|
# basic cleaning
|
|||
|
|
|
|||
|
|
pop_cols = ["Facebook", "GooglePlus", "LinkedIn"]
|
|||
|
|
|
|||
|
|
# encode -1 as missing
|
|||
|
|
for col in pop_cols:
|
|||
|
|
news.loc[news[col] < 0, col] = np.nan
|
|||
|
|
|
|||
|
|
# convert publishdate and add numeric time feature
|
|||
|
|
news["PublishDate"] = pd.to_datetime(news["PublishDate"])
|
|||
|
|
news["DaysSinceEpoch"] = (
|
|||
|
|
news["PublishDate"] - pd.Timestamp("1970-01-01")
|
|||
|
|
).dt.days
|
|||
|
|
|
|||
|
|
# log transform facebook popularity where available
|
|||
|
|
news["log_Facebook"] = np.log1p(news["Facebook"])
|
|||
|
|
|
|||
|
|
# eda helpers (optional plotting)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def plot_eda():
|
|||
|
|
plt.figure()
|
|||
|
|
vals = news["Facebook"].dropna()
|
|||
|
|
vals = vals[vals > 0]
|
|||
|
|
vals.plot.hist(bins=50)
|
|||
|
|
plt.xlabel("facebook shares")
|
|||
|
|
plt.ylabel("count")
|
|||
|
|
plt.title("distribution of facebook popularity")
|
|||
|
|
plt.xscale("log")
|
|||
|
|
plt.tight_layout()
|
|||
|
|
plt.savefig("imgs/eda_facebook_hist.png")
|
|||
|
|
plt.close()
|
|||
|
|
|
|||
|
|
plt.figure()
|
|||
|
|
news["log_Facebook"].dropna().plot.hist(bins=50)
|
|||
|
|
plt.xlabel("log1p(facebook shares)")
|
|||
|
|
plt.ylabel("count")
|
|||
|
|
plt.title("distribution of log-transformed facebook popularity")
|
|||
|
|
plt.tight_layout()
|
|||
|
|
plt.savefig("imgs/eda_log_facebook_hist.png")
|
|||
|
|
plt.close()
|
|||
|
|
|
|||
|
|
mean_by_topic = (
|
|||
|
|
news.groupby("Topic")["log_Facebook"].mean().sort_values()
|
|||
|
|
)
|
|||
|
|
plt.figure()
|
|||
|
|
mean_by_topic.plot(kind="bar")
|
|||
|
|
plt.ylabel("mean log1p(facebook shares)")
|
|||
|
|
plt.title("average facebook popularity by topic")
|
|||
|
|
plt.tight_layout()
|
|||
|
|
plt.savefig("imgs/eda_mean_by_topic.png")
|
|||
|
|
plt.close()
|
|||
|
|
|
|||
|
|
sample = news.dropna(
|
|||
|
|
subset=["log_Facebook", "SentimentTitle"]
|
|||
|
|
).sample(5000, random_state=42)
|
|||
|
|
plt.figure()
|
|||
|
|
plt.scatter(
|
|||
|
|
sample["SentimentTitle"],
|
|||
|
|
sample["log_Facebook"],
|
|||
|
|
alpha=0.3,
|
|||
|
|
)
|
|||
|
|
plt.xlabel("sentimenttitle")
|
|||
|
|
plt.ylabel("log1p(facebook shares)")
|
|||
|
|
plt.title("title sentiment vs facebook popularity (sample)")
|
|||
|
|
plt.tight_layout()
|
|||
|
|
plt.savefig("imgs/eda_sentiment_vs_popularity.png")
|
|||
|
|
plt.close()
|
|||
|
|
|
|||
|
|
# model 1: linear regression
|
|||
|
|
|
|||
|
|
|
|||
|
|
def run_model_1():
|
|||
|
|
df = news.dropna(subset=["log_Facebook"]).copy()
|
|||
|
|
|
|||
|
|
X = df[["SentimentTitle", "SentimentHeadline", "DaysSinceEpoch", "Topic"]]
|
|||
|
|
X = pd.get_dummies(X, columns=["Topic"], drop_first=True)
|
|||
|
|
y = df["log_Facebook"]
|
|||
|
|
|
|||
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|||
|
|
X, y, test_size=0.2, random_state=42
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
linreg = LinearRegression()
|
|||
|
|
linreg.fit(X_train, y_train)
|
|||
|
|
y_pred = linreg.predict(X_test)
|
|||
|
|
|
|||
|
|
r2 = r2_score(y_test, y_pred)
|
|||
|
|
rmse = root_mean_squared_error(y_test, y_pred)
|
|||
|
|
|
|||
|
|
print("model 1 – linear regression")
|
|||
|
|
print("r2:", r2)
|
|||
|
|
print("rmse:", rmse)
|
|||
|
|
print("coefficients:")
|
|||
|
|
print(pd.Series(linreg.coef_, index=X.columns))
|
|||
|
|
|
|||
|
|
# optional diagnostic plot
|
|||
|
|
plt.figure()
|
|||
|
|
plt.scatter(y_test, y_pred, alpha=0.3)
|
|||
|
|
plt.xlabel("actual log1p(facebook)")
|
|||
|
|
plt.ylabel("predicted log1p(facebook)")
|
|||
|
|
plt.title("model 1: actual vs predicted")
|
|||
|
|
plt.tight_layout()
|
|||
|
|
plt.savefig("imgs/model1_actual_vs_predicted.png")
|
|||
|
|
plt.close()
|
|||
|
|
|
|||
|
|
return linreg, (X_test, y_test, y_pred)
|
|||
|
|
|
|||
|
|
# prepare economy + facebook time-slice data
|
|||
|
|
|
|||
|
|
|
|||
|
|
with zipfile.ZipFile(zip_path, "r") as zf:
|
|||
|
|
with zf.open("Data/Facebook_Economy.csv") as f:
|
|||
|
|
fb_econ = pd.read_csv(f)
|
|||
|
|
|
|||
|
|
# ensure integer id for join
|
|||
|
|
news["IDLink_int"] = news["IDLink"].astype(int)
|
|||
|
|
|
|||
|
|
news_econ = news[news["Topic"] == "economy"].copy()
|
|||
|
|
news_econ["IDLink_int"] = news_econ["IDLink"].astype(int)
|
|||
|
|
|
|||
|
|
fb_econ_merged = fb_econ.merge(
|
|||
|
|
news_econ, left_on="IDLink", right_on="IDLink_int", how="inner"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# clean time-slice features
|
|||
|
|
ts_cols = [c for c in fb_econ.columns if c.startswith("TS")]
|
|||
|
|
for col in ts_cols:
|
|||
|
|
fb_econ_merged.loc[fb_econ_merged[col] < 0, col] = 0
|
|||
|
|
|
|||
|
|
# drop rows with missing facebook target
|
|||
|
|
fb_econ_merged = fb_econ_merged[fb_econ_merged["Facebook"].notna()].copy()
|
|||
|
|
fb_econ_merged["log_Facebook"] = np.log1p(fb_econ_merged["Facebook"])
|
|||
|
|
|
|||
|
|
ts_cols_early = ts_cols[:50]
|
|||
|
|
|
|||
|
|
# model 2: random forest on raw early ts
|
|||
|
|
|
|||
|
|
|
|||
|
|
def run_model_2():
|
|||
|
|
X = fb_econ_merged[ts_cols_early + ["SentimentTitle", "SentimentHeadline"]]
|
|||
|
|
y = fb_econ_merged["log_Facebook"]
|
|||
|
|
|
|||
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|||
|
|
X, y, test_size=0.2, random_state=42
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
rf = RandomForestRegressor(
|
|||
|
|
n_estimators=120,
|
|||
|
|
random_state=42,
|
|||
|
|
n_jobs=-1,
|
|||
|
|
max_depth=None,
|
|||
|
|
min_samples_leaf=2,
|
|||
|
|
)
|
|||
|
|
rf.fit(X_train, y_train)
|
|||
|
|
|
|||
|
|
pipe = Pipeline([
|
|||
|
|
("scaler", StandardScaler()),
|
|||
|
|
("pca", PCA(n_components=10, random_state=42)),
|
|||
|
|
("rf", RandomForestRegressor(
|
|||
|
|
n_estimators=120,
|
|||
|
|
random_state=42,
|
|||
|
|
n_jobs=-1,
|
|||
|
|
max_depth=None,
|
|||
|
|
min_samples_leaf=2,
|
|||
|
|
)),
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
pipe.fit(X_train, y_train)
|
|||
|
|
y_pred = pipe.predict(X_test)
|
|||
|
|
|
|||
|
|
r2 = r2_score(y_test, y_pred)
|
|||
|
|
rmse = root_mean_squared_error(y_test, y_pred)
|
|||
|
|
|
|||
|
|
print("model 2 – random forest on raw ts")
|
|||
|
|
print("r2:", r2)
|
|||
|
|
print("rmse:", rmse)
|
|||
|
|
|
|||
|
|
importances = pd.Series(rf.feature_importances_, index=X.columns)
|
|||
|
|
print("top importances:")
|
|||
|
|
print(importances.sort_values(ascending=False).head(10))
|
|||
|
|
|
|||
|
|
return rf, (X_test, y_test, y_pred)
|
|||
|
|
|
|||
|
|
# model 3: pca + random forest
|
|||
|
|
|
|||
|
|
|
|||
|
|
def run_model_3():
|
|||
|
|
ts = fb_econ_merged[ts_cols_early]
|
|||
|
|
sent = fb_econ_merged[["SentimentTitle", "SentimentHeadline"]]
|
|||
|
|
X = pd.concat([ts, sent], axis=1)
|
|||
|
|
y = fb_econ_merged["log_Facebook"]
|
|||
|
|
|
|||
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|||
|
|
X, y, test_size=0.2, random_state=42
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
scaler = StandardScaler()
|
|||
|
|
X_train_scaled = scaler.fit_transform(X_train[ts_cols_early])
|
|||
|
|
X_test_scaled = scaler.transform(X_test[ts_cols_early])
|
|||
|
|
|
|||
|
|
pca = PCA(n_components=10, random_state=42)
|
|||
|
|
X_train_pca = pca.fit_transform(X_train_scaled)
|
|||
|
|
X_test_pca = pca.transform(X_test_scaled)
|
|||
|
|
|
|||
|
|
train_sent = X_train[["SentimentTitle", "SentimentHeadline"]].values
|
|||
|
|
test_sent = X_test[["SentimentTitle", "SentimentHeadline"]].values
|
|||
|
|
|
|||
|
|
X_train_final = np.hstack([X_train_pca, train_sent])
|
|||
|
|
X_test_final = np.hstack([X_test_pca, test_sent])
|
|||
|
|
|
|||
|
|
rf = RandomForestRegressor(
|
|||
|
|
n_estimators=120,
|
|||
|
|
random_state=42,
|
|||
|
|
n_jobs=-1,
|
|||
|
|
max_depth=None,
|
|||
|
|
min_samples_leaf=2,
|
|||
|
|
)
|
|||
|
|
rf.fit(X_train_final, y_train)
|
|||
|
|
y_pred = rf.predict(X_test_final)
|
|||
|
|
|
|||
|
|
r2 = r2_score(y_test, y_pred)
|
|||
|
|
rmse = root_mean_squared_error(y_test, y_pred)
|
|||
|
|
|
|||
|
|
print("model 3 – random forest on pca(ts)")
|
|||
|
|
print("r2:", r2)
|
|||
|
|
print("rmse:", rmse)
|
|||
|
|
print("pca variance explained (first 10):", pca.explained_variance_ratio_)
|
|||
|
|
print("total variance explained:", pca.explained_variance_ratio_.sum())
|
|||
|
|
|
|||
|
|
return rf, (X_test, y_test, y_pred), (pca, scaler)
|
|||
|
|
|
|||
|
|
# model 4: logistic regression (viral vs non-viral)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def run_model_4():
|
|||
|
|
df = news.copy()
|
|||
|
|
df = df[df["Facebook"].notna()].copy()
|
|||
|
|
|
|||
|
|
threshold = df["Facebook"].quantile(0.9)
|
|||
|
|
df["viral_fb"] = (df["Facebook"] >= threshold).astype(int)
|
|||
|
|
|
|||
|
|
X = df[["SentimentTitle", "SentimentHeadline", "DaysSinceEpoch", "Topic"]]
|
|||
|
|
X = pd.get_dummies(X, columns=["Topic"], drop_first=True)
|
|||
|
|
y = df["viral_fb"]
|
|||
|
|
|
|||
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|||
|
|
X,
|
|||
|
|
y,
|
|||
|
|
test_size=0.2,
|
|||
|
|
random_state=42,
|
|||
|
|
stratify=y,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
clf = LogisticRegression(
|
|||
|
|
max_iter=500,
|
|||
|
|
class_weight="balanced",
|
|||
|
|
)
|
|||
|
|
clf.fit(X_train, y_train)
|
|||
|
|
|
|||
|
|
y_pred = clf.predict(X_test)
|
|||
|
|
y_proba = clf.predict_proba(X_test)[:, 1]
|
|||
|
|
|
|||
|
|
acc = accuracy_score(y_test, y_pred)
|
|||
|
|
f1 = f1_score(y_test, y_pred)
|
|||
|
|
auc = roc_auc_score(y_test, y_proba)
|
|||
|
|
cm = confusion_matrix(y_test, y_pred)
|
|||
|
|
|
|||
|
|
print("model 4 – logistic regression (viral vs non-viral)")
|
|||
|
|
print("threshold (shares):", threshold)
|
|||
|
|
print("accuracy:", acc)
|
|||
|
|
print("f1 (positive class):", f1)
|
|||
|
|
print("roc auc:", auc)
|
|||
|
|
print("confusion matrix:\n", cm)
|
|||
|
|
|
|||
|
|
return clf, (X_test, y_test, y_pred, y_proba)
|
|||
|
|
|
|||
|
|
# model 5: k-means clustering on ts shapes
|
|||
|
|
|
|||
|
|
|
|||
|
|
def run_model_5():
|
|||
|
|
X = fb_econ_merged[ts_cols_early].values
|
|||
|
|
scaler = StandardScaler()
|
|||
|
|
X_scaled = scaler.fit_transform(X)
|
|||
|
|
|
|||
|
|
rng = np.random.RandomState(42)
|
|||
|
|
idx = rng.choice(X_scaled.shape[0], size=5000, replace=False)
|
|||
|
|
X_sample = X_scaled[idx]
|
|||
|
|
fb_sample = fb_econ_merged["Facebook"].values[idx]
|
|||
|
|
|
|||
|
|
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
|
|||
|
|
kmeans.fit(X_sample)
|
|||
|
|
labels = kmeans.labels_
|
|||
|
|
|
|||
|
|
sil = silhouette_score(X_sample, labels)
|
|||
|
|
print("model 5 – kmeans on ts shapes")
|
|||
|
|
print("silhouette score:", sil)
|
|||
|
|
|
|||
|
|
cluster_df = pd.DataFrame(
|
|||
|
|
{"cluster": labels, "Facebook": fb_sample}
|
|||
|
|
)
|
|||
|
|
print(cluster_df.groupby("cluster")["Facebook"].agg(
|
|||
|
|
["count", "mean", "median", "max"]
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
centers_scaled = kmeans.cluster_centers_
|
|||
|
|
centers = scaler.inverse_transform(centers_scaled)
|
|||
|
|
centers_df = pd.DataFrame(centers, columns=ts_cols_early)
|
|||
|
|
|
|||
|
|
summary = pd.DataFrame({
|
|||
|
|
"cluster": list(range(centers_df.shape[0])),
|
|||
|
|
"avg_ts": centers_df.mean(axis=1),
|
|||
|
|
"ts1": centers_df["TS1"],
|
|||
|
|
"ts10": centers_df["TS10"],
|
|||
|
|
"ts25": centers_df["TS25"],
|
|||
|
|
"ts50": centers_df["TS50"],
|
|||
|
|
})
|
|||
|
|
print("cluster centroid summary:\n", summary)
|
|||
|
|
|
|||
|
|
return kmeans, scaler, summary
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
run_model_1()
|
|||
|
|
run_model_2()
|
|||
|
|
run_model_3()
|
|||
|
|
run_model_4()
|
|||
|
|
run_model_5()
|
|||
|
|
plot_eda()
|