added assignment VI
This commit is contained in:
@@ -0,0 +1,363 @@
|
||||
import zipfile
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import os
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.linear_model import LinearRegression, LogisticRegression
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.metrics import (
|
||||
r2_score,
|
||||
root_mean_squared_error,
|
||||
accuracy_score,
|
||||
f1_score,
|
||||
roc_auc_score,
|
||||
confusion_matrix,
|
||||
silhouette_score,
|
||||
)
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
# ensure imgs dir exists
|
||||
os.makedirs("imgs", exist_ok=True)
|
||||
|
||||
# data loading
|
||||
|
||||
zip_path = "news+popularity+in+multiple+social+media+platforms.zip"
|
||||
|
||||
with zipfile.ZipFile(zip_path, "r") as zf:
|
||||
with zf.open("Data/News_Final.csv") as f:
|
||||
news = pd.read_csv(f)
|
||||
|
||||
# basic cleaning
|
||||
|
||||
pop_cols = ["Facebook", "GooglePlus", "LinkedIn"]
|
||||
|
||||
# encode -1 as missing
|
||||
for col in pop_cols:
|
||||
news.loc[news[col] < 0, col] = np.nan
|
||||
|
||||
# convert publishdate and add numeric time feature
|
||||
news["PublishDate"] = pd.to_datetime(news["PublishDate"])
|
||||
news["DaysSinceEpoch"] = (
|
||||
news["PublishDate"] - pd.Timestamp("1970-01-01")
|
||||
).dt.days
|
||||
|
||||
# log transform facebook popularity where available
|
||||
news["log_Facebook"] = np.log1p(news["Facebook"])
|
||||
|
||||
# eda helpers (optional plotting)
|
||||
|
||||
|
||||
def plot_eda():
|
||||
plt.figure()
|
||||
vals = news["Facebook"].dropna()
|
||||
vals = vals[vals > 0]
|
||||
vals.plot.hist(bins=50)
|
||||
plt.xlabel("facebook shares")
|
||||
plt.ylabel("count")
|
||||
plt.title("distribution of facebook popularity")
|
||||
plt.xscale("log")
|
||||
plt.tight_layout()
|
||||
plt.savefig("imgs/eda_facebook_hist.png")
|
||||
plt.close()
|
||||
|
||||
plt.figure()
|
||||
news["log_Facebook"].dropna().plot.hist(bins=50)
|
||||
plt.xlabel("log1p(facebook shares)")
|
||||
plt.ylabel("count")
|
||||
plt.title("distribution of log-transformed facebook popularity")
|
||||
plt.tight_layout()
|
||||
plt.savefig("imgs/eda_log_facebook_hist.png")
|
||||
plt.close()
|
||||
|
||||
mean_by_topic = (
|
||||
news.groupby("Topic")["log_Facebook"].mean().sort_values()
|
||||
)
|
||||
plt.figure()
|
||||
mean_by_topic.plot(kind="bar")
|
||||
plt.ylabel("mean log1p(facebook shares)")
|
||||
plt.title("average facebook popularity by topic")
|
||||
plt.tight_layout()
|
||||
plt.savefig("imgs/eda_mean_by_topic.png")
|
||||
plt.close()
|
||||
|
||||
sample = news.dropna(
|
||||
subset=["log_Facebook", "SentimentTitle"]
|
||||
).sample(5000, random_state=42)
|
||||
plt.figure()
|
||||
plt.scatter(
|
||||
sample["SentimentTitle"],
|
||||
sample["log_Facebook"],
|
||||
alpha=0.3,
|
||||
)
|
||||
plt.xlabel("sentimenttitle")
|
||||
plt.ylabel("log1p(facebook shares)")
|
||||
plt.title("title sentiment vs facebook popularity (sample)")
|
||||
plt.tight_layout()
|
||||
plt.savefig("imgs/eda_sentiment_vs_popularity.png")
|
||||
plt.close()
|
||||
|
||||
# model 1: linear regression
|
||||
|
||||
|
||||
def run_model_1():
|
||||
df = news.dropna(subset=["log_Facebook"]).copy()
|
||||
|
||||
X = df[["SentimentTitle", "SentimentHeadline", "DaysSinceEpoch", "Topic"]]
|
||||
X = pd.get_dummies(X, columns=["Topic"], drop_first=True)
|
||||
y = df["log_Facebook"]
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42
|
||||
)
|
||||
|
||||
linreg = LinearRegression()
|
||||
linreg.fit(X_train, y_train)
|
||||
y_pred = linreg.predict(X_test)
|
||||
|
||||
r2 = r2_score(y_test, y_pred)
|
||||
rmse = root_mean_squared_error(y_test, y_pred)
|
||||
|
||||
print("model 1 – linear regression")
|
||||
print("r2:", r2)
|
||||
print("rmse:", rmse)
|
||||
print("coefficients:")
|
||||
print(pd.Series(linreg.coef_, index=X.columns))
|
||||
|
||||
# optional diagnostic plot
|
||||
plt.figure()
|
||||
plt.scatter(y_test, y_pred, alpha=0.3)
|
||||
plt.xlabel("actual log1p(facebook)")
|
||||
plt.ylabel("predicted log1p(facebook)")
|
||||
plt.title("model 1: actual vs predicted")
|
||||
plt.tight_layout()
|
||||
plt.savefig("imgs/model1_actual_vs_predicted.png")
|
||||
plt.close()
|
||||
|
||||
return linreg, (X_test, y_test, y_pred)
|
||||
|
||||
# prepare economy + facebook time-slice data
|
||||
|
||||
|
||||
with zipfile.ZipFile(zip_path, "r") as zf:
|
||||
with zf.open("Data/Facebook_Economy.csv") as f:
|
||||
fb_econ = pd.read_csv(f)
|
||||
|
||||
# ensure integer id for join
|
||||
news["IDLink_int"] = news["IDLink"].astype(int)
|
||||
|
||||
news_econ = news[news["Topic"] == "economy"].copy()
|
||||
news_econ["IDLink_int"] = news_econ["IDLink"].astype(int)
|
||||
|
||||
fb_econ_merged = fb_econ.merge(
|
||||
news_econ, left_on="IDLink", right_on="IDLink_int", how="inner"
|
||||
)
|
||||
|
||||
# clean time-slice features
|
||||
ts_cols = [c for c in fb_econ.columns if c.startswith("TS")]
|
||||
for col in ts_cols:
|
||||
fb_econ_merged.loc[fb_econ_merged[col] < 0, col] = 0
|
||||
|
||||
# drop rows with missing facebook target
|
||||
fb_econ_merged = fb_econ_merged[fb_econ_merged["Facebook"].notna()].copy()
|
||||
fb_econ_merged["log_Facebook"] = np.log1p(fb_econ_merged["Facebook"])
|
||||
|
||||
ts_cols_early = ts_cols[:50]
|
||||
|
||||
# model 2: random forest on raw early ts
|
||||
|
||||
|
||||
def run_model_2():
|
||||
X = fb_econ_merged[ts_cols_early + ["SentimentTitle", "SentimentHeadline"]]
|
||||
y = fb_econ_merged["log_Facebook"]
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42
|
||||
)
|
||||
|
||||
rf = RandomForestRegressor(
|
||||
n_estimators=120,
|
||||
random_state=42,
|
||||
n_jobs=-1,
|
||||
max_depth=None,
|
||||
min_samples_leaf=2,
|
||||
)
|
||||
rf.fit(X_train, y_train)
|
||||
|
||||
pipe = Pipeline([
|
||||
("scaler", StandardScaler()),
|
||||
("pca", PCA(n_components=10, random_state=42)),
|
||||
("rf", RandomForestRegressor(
|
||||
n_estimators=120,
|
||||
random_state=42,
|
||||
n_jobs=-1,
|
||||
max_depth=None,
|
||||
min_samples_leaf=2,
|
||||
)),
|
||||
])
|
||||
|
||||
pipe.fit(X_train, y_train)
|
||||
y_pred = pipe.predict(X_test)
|
||||
|
||||
r2 = r2_score(y_test, y_pred)
|
||||
rmse = root_mean_squared_error(y_test, y_pred)
|
||||
|
||||
print("model 2 – random forest on raw ts")
|
||||
print("r2:", r2)
|
||||
print("rmse:", rmse)
|
||||
|
||||
importances = pd.Series(rf.feature_importances_, index=X.columns)
|
||||
print("top importances:")
|
||||
print(importances.sort_values(ascending=False).head(10))
|
||||
|
||||
return rf, (X_test, y_test, y_pred)
|
||||
|
||||
# model 3: pca + random forest
|
||||
|
||||
|
||||
def run_model_3():
|
||||
ts = fb_econ_merged[ts_cols_early]
|
||||
sent = fb_econ_merged[["SentimentTitle", "SentimentHeadline"]]
|
||||
X = pd.concat([ts, sent], axis=1)
|
||||
y = fb_econ_merged["log_Facebook"]
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42
|
||||
)
|
||||
|
||||
scaler = StandardScaler()
|
||||
X_train_scaled = scaler.fit_transform(X_train[ts_cols_early])
|
||||
X_test_scaled = scaler.transform(X_test[ts_cols_early])
|
||||
|
||||
pca = PCA(n_components=10, random_state=42)
|
||||
X_train_pca = pca.fit_transform(X_train_scaled)
|
||||
X_test_pca = pca.transform(X_test_scaled)
|
||||
|
||||
train_sent = X_train[["SentimentTitle", "SentimentHeadline"]].values
|
||||
test_sent = X_test[["SentimentTitle", "SentimentHeadline"]].values
|
||||
|
||||
X_train_final = np.hstack([X_train_pca, train_sent])
|
||||
X_test_final = np.hstack([X_test_pca, test_sent])
|
||||
|
||||
rf = RandomForestRegressor(
|
||||
n_estimators=120,
|
||||
random_state=42,
|
||||
n_jobs=-1,
|
||||
max_depth=None,
|
||||
min_samples_leaf=2,
|
||||
)
|
||||
rf.fit(X_train_final, y_train)
|
||||
y_pred = rf.predict(X_test_final)
|
||||
|
||||
r2 = r2_score(y_test, y_pred)
|
||||
rmse = root_mean_squared_error(y_test, y_pred)
|
||||
|
||||
print("model 3 – random forest on pca(ts)")
|
||||
print("r2:", r2)
|
||||
print("rmse:", rmse)
|
||||
print("pca variance explained (first 10):", pca.explained_variance_ratio_)
|
||||
print("total variance explained:", pca.explained_variance_ratio_.sum())
|
||||
|
||||
return rf, (X_test, y_test, y_pred), (pca, scaler)
|
||||
|
||||
# model 4: logistic regression (viral vs non-viral)
|
||||
|
||||
|
||||
def run_model_4():
|
||||
df = news.copy()
|
||||
df = df[df["Facebook"].notna()].copy()
|
||||
|
||||
threshold = df["Facebook"].quantile(0.9)
|
||||
df["viral_fb"] = (df["Facebook"] >= threshold).astype(int)
|
||||
|
||||
X = df[["SentimentTitle", "SentimentHeadline", "DaysSinceEpoch", "Topic"]]
|
||||
X = pd.get_dummies(X, columns=["Topic"], drop_first=True)
|
||||
y = df["viral_fb"]
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X,
|
||||
y,
|
||||
test_size=0.2,
|
||||
random_state=42,
|
||||
stratify=y,
|
||||
)
|
||||
|
||||
clf = LogisticRegression(
|
||||
max_iter=500,
|
||||
class_weight="balanced",
|
||||
)
|
||||
clf.fit(X_train, y_train)
|
||||
|
||||
y_pred = clf.predict(X_test)
|
||||
y_proba = clf.predict_proba(X_test)[:, 1]
|
||||
|
||||
acc = accuracy_score(y_test, y_pred)
|
||||
f1 = f1_score(y_test, y_pred)
|
||||
auc = roc_auc_score(y_test, y_proba)
|
||||
cm = confusion_matrix(y_test, y_pred)
|
||||
|
||||
print("model 4 – logistic regression (viral vs non-viral)")
|
||||
print("threshold (shares):", threshold)
|
||||
print("accuracy:", acc)
|
||||
print("f1 (positive class):", f1)
|
||||
print("roc auc:", auc)
|
||||
print("confusion matrix:\n", cm)
|
||||
|
||||
return clf, (X_test, y_test, y_pred, y_proba)
|
||||
|
||||
# model 5: k-means clustering on ts shapes
|
||||
|
||||
|
||||
def run_model_5():
|
||||
X = fb_econ_merged[ts_cols_early].values
|
||||
scaler = StandardScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
|
||||
rng = np.random.RandomState(42)
|
||||
idx = rng.choice(X_scaled.shape[0], size=5000, replace=False)
|
||||
X_sample = X_scaled[idx]
|
||||
fb_sample = fb_econ_merged["Facebook"].values[idx]
|
||||
|
||||
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
|
||||
kmeans.fit(X_sample)
|
||||
labels = kmeans.labels_
|
||||
|
||||
sil = silhouette_score(X_sample, labels)
|
||||
print("model 5 – kmeans on ts shapes")
|
||||
print("silhouette score:", sil)
|
||||
|
||||
cluster_df = pd.DataFrame(
|
||||
{"cluster": labels, "Facebook": fb_sample}
|
||||
)
|
||||
print(cluster_df.groupby("cluster")["Facebook"].agg(
|
||||
["count", "mean", "median", "max"]
|
||||
))
|
||||
|
||||
centers_scaled = kmeans.cluster_centers_
|
||||
centers = scaler.inverse_transform(centers_scaled)
|
||||
centers_df = pd.DataFrame(centers, columns=ts_cols_early)
|
||||
|
||||
summary = pd.DataFrame({
|
||||
"cluster": list(range(centers_df.shape[0])),
|
||||
"avg_ts": centers_df.mean(axis=1),
|
||||
"ts1": centers_df["TS1"],
|
||||
"ts10": centers_df["TS10"],
|
||||
"ts25": centers_df["TS25"],
|
||||
"ts50": centers_df["TS50"],
|
||||
})
|
||||
print("cluster centroid summary:\n", summary)
|
||||
|
||||
return kmeans, scaler, summary
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_model_1()
|
||||
run_model_2()
|
||||
run_model_3()
|
||||
run_model_4()
|
||||
run_model_5()
|
||||
plot_eda()
|
||||
Reference in New Issue
Block a user