This repository has been archived on 2026-05-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Data-Analytics/Assignment VI/code.py
T

364 lines
9.2 KiB
Python
Raw Normal View History

2025-12-05 19:59:00 -05:00
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
r2_score,
root_mean_squared_error,
accuracy_score,
f1_score,
roc_auc_score,
confusion_matrix,
silhouette_score,
)
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
# ensure imgs dir exists
os.makedirs("imgs", exist_ok=True)
# data loading
zip_path = "news+popularity+in+multiple+social+media+platforms.zip"
with zipfile.ZipFile(zip_path, "r") as zf:
with zf.open("Data/News_Final.csv") as f:
news = pd.read_csv(f)
# basic cleaning
pop_cols = ["Facebook", "GooglePlus", "LinkedIn"]
# encode -1 as missing
for col in pop_cols:
news.loc[news[col] < 0, col] = np.nan
# convert publishdate and add numeric time feature
news["PublishDate"] = pd.to_datetime(news["PublishDate"])
news["DaysSinceEpoch"] = (
news["PublishDate"] - pd.Timestamp("1970-01-01")
).dt.days
# log transform facebook popularity where available
news["log_Facebook"] = np.log1p(news["Facebook"])
# eda helpers (optional plotting)
def plot_eda():
plt.figure()
vals = news["Facebook"].dropna()
vals = vals[vals > 0]
vals.plot.hist(bins=50)
plt.xlabel("facebook shares")
plt.ylabel("count")
plt.title("distribution of facebook popularity")
plt.xscale("log")
plt.tight_layout()
plt.savefig("imgs/eda_facebook_hist.png")
plt.close()
plt.figure()
news["log_Facebook"].dropna().plot.hist(bins=50)
plt.xlabel("log1p(facebook shares)")
plt.ylabel("count")
plt.title("distribution of log-transformed facebook popularity")
plt.tight_layout()
plt.savefig("imgs/eda_log_facebook_hist.png")
plt.close()
mean_by_topic = (
news.groupby("Topic")["log_Facebook"].mean().sort_values()
)
plt.figure()
mean_by_topic.plot(kind="bar")
plt.ylabel("mean log1p(facebook shares)")
plt.title("average facebook popularity by topic")
plt.tight_layout()
plt.savefig("imgs/eda_mean_by_topic.png")
plt.close()
sample = news.dropna(
subset=["log_Facebook", "SentimentTitle"]
).sample(5000, random_state=42)
plt.figure()
plt.scatter(
sample["SentimentTitle"],
sample["log_Facebook"],
alpha=0.3,
)
plt.xlabel("sentimenttitle")
plt.ylabel("log1p(facebook shares)")
plt.title("title sentiment vs facebook popularity (sample)")
plt.tight_layout()
plt.savefig("imgs/eda_sentiment_vs_popularity.png")
plt.close()
# model 1: linear regression
def run_model_1():
df = news.dropna(subset=["log_Facebook"]).copy()
X = df[["SentimentTitle", "SentimentHeadline", "DaysSinceEpoch", "Topic"]]
X = pd.get_dummies(X, columns=["Topic"], drop_first=True)
y = df["log_Facebook"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
print("model 1 linear regression")
print("r2:", r2)
print("rmse:", rmse)
print("coefficients:")
print(pd.Series(linreg.coef_, index=X.columns))
# optional diagnostic plot
plt.figure()
plt.scatter(y_test, y_pred, alpha=0.3)
plt.xlabel("actual log1p(facebook)")
plt.ylabel("predicted log1p(facebook)")
plt.title("model 1: actual vs predicted")
plt.tight_layout()
plt.savefig("imgs/model1_actual_vs_predicted.png")
plt.close()
return linreg, (X_test, y_test, y_pred)
# prepare economy + facebook time-slice data
with zipfile.ZipFile(zip_path, "r") as zf:
with zf.open("Data/Facebook_Economy.csv") as f:
fb_econ = pd.read_csv(f)
# ensure integer id for join
news["IDLink_int"] = news["IDLink"].astype(int)
news_econ = news[news["Topic"] == "economy"].copy()
news_econ["IDLink_int"] = news_econ["IDLink"].astype(int)
fb_econ_merged = fb_econ.merge(
news_econ, left_on="IDLink", right_on="IDLink_int", how="inner"
)
# clean time-slice features
ts_cols = [c for c in fb_econ.columns if c.startswith("TS")]
for col in ts_cols:
fb_econ_merged.loc[fb_econ_merged[col] < 0, col] = 0
# drop rows with missing facebook target
fb_econ_merged = fb_econ_merged[fb_econ_merged["Facebook"].notna()].copy()
fb_econ_merged["log_Facebook"] = np.log1p(fb_econ_merged["Facebook"])
ts_cols_early = ts_cols[:50]
# model 2: random forest on raw early ts
def run_model_2():
X = fb_econ_merged[ts_cols_early + ["SentimentTitle", "SentimentHeadline"]]
y = fb_econ_merged["log_Facebook"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
rf = RandomForestRegressor(
n_estimators=120,
random_state=42,
n_jobs=-1,
max_depth=None,
min_samples_leaf=2,
)
rf.fit(X_train, y_train)
pipe = Pipeline([
("scaler", StandardScaler()),
("pca", PCA(n_components=10, random_state=42)),
("rf", RandomForestRegressor(
n_estimators=120,
random_state=42,
n_jobs=-1,
max_depth=None,
min_samples_leaf=2,
)),
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
print("model 2 random forest on raw ts")
print("r2:", r2)
print("rmse:", rmse)
importances = pd.Series(rf.feature_importances_, index=X.columns)
print("top importances:")
print(importances.sort_values(ascending=False).head(10))
return rf, (X_test, y_test, y_pred)
# model 3: pca + random forest
def run_model_3():
ts = fb_econ_merged[ts_cols_early]
sent = fb_econ_merged[["SentimentTitle", "SentimentHeadline"]]
X = pd.concat([ts, sent], axis=1)
y = fb_econ_merged["log_Facebook"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[ts_cols_early])
X_test_scaled = scaler.transform(X_test[ts_cols_early])
pca = PCA(n_components=10, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
train_sent = X_train[["SentimentTitle", "SentimentHeadline"]].values
test_sent = X_test[["SentimentTitle", "SentimentHeadline"]].values
X_train_final = np.hstack([X_train_pca, train_sent])
X_test_final = np.hstack([X_test_pca, test_sent])
rf = RandomForestRegressor(
n_estimators=120,
random_state=42,
n_jobs=-1,
max_depth=None,
min_samples_leaf=2,
)
rf.fit(X_train_final, y_train)
y_pred = rf.predict(X_test_final)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
print("model 3 random forest on pca(ts)")
print("r2:", r2)
print("rmse:", rmse)
print("pca variance explained (first 10):", pca.explained_variance_ratio_)
print("total variance explained:", pca.explained_variance_ratio_.sum())
return rf, (X_test, y_test, y_pred), (pca, scaler)
# model 4: logistic regression (viral vs non-viral)
def run_model_4():
df = news.copy()
df = df[df["Facebook"].notna()].copy()
threshold = df["Facebook"].quantile(0.9)
df["viral_fb"] = (df["Facebook"] >= threshold).astype(int)
X = df[["SentimentTitle", "SentimentHeadline", "DaysSinceEpoch", "Topic"]]
X = pd.get_dummies(X, columns=["Topic"], drop_first=True)
y = df["viral_fb"]
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.2,
random_state=42,
stratify=y,
)
clf = LogisticRegression(
max_iter=500,
class_weight="balanced",
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
cm = confusion_matrix(y_test, y_pred)
print("model 4 logistic regression (viral vs non-viral)")
print("threshold (shares):", threshold)
print("accuracy:", acc)
print("f1 (positive class):", f1)
print("roc auc:", auc)
print("confusion matrix:\n", cm)
return clf, (X_test, y_test, y_pred, y_proba)
# model 5: k-means clustering on ts shapes
def run_model_5():
X = fb_econ_merged[ts_cols_early].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
rng = np.random.RandomState(42)
idx = rng.choice(X_scaled.shape[0], size=5000, replace=False)
X_sample = X_scaled[idx]
fb_sample = fb_econ_merged["Facebook"].values[idx]
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans.fit(X_sample)
labels = kmeans.labels_
sil = silhouette_score(X_sample, labels)
print("model 5 kmeans on ts shapes")
print("silhouette score:", sil)
cluster_df = pd.DataFrame(
{"cluster": labels, "Facebook": fb_sample}
)
print(cluster_df.groupby("cluster")["Facebook"].agg(
["count", "mean", "median", "max"]
))
centers_scaled = kmeans.cluster_centers_
centers = scaler.inverse_transform(centers_scaled)
centers_df = pd.DataFrame(centers, columns=ts_cols_early)
summary = pd.DataFrame({
"cluster": list(range(centers_df.shape[0])),
"avg_ts": centers_df.mean(axis=1),
"ts1": centers_df["TS1"],
"ts10": centers_df["TS10"],
"ts25": centers_df["TS25"],
"ts50": centers_df["TS50"],
})
print("cluster centroid summary:\n", summary)
return kmeans, scaler, summary
if __name__ == "__main__":
run_model_1()
run_model_2()
run_model_3()
run_model_4()
run_model_5()
plot_eda()