This repository has been archived on 2026-05-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Data-Analytics/Assignment VI/code.py
T
2025-12-05 19:59:00 -05:00

364 lines
9.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
r2_score,
root_mean_squared_error,
accuracy_score,
f1_score,
roc_auc_score,
confusion_matrix,
silhouette_score,
)
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
# ensure imgs dir exists
os.makedirs("imgs", exist_ok=True)
# data loading
zip_path = "news+popularity+in+multiple+social+media+platforms.zip"
with zipfile.ZipFile(zip_path, "r") as zf:
with zf.open("Data/News_Final.csv") as f:
news = pd.read_csv(f)
# basic cleaning
pop_cols = ["Facebook", "GooglePlus", "LinkedIn"]
# encode -1 as missing
for col in pop_cols:
news.loc[news[col] < 0, col] = np.nan
# convert publishdate and add numeric time feature
news["PublishDate"] = pd.to_datetime(news["PublishDate"])
news["DaysSinceEpoch"] = (
news["PublishDate"] - pd.Timestamp("1970-01-01")
).dt.days
# log transform facebook popularity where available
news["log_Facebook"] = np.log1p(news["Facebook"])
# eda helpers (optional plotting)
def plot_eda():
plt.figure()
vals = news["Facebook"].dropna()
vals = vals[vals > 0]
vals.plot.hist(bins=50)
plt.xlabel("facebook shares")
plt.ylabel("count")
plt.title("distribution of facebook popularity")
plt.xscale("log")
plt.tight_layout()
plt.savefig("imgs/eda_facebook_hist.png")
plt.close()
plt.figure()
news["log_Facebook"].dropna().plot.hist(bins=50)
plt.xlabel("log1p(facebook shares)")
plt.ylabel("count")
plt.title("distribution of log-transformed facebook popularity")
plt.tight_layout()
plt.savefig("imgs/eda_log_facebook_hist.png")
plt.close()
mean_by_topic = (
news.groupby("Topic")["log_Facebook"].mean().sort_values()
)
plt.figure()
mean_by_topic.plot(kind="bar")
plt.ylabel("mean log1p(facebook shares)")
plt.title("average facebook popularity by topic")
plt.tight_layout()
plt.savefig("imgs/eda_mean_by_topic.png")
plt.close()
sample = news.dropna(
subset=["log_Facebook", "SentimentTitle"]
).sample(5000, random_state=42)
plt.figure()
plt.scatter(
sample["SentimentTitle"],
sample["log_Facebook"],
alpha=0.3,
)
plt.xlabel("sentimenttitle")
plt.ylabel("log1p(facebook shares)")
plt.title("title sentiment vs facebook popularity (sample)")
plt.tight_layout()
plt.savefig("imgs/eda_sentiment_vs_popularity.png")
plt.close()
# model 1: linear regression
def run_model_1():
df = news.dropna(subset=["log_Facebook"]).copy()
X = df[["SentimentTitle", "SentimentHeadline", "DaysSinceEpoch", "Topic"]]
X = pd.get_dummies(X, columns=["Topic"], drop_first=True)
y = df["log_Facebook"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
print("model 1 linear regression")
print("r2:", r2)
print("rmse:", rmse)
print("coefficients:")
print(pd.Series(linreg.coef_, index=X.columns))
# optional diagnostic plot
plt.figure()
plt.scatter(y_test, y_pred, alpha=0.3)
plt.xlabel("actual log1p(facebook)")
plt.ylabel("predicted log1p(facebook)")
plt.title("model 1: actual vs predicted")
plt.tight_layout()
plt.savefig("imgs/model1_actual_vs_predicted.png")
plt.close()
return linreg, (X_test, y_test, y_pred)
# prepare economy + facebook time-slice data
with zipfile.ZipFile(zip_path, "r") as zf:
with zf.open("Data/Facebook_Economy.csv") as f:
fb_econ = pd.read_csv(f)
# ensure integer id for join
news["IDLink_int"] = news["IDLink"].astype(int)
news_econ = news[news["Topic"] == "economy"].copy()
news_econ["IDLink_int"] = news_econ["IDLink"].astype(int)
fb_econ_merged = fb_econ.merge(
news_econ, left_on="IDLink", right_on="IDLink_int", how="inner"
)
# clean time-slice features
ts_cols = [c for c in fb_econ.columns if c.startswith("TS")]
for col in ts_cols:
fb_econ_merged.loc[fb_econ_merged[col] < 0, col] = 0
# drop rows with missing facebook target
fb_econ_merged = fb_econ_merged[fb_econ_merged["Facebook"].notna()].copy()
fb_econ_merged["log_Facebook"] = np.log1p(fb_econ_merged["Facebook"])
ts_cols_early = ts_cols[:50]
# model 2: random forest on raw early ts
def run_model_2():
X = fb_econ_merged[ts_cols_early + ["SentimentTitle", "SentimentHeadline"]]
y = fb_econ_merged["log_Facebook"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
rf = RandomForestRegressor(
n_estimators=120,
random_state=42,
n_jobs=-1,
max_depth=None,
min_samples_leaf=2,
)
rf.fit(X_train, y_train)
pipe = Pipeline([
("scaler", StandardScaler()),
("pca", PCA(n_components=10, random_state=42)),
("rf", RandomForestRegressor(
n_estimators=120,
random_state=42,
n_jobs=-1,
max_depth=None,
min_samples_leaf=2,
)),
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
print("model 2 random forest on raw ts")
print("r2:", r2)
print("rmse:", rmse)
importances = pd.Series(rf.feature_importances_, index=X.columns)
print("top importances:")
print(importances.sort_values(ascending=False).head(10))
return rf, (X_test, y_test, y_pred)
# model 3: pca + random forest
def run_model_3():
ts = fb_econ_merged[ts_cols_early]
sent = fb_econ_merged[["SentimentTitle", "SentimentHeadline"]]
X = pd.concat([ts, sent], axis=1)
y = fb_econ_merged["log_Facebook"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[ts_cols_early])
X_test_scaled = scaler.transform(X_test[ts_cols_early])
pca = PCA(n_components=10, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
train_sent = X_train[["SentimentTitle", "SentimentHeadline"]].values
test_sent = X_test[["SentimentTitle", "SentimentHeadline"]].values
X_train_final = np.hstack([X_train_pca, train_sent])
X_test_final = np.hstack([X_test_pca, test_sent])
rf = RandomForestRegressor(
n_estimators=120,
random_state=42,
n_jobs=-1,
max_depth=None,
min_samples_leaf=2,
)
rf.fit(X_train_final, y_train)
y_pred = rf.predict(X_test_final)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
print("model 3 random forest on pca(ts)")
print("r2:", r2)
print("rmse:", rmse)
print("pca variance explained (first 10):", pca.explained_variance_ratio_)
print("total variance explained:", pca.explained_variance_ratio_.sum())
return rf, (X_test, y_test, y_pred), (pca, scaler)
# model 4: logistic regression (viral vs non-viral)
def run_model_4():
df = news.copy()
df = df[df["Facebook"].notna()].copy()
threshold = df["Facebook"].quantile(0.9)
df["viral_fb"] = (df["Facebook"] >= threshold).astype(int)
X = df[["SentimentTitle", "SentimentHeadline", "DaysSinceEpoch", "Topic"]]
X = pd.get_dummies(X, columns=["Topic"], drop_first=True)
y = df["viral_fb"]
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.2,
random_state=42,
stratify=y,
)
clf = LogisticRegression(
max_iter=500,
class_weight="balanced",
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
cm = confusion_matrix(y_test, y_pred)
print("model 4 logistic regression (viral vs non-viral)")
print("threshold (shares):", threshold)
print("accuracy:", acc)
print("f1 (positive class):", f1)
print("roc auc:", auc)
print("confusion matrix:\n", cm)
return clf, (X_test, y_test, y_pred, y_proba)
# model 5: k-means clustering on ts shapes
def run_model_5():
X = fb_econ_merged[ts_cols_early].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
rng = np.random.RandomState(42)
idx = rng.choice(X_scaled.shape[0], size=5000, replace=False)
X_sample = X_scaled[idx]
fb_sample = fb_econ_merged["Facebook"].values[idx]
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans.fit(X_sample)
labels = kmeans.labels_
sil = silhouette_score(X_sample, labels)
print("model 5 kmeans on ts shapes")
print("silhouette score:", sil)
cluster_df = pd.DataFrame(
{"cluster": labels, "Facebook": fb_sample}
)
print(cluster_df.groupby("cluster")["Facebook"].agg(
["count", "mean", "median", "max"]
))
centers_scaled = kmeans.cluster_centers_
centers = scaler.inverse_transform(centers_scaled)
centers_df = pd.DataFrame(centers, columns=ts_cols_early)
summary = pd.DataFrame({
"cluster": list(range(centers_df.shape[0])),
"avg_ts": centers_df.mean(axis=1),
"ts1": centers_df["TS1"],
"ts10": centers_df["TS10"],
"ts25": centers_df["TS25"],
"ts50": centers_df["TS50"],
})
print("cluster centroid summary:\n", summary)
return kmeans, scaler, summary
if __name__ == "__main__":
run_model_1()
run_model_2()
run_model_3()
run_model_4()
run_model_5()
plot_eda()