initial code commit

This commit is contained in:
2024-12-23 17:45:16 +02:00
commit 8bcd22e8c9
12 changed files with 3972 additions and 0 deletions
+9
View File
@@ -0,0 +1,9 @@
node_modules/
.venv/
secret/
data/
bdata/
temp_audio/
.env
temp.*
*.parquet
Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

+263
View File
@@ -0,0 +1,263 @@
import express from 'express';
import { google } from 'googleapis';
import open from 'open';
import fs from 'fs';
import path from 'path';
import { tokenManager } from './tokenManager.js';
(await import('dotenv')).config({
path: './Downloader/secret/config.env',
debug: true
});
const app = express();
const port = 3000;
const { CLIENT_ID, CLIENT_SECRET, REDIRECT_URI } = process.env,
manager = new tokenManager({ clientId: CLIENT_ID, clientSecret: CLIENT_SECRET, redirectUri: REDIRECT_URI, tokenPath: 'Downloader/secret/token.json' });
const oauth2Client = manager.getAuthClient();
// scope to read playlist items/liked videos
const SCOPES = ['https://www.googleapis.com/auth/youtube.readonly'];
let downloadStatus = 'idle'; // can be: 'idle', 'in-progress', 'completed', 'error'
//#region oauth flow
app.get('/auth', async (_req, res) => {
const t = manager.loadToken();
if (t) return res.redirect('/choose-playlist');
// generate auth url
const authUrl = oauth2Client.generateAuthUrl({
access_type: 'offline',
scope: SCOPES
});
// automatically open the url in the default browser
const c = await open(authUrl).catch((err) => {
console.error('error opening browser:', err);
return res.status(500).send('failed to open browser for oauth.');
});
c.on('close', () => res.redirect('/choose-playlist'))
});
app.get('/oauth2callback', async (req, res) => {
try {
const code = req.query.code;
const { tokens } = await oauth2Client.getToken(code);
oauth2Client.setCredentials(tokens);
manager.saveToken(tokens);
// close the window
res.sendStatus(200);
} catch (err) {
console.error('error retrieving token:', err);
res.status(500).send('error retrieving token.');
}
});
//#endregion
//#region youtube stuffs
async function getAllPlaylists(auth) {
const youtube = google.youtube('v3');
let playlists = [];
let nextPageToken = null;
do {
const response = await youtube.playlists.list({
auth,
part: 'snippet',
mine: true,
maxResults: 50,
pageToken: nextPageToken
});
if (response.data.items) {
playlists = playlists.concat(response.data.items);
}
nextPageToken = response.data.nextPageToken;
} while (nextPageToken);
return playlists;
}
async function getPlaylistItems(playlistId, auth) {
const youtube = google.youtube('v3');
let items = [];
let nextPageToken = null;
do {
const response = await youtube.playlistItems.list({
auth,
part: 'snippet,contentDetails',
playlistId,
maxResults: 50,
pageToken: nextPageToken
});
if (response.data.items) {
items = items.concat(response.data.items);
}
nextPageToken = response.data.nextPageToken;
} while (nextPageToken);
return items.map(o => `https://music.youtube.com/watch?v=${o.id}`);
}
//#endregion
//#region routes
app.get('/choose-playlist', async (_req, res) => {
try {
if (!oauth2Client.credentials || !oauth2Client.credentials.access_token) {
const t = manager.loadToken();
if (!t) return res.redirect('/auth');
}
const playlists = await getAllPlaylists(oauth2Client);
let html = `
<html>
<head>
<title>choose playlist</title>
<style>
body { font-family: sans-serif; }
#container { margin: 20px; }
select, button { margin-top: 10px; }
</style>
</head>
<body>
<div id="container">
<h1>choose a playlist to download</h1>
<select id="playlistSelect">
${playlists
.map(
(pl) =>
`<option value="${pl.id}">${pl.snippet.title}</option>`
)
.join('')
}
</select>
<br/>
<button id="downloadBtn">download playlist</button>
</div>
<script>
// when the button is clicked, we'll navigate to /download-playlist?playlistId=...
const downloadBtn = document.querySelector('#downloadBtn');
const select = document.querySelector('#playlistSelect');
downloadBtn.addEventListener('click', () => {
const chosenId = select.value;
if (!chosenId) {
alert('no playlist selected!');
return;
}
window.location.href = '/download-playlist?playlistId=' + chosenId;
});
</script>
</body>
</html>
`;
res.send(html);
} catch (err) {
console.error('error fetching playlists:', err);
res.status(500).send('error fetching playlists.');
}
});
/**
* called when the user has selected a playlist from the popup
* fetch all items, write them to a json file, and update the status
*/
app.get('/download-playlist', async (req, res) => {
try {
if (!oauth2Client.credentials || !oauth2Client.credentials.access_token) {
return res
.status(401)
.send('error: oauth2 client not authorized. go to /auth first.');
}
const { playlistId } = req.query;
if (!playlistId) {
return res
.status(400)
.send('missing playlist id. please choose a playlist.');
}
// set status to in-progress
downloadStatus = 'in-progress';
// fetch the playlist items
const items = await getPlaylistItems(playlistId, oauth2Client);
// create a data folder if it doesn't exist
const dataDir = path.join(process.cwd(), 'data');
if (!fs.existsSync(dataDir)) {
fs.mkdirSync(dataDir);
}
const outFile = path.join(dataDir, `playlist_${playlistId}.json`);
fs.writeFileSync(outFile, JSON.stringify(items, null, 2), 'utf8');
downloadStatus = 'completed';
res.send(`
<html>
<head><title>download complete</title></head>
<body>
<h1>download complete!</h1>
<p>downloaded ${items.length} items to <strong>${outFile}</strong></p>
<p><a href="/status" target="_blank">check status</a></p>
<script>window.close()</script>
</body>
</html>
`);
} catch (err) {
console.error('error downloading playlist:', err);
downloadStatus = 'error';
res.status(500).send('error downloading playlist.');
}
});
app.get('/status', (_req, res) => {
let html = `
<html>
<head>
<title>download status</title>
</head>
<body>
<h1>current status: ${downloadStatus}</h1>
</body>
</html>
`;
res.send(html);
});
//#endregion
app.listen(port, () => {
console.log(`server listening on http://localhost:${port}`);
console.log(`go to http://localhost:${port}/auth to start oauth flow`);
});
+54
View File
@@ -0,0 +1,54 @@
import { google } from "googleapis";
import fs from 'fs';
import { tokenManager } from "./tokenManager.js";
(await import('dotenv')).config({
path: './secret/config.env',
debug: true
});
const { CLIENT_ID, CLIENT_SECRET, REDIRECT_URI } = process.env,
manager = new tokenManager({ clientId: CLIENT_ID, clientSecret: CLIENT_SECRET, redirectUri: REDIRECT_URI, tokenPath: 'secret/token.json' });
if (!manager.loadToken()) throw 'LOAD TOKEN FAILED!';
const youtube = google.youtube('v3'),
video = await youtube.videos.list({
auth: manager.getAuthClient(),
part: 'snippet,contentDetails',
myRating: 'like',
maxResults: 1,
// pageToken: nextPageToken
});
const channelsinfo = (await (youtube.channels.list({ auth: manager.getAuthClient(), mine: true, part: 'snippet,contentDetails,statistics' }))).data;
fs.writeFileSync('channels.json', JSON.stringify(channelsinfo));
let likedMusic = [];
let nextPageToken = null;
// first, retrieve *all* liked videos
do {
const response = await youtube.videos.list({
auth: manager.getAuthClient(),
part: 'snippet,contentDetails',
myRating: 'like',
maxResults: 50,
pageToken: nextPageToken
});
if (response.data.items) {
likedMusic = likedMusic.concat(response.data.items.filter(o => o.snippet?.categoryId === '10').map(o => o.snippet.title))
// snippet.categoryId should be present under `video.snippet`
const t = response.data.items.find(video => video.snippet.title === 'Peeping Tom (feat. Rosie Harte)')
if (t) {
fs.writeFileSync('temp.json', JSON.stringify(t));
break;
}
// likedMusic = likedMusic.concat(response.data.items.filter((video) => video.snippet?.categoryId === '10'));
}
nextPageToken = response.data.nextPageToken;
} while (nextPageToken);
// console.log('not found!');
fs.writeFileSync('temp.json', JSON.stringify(likedMusic))
+94
View File
@@ -0,0 +1,94 @@
// APPARENTLY the youtube api just....doesn't return all likes for some reason.....
import { chromium } from 'playwright';
import fs from 'fs';
import dotenv from 'dotenv';
dotenv.config();
const urltostr = (u) => {
try {
return new URL(u);
}
catch (err) {
return null;
}
}
async function scrapeLikedVideos() {
const browser = await chromium.launchPersistentContext('bdata', {
headless: false, // youtube breaks in headless
args: ['--disable-blink-features=AutomationControlled']
});
const page = await browser.newPage();
console.log("Opening YouTube...");
await page.goto('https://music.youtube.com/', { waitUntil: 'networkidle' });
// Step Log in or die
if (await page.locator('[aria-label="Sign in"]').isVisible()) {
console.log("Logging in...");
await page.click('[aria-label="Sign in"]');
await page.waitForNavigation({ waitUntil: 'networkidle' });
console.log(page.url());
await page.waitForURL('https://music.youtube.com/').catch(console.error);
console.log("Login successful");
} else {
console.log("Already logged in");
}
// Navigate to "Liked Videos" playlist
console.log("Navigating to Liked Videos...");
await page.goto('https://music.youtube.com/playlist?list=LM', { waitUntil: 'domcontentloaded' });
// Scroll to load all liked videos
console.log("Scrolling through Liked Videos...");
const s = new Set();
let previousHeight = 0;
while (true) {
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) break;
previousHeight = currentHeight;
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(2000); // Wait for new content to load
// sloppy and repetative to do it every time, but otherwise it won't work as the incoming videos won't all appear
(await page.evaluate(() => {
const videos = Array.from(document.querySelector('#contents').querySelectorAll('.title .yt-simple-endpoint'));
return videos.map(video => video.href.replace('&list=LM', ''));
})).map(u => s.add(u));
}
// // Scrape video data
// console.log("Scraping liked videos...");
// const likedVideos = await page.evaluate(() => {
// const videos = Array.from(document.querySelector('#contents').querySelectorAll('.title .yt-simple-endpoint'));
// return videos.map(video => video.href);
// });
// console.log(`Found ${likedVideos.length} liked videos.`);
// console.log(likedVideos);
// Close the browser
await browser.close();
// Save the results to a JSON file
fs.writeFileSync('liked_videos.json', JSON.stringify([...s], null, 2));
console.log("Liked videos saved to liked_videos.json");
return [...s];
}
// Run the scraper
(async () => {
try {
await scrapeLikedVideos();
} catch (error) {
console.error("Error scraping liked videos:", error);
}
})();
+61
View File
@@ -0,0 +1,61 @@
import fs from 'fs';
import { google } from 'googleapis';
export class tokenManager {
constructor({
clientId,
clientSecret,
redirectUri,
tokenPath = 'token.json'
}) {
// store options
this.clientId = clientId;
this.clientSecret = clientSecret;
this.redirectUri = redirectUri;
this.tokenPath = tokenPath;
// create oauth2 client
this.oauth2Client = new google.auth.OAuth2(
this.clientId,
this.clientSecret,
this.redirectUri
);
}
loadToken() {
if (!fs.existsSync(this.tokenPath)) {
return null;
}
const tokenData = fs.readFileSync(this.tokenPath, 'utf-8');
const token = JSON.parse(tokenData);
this.oauth2Client.setCredentials(token);
return token;
}
saveToken(token) {
fs.writeFileSync(this.tokenPath, JSON.stringify(token, null, 2), 'utf-8');
this.oauth2Client.setCredentials(token);
}
async refreshAccessToken() {
// if no refresh token is present, we can't refresh
if (!this.oauth2Client.credentials.refresh_token) {
throw new Error('no refresh token is available');
}
// use the googleapis refresh method
const { credentials } = await this.oauth2Client.refreshAccessToken();
// save the new token info
this.saveToken(credentials);
return credentials;
}
getAuthClient() {
return this.oauth2Client;
}
}
+12
View File
@@ -0,0 +1,12 @@
# YouTube Music Processing
A simple project for processing and analyzing YouTube Music data. Includes tools for metadata extraction, audio analysis, and visualization.
### Current Features:
* Fetch YouTube Music metadata.
* Fetch YouTube Music audio metadata
### Planned Features:
* implement ML to make a personal recommendation algorithm
* maybe see if I can make a new song from all my genres using whisper or smth
+117
View File
@@ -0,0 +1,117 @@
import yt_dlp
import librosa
import numpy as np
import os
import requests
import pandas as pd
import numpy as np
# Constants
COOKIES_PATH = "youtube_cookies.txt" # Path to your cookies file
OUTPUT_AUDIO = "audio.wav" # Output audio file for Librosa processing
# Step 1: Download audio from YouTube
def download_audio(video_url, output_path, cookies_path):
ydl_opts = {
"format": "bestaudio/best",
"cookiefile": cookies_path,
"postprocessors": [
{ # Convert audio to WAV format for Librosa
"key": "FFmpegExtractAudio",
"preferredcodec": "wav",
}
],
"outtmpl": output_path,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])
print(f"Downloaded and converted audio to {output_path}")
# Step 2: Extract audio features using Librosa
def extract_audio_features(audio_path):
y, sr = librosa.load(audio_path, sr=None) # Load audio
features = {
"tempo": librosa.feature.tempo(y=y, sr=sr)[0], # Tempo in BPM
"mfcc": np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0), # MFCCs
"spectral_contrast": np.mean(
librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0
),
"chroma_stft": np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0),
}
print("Extracted audio features:", features)
return features
# Step 3: Query MusicBrainz or Discogs for metadata
def fetch_metadata(title, artist):
# Example: Fetch metadata from MusicBrainz
base_url = "https://musicbrainz.org/ws/2/recording/"
params = {
"query": f"{title} AND artist:{artist}",
"fmt": "json",
}
response = requests.get(base_url, params=params)
if response.status_code == 200:
results = response.json().get("recordings", [])
if results:
metadata = {
"title": results[0].get("title"),
"artist": results[0]
.get("artist-credit", [{}])[0]
.get("artist", {})
.get("name"),
"release_date": results[0].get("first-release-date"),
"genres": results[0].get("tags", []),
}
print("Fetched metadata from MusicBrainz:", metadata)
return metadata
else:
print("No results found on MusicBrainz.")
else:
print(f"MusicBrainz API error: {response.status_code}")
return None
# Main pipeline (one at a time)
if __name__ == "__main__":
video_url = "https://www.youtube.com/watch?v=UoCxdh7qQHE"
# Step 1: Download audio
download_audio(video_url, OUTPUT_AUDIO.replace(".wav", ""), COOKIES_PATH)
# Step 2: Extract audio features
audio_features = extract_audio_features(OUTPUT_AUDIO)
# Step 3: Fetch metadata
youtube_title = "Turning Into Night" # Example, fetch dynamically from yt-dlp metadata if needed
youtube_artist = "Jamie Berry"
metadata = fetch_metadata(youtube_title, youtube_artist)
data = {
**metadata,
**{f"mfcc_{i}": val for i, val in enumerate(audio_features["mfcc"])},
**{
f"spectral_contrast_{i}": val
for i, val in enumerate(audio_features["spectral_contrast"])
},
**{
f"chroma_stft_{i}": val
for i, val in enumerate(audio_features["chroma_stft"])
},
"tempo": audio_features["tempo"],
}
# Convert to a DataFrame
df = pd.DataFrame([data])
# Save to Parquet
output_file = "output.parquet"
df.to_parquet(output_file, engine="pyarrow", index=False)
# Clean up downloaded audio (optional)
os.remove(OUTPUT_AUDIO)
print("Pipeline complete.")
+146
View File
@@ -0,0 +1,146 @@
import yt_dlp
import librosa
import numpy as np
import os
import json
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
# Constants
COOKIES_PATH = "Downloader/secret/youtube_cookies.txt"
TEMP_AUDIO_DIR = "temp_audio" # dir to store temporary audio files in
OUTPUT_FILE = "output.parquet"
ERROR_LOG_FILE = "error_log.txt"
MAX_WORKERS = 6
# Ensure temporary directory exists
os.makedirs(TEMP_AUDIO_DIR, exist_ok=True)
# Function to log errors
def log_error(message: str):
with open(ERROR_LOG_FILE, "a") as log_file:
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_file.write(f"[{timestamp}] {message}\n")
def get_youtube_music_title(url: str) -> str:
try:
ydl_opts = {
'quiet': True,
'no_warnings': True,
'skip_download': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
return info.get('title', 'No title found')
except Exception as e:
log_error(f"Failed to retrieve title for URL {url}: {e}")
return "Unknown Title"
def download_audio(video_url, output_path, cookies_path):
try:
ydl_opts = {
"format": "bestaudio/best",
"cookiefile": cookies_path,
"postprocessors": [
{"key": "FFmpegExtractAudio", "preferredcodec": "wav"}
],
"outtmpl": output_path,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])
print(f"Downloaded and converted audio to {output_path}")
except Exception as e:
log_error(f"Failed to download audio for {video_url}: {e}")
raise
def extract_audio_features(audio_path):
try:
y, sr = librosa.load(audio_path, sr=None)
features = {
"tempo": librosa.beat.tempo(y=y, sr=sr)[0],
"mfcc": np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0),
"spectral_contrast": np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0),
"chroma_stft": np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0),
}
print("Extracted audio features:", features)
return features
except Exception as e:
log_error(f"Failed to extract features from {audio_path}: {e}")
raise
def fetch_metadata(title, artist="Unknown"):
try:
base_url = "https://musicbrainz.org/ws/2/recording/"
params = {"query": title, "fmt": "json"}
response = requests.get(base_url, params=params)
if response.status_code == 200:
results = response.json().get("recordings", [])
if results:
metadata = {
"title": results[0].get("title"),
"artist": results[0].get("artist-credit", [{}])[0].get("artist", {}).get("name"),
"release_date": results[0].get("first-release-date"),
"genres": results[0].get("tags", []),
}
print("Fetched metadata from MusicBrainz:", metadata)
return metadata
log_error(f"No results from MusicBrainz for {title} by {artist}")
except Exception as e:
log_error(f"Failed to fetch metadata for {title}: {e}")
return {"title": title, "artist": artist, "release_date": None, "genres": []}
def process_song(video_url):
title = get_youtube_music_title(video_url)
audio_path = os.path.join(TEMP_AUDIO_DIR, f"{title.replace(' ', '_')}.wav")
try:
download_audio(video_url, audio_path.replace(".wav", ""), COOKIES_PATH)
audio_features = extract_audio_features(audio_path)
metadata = fetch_metadata(title)
data = {
**metadata,
**{f"mfcc_{i}": val for i, val in enumerate(audio_features["mfcc"])},
**{f"spectral_contrast_{i}": val for i, val in enumerate(audio_features["spectral_contrast"])},
**{f"chroma_stft_{i}": val for i, val in enumerate(audio_features["chroma_stft"])},
"tempo": audio_features["tempo"],
}
return data
except Exception as e:
log_error(f"Failed to process song {title} from URL {video_url}: {e}")
return None
finally:
if os.path.exists(audio_path):
os.remove(audio_path)
def read_urls_from_json(data_dir):
urls = []
for filename in os.listdir(data_dir):
if filename.endswith(".json"):
file_path = os.path.join(data_dir, filename)
try:
with open(file_path, "r") as f:
data = json.load(f)
if isinstance(data, list):
urls.extend(data)
elif isinstance(data, dict) and "url" in data:
urls.append(data["url"])
except json.JSONDecodeError as e:
log_error(f"Failed to read JSON file {file_path}: {e}")
return [url for url in urls if url]
if __name__ == "__main__":
try:
songs = read_urls_from_json('data')
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
results = list(executor.map(process_song, songs))
processed_data = [result for result in results if result is not None]
df = pd.DataFrame(processed_data)
df.to_parquet(OUTPUT_FILE, engine="pyarrow", index=False)
print(f"Data saved to {OUTPUT_FILE}")
except Exception as e:
log_error(f"Pipeline failed: {e}")
finally:
if os.path.exists(TEMP_AUDIO_DIR):
os.rmdir(TEMP_AUDIO_DIR)
print("Pipeline complete.")
+3158
View File
File diff suppressed because it is too large Load Diff
+22
View File
@@ -0,0 +1,22 @@
{
"name": "music-ml",
"version": "1.0.0",
"main": "Downloader/main.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"description": "",
"dependencies": {
"dotenv": "^16.4.7",
"express": "^4.21.2",
"googleapis": "^144.0.0",
"open": "^10.1.0",
"playwright": "^1.49.1",
"puppeteer": "^23.11.1",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2"
}
}
+36
View File
@@ -0,0 +1,36 @@
audioread==3.0.1
certifi==2024.12.14
cffi==1.17.1
charset-normalizer==3.4.0
decorator==5.1.1
idna==3.10
joblib==1.4.2
lazy_loader==0.4
librosa==0.10.2.post1
llvmlite==0.43.0
msgpack==1.1.0
numba==0.60.0
numpy==2.0.2
packaging==24.2
pafy==0.5.5
pandas==2.2.3
platformdirs==4.3.6
pooch==1.8.2
pyarrow==18.1.0
pycparser==2.22
python-dateutil==2.9.0.post0
pytube==15.0.0
pytz==2024.2
requests==2.32.3
scikit-learn==1.6.0
scipy==1.14.1
six==1.17.0
soundfile==0.12.1
soxr==0.5.0.post1
threadpoolctl==3.5.0
typing_extensions==4.12.2
tzdata==2024.2
urllib3==2.3.0
youtube-dl==2021.12.17
yt-dlp==2024.12.13
ytmusicapi==1.9.0