initial code commit

2026-07-14 00:38:06 +00:00 · 2024-12-23 17:45:16 +02:00
commit 8bcd22e8c9
12 changed files with 3972 additions and 0 deletions
@@ -0,0 +1,9 @@
 node_modules/
 .venv/
 secret/
 data/
 bdata/
 temp_audio/
 .env
 temp.*
 *.parquet
@@ -0,0 +1,263 @@
 import express from 'express';
 import { google } from 'googleapis';
 import open from 'open';
 import fs from 'fs';
 import path from 'path';
 import { tokenManager } from './tokenManager.js';
 (await import('dotenv')).config({
    path: './Downloader/secret/config.env',
    debug: true
 });
 const app = express();
 const port = 3000;
 const { CLIENT_ID, CLIENT_SECRET, REDIRECT_URI } = process.env,
    manager = new tokenManager({ clientId: CLIENT_ID, clientSecret: CLIENT_SECRET, redirectUri: REDIRECT_URI, tokenPath: 'Downloader/secret/token.json' });
 const oauth2Client = manager.getAuthClient();
 // scope to read playlist items/liked videos
 const SCOPES = ['https://www.googleapis.com/auth/youtube.readonly'];
 let downloadStatus = 'idle'; // can be: 'idle', 'in-progress', 'completed', 'error'
 //#region oauth flow
 app.get('/auth', async (_req, res) => {
    const t = manager.loadToken();
    if (t) return res.redirect('/choose-playlist');
    // generate auth url
    const authUrl = oauth2Client.generateAuthUrl({
        access_type: 'offline',
        scope: SCOPES
    });
    // automatically open the url in the default browser
    const c = await open(authUrl).catch((err) => {
        console.error('error opening browser:', err);
        return res.status(500).send('failed to open browser for oauth.');
    });
    c.on('close', () => res.redirect('/choose-playlist'))
 });
 app.get('/oauth2callback', async (req, res) => {
    try {
        const code = req.query.code;
        const { tokens } = await oauth2Client.getToken(code);
        oauth2Client.setCredentials(tokens);
        manager.saveToken(tokens);
        // close the window
        res.sendStatus(200);
    } catch (err) {
        console.error('error retrieving token:', err);
        res.status(500).send('error retrieving token.');
    }
 });
 //#endregion
 //#region youtube stuffs
 async function getAllPlaylists(auth) {
    const youtube = google.youtube('v3');
    let playlists = [];
    let nextPageToken = null;
    do {
        const response = await youtube.playlists.list({
            auth,
            part: 'snippet',
            mine: true,
            maxResults: 50,
            pageToken: nextPageToken
        });
        if (response.data.items) {
            playlists = playlists.concat(response.data.items);
        }
        nextPageToken = response.data.nextPageToken;
    } while (nextPageToken);
    return playlists;
 }
 async function getPlaylistItems(playlistId, auth) {
    const youtube = google.youtube('v3');
    let items = [];
    let nextPageToken = null;
    do {
        const response = await youtube.playlistItems.list({
            auth,
            part: 'snippet,contentDetails',
            playlistId,
            maxResults: 50,
            pageToken: nextPageToken
        });
        if (response.data.items) {
            items = items.concat(response.data.items);
        }
        nextPageToken = response.data.nextPageToken;
    } while (nextPageToken);
    return items.map(o => `https://music.youtube.com/watch?v=${o.id}`);
 }
 //#endregion
 //#region routes
 app.get('/choose-playlist', async (_req, res) => {
    try {
        if (!oauth2Client.credentials || !oauth2Client.credentials.access_token) {
            const t = manager.loadToken();
            if (!t) return res.redirect('/auth');
        }
        const playlists = await getAllPlaylists(oauth2Client);
        let html = `
    <html>
    <head>
      <title>choose playlist</title>
      <style>
        body { font-family: sans-serif; }
        #container { margin: 20px; }
        select, button { margin-top: 10px; }
      </style>
    </head>
    <body>
      <div id="container">
        <h1>choose a playlist to download</h1>
        <select id="playlistSelect">
          ${playlists
                .map(
                    (pl) =>
                        `<option value="${pl.id}">${pl.snippet.title}</option>`
                )
                .join('')
            }
        </select>
        <br/>
        <button id="downloadBtn">download playlist</button>
      </div>
      <script>
        // when the button is clicked, we'll navigate to /download-playlist?playlistId=...
        const downloadBtn = document.querySelector('#downloadBtn');
        const select = document.querySelector('#playlistSelect');
        downloadBtn.addEventListener('click', () => {
          const chosenId = select.value;
          if (!chosenId) {
            alert('no playlist selected!');
            return;
          }
          window.location.href = '/download-playlist?playlistId=' + chosenId;
        });
      </script>
    </body>
    </html>
    `;
        res.send(html);
    } catch (err) {
        console.error('error fetching playlists:', err);
        res.status(500).send('error fetching playlists.');
    }
 });
 /**
 * called when the user has selected a playlist from the popup
 * fetch all items, write them to a json file, and update the status
 */
 app.get('/download-playlist', async (req, res) => {
    try {
        if (!oauth2Client.credentials || !oauth2Client.credentials.access_token) {
            return res
                .status(401)
                .send('error: oauth2 client not authorized. go to /auth first.');
        }
        const { playlistId } = req.query;
        if (!playlistId) {
            return res
                .status(400)
                .send('missing playlist id. please choose a playlist.');
        }
        // set status to in-progress
        downloadStatus = 'in-progress';
        // fetch the playlist items
        const items = await getPlaylistItems(playlistId, oauth2Client);
        // create a data folder if it doesn't exist
        const dataDir = path.join(process.cwd(), 'data');
        if (!fs.existsSync(dataDir)) {
            fs.mkdirSync(dataDir);
        }
        const outFile = path.join(dataDir, `playlist_${playlistId}.json`);
        fs.writeFileSync(outFile, JSON.stringify(items, null, 2), 'utf8');
        downloadStatus = 'completed';
        res.send(`
      <html>
      <head><title>download complete</title></head>
      <body>
        <h1>download complete!</h1>
        <p>downloaded ${items.length} items to <strong>${outFile}</strong></p>
        <p><a href="/status" target="_blank">check status</a></p>
        <script>window.close()</script>
      </body>
      </html>
    `);
    } catch (err) {
        console.error('error downloading playlist:', err);
        downloadStatus = 'error';
        res.status(500).send('error downloading playlist.');
    }
 });
 app.get('/status', (_req, res) => {
    let html = `
    <html>
    <head>
      <title>download status</title>
    </head>
    <body>
      <h1>current status: ${downloadStatus}</h1>
    </body>
    </html>
  `;
    res.send(html);
 });
 //#endregion
 app.listen(port, () => {
    console.log(`server listening on http://localhost:${port}`);
    console.log(`go to http://localhost:${port}/auth to start oauth flow`);
 });
@@ -0,0 +1,54 @@
 import { google } from "googleapis";
 import fs from 'fs';
 import { tokenManager } from "./tokenManager.js";
 (await import('dotenv')).config({
    path: './secret/config.env',
    debug: true
 });
 const { CLIENT_ID, CLIENT_SECRET, REDIRECT_URI } = process.env,
    manager = new tokenManager({ clientId: CLIENT_ID, clientSecret: CLIENT_SECRET, redirectUri: REDIRECT_URI, tokenPath: 'secret/token.json' });
 if (!manager.loadToken()) throw 'LOAD TOKEN FAILED!';
 const youtube = google.youtube('v3'),
    video = await youtube.videos.list({
        auth: manager.getAuthClient(),
        part: 'snippet,contentDetails',
        myRating: 'like',
        maxResults: 1,
        // pageToken: nextPageToken
    });
 const channelsinfo = (await (youtube.channels.list({ auth: manager.getAuthClient(), mine: true, part: 'snippet,contentDetails,statistics' }))).data;
 fs.writeFileSync('channels.json', JSON.stringify(channelsinfo));
 let likedMusic = [];
 let nextPageToken = null;
 // first, retrieve *all* liked videos
 do {
    const response = await youtube.videos.list({
        auth: manager.getAuthClient(),
        part: 'snippet,contentDetails',
        myRating: 'like',
        maxResults: 50,
        pageToken: nextPageToken
    });
    if (response.data.items) {
        likedMusic = likedMusic.concat(response.data.items.filter(o => o.snippet?.categoryId === '10').map(o => o.snippet.title))
        // snippet.categoryId should be present under `video.snippet`
        const t = response.data.items.find(video => video.snippet.title === 'Peeping Tom (feat. Rosie Harte)')
        if (t) {
            fs.writeFileSync('temp.json', JSON.stringify(t));
            break;
        }
        // likedMusic = likedMusic.concat(response.data.items.filter((video) => video.snippet?.categoryId === '10'));
    }
    nextPageToken = response.data.nextPageToken;
 } while (nextPageToken);
 // console.log('not found!');
 fs.writeFileSync('temp.json', JSON.stringify(likedMusic))
@@ -0,0 +1,94 @@
 // APPARENTLY the youtube api just....doesn't return all likes for some reason.....
 import { chromium } from 'playwright';
 import fs from 'fs';
 import dotenv from 'dotenv';
 dotenv.config();
 const urltostr = (u) => {
    try {
        return new URL(u);
    }
    catch (err) {
        return null;
    }
 }
 async function scrapeLikedVideos() {
    const browser = await chromium.launchPersistentContext('bdata', {
        headless: false, // youtube breaks in headless
        args: ['--disable-blink-features=AutomationControlled']
    });
    const page = await browser.newPage();
    console.log("Opening YouTube...");
    await page.goto('https://music.youtube.com/', { waitUntil: 'networkidle' });
    // Step Log in or die
    if (await page.locator('[aria-label="Sign in"]').isVisible()) {
        console.log("Logging in...");
        await page.click('[aria-label="Sign in"]');
        await page.waitForNavigation({ waitUntil: 'networkidle' });
        console.log(page.url());
        await page.waitForURL('https://music.youtube.com/').catch(console.error);
        console.log("Login successful");
    } else {
        console.log("Already logged in");
    }
    // Navigate to "Liked Videos" playlist
    console.log("Navigating to Liked Videos...");
    await page.goto('https://music.youtube.com/playlist?list=LM', { waitUntil: 'domcontentloaded' });
    // Scroll to load all liked videos
    console.log("Scrolling through Liked Videos...");
    const s = new Set();
    let previousHeight = 0;
    while (true) {
        const currentHeight = await page.evaluate(() => document.body.scrollHeight);
        if (currentHeight === previousHeight) break;
        previousHeight = currentHeight;
        await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
        await page.waitForTimeout(2000); // Wait for new content to load
        // sloppy and repetative to do it every time, but otherwise it won't work as the incoming videos won't all appear
        (await page.evaluate(() => {
            const videos = Array.from(document.querySelector('#contents').querySelectorAll('.title .yt-simple-endpoint'));
            return videos.map(video => video.href.replace('&list=LM', ''));
        })).map(u => s.add(u));
    }
    // // Scrape video data
    // console.log("Scraping liked videos...");
    // const likedVideos = await page.evaluate(() => {
    //     const videos = Array.from(document.querySelector('#contents').querySelectorAll('.title .yt-simple-endpoint'));
    //     return videos.map(video => video.href);
    // });
    // console.log(`Found ${likedVideos.length} liked videos.`);
    // console.log(likedVideos);
    // Close the browser
    await browser.close();
    // Save the results to a JSON file
    fs.writeFileSync('liked_videos.json', JSON.stringify([...s], null, 2));
    console.log("Liked videos saved to liked_videos.json");
    return [...s];
 }
 // Run the scraper
 (async () => {
    try {
        await scrapeLikedVideos();
    } catch (error) {
        console.error("Error scraping liked videos:", error);
    }
 })();
@@ -0,0 +1,61 @@
 import fs from 'fs';
 import { google } from 'googleapis';
 export class tokenManager {
    constructor({
        clientId,
        clientSecret,
        redirectUri,
        tokenPath = 'token.json'
    }) {
        // store options
        this.clientId = clientId;
        this.clientSecret = clientSecret;
        this.redirectUri = redirectUri;
        this.tokenPath = tokenPath;
        // create oauth2 client
        this.oauth2Client = new google.auth.OAuth2(
            this.clientId,
            this.clientSecret,
            this.redirectUri
        );
    }
    loadToken() {
        if (!fs.existsSync(this.tokenPath)) {
            return null;
        }
        const tokenData = fs.readFileSync(this.tokenPath, 'utf-8');
        const token = JSON.parse(tokenData);
        this.oauth2Client.setCredentials(token);
        return token;
    }
    saveToken(token) {
        fs.writeFileSync(this.tokenPath, JSON.stringify(token, null, 2), 'utf-8');
        this.oauth2Client.setCredentials(token);
    }
    async refreshAccessToken() {
        // if no refresh token is present, we can't refresh
        if (!this.oauth2Client.credentials.refresh_token) {
            throw new Error('no refresh token is available');
        }
        // use the googleapis refresh method
        const { credentials } = await this.oauth2Client.refreshAccessToken();
        // save the new token info
        this.saveToken(credentials);
        return credentials;
    }
    getAuthClient() {
        return this.oauth2Client;
    }
 }
@@ -0,0 +1,12 @@
 # YouTube Music Processing
 A simple project for processing and analyzing YouTube Music data. Includes tools for metadata extraction, audio analysis, and visualization.
 ### Current Features:
 * Fetch YouTube Music metadata.
 * Fetch YouTube Music audio metadata
 ### Planned Features:
 * implement ML to make a personal recommendation algorithm
 * maybe see if I can make a new song from all my genres using whisper or smth
@@ -0,0 +1,117 @@
 import yt_dlp
 import librosa
 import numpy as np
 import os
 import requests
 import pandas as pd
 import numpy as np
 # Constants
 COOKIES_PATH = "youtube_cookies.txt"  # Path to your cookies file
 OUTPUT_AUDIO = "audio.wav"  # Output audio file for Librosa processing
 # Step 1: Download audio from YouTube
 def download_audio(video_url, output_path, cookies_path):
    ydl_opts = {
        "format": "bestaudio/best",
        "cookiefile": cookies_path,
        "postprocessors": [
            {  # Convert audio to WAV format for Librosa
                "key": "FFmpegExtractAudio",
                "preferredcodec": "wav",
            }
        ],
        "outtmpl": output_path,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])
    print(f"Downloaded and converted audio to {output_path}")
 # Step 2: Extract audio features using Librosa
 def extract_audio_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)  # Load audio
    features = {
        "tempo": librosa.feature.tempo(y=y, sr=sr)[0],  # Tempo in BPM
        "mfcc": np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0),  # MFCCs
        "spectral_contrast": np.mean(
            librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0
        ),
        "chroma_stft": np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0),
    }
    print("Extracted audio features:", features)
    return features
 # Step 3: Query MusicBrainz or Discogs for metadata
 def fetch_metadata(title, artist):
    # Example: Fetch metadata from MusicBrainz
    base_url = "https://musicbrainz.org/ws/2/recording/"
    params = {
        "query": f"{title} AND artist:{artist}",
        "fmt": "json",
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        results = response.json().get("recordings", [])
        if results:
            metadata = {
                "title": results[0].get("title"),
                "artist": results[0]
                .get("artist-credit", [{}])[0]
                .get("artist", {})
                .get("name"),
                "release_date": results[0].get("first-release-date"),
                "genres": results[0].get("tags", []),
            }
            print("Fetched metadata from MusicBrainz:", metadata)
            return metadata
        else:
            print("No results found on MusicBrainz.")
    else:
        print(f"MusicBrainz API error: {response.status_code}")
    return None
 # Main pipeline (one at a time)
 if __name__ == "__main__":
    video_url = "https://www.youtube.com/watch?v=UoCxdh7qQHE"
    # Step 1: Download audio
    download_audio(video_url, OUTPUT_AUDIO.replace(".wav", ""), COOKIES_PATH)
    # Step 2: Extract audio features
    audio_features = extract_audio_features(OUTPUT_AUDIO)
    # Step 3: Fetch metadata
    youtube_title = "Turning Into Night"  # Example, fetch dynamically from yt-dlp metadata if needed
    youtube_artist = "Jamie Berry"
    metadata = fetch_metadata(youtube_title, youtube_artist)
    data = {
        **metadata,
        **{f"mfcc_{i}": val for i, val in enumerate(audio_features["mfcc"])},
        **{
            f"spectral_contrast_{i}": val
            for i, val in enumerate(audio_features["spectral_contrast"])
        },
        **{
            f"chroma_stft_{i}": val
            for i, val in enumerate(audio_features["chroma_stft"])
        },
        "tempo": audio_features["tempo"],
    }
    # Convert to a DataFrame
    df = pd.DataFrame([data])
    # Save to Parquet
    output_file = "output.parquet"
    df.to_parquet(output_file, engine="pyarrow", index=False)
    # Clean up downloaded audio (optional)
    os.remove(OUTPUT_AUDIO)
    print("Pipeline complete.")
@@ -0,0 +1,146 @@
 import yt_dlp
 import librosa
 import numpy as np
 import os
 import json
 import requests
 import pandas as pd
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 # Constants
 COOKIES_PATH = "Downloader/secret/youtube_cookies.txt"
 TEMP_AUDIO_DIR = "temp_audio"  # dir to store temporary audio files in
 OUTPUT_FILE = "output.parquet"
 ERROR_LOG_FILE = "error_log.txt"
 MAX_WORKERS = 6
 # Ensure temporary directory exists
 os.makedirs(TEMP_AUDIO_DIR, exist_ok=True)
 # Function to log errors
 def log_error(message: str):
    with open(ERROR_LOG_FILE, "a") as log_file:
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_file.write(f"[{timestamp}] {message}\n")
 def get_youtube_music_title(url: str) -> str:
    try:
        ydl_opts = {
            'quiet': True,
            'no_warnings': True,
            'skip_download': True,
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=False)
            return info.get('title', 'No title found')
    except Exception as e:
        log_error(f"Failed to retrieve title for URL {url}: {e}")
        return "Unknown Title"
 def download_audio(video_url, output_path, cookies_path):
    try:
        ydl_opts = {
            "format": "bestaudio/best",
            "cookiefile": cookies_path,
            "postprocessors": [
                {"key": "FFmpegExtractAudio", "preferredcodec": "wav"}
            ],
            "outtmpl": output_path,
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([video_url])
        print(f"Downloaded and converted audio to {output_path}")
    except Exception as e:
        log_error(f"Failed to download audio for {video_url}: {e}")
        raise
 def extract_audio_features(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=None)
        features = {
            "tempo": librosa.beat.tempo(y=y, sr=sr)[0],
            "mfcc": np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0),
            "spectral_contrast": np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0),
            "chroma_stft": np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0),
        }
        print("Extracted audio features:", features)
        return features
    except Exception as e:
        log_error(f"Failed to extract features from {audio_path}: {e}")
        raise
 def fetch_metadata(title, artist="Unknown"):
    try:
        base_url = "https://musicbrainz.org/ws/2/recording/"
        params = {"query": title, "fmt": "json"}
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            results = response.json().get("recordings", [])
            if results:
                metadata = {
                    "title": results[0].get("title"),
                    "artist": results[0].get("artist-credit", [{}])[0].get("artist", {}).get("name"),
                    "release_date": results[0].get("first-release-date"),
                    "genres": results[0].get("tags", []),
                }
                print("Fetched metadata from MusicBrainz:", metadata)
                return metadata
        log_error(f"No results from MusicBrainz for {title} by {artist}")
    except Exception as e:
        log_error(f"Failed to fetch metadata for {title}: {e}")
    return {"title": title, "artist": artist, "release_date": None, "genres": []}
 def process_song(video_url):
    title = get_youtube_music_title(video_url)
    audio_path = os.path.join(TEMP_AUDIO_DIR, f"{title.replace(' ', '_')}.wav")
    try:
        download_audio(video_url, audio_path.replace(".wav", ""), COOKIES_PATH)
        audio_features = extract_audio_features(audio_path)
        metadata = fetch_metadata(title)
        data = {
            **metadata,
            **{f"mfcc_{i}": val for i, val in enumerate(audio_features["mfcc"])},
            **{f"spectral_contrast_{i}": val for i, val in enumerate(audio_features["spectral_contrast"])},
            **{f"chroma_stft_{i}": val for i, val in enumerate(audio_features["chroma_stft"])},
            "tempo": audio_features["tempo"],
        }
        return data
    except Exception as e:
        log_error(f"Failed to process song {title} from URL {video_url}: {e}")
        return None
    finally:
        if os.path.exists(audio_path):
            os.remove(audio_path)
 def read_urls_from_json(data_dir):
    urls = []
    for filename in os.listdir(data_dir):
        if filename.endswith(".json"):
            file_path = os.path.join(data_dir, filename)
            try:
                with open(file_path, "r") as f:
                    data = json.load(f)
                    if isinstance(data, list):
                        urls.extend(data)
                    elif isinstance(data, dict) and "url" in data:
                        urls.append(data["url"])
            except json.JSONDecodeError as e:
                log_error(f"Failed to read JSON file {file_path}: {e}")
    return [url for url in urls if url]
 if __name__ == "__main__":
    try:
        songs = read_urls_from_json('data')
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            results = list(executor.map(process_song, songs))
        processed_data = [result for result in results if result is not None]
        df = pd.DataFrame(processed_data)
        df.to_parquet(OUTPUT_FILE, engine="pyarrow", index=False)
        print(f"Data saved to {OUTPUT_FILE}")
    except Exception as e:
        log_error(f"Pipeline failed: {e}")
    finally:
        if os.path.exists(TEMP_AUDIO_DIR):
            os.rmdir(TEMP_AUDIO_DIR)
        print("Pipeline complete.")
@@ -0,0 +1,22 @@
 {
  "name": "music-ml",
  "version": "1.0.0",
  "main": "Downloader/main.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "description": "",
  "dependencies": {
    "dotenv": "^16.4.7",
    "express": "^4.21.2",
    "googleapis": "^144.0.0",
    "open": "^10.1.0",
    "playwright": "^1.49.1",
    "puppeteer": "^23.11.1",
    "puppeteer-extra": "^3.3.6",
    "puppeteer-extra-plugin-stealth": "^2.11.2"
  }
 }
@@ -0,0 +1,36 @@
 audioread==3.0.1
 certifi==2024.12.14
 cffi==1.17.1
 charset-normalizer==3.4.0
 decorator==5.1.1
 idna==3.10
 joblib==1.4.2
 lazy_loader==0.4
 librosa==0.10.2.post1
 llvmlite==0.43.0
 msgpack==1.1.0
 numba==0.60.0
 numpy==2.0.2
 packaging==24.2
 pafy==0.5.5
 pandas==2.2.3
 platformdirs==4.3.6
 pooch==1.8.2
 pyarrow==18.1.0
 pycparser==2.22
 python-dateutil==2.9.0.post0
 pytube==15.0.0
 pytz==2024.2
 requests==2.32.3
 scikit-learn==1.6.0
 scipy==1.14.1
 six==1.17.0
 soundfile==0.12.1
 soxr==0.5.0.post1
 threadpoolctl==3.5.0
 typing_extensions==4.12.2
 tzdata==2024.2
 urllib3==2.3.0
 youtube-dl==2021.12.17
 yt-dlp==2024.12.13
 ytmusicapi==1.9.0