mirror of
https://github.com/ION606/youtube-music-meta-extract.git
synced 2026-05-14 13:56:57 +00:00
initial code commit
This commit is contained in:
@@ -0,0 +1,9 @@
|
||||
node_modules/
|
||||
.venv/
|
||||
secret/
|
||||
data/
|
||||
bdata/
|
||||
temp_audio/
|
||||
.env
|
||||
temp.*
|
||||
*.parquet
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 20 KiB |
@@ -0,0 +1,263 @@
|
||||
import express from 'express';
|
||||
import { google } from 'googleapis';
|
||||
import open from 'open';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import { tokenManager } from './tokenManager.js';
|
||||
|
||||
(await import('dotenv')).config({
|
||||
path: './Downloader/secret/config.env',
|
||||
debug: true
|
||||
});
|
||||
|
||||
|
||||
const app = express();
|
||||
const port = 3000;
|
||||
|
||||
|
||||
const { CLIENT_ID, CLIENT_SECRET, REDIRECT_URI } = process.env,
|
||||
manager = new tokenManager({ clientId: CLIENT_ID, clientSecret: CLIENT_SECRET, redirectUri: REDIRECT_URI, tokenPath: 'Downloader/secret/token.json' });
|
||||
|
||||
const oauth2Client = manager.getAuthClient();
|
||||
|
||||
|
||||
// scope to read playlist items/liked videos
|
||||
const SCOPES = ['https://www.googleapis.com/auth/youtube.readonly'];
|
||||
|
||||
let downloadStatus = 'idle'; // can be: 'idle', 'in-progress', 'completed', 'error'
|
||||
|
||||
|
||||
//#region oauth flow
|
||||
|
||||
app.get('/auth', async (_req, res) => {
|
||||
const t = manager.loadToken();
|
||||
if (t) return res.redirect('/choose-playlist');
|
||||
|
||||
// generate auth url
|
||||
const authUrl = oauth2Client.generateAuthUrl({
|
||||
access_type: 'offline',
|
||||
scope: SCOPES
|
||||
});
|
||||
|
||||
// automatically open the url in the default browser
|
||||
const c = await open(authUrl).catch((err) => {
|
||||
console.error('error opening browser:', err);
|
||||
return res.status(500).send('failed to open browser for oauth.');
|
||||
});
|
||||
|
||||
c.on('close', () => res.redirect('/choose-playlist'))
|
||||
});
|
||||
|
||||
|
||||
|
||||
app.get('/oauth2callback', async (req, res) => {
|
||||
try {
|
||||
const code = req.query.code;
|
||||
const { tokens } = await oauth2Client.getToken(code);
|
||||
oauth2Client.setCredentials(tokens);
|
||||
|
||||
manager.saveToken(tokens);
|
||||
|
||||
// close the window
|
||||
res.sendStatus(200);
|
||||
} catch (err) {
|
||||
console.error('error retrieving token:', err);
|
||||
res.status(500).send('error retrieving token.');
|
||||
}
|
||||
});
|
||||
|
||||
//#endregion
|
||||
|
||||
|
||||
//#region youtube stuffs
|
||||
|
||||
async function getAllPlaylists(auth) {
|
||||
const youtube = google.youtube('v3');
|
||||
let playlists = [];
|
||||
let nextPageToken = null;
|
||||
|
||||
do {
|
||||
const response = await youtube.playlists.list({
|
||||
auth,
|
||||
part: 'snippet',
|
||||
mine: true,
|
||||
maxResults: 50,
|
||||
pageToken: nextPageToken
|
||||
});
|
||||
|
||||
if (response.data.items) {
|
||||
playlists = playlists.concat(response.data.items);
|
||||
}
|
||||
|
||||
nextPageToken = response.data.nextPageToken;
|
||||
} while (nextPageToken);
|
||||
|
||||
return playlists;
|
||||
}
|
||||
|
||||
async function getPlaylistItems(playlistId, auth) {
|
||||
const youtube = google.youtube('v3');
|
||||
let items = [];
|
||||
let nextPageToken = null;
|
||||
|
||||
do {
|
||||
const response = await youtube.playlistItems.list({
|
||||
auth,
|
||||
part: 'snippet,contentDetails',
|
||||
playlistId,
|
||||
maxResults: 50,
|
||||
pageToken: nextPageToken
|
||||
});
|
||||
|
||||
if (response.data.items) {
|
||||
items = items.concat(response.data.items);
|
||||
}
|
||||
|
||||
nextPageToken = response.data.nextPageToken;
|
||||
} while (nextPageToken);
|
||||
|
||||
return items.map(o => `https://music.youtube.com/watch?v=${o.id}`);
|
||||
}
|
||||
|
||||
//#endregion
|
||||
|
||||
|
||||
//#region routes
|
||||
|
||||
app.get('/choose-playlist', async (_req, res) => {
|
||||
try {
|
||||
if (!oauth2Client.credentials || !oauth2Client.credentials.access_token) {
|
||||
const t = manager.loadToken();
|
||||
if (!t) return res.redirect('/auth');
|
||||
}
|
||||
|
||||
const playlists = await getAllPlaylists(oauth2Client);
|
||||
|
||||
let html = `
|
||||
<html>
|
||||
<head>
|
||||
<title>choose playlist</title>
|
||||
<style>
|
||||
body { font-family: sans-serif; }
|
||||
#container { margin: 20px; }
|
||||
select, button { margin-top: 10px; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="container">
|
||||
<h1>choose a playlist to download</h1>
|
||||
<select id="playlistSelect">
|
||||
${playlists
|
||||
.map(
|
||||
(pl) =>
|
||||
`<option value="${pl.id}">${pl.snippet.title}</option>`
|
||||
)
|
||||
.join('')
|
||||
}
|
||||
</select>
|
||||
<br/>
|
||||
<button id="downloadBtn">download playlist</button>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
// when the button is clicked, we'll navigate to /download-playlist?playlistId=...
|
||||
const downloadBtn = document.querySelector('#downloadBtn');
|
||||
const select = document.querySelector('#playlistSelect');
|
||||
|
||||
downloadBtn.addEventListener('click', () => {
|
||||
const chosenId = select.value;
|
||||
if (!chosenId) {
|
||||
alert('no playlist selected!');
|
||||
return;
|
||||
}
|
||||
window.location.href = '/download-playlist?playlistId=' + chosenId;
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
res.send(html);
|
||||
} catch (err) {
|
||||
console.error('error fetching playlists:', err);
|
||||
res.status(500).send('error fetching playlists.');
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* called when the user has selected a playlist from the popup
|
||||
* fetch all items, write them to a json file, and update the status
|
||||
*/
|
||||
app.get('/download-playlist', async (req, res) => {
|
||||
try {
|
||||
if (!oauth2Client.credentials || !oauth2Client.credentials.access_token) {
|
||||
return res
|
||||
.status(401)
|
||||
.send('error: oauth2 client not authorized. go to /auth first.');
|
||||
}
|
||||
|
||||
const { playlistId } = req.query;
|
||||
if (!playlistId) {
|
||||
return res
|
||||
.status(400)
|
||||
.send('missing playlist id. please choose a playlist.');
|
||||
}
|
||||
|
||||
// set status to in-progress
|
||||
downloadStatus = 'in-progress';
|
||||
|
||||
// fetch the playlist items
|
||||
const items = await getPlaylistItems(playlistId, oauth2Client);
|
||||
|
||||
// create a data folder if it doesn't exist
|
||||
const dataDir = path.join(process.cwd(), 'data');
|
||||
if (!fs.existsSync(dataDir)) {
|
||||
fs.mkdirSync(dataDir);
|
||||
}
|
||||
|
||||
const outFile = path.join(dataDir, `playlist_${playlistId}.json`);
|
||||
|
||||
fs.writeFileSync(outFile, JSON.stringify(items, null, 2), 'utf8');
|
||||
|
||||
downloadStatus = 'completed';
|
||||
|
||||
res.send(`
|
||||
<html>
|
||||
<head><title>download complete</title></head>
|
||||
<body>
|
||||
<h1>download complete!</h1>
|
||||
<p>downloaded ${items.length} items to <strong>${outFile}</strong></p>
|
||||
<p><a href="/status" target="_blank">check status</a></p>
|
||||
<script>window.close()</script>
|
||||
</body>
|
||||
</html>
|
||||
`);
|
||||
} catch (err) {
|
||||
console.error('error downloading playlist:', err);
|
||||
downloadStatus = 'error';
|
||||
res.status(500).send('error downloading playlist.');
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
app.get('/status', (_req, res) => {
|
||||
let html = `
|
||||
<html>
|
||||
<head>
|
||||
<title>download status</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>current status: ${downloadStatus}</h1>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
res.send(html);
|
||||
});
|
||||
|
||||
|
||||
//#endregion
|
||||
|
||||
|
||||
app.listen(port, () => {
|
||||
console.log(`server listening on http://localhost:${port}`);
|
||||
console.log(`go to http://localhost:${port}/auth to start oauth flow`);
|
||||
});
|
||||
@@ -0,0 +1,54 @@
|
||||
import { google } from "googleapis";
|
||||
import fs from 'fs';
|
||||
import { tokenManager } from "./tokenManager.js";
|
||||
(await import('dotenv')).config({
|
||||
path: './secret/config.env',
|
||||
debug: true
|
||||
});
|
||||
|
||||
const { CLIENT_ID, CLIENT_SECRET, REDIRECT_URI } = process.env,
|
||||
manager = new tokenManager({ clientId: CLIENT_ID, clientSecret: CLIENT_SECRET, redirectUri: REDIRECT_URI, tokenPath: 'secret/token.json' });
|
||||
|
||||
if (!manager.loadToken()) throw 'LOAD TOKEN FAILED!';
|
||||
|
||||
const youtube = google.youtube('v3'),
|
||||
video = await youtube.videos.list({
|
||||
auth: manager.getAuthClient(),
|
||||
part: 'snippet,contentDetails',
|
||||
myRating: 'like',
|
||||
maxResults: 1,
|
||||
// pageToken: nextPageToken
|
||||
});
|
||||
|
||||
const channelsinfo = (await (youtube.channels.list({ auth: manager.getAuthClient(), mine: true, part: 'snippet,contentDetails,statistics' }))).data;
|
||||
fs.writeFileSync('channels.json', JSON.stringify(channelsinfo));
|
||||
|
||||
let likedMusic = [];
|
||||
let nextPageToken = null;
|
||||
|
||||
// first, retrieve *all* liked videos
|
||||
do {
|
||||
const response = await youtube.videos.list({
|
||||
auth: manager.getAuthClient(),
|
||||
part: 'snippet,contentDetails',
|
||||
myRating: 'like',
|
||||
maxResults: 50,
|
||||
pageToken: nextPageToken
|
||||
});
|
||||
|
||||
if (response.data.items) {
|
||||
likedMusic = likedMusic.concat(response.data.items.filter(o => o.snippet?.categoryId === '10').map(o => o.snippet.title))
|
||||
// snippet.categoryId should be present under `video.snippet`
|
||||
const t = response.data.items.find(video => video.snippet.title === 'Peeping Tom (feat. Rosie Harte)')
|
||||
if (t) {
|
||||
fs.writeFileSync('temp.json', JSON.stringify(t));
|
||||
break;
|
||||
}
|
||||
// likedMusic = likedMusic.concat(response.data.items.filter((video) => video.snippet?.categoryId === '10'));
|
||||
}
|
||||
|
||||
nextPageToken = response.data.nextPageToken;
|
||||
} while (nextPageToken);
|
||||
|
||||
// console.log('not found!');
|
||||
fs.writeFileSync('temp.json', JSON.stringify(likedMusic))
|
||||
@@ -0,0 +1,94 @@
|
||||
// APPARENTLY the youtube api just....doesn't return all likes for some reason.....
|
||||
|
||||
import { chromium } from 'playwright';
|
||||
import fs from 'fs';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const urltostr = (u) => {
|
||||
try {
|
||||
return new URL(u);
|
||||
}
|
||||
catch (err) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function scrapeLikedVideos() {
|
||||
const browser = await chromium.launchPersistentContext('bdata', {
|
||||
headless: false, // youtube breaks in headless
|
||||
args: ['--disable-blink-features=AutomationControlled']
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
console.log("Opening YouTube...");
|
||||
await page.goto('https://music.youtube.com/', { waitUntil: 'networkidle' });
|
||||
|
||||
// Step Log in or die
|
||||
if (await page.locator('[aria-label="Sign in"]').isVisible()) {
|
||||
console.log("Logging in...");
|
||||
|
||||
await page.click('[aria-label="Sign in"]');
|
||||
await page.waitForNavigation({ waitUntil: 'networkidle' });
|
||||
|
||||
console.log(page.url());
|
||||
await page.waitForURL('https://music.youtube.com/').catch(console.error);
|
||||
|
||||
console.log("Login successful");
|
||||
} else {
|
||||
console.log("Already logged in");
|
||||
}
|
||||
|
||||
// Navigate to "Liked Videos" playlist
|
||||
console.log("Navigating to Liked Videos...");
|
||||
await page.goto('https://music.youtube.com/playlist?list=LM', { waitUntil: 'domcontentloaded' });
|
||||
|
||||
// Scroll to load all liked videos
|
||||
console.log("Scrolling through Liked Videos...");
|
||||
const s = new Set();
|
||||
let previousHeight = 0;
|
||||
while (true) {
|
||||
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
if (currentHeight === previousHeight) break;
|
||||
|
||||
previousHeight = currentHeight;
|
||||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||
await page.waitForTimeout(2000); // Wait for new content to load
|
||||
|
||||
// sloppy and repetative to do it every time, but otherwise it won't work as the incoming videos won't all appear
|
||||
(await page.evaluate(() => {
|
||||
const videos = Array.from(document.querySelector('#contents').querySelectorAll('.title .yt-simple-endpoint'));
|
||||
return videos.map(video => video.href.replace('&list=LM', ''));
|
||||
})).map(u => s.add(u));
|
||||
}
|
||||
|
||||
// // Scrape video data
|
||||
// console.log("Scraping liked videos...");
|
||||
// const likedVideos = await page.evaluate(() => {
|
||||
// const videos = Array.from(document.querySelector('#contents').querySelectorAll('.title .yt-simple-endpoint'));
|
||||
// return videos.map(video => video.href);
|
||||
// });
|
||||
|
||||
// console.log(`Found ${likedVideos.length} liked videos.`);
|
||||
// console.log(likedVideos);
|
||||
|
||||
// Close the browser
|
||||
await browser.close();
|
||||
|
||||
// Save the results to a JSON file
|
||||
fs.writeFileSync('liked_videos.json', JSON.stringify([...s], null, 2));
|
||||
console.log("Liked videos saved to liked_videos.json");
|
||||
|
||||
return [...s];
|
||||
}
|
||||
|
||||
// Run the scraper
|
||||
(async () => {
|
||||
try {
|
||||
await scrapeLikedVideos();
|
||||
} catch (error) {
|
||||
console.error("Error scraping liked videos:", error);
|
||||
}
|
||||
})();
|
||||
@@ -0,0 +1,61 @@
|
||||
import fs from 'fs';
|
||||
import { google } from 'googleapis';
|
||||
|
||||
|
||||
export class tokenManager {
|
||||
constructor({
|
||||
clientId,
|
||||
clientSecret,
|
||||
redirectUri,
|
||||
tokenPath = 'token.json'
|
||||
}) {
|
||||
// store options
|
||||
this.clientId = clientId;
|
||||
this.clientSecret = clientSecret;
|
||||
this.redirectUri = redirectUri;
|
||||
this.tokenPath = tokenPath;
|
||||
|
||||
// create oauth2 client
|
||||
this.oauth2Client = new google.auth.OAuth2(
|
||||
this.clientId,
|
||||
this.clientSecret,
|
||||
this.redirectUri
|
||||
);
|
||||
}
|
||||
|
||||
loadToken() {
|
||||
if (!fs.existsSync(this.tokenPath)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const tokenData = fs.readFileSync(this.tokenPath, 'utf-8');
|
||||
const token = JSON.parse(tokenData);
|
||||
this.oauth2Client.setCredentials(token);
|
||||
|
||||
return token;
|
||||
}
|
||||
|
||||
|
||||
saveToken(token) {
|
||||
fs.writeFileSync(this.tokenPath, JSON.stringify(token, null, 2), 'utf-8');
|
||||
this.oauth2Client.setCredentials(token);
|
||||
}
|
||||
|
||||
async refreshAccessToken() {
|
||||
// if no refresh token is present, we can't refresh
|
||||
if (!this.oauth2Client.credentials.refresh_token) {
|
||||
throw new Error('no refresh token is available');
|
||||
}
|
||||
|
||||
// use the googleapis refresh method
|
||||
const { credentials } = await this.oauth2Client.refreshAccessToken();
|
||||
|
||||
// save the new token info
|
||||
this.saveToken(credentials);
|
||||
return credentials;
|
||||
}
|
||||
|
||||
getAuthClient() {
|
||||
return this.oauth2Client;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
# YouTube Music Processing
|
||||
|
||||
A simple project for processing and analyzing YouTube Music data. Includes tools for metadata extraction, audio analysis, and visualization.
|
||||
|
||||
### Current Features:
|
||||
* Fetch YouTube Music metadata.
|
||||
* Fetch YouTube Music audio metadata
|
||||
|
||||
|
||||
### Planned Features:
|
||||
* implement ML to make a personal recommendation algorithm
|
||||
* maybe see if I can make a new song from all my genres using whisper or smth
|
||||
@@ -0,0 +1,117 @@
|
||||
import yt_dlp
|
||||
import librosa
|
||||
import numpy as np
|
||||
import os
|
||||
import requests
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# Constants
|
||||
COOKIES_PATH = "youtube_cookies.txt" # Path to your cookies file
|
||||
OUTPUT_AUDIO = "audio.wav" # Output audio file for Librosa processing
|
||||
|
||||
|
||||
# Step 1: Download audio from YouTube
|
||||
def download_audio(video_url, output_path, cookies_path):
|
||||
ydl_opts = {
|
||||
"format": "bestaudio/best",
|
||||
"cookiefile": cookies_path,
|
||||
"postprocessors": [
|
||||
{ # Convert audio to WAV format for Librosa
|
||||
"key": "FFmpegExtractAudio",
|
||||
"preferredcodec": "wav",
|
||||
}
|
||||
],
|
||||
"outtmpl": output_path,
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download([video_url])
|
||||
print(f"Downloaded and converted audio to {output_path}")
|
||||
|
||||
|
||||
# Step 2: Extract audio features using Librosa
|
||||
def extract_audio_features(audio_path):
|
||||
y, sr = librosa.load(audio_path, sr=None) # Load audio
|
||||
features = {
|
||||
"tempo": librosa.feature.tempo(y=y, sr=sr)[0], # Tempo in BPM
|
||||
"mfcc": np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0), # MFCCs
|
||||
"spectral_contrast": np.mean(
|
||||
librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0
|
||||
),
|
||||
"chroma_stft": np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0),
|
||||
}
|
||||
print("Extracted audio features:", features)
|
||||
return features
|
||||
|
||||
|
||||
# Step 3: Query MusicBrainz or Discogs for metadata
|
||||
def fetch_metadata(title, artist):
|
||||
# Example: Fetch metadata from MusicBrainz
|
||||
base_url = "https://musicbrainz.org/ws/2/recording/"
|
||||
params = {
|
||||
"query": f"{title} AND artist:{artist}",
|
||||
"fmt": "json",
|
||||
}
|
||||
response = requests.get(base_url, params=params)
|
||||
if response.status_code == 200:
|
||||
results = response.json().get("recordings", [])
|
||||
if results:
|
||||
metadata = {
|
||||
"title": results[0].get("title"),
|
||||
"artist": results[0]
|
||||
.get("artist-credit", [{}])[0]
|
||||
.get("artist", {})
|
||||
.get("name"),
|
||||
"release_date": results[0].get("first-release-date"),
|
||||
"genres": results[0].get("tags", []),
|
||||
}
|
||||
print("Fetched metadata from MusicBrainz:", metadata)
|
||||
return metadata
|
||||
else:
|
||||
print("No results found on MusicBrainz.")
|
||||
else:
|
||||
print(f"MusicBrainz API error: {response.status_code}")
|
||||
return None
|
||||
|
||||
|
||||
# Main pipeline (one at a time)
|
||||
if __name__ == "__main__":
|
||||
video_url = "https://www.youtube.com/watch?v=UoCxdh7qQHE"
|
||||
|
||||
# Step 1: Download audio
|
||||
download_audio(video_url, OUTPUT_AUDIO.replace(".wav", ""), COOKIES_PATH)
|
||||
|
||||
# Step 2: Extract audio features
|
||||
audio_features = extract_audio_features(OUTPUT_AUDIO)
|
||||
|
||||
# Step 3: Fetch metadata
|
||||
youtube_title = "Turning Into Night" # Example, fetch dynamically from yt-dlp metadata if needed
|
||||
youtube_artist = "Jamie Berry"
|
||||
metadata = fetch_metadata(youtube_title, youtube_artist)
|
||||
|
||||
data = {
|
||||
**metadata,
|
||||
**{f"mfcc_{i}": val for i, val in enumerate(audio_features["mfcc"])},
|
||||
**{
|
||||
f"spectral_contrast_{i}": val
|
||||
for i, val in enumerate(audio_features["spectral_contrast"])
|
||||
},
|
||||
**{
|
||||
f"chroma_stft_{i}": val
|
||||
for i, val in enumerate(audio_features["chroma_stft"])
|
||||
},
|
||||
"tempo": audio_features["tempo"],
|
||||
}
|
||||
|
||||
# Convert to a DataFrame
|
||||
df = pd.DataFrame([data])
|
||||
|
||||
# Save to Parquet
|
||||
output_file = "output.parquet"
|
||||
df.to_parquet(output_file, engine="pyarrow", index=False)
|
||||
|
||||
# Clean up downloaded audio (optional)
|
||||
os.remove(OUTPUT_AUDIO)
|
||||
print("Pipeline complete.")
|
||||
|
||||
@@ -0,0 +1,146 @@
|
||||
import yt_dlp
|
||||
import librosa
|
||||
import numpy as np
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
import pandas as pd
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime
|
||||
|
||||
# Constants
|
||||
COOKIES_PATH = "Downloader/secret/youtube_cookies.txt"
|
||||
TEMP_AUDIO_DIR = "temp_audio" # dir to store temporary audio files in
|
||||
OUTPUT_FILE = "output.parquet"
|
||||
ERROR_LOG_FILE = "error_log.txt"
|
||||
MAX_WORKERS = 6
|
||||
|
||||
# Ensure temporary directory exists
|
||||
os.makedirs(TEMP_AUDIO_DIR, exist_ok=True)
|
||||
|
||||
# Function to log errors
|
||||
def log_error(message: str):
|
||||
with open(ERROR_LOG_FILE, "a") as log_file:
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
log_file.write(f"[{timestamp}] {message}\n")
|
||||
|
||||
def get_youtube_music_title(url: str) -> str:
|
||||
try:
|
||||
ydl_opts = {
|
||||
'quiet': True,
|
||||
'no_warnings': True,
|
||||
'skip_download': True,
|
||||
}
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
return info.get('title', 'No title found')
|
||||
except Exception as e:
|
||||
log_error(f"Failed to retrieve title for URL {url}: {e}")
|
||||
return "Unknown Title"
|
||||
|
||||
def download_audio(video_url, output_path, cookies_path):
|
||||
try:
|
||||
ydl_opts = {
|
||||
"format": "bestaudio/best",
|
||||
"cookiefile": cookies_path,
|
||||
"postprocessors": [
|
||||
{"key": "FFmpegExtractAudio", "preferredcodec": "wav"}
|
||||
],
|
||||
"outtmpl": output_path,
|
||||
}
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download([video_url])
|
||||
print(f"Downloaded and converted audio to {output_path}")
|
||||
except Exception as e:
|
||||
log_error(f"Failed to download audio for {video_url}: {e}")
|
||||
raise
|
||||
|
||||
def extract_audio_features(audio_path):
|
||||
try:
|
||||
y, sr = librosa.load(audio_path, sr=None)
|
||||
features = {
|
||||
"tempo": librosa.beat.tempo(y=y, sr=sr)[0],
|
||||
"mfcc": np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0),
|
||||
"spectral_contrast": np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0),
|
||||
"chroma_stft": np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0),
|
||||
}
|
||||
print("Extracted audio features:", features)
|
||||
return features
|
||||
except Exception as e:
|
||||
log_error(f"Failed to extract features from {audio_path}: {e}")
|
||||
raise
|
||||
|
||||
def fetch_metadata(title, artist="Unknown"):
|
||||
try:
|
||||
base_url = "https://musicbrainz.org/ws/2/recording/"
|
||||
params = {"query": title, "fmt": "json"}
|
||||
response = requests.get(base_url, params=params)
|
||||
if response.status_code == 200:
|
||||
results = response.json().get("recordings", [])
|
||||
if results:
|
||||
metadata = {
|
||||
"title": results[0].get("title"),
|
||||
"artist": results[0].get("artist-credit", [{}])[0].get("artist", {}).get("name"),
|
||||
"release_date": results[0].get("first-release-date"),
|
||||
"genres": results[0].get("tags", []),
|
||||
}
|
||||
print("Fetched metadata from MusicBrainz:", metadata)
|
||||
return metadata
|
||||
log_error(f"No results from MusicBrainz for {title} by {artist}")
|
||||
except Exception as e:
|
||||
log_error(f"Failed to fetch metadata for {title}: {e}")
|
||||
return {"title": title, "artist": artist, "release_date": None, "genres": []}
|
||||
|
||||
def process_song(video_url):
|
||||
title = get_youtube_music_title(video_url)
|
||||
audio_path = os.path.join(TEMP_AUDIO_DIR, f"{title.replace(' ', '_')}.wav")
|
||||
try:
|
||||
download_audio(video_url, audio_path.replace(".wav", ""), COOKIES_PATH)
|
||||
audio_features = extract_audio_features(audio_path)
|
||||
metadata = fetch_metadata(title)
|
||||
data = {
|
||||
**metadata,
|
||||
**{f"mfcc_{i}": val for i, val in enumerate(audio_features["mfcc"])},
|
||||
**{f"spectral_contrast_{i}": val for i, val in enumerate(audio_features["spectral_contrast"])},
|
||||
**{f"chroma_stft_{i}": val for i, val in enumerate(audio_features["chroma_stft"])},
|
||||
"tempo": audio_features["tempo"],
|
||||
}
|
||||
return data
|
||||
except Exception as e:
|
||||
log_error(f"Failed to process song {title} from URL {video_url}: {e}")
|
||||
return None
|
||||
finally:
|
||||
if os.path.exists(audio_path):
|
||||
os.remove(audio_path)
|
||||
|
||||
def read_urls_from_json(data_dir):
|
||||
urls = []
|
||||
for filename in os.listdir(data_dir):
|
||||
if filename.endswith(".json"):
|
||||
file_path = os.path.join(data_dir, filename)
|
||||
try:
|
||||
with open(file_path, "r") as f:
|
||||
data = json.load(f)
|
||||
if isinstance(data, list):
|
||||
urls.extend(data)
|
||||
elif isinstance(data, dict) and "url" in data:
|
||||
urls.append(data["url"])
|
||||
except json.JSONDecodeError as e:
|
||||
log_error(f"Failed to read JSON file {file_path}: {e}")
|
||||
return [url for url in urls if url]
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
songs = read_urls_from_json('data')
|
||||
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
||||
results = list(executor.map(process_song, songs))
|
||||
processed_data = [result for result in results if result is not None]
|
||||
df = pd.DataFrame(processed_data)
|
||||
df.to_parquet(OUTPUT_FILE, engine="pyarrow", index=False)
|
||||
print(f"Data saved to {OUTPUT_FILE}")
|
||||
except Exception as e:
|
||||
log_error(f"Pipeline failed: {e}")
|
||||
finally:
|
||||
if os.path.exists(TEMP_AUDIO_DIR):
|
||||
os.rmdir(TEMP_AUDIO_DIR)
|
||||
print("Pipeline complete.")
|
||||
Generated
+3158
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "music-ml",
|
||||
"version": "1.0.0",
|
||||
"main": "Downloader/main.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"description": "",
|
||||
"dependencies": {
|
||||
"dotenv": "^16.4.7",
|
||||
"express": "^4.21.2",
|
||||
"googleapis": "^144.0.0",
|
||||
"open": "^10.1.0",
|
||||
"playwright": "^1.49.1",
|
||||
"puppeteer": "^23.11.1",
|
||||
"puppeteer-extra": "^3.3.6",
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
audioread==3.0.1
|
||||
certifi==2024.12.14
|
||||
cffi==1.17.1
|
||||
charset-normalizer==3.4.0
|
||||
decorator==5.1.1
|
||||
idna==3.10
|
||||
joblib==1.4.2
|
||||
lazy_loader==0.4
|
||||
librosa==0.10.2.post1
|
||||
llvmlite==0.43.0
|
||||
msgpack==1.1.0
|
||||
numba==0.60.0
|
||||
numpy==2.0.2
|
||||
packaging==24.2
|
||||
pafy==0.5.5
|
||||
pandas==2.2.3
|
||||
platformdirs==4.3.6
|
||||
pooch==1.8.2
|
||||
pyarrow==18.1.0
|
||||
pycparser==2.22
|
||||
python-dateutil==2.9.0.post0
|
||||
pytube==15.0.0
|
||||
pytz==2024.2
|
||||
requests==2.32.3
|
||||
scikit-learn==1.6.0
|
||||
scipy==1.14.1
|
||||
six==1.17.0
|
||||
soundfile==0.12.1
|
||||
soxr==0.5.0.post1
|
||||
threadpoolctl==3.5.0
|
||||
typing_extensions==4.12.2
|
||||
tzdata==2024.2
|
||||
urllib3==2.3.0
|
||||
youtube-dl==2021.12.17
|
||||
yt-dlp==2024.12.13
|
||||
ytmusicapi==1.9.0
|
||||
Reference in New Issue
Block a user