import yt_dlp import librosa import numpy as np import os import json import requests import pandas as pd from concurrent.futures import ThreadPoolExecutor from datetime import datetime # Constants COOKIES_PATH = "Downloader/secret/youtube_cookies.txt" TEMP_AUDIO_DIR = "temp_audio" # dir to store temporary audio files in OUTPUT_FILE = "output.parquet" ERROR_LOG_FILE = "error_log.txt" MAX_WORKERS = 6 # Ensure temporary directory exists os.makedirs(TEMP_AUDIO_DIR, exist_ok=True) # Function to log errors def log_error(message: str): with open(ERROR_LOG_FILE, "a") as log_file: timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") log_file.write(f"[{timestamp}] {message}\n") def get_youtube_music_title(url: str) -> str: try: ydl_opts = { 'quiet': True, 'no_warnings': True, 'skip_download': True, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=False) return info.get('title', 'No title found') except Exception as e: log_error(f"Failed to retrieve title for URL {url}: {e}") return "Unknown Title" def download_audio(video_url, output_path, cookies_path): try: ydl_opts = { "format": "bestaudio/best", "cookiefile": cookies_path, "postprocessors": [ {"key": "FFmpegExtractAudio", "preferredcodec": "wav"} ], "outtmpl": output_path, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([video_url]) print(f"Downloaded and converted audio to {output_path}") except Exception as e: log_error(f"Failed to download audio for {video_url}: {e}") raise def extract_audio_features(audio_path): try: y, sr = librosa.load(audio_path, sr=None) features = { "tempo": librosa.beat.tempo(y=y, sr=sr)[0], "mfcc": np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0), "spectral_contrast": np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0), "chroma_stft": np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0), } print("Extracted audio features:", features) return features except Exception as e: log_error(f"Failed to extract features from {audio_path}: {e}") raise def fetch_metadata(title, artist="Unknown"): try: base_url = "https://musicbrainz.org/ws/2/recording/" params = {"query": title, "fmt": "json"} response = requests.get(base_url, params=params) if response.status_code == 200: results = response.json().get("recordings", []) if results: metadata = { "title": results[0].get("title"), "artist": results[0].get("artist-credit", [{}])[0].get("artist", {}).get("name"), "release_date": results[0].get("first-release-date"), "genres": results[0].get("tags", []), } print("Fetched metadata from MusicBrainz:", metadata) return metadata log_error(f"No results from MusicBrainz for {title} by {artist}") except Exception as e: log_error(f"Failed to fetch metadata for {title}: {e}") return {"title": title, "artist": artist, "release_date": None, "genres": []} def process_song(video_url): title = get_youtube_music_title(video_url) audio_path = os.path.join(TEMP_AUDIO_DIR, f"{title.replace(' ', '_')}.wav") try: download_audio(video_url, audio_path.replace(".wav", ""), COOKIES_PATH) audio_features = extract_audio_features(audio_path) metadata = fetch_metadata(title) data = { **metadata, **{f"mfcc_{i}": val for i, val in enumerate(audio_features["mfcc"])}, **{f"spectral_contrast_{i}": val for i, val in enumerate(audio_features["spectral_contrast"])}, **{f"chroma_stft_{i}": val for i, val in enumerate(audio_features["chroma_stft"])}, "tempo": audio_features["tempo"], } return data except Exception as e: log_error(f"Failed to process song {title} from URL {video_url}: {e}") return None finally: if os.path.exists(audio_path): os.remove(audio_path) def read_urls_from_json(data_dir): urls = [] for filename in os.listdir(data_dir): if filename.endswith(".json"): file_path = os.path.join(data_dir, filename) try: with open(file_path, "r") as f: data = json.load(f) if isinstance(data, list): urls.extend(data) elif isinstance(data, dict) and "url" in data: urls.append(data["url"]) except json.JSONDecodeError as e: log_error(f"Failed to read JSON file {file_path}: {e}") return [url for url in urls if url] if __name__ == "__main__": try: songs = read_urls_from_json('data') with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: results = list(executor.map(process_song, songs)) processed_data = [result for result in results if result is not None] df = pd.DataFrame(processed_data) df.to_parquet(OUTPUT_FILE, engine="pyarrow", index=False) print(f"Data saved to {OUTPUT_FILE}") except Exception as e: log_error(f"Pipeline failed: {e}") finally: if os.path.exists(TEMP_AUDIO_DIR): os.rmdir(TEMP_AUDIO_DIR) print("Pipeline complete.")