bug fixes and status bar

This commit is contained in:
2024-12-23 18:21:10 +02:00
parent 319ce65d71
commit 33b550e7de
3 changed files with 29 additions and 18 deletions
+1
View File
@@ -7,3 +7,4 @@ temp_audio/
.env .env
temp.* temp.*
*.parquet *.parquet
err.log
-5
View File
@@ -1,5 +0,0 @@
[2024-12-23 17:50:29] No results from MusicBrainz for コインロッカーベイビー by Unknown
[2024-12-23 17:50:34] No results from MusicBrainz for いみごのたまご by Unknown
[2024-12-23 17:55:13] No results from MusicBrainz for Hello by Unknown
[2024-12-23 17:55:22] Failed to download audio for https://music.youtube.com/watch?v=SOP8opBgvAY: ERROR: 'Downloader/secret/youtube_cookies.txt' does not look like a Netscape format cookies file
[2024-12-23 17:55:22] Failed to process song Trauma Team (2020 Version) from URL https://music.youtube.com/watch?v=SOP8opBgvAY: ERROR: 'Downloader/secret/youtube_cookies.txt' does not look like a Netscape format cookies file
+28 -13
View File
@@ -1,5 +1,6 @@
import yt_dlp import yt_dlp
import librosa import librosa
from librosa.feature.rhythm import tempo
import numpy as np import numpy as np
import os import os
import json import json
@@ -7,13 +8,26 @@ import requests
import pandas as pd import pandas as pd
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from datetime import datetime from datetime import datetime
from tqdm import tqdm
class NoOpLogger:
def debug(self, msg):
pass
def warning(self, msg):
pass
def error(self, msg):
pass
# Constants # Constants
COOKIES_PATH = "Downloader/secret/youtube_cookies.txt" COOKIES_PATH = "Downloader/secret/youtube_cookies.txt"
TEMP_AUDIO_DIR = "temp_audio" # dir to store temporary audio files in TEMP_AUDIO_DIR = "temp_audio" # dir to store temporary audio files in
OUTPUT_FILE = "output.parquet" OUTPUT_FILE = "output.parquet"
ERROR_LOG_FILE = "error_log.txt" ERROR_LOG_FILE = "err.log"
MAX_WORKERS = 6 MAX_WORKERS = 10
DOWNLOAD_LONG = False # Set to True to allow downloading songs over 15 minutes DOWNLOAD_LONG = False # Set to True to allow downloading songs over 15 minutes
# Ensure temporary directory exists # Ensure temporary directory exists
@@ -25,7 +39,7 @@ def log_error(message: str):
with open(ERROR_LOG_FILE, "a") as log_file: with open(ERROR_LOG_FILE, "a") as log_file:
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_file.write(f"[{timestamp}] {message}\n") log_file.write(f"[{timestamp}] {message}\n")
def get_youtube_music_info(url: str): def get_youtube_music_info(url: str):
try: try:
@@ -33,6 +47,7 @@ def get_youtube_music_info(url: str):
'quiet': True, 'quiet': True,
'no_warnings': True, 'no_warnings': True,
'skip_download': True, 'skip_download': True,
'logger': NoOpLogger(), # Suppress all yt_dlp logs
} }
with yt_dlp.YoutubeDL(ydl_opts) as ydl: with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False) info = ydl.extract_info(url, download=False)
@@ -43,7 +58,7 @@ def get_youtube_music_info(url: str):
except Exception as e: except Exception as e:
log_error(f"Failed to retrieve info for URL {url}: {e}") log_error(f"Failed to retrieve info for URL {url}: {e}")
return {'title': 'Unknown Title', 'duration': 0} return {'title': 'Unknown Title', 'duration': 0}
def download_audio(video_url, output_path, cookies_path): def download_audio(video_url, output_path, cookies_path):
try: try:
@@ -54,25 +69,25 @@ def download_audio(video_url, output_path, cookies_path):
{"key": "FFmpegExtractAudio", "preferredcodec": "wav"} {"key": "FFmpegExtractAudio", "preferredcodec": "wav"}
], ],
"outtmpl": output_path, "outtmpl": output_path,
"logger": NoOpLogger(), # Suppress all yt_dlp logs
} }
with yt_dlp.YoutubeDL(ydl_opts) as ydl: with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url]) ydl.download([video_url])
print(f"Downloaded and converted audio to {output_path}")
except Exception as e: except Exception as e:
log_error(f"Failed to download audio for {video_url}: {e}") log_error(f"Failed to download audio for {video_url}: {e}")
raise raise
def extract_audio_features(audio_path): def extract_audio_features(audio_path):
try: try:
y, sr = librosa.load(audio_path, sr=None) y, sr = librosa.load(audio_path, sr=None)
features = { features = {
"tempo": librosa.beat.tempo(y=y, sr=sr)[0], "tempo": tempo(y=y, sr=sr)[0],
"mfcc": np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0), "mfcc": np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0),
"spectral_contrast": np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0), "spectral_contrast": np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0),
"chroma_stft": np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0), "chroma_stft": np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0),
} }
print("Extracted audio features:", features)
return features return features
except Exception as e: except Exception as e:
log_error(f"Failed to extract features from {audio_path}: {e}") log_error(f"Failed to extract features from {audio_path}: {e}")
@@ -93,7 +108,6 @@ def fetch_metadata(title, artist="Unknown"):
"release_date": results[0].get("first-release-date"), "release_date": results[0].get("first-release-date"),
"genres": results[0].get("tags", []), "genres": results[0].get("tags", []),
} }
print("Fetched metadata from MusicBrainz:", metadata)
return metadata return metadata
log_error(f"No results from MusicBrainz for {title} by {artist}") log_error(f"No results from MusicBrainz for {title} by {artist}")
except Exception as e: except Exception as e:
@@ -108,7 +122,6 @@ def process_song(video_url):
# Check if the song exceeds the allowed length # Check if the song exceeds the allowed length
if not DOWNLOAD_LONG and duration > 15 * 60: if not DOWNLOAD_LONG and duration > 15 * 60:
print(f"Skipping {title} (Duration: {duration / 60:.2f} minutes) as it exceeds 15 minutes.")
log_error(f"Skipped {title} (Duration: {duration / 60:.2f} minutes) - too long.") log_error(f"Skipped {title} (Duration: {duration / 60:.2f} minutes) - too long.")
with open(ERROR_LOG_FILE, "a") as log_file: with open(ERROR_LOG_FILE, "a") as log_file:
log_file.write(f"{video_url},") log_file.write(f"{video_url},")
@@ -156,14 +169,16 @@ if __name__ == "__main__":
try: try:
songs = read_urls_from_json('data') songs = read_urls_from_json('data')
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
results = list(executor.map(process_song, songs)) with tqdm(total=len(songs), desc="Processing songs", unit="song") as pbar:
results = []
for result in executor.map(process_song, songs):
results.append(result)
pbar.update(1)
processed_data = [result for result in results if result is not None] processed_data = [result for result in results if result is not None]
df = pd.DataFrame(processed_data) df = pd.DataFrame(processed_data)
df.to_parquet(OUTPUT_FILE, engine="pyarrow", index=False) df.to_parquet(OUTPUT_FILE, engine="pyarrow", index=False)
print(f"Data saved to {OUTPUT_FILE}")
except Exception as e: except Exception as e:
log_error(f"Pipeline failed: {e}") log_error(f"Pipeline failed: {e}")
finally: finally:
if os.path.exists(TEMP_AUDIO_DIR): if os.path.exists(TEMP_AUDIO_DIR):
os.rmdir(TEMP_AUDIO_DIR) os.rmdir(TEMP_AUDIO_DIR)
print("Pipeline complete.")