mirror of
https://github.com/ION606/youtube-music-meta-extract.git
synced 2026-05-14 22:06:56 +00:00
bug fixes and status bar
This commit is contained in:
@@ -7,3 +7,4 @@ temp_audio/
|
|||||||
.env
|
.env
|
||||||
temp.*
|
temp.*
|
||||||
*.parquet
|
*.parquet
|
||||||
|
err.log
|
||||||
|
|||||||
@@ -1,5 +0,0 @@
|
|||||||
[2024-12-23 17:50:29] No results from MusicBrainz for コインロッカーベイビー by Unknown
|
|
||||||
[2024-12-23 17:50:34] No results from MusicBrainz for いみごのたまご by Unknown
|
|
||||||
[2024-12-23 17:55:13] No results from MusicBrainz for Hello by Unknown
|
|
||||||
[2024-12-23 17:55:22] Failed to download audio for https://music.youtube.com/watch?v=SOP8opBgvAY: [0;31mERROR:[0m 'Downloader/secret/youtube_cookies.txt' does not look like a Netscape format cookies file
|
|
||||||
[2024-12-23 17:55:22] Failed to process song Trauma Team (2020 Version) from URL https://music.youtube.com/watch?v=SOP8opBgvAY: [0;31mERROR:[0m 'Downloader/secret/youtube_cookies.txt' does not look like a Netscape format cookies file
|
|
||||||
+28
-13
@@ -1,5 +1,6 @@
|
|||||||
import yt_dlp
|
import yt_dlp
|
||||||
import librosa
|
import librosa
|
||||||
|
from librosa.feature.rhythm import tempo
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
@@ -7,13 +8,26 @@ import requests
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
class NoOpLogger:
|
||||||
|
def debug(self, msg):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def warning(self, msg):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def error(self, msg):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
# Constants
|
# Constants
|
||||||
COOKIES_PATH = "Downloader/secret/youtube_cookies.txt"
|
COOKIES_PATH = "Downloader/secret/youtube_cookies.txt"
|
||||||
TEMP_AUDIO_DIR = "temp_audio" # dir to store temporary audio files in
|
TEMP_AUDIO_DIR = "temp_audio" # dir to store temporary audio files in
|
||||||
OUTPUT_FILE = "output.parquet"
|
OUTPUT_FILE = "output.parquet"
|
||||||
ERROR_LOG_FILE = "error_log.txt"
|
ERROR_LOG_FILE = "err.log"
|
||||||
MAX_WORKERS = 6
|
MAX_WORKERS = 10
|
||||||
DOWNLOAD_LONG = False # Set to True to allow downloading songs over 15 minutes
|
DOWNLOAD_LONG = False # Set to True to allow downloading songs over 15 minutes
|
||||||
|
|
||||||
# Ensure temporary directory exists
|
# Ensure temporary directory exists
|
||||||
@@ -25,7 +39,7 @@ def log_error(message: str):
|
|||||||
with open(ERROR_LOG_FILE, "a") as log_file:
|
with open(ERROR_LOG_FILE, "a") as log_file:
|
||||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
log_file.write(f"[{timestamp}] {message}\n")
|
log_file.write(f"[{timestamp}] {message}\n")
|
||||||
|
|
||||||
|
|
||||||
def get_youtube_music_info(url: str):
|
def get_youtube_music_info(url: str):
|
||||||
try:
|
try:
|
||||||
@@ -33,6 +47,7 @@ def get_youtube_music_info(url: str):
|
|||||||
'quiet': True,
|
'quiet': True,
|
||||||
'no_warnings': True,
|
'no_warnings': True,
|
||||||
'skip_download': True,
|
'skip_download': True,
|
||||||
|
'logger': NoOpLogger(), # Suppress all yt_dlp logs
|
||||||
}
|
}
|
||||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||||
info = ydl.extract_info(url, download=False)
|
info = ydl.extract_info(url, download=False)
|
||||||
@@ -43,7 +58,7 @@ def get_youtube_music_info(url: str):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
log_error(f"Failed to retrieve info for URL {url}: {e}")
|
log_error(f"Failed to retrieve info for URL {url}: {e}")
|
||||||
return {'title': 'Unknown Title', 'duration': 0}
|
return {'title': 'Unknown Title', 'duration': 0}
|
||||||
|
|
||||||
|
|
||||||
def download_audio(video_url, output_path, cookies_path):
|
def download_audio(video_url, output_path, cookies_path):
|
||||||
try:
|
try:
|
||||||
@@ -54,25 +69,25 @@ def download_audio(video_url, output_path, cookies_path):
|
|||||||
{"key": "FFmpegExtractAudio", "preferredcodec": "wav"}
|
{"key": "FFmpegExtractAudio", "preferredcodec": "wav"}
|
||||||
],
|
],
|
||||||
"outtmpl": output_path,
|
"outtmpl": output_path,
|
||||||
|
"logger": NoOpLogger(), # Suppress all yt_dlp logs
|
||||||
}
|
}
|
||||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||||
ydl.download([video_url])
|
ydl.download([video_url])
|
||||||
print(f"Downloaded and converted audio to {output_path}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log_error(f"Failed to download audio for {video_url}: {e}")
|
log_error(f"Failed to download audio for {video_url}: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def extract_audio_features(audio_path):
|
def extract_audio_features(audio_path):
|
||||||
try:
|
try:
|
||||||
y, sr = librosa.load(audio_path, sr=None)
|
y, sr = librosa.load(audio_path, sr=None)
|
||||||
features = {
|
features = {
|
||||||
"tempo": librosa.beat.tempo(y=y, sr=sr)[0],
|
"tempo": tempo(y=y, sr=sr)[0],
|
||||||
"mfcc": np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0),
|
"mfcc": np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0),
|
||||||
"spectral_contrast": np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0),
|
"spectral_contrast": np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0),
|
||||||
"chroma_stft": np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0),
|
"chroma_stft": np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0),
|
||||||
}
|
}
|
||||||
print("Extracted audio features:", features)
|
|
||||||
return features
|
return features
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log_error(f"Failed to extract features from {audio_path}: {e}")
|
log_error(f"Failed to extract features from {audio_path}: {e}")
|
||||||
@@ -93,7 +108,6 @@ def fetch_metadata(title, artist="Unknown"):
|
|||||||
"release_date": results[0].get("first-release-date"),
|
"release_date": results[0].get("first-release-date"),
|
||||||
"genres": results[0].get("tags", []),
|
"genres": results[0].get("tags", []),
|
||||||
}
|
}
|
||||||
print("Fetched metadata from MusicBrainz:", metadata)
|
|
||||||
return metadata
|
return metadata
|
||||||
log_error(f"No results from MusicBrainz for {title} by {artist}")
|
log_error(f"No results from MusicBrainz for {title} by {artist}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -108,7 +122,6 @@ def process_song(video_url):
|
|||||||
|
|
||||||
# Check if the song exceeds the allowed length
|
# Check if the song exceeds the allowed length
|
||||||
if not DOWNLOAD_LONG and duration > 15 * 60:
|
if not DOWNLOAD_LONG and duration > 15 * 60:
|
||||||
print(f"Skipping {title} (Duration: {duration / 60:.2f} minutes) as it exceeds 15 minutes.")
|
|
||||||
log_error(f"Skipped {title} (Duration: {duration / 60:.2f} minutes) - too long.")
|
log_error(f"Skipped {title} (Duration: {duration / 60:.2f} minutes) - too long.")
|
||||||
with open(ERROR_LOG_FILE, "a") as log_file:
|
with open(ERROR_LOG_FILE, "a") as log_file:
|
||||||
log_file.write(f"{video_url},")
|
log_file.write(f"{video_url},")
|
||||||
@@ -156,14 +169,16 @@ if __name__ == "__main__":
|
|||||||
try:
|
try:
|
||||||
songs = read_urls_from_json('data')
|
songs = read_urls_from_json('data')
|
||||||
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
||||||
results = list(executor.map(process_song, songs))
|
with tqdm(total=len(songs), desc="Processing songs", unit="song") as pbar:
|
||||||
|
results = []
|
||||||
|
for result in executor.map(process_song, songs):
|
||||||
|
results.append(result)
|
||||||
|
pbar.update(1)
|
||||||
processed_data = [result for result in results if result is not None]
|
processed_data = [result for result in results if result is not None]
|
||||||
df = pd.DataFrame(processed_data)
|
df = pd.DataFrame(processed_data)
|
||||||
df.to_parquet(OUTPUT_FILE, engine="pyarrow", index=False)
|
df.to_parquet(OUTPUT_FILE, engine="pyarrow", index=False)
|
||||||
print(f"Data saved to {OUTPUT_FILE}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log_error(f"Pipeline failed: {e}")
|
log_error(f"Pipeline failed: {e}")
|
||||||
finally:
|
finally:
|
||||||
if os.path.exists(TEMP_AUDIO_DIR):
|
if os.path.exists(TEMP_AUDIO_DIR):
|
||||||
os.rmdir(TEMP_AUDIO_DIR)
|
os.rmdir(TEMP_AUDIO_DIR)
|
||||||
print("Pipeline complete.")
|
|
||||||
|
|||||||
Reference in New Issue
Block a user