"""Download media files using yt-dlp with support for direct file downloads. Lean, focused downloader without event infrastructure overhead. - yt-dlp integration for streaming sites - Direct file download fallback for PDFs, images, documents - Tag extraction via metadata.extract_ytdlp_tags() - Logging via helper.logger.log() """ from __future__ import annotations import re # noqa: F401 import sys import time import traceback from pathlib import Path from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin import httpx from helper.logger import log, debug from .utils import ensure_directory, sha256_file from .http_client import HTTPClient from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar try: import yt_dlp # type: ignore from yt_dlp.extractor import gen_extractors # type: ignore except Exception as exc: yt_dlp = None # type: ignore YTDLP_IMPORT_ERROR = exc else: YTDLP_IMPORT_ERROR = None try: from metadata import extract_ytdlp_tags except ImportError: extract_ytdlp_tags = None _EXTRACTOR_CACHE: List[Any] | None = None def _ensure_yt_dlp_ready() -> None: """Verify yt-dlp is available, raise if not.""" if yt_dlp is not None: return detail = str(YTDLP_IMPORT_ERROR or "yt-dlp is not installed") raise DownloadError(f"yt-dlp module not available: {detail}") def _progress_callback(status: Dict[str, Any]) -> None: """Simple progress callback using logger.""" event = status.get("status") if event == "downloading": percent = status.get("_percent_str", "?") speed = status.get("_speed_str", "?") debug(f"Downloading {percent} at {speed}") elif event == "finished": debug(f"✓ Download finished: {status.get('filename')}") elif event in ("postprocessing", "processing"): debug(f"Post-processing: {status.get('postprocessor')}") def is_url_supported_by_ytdlp(url: str) -> bool: """Check if URL is supported by yt-dlp.""" if yt_dlp is None: return False global _EXTRACTOR_CACHE if _EXTRACTOR_CACHE is None: try: _EXTRACTOR_CACHE = [ie for ie in gen_extractors()] # type: ignore[arg-type] except Exception: _EXTRACTOR_CACHE = [] for extractor in _EXTRACTOR_CACHE: try: if not extractor.suitable(url): continue except Exception: continue name = getattr(extractor, "IE_NAME", "") if name.lower() == "generic": continue return True return False def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[str] = None) -> Optional[List[Dict[str, Any]]]: """Get list of available formats for a URL using yt-dlp. Args: url: URL to get formats for no_playlist: If True, ignore playlists and list formats for single video playlist_items: If specified, only list formats for these playlist items (e.g., "1,3,5-8") Returns: List of format dictionaries with keys: format_id, format, resolution, fps, vcodec, acodec, filesize, etc. Returns None if yt-dlp is not available or format listing fails. """ _ensure_yt_dlp_ready() try: ydl_opts = { "quiet": False, "no_warnings": False, "socket_timeout": 30, } # Add no_playlist option if specified if no_playlist: ydl_opts["noplaylist"] = True # Add playlist_items filter if specified if playlist_items: ydl_opts["playlist_items"] = playlist_items with yt_dlp.YoutubeDL(ydl_opts) as ydl: debug(f"Fetching format list for: {url}") info = ydl.extract_info(url, download=False) formats = info.get("formats", []) if not formats: log("No formats available", file=sys.stderr) return None # Parse and extract relevant format info result_formats = [] for fmt in formats: format_info = { "format_id": fmt.get("format_id", ""), "format": fmt.get("format", ""), "ext": fmt.get("ext", ""), "resolution": fmt.get("resolution", ""), "width": fmt.get("width"), "height": fmt.get("height"), "fps": fmt.get("fps"), "vcodec": fmt.get("vcodec", "none"), "acodec": fmt.get("acodec", "none"), "filesize": fmt.get("filesize"), "tbr": fmt.get("tbr"), # Total bitrate } result_formats.append(format_info) debug(f"Found {len(result_formats)} available formats") return result_formats except Exception as e: log(f"✗ Error fetching formats: {e}", file=sys.stderr) return None def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]: """Build yt-dlp download options.""" ensure_directory(opts.output_dir) outtmpl = str((opts.output_dir / "%(title)s.%(ext)s").resolve()) base_options: Dict[str, Any] = { "outtmpl": outtmpl, "quiet": False, "no_warnings": False, "noprogress": False, "socket_timeout": 30, "retries": 10, "fragment_retries": 10, "http_chunk_size": 10_485_760, "restrictfilenames": True, "progress_hooks": [_progress_callback], } if opts.cookies_path and opts.cookies_path.is_file(): base_options["cookiefile"] = str(opts.cookies_path) # Add no-playlist option if specified (for single video from playlist URLs) if opts.no_playlist: base_options["noplaylist"] = True # Configure based on mode if opts.mode == "audio": base_options["format"] = opts.ytdl_format or "251/140/bestaudio" base_options["postprocessors"] = [{"key": "FFmpegExtractAudio"}] else: # video base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best" base_options["format_sort"] = [ "res:4320", "res:2880", "res:2160", "res:1440", "res:1080", "res:720", "res" ] # Add clip sections if provided if opts.clip_sections: base_options["download_sections"] = opts.clip_sections # Add playlist items selection if provided if opts.playlist_items: base_options["playlist_items"] = opts.playlist_items debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}") return base_options def _iter_download_entries(info: Dict[str, Any]) -> Iterator[Dict[str, Any]]: """Iterate through download entries, handling playlists.""" queue: List[Dict[str, Any]] = [info] seen: set[int] = set() while queue: current = queue.pop(0) obj_id = id(current) if obj_id in seen: continue seen.add(obj_id) entries = current.get("entries") if isinstance(entries, list): for entry in entries: if isinstance(entry, dict): queue.append(entry) if current.get("requested_downloads") or not entries: yield current def _candidate_paths(entry: Dict[str, Any], output_dir: Path) -> Iterator[Path]: """Get candidate file paths for downloaded media.""" requested = entry.get("requested_downloads") if isinstance(requested, list): for item in requested: if isinstance(item, dict): for key in ("filepath", "_filename", "filename"): value = item.get(key) if value: yield Path(value) for key in ("filepath", "_filename", "filename"): value = entry.get(key) if value: yield Path(value) if entry.get("filename"): yield output_dir / entry["filename"] def _resolve_entry_and_path(info: Dict[str, Any], output_dir: Path) -> tuple[Dict[str, Any], Path]: """Find downloaded file in yt-dlp metadata.""" for entry in _iter_download_entries(info): for candidate in _candidate_paths(entry, output_dir): if candidate.is_file(): return entry, candidate if not candidate.is_absolute(): resolved = output_dir / candidate if resolved.is_file(): return entry, resolved raise FileNotFoundError("yt-dlp did not report a downloaded media file") def _extract_sha256(info: Dict[str, Any]) -> Optional[str]: """Extract SHA256 hash from yt-dlp metadata.""" for payload in [info] + info.get("entries", []): if not isinstance(payload, dict): continue hashes = payload.get("hashes") if isinstance(hashes, dict): for key in ("sha256", "sha-256", "sha_256"): value = hashes.get(key) if isinstance(value, str) and value.strip(): return value.strip().lower() for key in ("sha256", "sha-256", "sha_256"): value = payload.get(key) if isinstance(value, str) and value.strip(): return value.strip().lower() return None def _get_libgen_download_url(libgen_url: str) -> Optional[str]: """Extract the actual download link from LibGen redirect URL. LibGen URLs like https://libgen.gl/file.php?id=123456 redirect to actual mirror URLs. This follows the redirect chain to get the real file. Args: libgen_url: LibGen file.php URL Returns: Actual download URL or None if extraction fails """ try: import requests from urllib.parse import urlparse # Check if this is a LibGen URL parsed = urlparse(libgen_url) if 'libgen' not in parsed.netloc.lower(): return None if '/file.php' not in parsed.path.lower(): return None # LibGen redirects to actual mirrors, follow redirects to get final URL session = requests.Session() session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) debug(f"Following LibGen redirect chain for: {libgen_url}") # First, get the page and look for direct download link try: response = session.get(libgen_url, timeout=10, allow_redirects=True) final_url = response.url # Try to find actual download link in the page try: from bs4 import BeautifulSoup soup = BeautifulSoup(response.content, 'html.parser') # Look for download links - LibGen typically has forms with download buttons # Look for all links and forms that might lead to download for link in soup.find_all('a'): href = link.get('href') if href and isinstance(href, str): # Look for direct file links or get.php redirects if 'get.php' in href.lower() or href.endswith(('.pdf', '.epub', '.djvu', '.mobi')): download_url = href if href.startswith('http') else urljoin(final_url, href) debug(f"Found download link: {download_url}") return download_url except ImportError: pass # BeautifulSoup not available # If we followed redirects successfully, return the final URL # This handles cases where libgen redirects to a direct download mirror if final_url != libgen_url: debug(f"LibGen resolved to mirror: {final_url}") return final_url except requests.RequestException as e: log(f"Error following LibGen redirects: {e}", file=sys.stderr) # Try head request as fallback try: response = session.head(libgen_url, allow_redirects=True, timeout=10) if response.url != libgen_url: debug(f"LibGen HEAD resolved to: {response.url}") return response.url except: pass return None except Exception as e: log(f"Error resolving LibGen URL: {e}", file=sys.stderr) return None def _download_direct_file( url: str, output_dir: Path, debug_logger: Optional[DebugLogger] = None, ) -> DownloadMediaResult: """Download a direct file (PDF, image, document, etc.) without yt-dlp.""" ensure_directory(output_dir) from urllib.parse import unquote, urlparse, parse_qs import re # Extract filename from URL parsed_url = urlparse(url) url_path = parsed_url.path # Try to get filename from query parameters first (for LibGen and similar services) # e.g., ?filename=Book+Title.pdf or &download=filename.pdf filename = None if parsed_url.query: query_params = parse_qs(parsed_url.query) for param_name in ('filename', 'download', 'file', 'name'): if param_name in query_params and query_params[param_name]: filename = query_params[param_name][0] filename = unquote(filename) break # If not found in query params, extract from URL path if not filename or not filename.strip(): filename = url_path.split("/")[-1] if url_path else "" filename = unquote(filename) # Remove query strings from filename if any if "?" in filename: filename = filename.split("?")[0] # Try to get real filename from Content-Disposition header (HEAD request) try: with HTTPClient(timeout=10.0) as client: response = client._request("HEAD", url, follow_redirects=True) content_disposition = response.headers.get("content-disposition", "") if content_disposition: # Extract filename from Content-Disposition header # Format: attachment; filename="filename.pdf" or filename=filename.pdf match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition) if match: extracted_name = match.group(1) or match.group(2) if extracted_name: filename = unquote(extracted_name) debug(f"Filename from Content-Disposition: {filename}") except Exception as e: log(f"Could not get filename from headers: {e}", file=sys.stderr) # Fallback if we still don't have a good filename if not filename or "." not in filename: filename = "downloaded_file.bin" file_path = output_dir / filename progress_bar = ProgressBar() debug(f"Direct download: {filename}") try: start_time = time.time() downloaded_bytes = [0] total_bytes = [0] last_progress_time = [start_time] def progress_callback(bytes_downloaded: int, content_length: int) -> None: downloaded_bytes[0] = bytes_downloaded total_bytes[0] = content_length now = time.time() if now - last_progress_time[0] >= 0.5 and total_bytes[0] > 0: elapsed = now - start_time percent = (bytes_downloaded / content_length) * 100 if content_length > 0 else 0 speed = bytes_downloaded / elapsed if elapsed > 0 else 0 eta_seconds = (content_length - bytes_downloaded) / speed if speed > 0 else 0 speed_str = progress_bar.format_bytes(speed) + "/s" minutes, seconds = divmod(int(eta_seconds), 60) hours, minutes = divmod(minutes, 60) eta_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}" progress_line = progress_bar.format_progress( percent_str=f"{percent:.1f}%", downloaded=bytes_downloaded, total=content_length, speed_str=speed_str, eta_str=eta_str, ) debug(progress_line) last_progress_time[0] = now with HTTPClient(timeout=30.0) as client: client.download(url, str(file_path), progress_callback=progress_callback) elapsed = time.time() - start_time avg_speed_str = progress_bar.format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0) + "/s" debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}") # For direct file downloads, create minimal info dict without filename as title # This prevents creating duplicate title: tags when filename gets auto-generated # We'll add title back later only if we couldn't extract meaningful tags info = { "id": filename.rsplit(".", 1)[0], "ext": filename.rsplit(".", 1)[1] if "." in filename else "bin", "webpage_url": url, } hash_value = None try: hash_value = sha256_file(file_path) except Exception: pass tags = [] if extract_ytdlp_tags: try: tags = extract_ytdlp_tags(info) except Exception as e: log(f"Error extracting tags: {e}", file=sys.stderr) # Only use filename as a title tag if we couldn't extract any meaningful tags # This prevents duplicate title: tags when the filename could be mistaken for metadata if not any(t.startswith('title:') for t in tags): # Re-extract tags with filename as title only if needed info['title'] = filename tags = [] if extract_ytdlp_tags: try: tags = extract_ytdlp_tags(info) except Exception as e: log(f"Error extracting tags with filename: {e}", file=sys.stderr) if debug_logger is not None: debug_logger.write_record( "direct-file-downloaded", {"url": url, "path": str(file_path), "hash": hash_value}, ) return DownloadMediaResult( path=file_path, info=info, tags=tags, source_url=url, hash_value=hash_value, ) except (httpx.HTTPError, httpx.RequestError) as exc: log(f"Download error: {exc}", file=sys.stderr) if debug_logger is not None: debug_logger.write_record( "exception", {"phase": "direct-file", "url": url, "error": str(exc)}, ) raise DownloadError(f"Failed to download {url}: {exc}") from exc except Exception as exc: log(f"Error downloading file: {exc}", file=sys.stderr) if debug_logger is not None: debug_logger.write_record( "exception", { "phase": "direct-file", "url": url, "error": str(exc), "traceback": traceback.format_exc(), }, ) raise DownloadError(f"Error downloading file: {exc}") from exc def probe_url(url: str, no_playlist: bool = False) -> Optional[Dict[str, Any]]: """Probe URL to extract metadata WITHOUT downloading. Args: url: URL to probe no_playlist: If True, ignore playlists and probe only the single video Returns: Dict with keys: extractor, title, entries (if playlist), duration, etc. Returns None if not supported by yt-dlp. """ if not is_url_supported_by_ytdlp(url): return None _ensure_yt_dlp_ready() assert yt_dlp is not None try: # Extract info without downloading # Use extract_flat='in_playlist' to get full metadata for playlist items ydl_opts = { "quiet": True, # Suppress all output "no_warnings": True, "socket_timeout": 10, "retries": 3, "skip_download": True, # Don't actually download "extract_flat": "in_playlist", # Get playlist with metadata for each entry "noprogress": True, # No progress bars "quiet": True, } # Add no_playlist option if specified if no_playlist: ydl_opts["noplaylist"] = True with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type] info = ydl.extract_info(url, download=False) if not isinstance(info, dict): return None # Extract relevant fields return { "extractor": info.get("extractor", ""), "title": info.get("title", ""), "entries": info.get("entries", []), # Will be populated if playlist "duration": info.get("duration"), "uploader": info.get("uploader"), "description": info.get("description"), "url": url, } except Exception as exc: log(f"Probe failed for {url}: {exc}") return None def download_media( opts: DownloadOptions, *, debug_logger: Optional[DebugLogger] = None, ) -> DownloadMediaResult: """Download media from URL using yt-dlp or direct HTTP download. Args: opts: DownloadOptions with url, mode, output_dir, etc. debug_logger: Optional debug logger for troubleshooting Returns: DownloadMediaResult with path, info, tags, hash Raises: DownloadError: If download fails """ # Handle LibGen URLs specially # file.php redirects to mirrors, get.php is direct from modern API if 'libgen' in opts.url.lower(): if '/get.php' in opts.url.lower(): # Modern API get.php links are direct downloads from mirrors (not file redirects) log(f"Detected LibGen get.php URL, downloading directly...") if debug_logger is not None: debug_logger.write_record("libgen-direct", {"url": opts.url}) return _download_direct_file(opts.url, opts.output_dir, debug_logger) elif '/file.php' in opts.url.lower(): # Old-style file.php redirects to mirrors, we need to resolve log(f"Detected LibGen file.php URL, resolving to actual mirror...") actual_url = _get_libgen_download_url(opts.url) if actual_url and actual_url != opts.url: log(f"Resolved LibGen URL to mirror: {actual_url}") opts.url = actual_url # After resolution, this will typically be an onion link or direct file # Skip yt-dlp for this (it won't support onion/mirrors), go direct if debug_logger is not None: debug_logger.write_record("libgen-resolved", {"original": opts.url, "resolved": actual_url}) return _download_direct_file(opts.url, opts.output_dir, debug_logger) else: log(f"Could not resolve LibGen URL, trying direct download anyway", file=sys.stderr) if debug_logger is not None: debug_logger.write_record("libgen-resolve-failed", {"url": opts.url}) return _download_direct_file(opts.url, opts.output_dir, debug_logger) # Try yt-dlp first if URL is supported if not is_url_supported_by_ytdlp(opts.url): log(f"URL not supported by yt-dlp, trying direct download: {opts.url}") if debug_logger is not None: debug_logger.write_record("direct-file-attempt", {"url": opts.url}) return _download_direct_file(opts.url, opts.output_dir, debug_logger) _ensure_yt_dlp_ready() ytdl_options = _build_ytdlp_options(opts) log(f"Starting yt-dlp download: {opts.url}") if debug_logger is not None: debug_logger.write_record("ytdlp-start", {"url": opts.url}) assert yt_dlp is not None try: with yt_dlp.YoutubeDL(ytdl_options) as ydl: # type: ignore[arg-type] info = ydl.extract_info(opts.url, download=True) except Exception as exc: log(f"yt-dlp failed: {exc}", file=sys.stderr) if debug_logger is not None: debug_logger.write_record( "exception", { "phase": "yt-dlp", "error": str(exc), "traceback": traceback.format_exc(), }, ) raise DownloadError("yt-dlp download failed") from exc if not isinstance(info, dict): log(f"Unexpected yt-dlp response: {type(info)}", file=sys.stderr) raise DownloadError("Unexpected yt-dlp response type") info_dict: Dict[str, Any] = info if debug_logger is not None: debug_logger.write_record( "ytdlp-info", { "keys": sorted(info_dict.keys()), "is_playlist": bool(info_dict.get("entries")), }, ) try: entry, media_path = _resolve_entry_and_path(info_dict, opts.output_dir) except FileNotFoundError as exc: log(f"Error: {exc}", file=sys.stderr) if debug_logger is not None: debug_logger.write_record( "exception", {"phase": "resolve-path", "error": str(exc)}, ) raise DownloadError(str(exc)) from exc if debug_logger is not None: debug_logger.write_record( "resolved-media", {"path": str(media_path), "entry_keys": sorted(entry.keys())}, ) # Extract hash from metadata or compute hash_value = _extract_sha256(entry) or _extract_sha256(info_dict) if not hash_value: try: hash_value = sha256_file(media_path) except OSError as exc: if debug_logger is not None: debug_logger.write_record( "hash-error", {"path": str(media_path), "error": str(exc)}, ) # Extract tags using metadata.py tags = [] if extract_ytdlp_tags: try: tags = extract_ytdlp_tags(entry) except Exception as e: log(f"Error extracting tags: {e}", file=sys.stderr) source_url = ( entry.get("webpage_url") or entry.get("original_url") or entry.get("url") ) log(f"✓ Downloaded: {media_path.name} ({len(tags)} tags)") if debug_logger is not None: debug_logger.write_record( "downloaded", { "path": str(media_path), "tag_count": len(tags), "source_url": source_url, "sha256": hash_value, }, ) return DownloadMediaResult( path=media_path, info=entry, tags=tags, source_url=source_url, hash_value=hash_value, ) __all__ = [ "download_media", "is_url_supported_by_ytdlp", "DownloadError", "DownloadOptions", "DownloadMediaResult", ]