"""Download media files using yt-dlp with support for direct file downloads. Lean, focused downloader without event infrastructure overhead. - yt-dlp integration for streaming sites - Direct file download fallback for PDFs, images, documents - Tag extraction via metadata.extract_ytdlp_tags() - Logging via helper.logger.log() """ from __future__ import annotations import glob # noqa: F401 import hashlib import json # noqa: F401 import random import re import string import subprocess import sys import time import traceback from pathlib import Path from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin, urlparse import httpx from SYS.logger import log, debug from SYS.utils import ensure_directory, sha256_file from API.HTTP import HTTPClient from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar try: import yt_dlp # type: ignore from yt_dlp.extractor import gen_extractors # type: ignore except Exception as exc: yt_dlp = None # type: ignore YTDLP_IMPORT_ERROR = exc else: YTDLP_IMPORT_ERROR = None try: from metadata import extract_ytdlp_tags except ImportError: extract_ytdlp_tags = None _EXTRACTOR_CACHE: List[Any] | None = None def _ensure_yt_dlp_ready() -> None: """Verify yt-dlp is available, raise if not.""" if yt_dlp is not None: return detail = str(YTDLP_IMPORT_ERROR or "yt-dlp is not installed") raise DownloadError(f"yt-dlp module not available: {detail}") def _progress_callback(status: Dict[str, Any]) -> None: """Simple progress callback using logger.""" event = status.get("status") if event == "downloading": percent = status.get("_percent_str", "?") speed = status.get("_speed_str", "?") eta = status.get("_eta_str", "?") sys.stdout.write(f"\r[download] {percent} at {speed} ETA {eta} ") sys.stdout.flush() elif event == "finished": sys.stdout.write("\r" + " " * 70 + "\r") sys.stdout.flush() debug(f"✓ Download finished: {status.get('filename')}") elif event in ("postprocessing", "processing"): debug(f"Post-processing: {status.get('postprocessor')}") def is_url_supported_by_ytdlp(url: str) -> bool: """Check if URL is supported by yt-dlp.""" if yt_dlp is None: return False global _EXTRACTOR_CACHE if _EXTRACTOR_CACHE is None: try: _EXTRACTOR_CACHE = [ie for ie in gen_extractors()] # type: ignore[arg-type] except Exception: _EXTRACTOR_CACHE = [] for extractor in _EXTRACTOR_CACHE: try: if not extractor.suitable(url): continue except Exception: continue name = getattr(extractor, "IE_NAME", "") if name.lower() == "generic": continue return True return False def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[str] = None) -> Optional[List[Dict[str, Any]]]: """Get list of available formats for a URL using yt-dlp.""" _ensure_yt_dlp_ready() try: ydl_opts = { "quiet": True, "no_warnings": True, "socket_timeout": 30, } if no_playlist: ydl_opts["noplaylist"] = True if playlist_items: ydl_opts["playlist_items"] = playlist_items with yt_dlp.YoutubeDL(ydl_opts) as ydl: debug(f"Fetching format list for: {url}") info = ydl.extract_info(url, download=False) formats = info.get("formats", []) if not formats: log("No formats available", file=sys.stderr) return None result_formats = [] for fmt in formats: result_formats.append({ "format_id": fmt.get("format_id", ""), "format": fmt.get("format", ""), "ext": fmt.get("ext", ""), "resolution": fmt.get("resolution", ""), "width": fmt.get("width"), "height": fmt.get("height"), "fps": fmt.get("fps"), "vcodec": fmt.get("vcodec", "none"), "acodec": fmt.get("acodec", "none"), "filesize": fmt.get("filesize"), "tbr": fmt.get("tbr"), }) debug(f"Found {len(result_formats)} available formats") return result_formats except Exception as e: log(f"✗ Error fetching formats: {e}", file=sys.stderr) return None def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sections: List[str], quiet: bool = False) -> tuple[Optional[str], Dict[str, Any]]: """Download each section separately so merge-file can combine them. yt-dlp with multiple --download-sections args merges them into one file. We need separate files for merge-file, so download each section individually. Uses hash-based filenames for sections (not title-based) to prevent yt-dlp from thinking sections are already downloaded. The title is extracted and stored in tags. Returns: (session_id, first_section_info_dict) - session_id for finding files, info dict for metadata extraction """ sections_list = ytdl_options.get("download_sections", []) if not sections_list: return "", {} # Generate a unique hash-based ID for this download session # This ensures different videos/downloads don't have filename collisions session_id = hashlib.md5( (url + str(time.time()) + ''.join(random.choices(string.ascii_letters, k=10))).encode() ).hexdigest()[:12] first_section_info = None title_from_first = None # Download each section separately with unique output template using session ID for section_idx, section in enumerate(sections_list, 1): # Build unique output template for this section using session-based filename # e.g., "{session_id}_{section_idx}.ext" - simple and unique per section base_outtmpl = ytdl_options.get("outtmpl", "%(title)s.%(ext)s") output_dir_path = Path(base_outtmpl).parent # Use session_id + section index for temp filename # e.g., "/path/{session_id}_1.%(ext)s" filename_tmpl = f"{session_id}_{section_idx}" if base_outtmpl.endswith(".%(ext)s"): filename_tmpl += ".%(ext)s" # Use Path to handle separators correctly for the OS section_outtmpl = str(output_dir_path / filename_tmpl) # For the first section, extract metadata first (separate call) if section_idx == 1: metadata_cmd = ["yt-dlp", "--dump-json", "--skip-download"] if ytdl_options.get("cookiefile"): cookies_path = ytdl_options["cookiefile"].replace("\\", "/") metadata_cmd.extend(["--cookies", cookies_path]) if ytdl_options.get("noplaylist"): metadata_cmd.append("--no-playlist") metadata_cmd.append(url) try: meta_result = subprocess.run(metadata_cmd, capture_output=True, text=True) if meta_result.returncode == 0 and meta_result.stdout: try: info_dict = json.loads(meta_result.stdout.strip()) first_section_info = info_dict title_from_first = info_dict.get('title') if not quiet: debug(f"Extracted title from metadata: {title_from_first}") except json.JSONDecodeError: if not quiet: debug("Could not parse JSON metadata") except Exception as e: if not quiet: debug(f"Error extracting metadata: {e}") # Build yt-dlp command for downloading this section cmd = ["yt-dlp"] # Add format if ytdl_options.get("format"): cmd.extend(["-f", ytdl_options["format"]]) # Add ONLY this section (not all sections) cmd.extend(["--download-sections", section]) # Add force-keyframes-at-cuts if specified if ytdl_options.get("force_keyframes_at_cuts"): cmd.append("--force-keyframes-at-cuts") # Add output template for this section cmd.extend(["-o", section_outtmpl]) # Add cookies file if present if ytdl_options.get("cookiefile"): # Convert backslashes to forward slashes for better compatibility cookies_path = ytdl_options["cookiefile"].replace("\\", "/") cmd.extend(["--cookies", cookies_path]) # Add no-playlist if specified if ytdl_options.get("noplaylist"): cmd.append("--no-playlist") # Add the URL cmd.append(url) if not quiet: debug(f"Running yt-dlp for section {section_idx}/{len(sections_list)}: {section}") debug(f"Command: {' '.join(cmd)}") # Run the subprocess - don't capture output so progress is shown try: result = subprocess.run(cmd) if result.returncode != 0: raise DownloadError(f"yt-dlp subprocess failed for section {section_idx} with code {result.returncode}") except Exception as exc: raise DownloadError(f"yt-dlp subprocess error for section {section_idx}: {exc}") from exc return session_id, first_section_info or {} def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]: """Build yt-dlp download options.""" ensure_directory(opts.output_dir) # Build output template # When downloading sections, each section will have .section_N_of_M added by _download_with_sections_via_cli outtmpl = str((opts.output_dir / "%(title)s.%(ext)s").resolve()) base_options: Dict[str, Any] = { "outtmpl": outtmpl, "quiet": True, "no_warnings": True, "noprogress": True, "socket_timeout": 30, "retries": 10, "fragment_retries": 10, "http_chunk_size": 10_485_760, "restrictfilenames": True, "progress_hooks": [] if opts.quiet else [_progress_callback], } if opts.cookies_path and opts.cookies_path.is_file(): base_options["cookiefile"] = str(opts.cookies_path) else: # Fallback to browser cookies base_options["cookiesfrombrowser"] = ("chrome",) # Add no-playlist option if specified (for single video from playlist url) if opts.no_playlist: base_options["noplaylist"] = True # Configure based on mode if opts.mode == "audio": base_options["format"] = opts.ytdl_format or "251/140/bestaudio" base_options["postprocessors"] = [{"key": "FFmpegExtractAudio"}] else: # video base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best" base_options["format_sort"] = [ "res:4320", "res:2880", "res:2160", "res:1440", "res:1080", "res:720", "res" ] # Add clip sections if provided (yt-dlp will download only these sections) if opts.clip_sections: # Parse section ranges like "48-65,120-152,196-205" (seconds) # and convert to yt-dlp format: "*HH:MM:SS-HH:MM:SS,*HH:MM:SS-HH:MM:SS" sections = [] for section_range in opts.clip_sections.split(','): try: start_str, end_str = section_range.strip().split('-') start_sec = float(start_str) end_sec = float(end_str) # Convert seconds to HH:MM:SS format def sec_to_hhmmss(seconds): hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) secs = int(seconds % 60) return f"{hours:02d}:{minutes:02d}:{secs:02d}" start_time = sec_to_hhmmss(start_sec) end_time = sec_to_hhmmss(end_sec) sections.append(f"*{start_time}-{end_time}") except (ValueError, AttributeError): pass if sections: # Pass each section as a separate element in the list (yt-dlp expects multiple --download-sections args) base_options["download_sections"] = sections debug(f"Download sections configured: {', '.join(sections)}") # Note: Not using --force-keyframes-at-cuts to avoid re-encoding # This may result in less precise cuts but faster downloads # Add playlist items selection if provided if opts.playlist_items: base_options["playlist_items"] = opts.playlist_items if not opts.quiet: debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}") return base_options def _iter_download_entries(info: Dict[str, Any]) -> Iterator[Dict[str, Any]]: """Iterate through download entries, handling playlists.""" queue: List[Dict[str, Any]] = [info] seen: set[int] = set() while queue: current = queue.pop(0) obj_id = id(current) if obj_id in seen: continue seen.add(obj_id) entries = current.get("entries") if isinstance(entries, list): for entry in entries: if isinstance(entry, dict): queue.append(entry) if current.get("requested_downloads") or not entries: yield current def _candidate_paths(entry: Dict[str, Any], output_dir: Path) -> Iterator[Path]: """Get candidate file paths for downloaded media.""" requested = entry.get("requested_downloads") if isinstance(requested, list): for item in requested: if isinstance(item, dict): for key in ("filepath", "_filename", "filename"): value = item.get(key) if value: yield Path(value) for key in ("filepath", "_filename", "filename"): value = entry.get(key) if value: yield Path(value) if entry.get("filename"): yield output_dir / entry["filename"] def _resolve_entry_and_path(info: Dict[str, Any], output_dir: Path) -> tuple[Dict[str, Any], Path]: """Find downloaded file in yt-dlp metadata.""" for entry in _iter_download_entries(info): for candidate in _candidate_paths(entry, output_dir): if candidate.is_file(): return entry, candidate if not candidate.is_absolute(): resolved = output_dir / candidate if resolved.is_file(): return entry, resolved raise FileNotFoundError("yt-dlp did not report a downloaded media file") def _extract_sha256(info: Dict[str, Any]) -> Optional[str]: """Extract SHA256 hash from yt-dlp metadata.""" for payload in [info] + info.get("entries", []): if not isinstance(payload, dict): continue hashes = payload.get("hashes") if isinstance(hashes, dict): for key in ("sha256", "sha-256", "sha_256"): value = hashes.get(key) if isinstance(value, str) and value.strip(): return value.strip().lower() for key in ("sha256", "sha-256", "sha_256"): value = payload.get(key) if isinstance(value, str) and value.strip(): return value.strip().lower() return None def _get_libgen_download_url(libgen_url: str) -> Optional[str]: """Extract the actual download link from LibGen redirect URL. LibGen url like https://libgen.gl/file.php?id=123456 redirect to actual mirror url. This follows the redirect chain to get the real file. Args: libgen_url: LibGen file.php URL Returns: Actual download URL or None if extraction fails """ try: import requests from urllib.parse import urlparse # Check if this is a LibGen URL parsed = urlparse(libgen_url) if 'libgen' not in parsed.netloc.lower(): return None if '/file.php' not in parsed.path.lower(): return None # LibGen redirects to actual mirrors, follow redirects to get final URL session = requests.Session() session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) debug(f"Following LibGen redirect chain for: {libgen_url}") # First, get the page and look for direct download link try: response = session.get(libgen_url, timeout=10, allow_redirects=True) final_url = response.url # Try to find actual download link in the page try: try: from lxml import html as lxml_html except ImportError: lxml_html = None if lxml_html is not None: doc = lxml_html.fromstring(response.content) for a in doc.xpath("//a[@href]"): href = str(a.get("href") or "").strip() if not href: continue href_lower = href.lower() if "get.php" in href_lower or href_lower.endswith((".pdf", ".epub", ".djvu", ".mobi")): download_url = href if href.startswith("http") else urljoin(final_url, href) debug(f"Found download link: {download_url}") return download_url else: # Regex fallback for m in re.finditer( r"href=[\"\']([^\"\']+)[\"\']", response.text or "", flags=re.IGNORECASE, ): href = str(m.group(1) or "").strip() if not href or href.lower().startswith("javascript:"): continue href_lower = href.lower() if "get.php" in href_lower or href_lower.endswith((".pdf", ".epub", ".djvu", ".mobi")): download_url = href if href.startswith("http") else urljoin(final_url, href) debug(f"Found download link: {download_url}") return download_url except Exception: pass # If we followed redirects successfully, return the final URL # This handles cases where libgen redirects to a direct download mirror if final_url != libgen_url: debug(f"LibGen resolved to mirror: {final_url}") return final_url except requests.RequestException as e: log(f"Error following LibGen redirects: {e}", file=sys.stderr) # Try head request as fallback try: response = session.head(libgen_url, allow_redirects=True, timeout=10) if response.url != libgen_url: debug(f"LibGen HEAD resolved to: {response.url}") return response.url except: pass return None except Exception as e: log(f"Error resolving LibGen URL: {e}", file=sys.stderr) return None def _download_direct_file( url: str, output_dir: Path, debug_logger: Optional[DebugLogger] = None, quiet: bool = False, ) -> DownloadMediaResult: """Download a direct file (PDF, image, document, etc.) without yt-dlp.""" ensure_directory(output_dir) from urllib.parse import unquote, urlparse, parse_qs import re # Extract filename from URL parsed_url = urlparse(url) url_path = parsed_url.path # Try to get filename from query parameters first (for LibGen and similar services) # e.g., ?filename=Book+Title.pdf or &download=filename.pdf filename = None if parsed_url.query: query_params = parse_qs(parsed_url.query) for param_name in ('filename', 'download', 'file', 'name'): if param_name in query_params and query_params[param_name]: filename = query_params[param_name][0] filename = unquote(filename) break # If not found in query params, extract from URL path if not filename or not filename.strip(): filename = url_path.split("/")[-1] if url_path else "" filename = unquote(filename) # Remove query strings from filename if any if "?" in filename: filename = filename.split("?")[0] # Try to get real filename from Content-Disposition header (HEAD request) try: with HTTPClient(timeout=10.0) as client: response = client._request("HEAD", url, follow_redirects=True) content_disposition = response.headers.get("content-disposition", "") if content_disposition: # Extract filename from Content-Disposition header # Format: attachment; filename="filename.pdf" or filename=filename.pdf match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition) if match: extracted_name = match.group(1) or match.group(2) if extracted_name: filename = unquote(extracted_name) if not quiet: debug(f"Filename from Content-Disposition: {filename}") except Exception as e: if not quiet: log(f"Could not get filename from headers: {e}", file=sys.stderr) # Fallback if we still don't have a good filename if not filename or "." not in filename: filename = "downloaded_file.bin" file_path = output_dir / filename progress_bar = ProgressBar() if not quiet: debug(f"Direct download: {filename}") try: start_time = time.time() downloaded_bytes = [0] total_bytes = [0] last_progress_time = [start_time] def progress_callback(bytes_downloaded: int, content_length: int) -> None: downloaded_bytes[0] = bytes_downloaded total_bytes[0] = content_length now = time.time() if now - last_progress_time[0] >= 0.5 and total_bytes[0] > 0: elapsed = now - start_time percent = (bytes_downloaded / content_length) * 100 if content_length > 0 else 0 speed = bytes_downloaded / elapsed if elapsed > 0 else 0 eta_seconds = (content_length - bytes_downloaded) / speed if speed > 0 else 0 speed_str = progress_bar.format_bytes(speed) + "/s" minutes, seconds = divmod(int(eta_seconds), 60) hours, minutes = divmod(minutes, 60) eta_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}" progress_line = progress_bar.format_progress( percent_str=f"{percent:.1f}%", downloaded=bytes_downloaded, total=content_length, speed_str=speed_str, eta_str=eta_str, ) if not quiet: debug(progress_line) last_progress_time[0] = now with HTTPClient(timeout=30.0) as client: client.download(url, str(file_path), progress_callback=progress_callback) elapsed = time.time() - start_time avg_speed_str = progress_bar.format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0) + "/s" if not quiet: debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}") # For direct file downloads, create minimal info dict without filename as title # This prevents creating duplicate title: tags when filename gets auto-generated # We'll add title back later only if we couldn't extract meaningful tags info = { "id": filename.rsplit(".", 1)[0], "ext": filename.rsplit(".", 1)[1] if "." in filename else "bin", "webpage_url": url, } hash_value = None try: hash_value = sha256_file(file_path) except Exception: pass tags = [] if extract_ytdlp_tags: try: tags = extract_ytdlp_tags(info) except Exception as e: log(f"Error extracting tags: {e}", file=sys.stderr) # Only use filename as a title tag if we couldn't extract any meaningful tags # This prevents duplicate title: tags when the filename could be mistaken for metadata if not any(t.startswith('title:') for t in tags): # Re-extract tags with filename as title only if needed info['title'] = filename tags = [] if extract_ytdlp_tags: try: tags = extract_ytdlp_tags(info) except Exception as e: log(f"Error extracting tags with filename: {e}", file=sys.stderr) if debug_logger is not None: debug_logger.write_record( "direct-file-downloaded", {"url": url, "path": str(file_path), "hash": hash_value}, ) return DownloadMediaResult( path=file_path, info=info, tag=tags, source_url=url, hash_value=hash_value, ) except (httpx.HTTPError, httpx.RequestError) as exc: log(f"Download error: {exc}", file=sys.stderr) if debug_logger is not None: debug_logger.write_record( "exception", {"phase": "direct-file", "url": url, "error": str(exc)}, ) raise DownloadError(f"Failed to download {url}: {exc}") from exc except Exception as exc: log(f"Error downloading file: {exc}", file=sys.stderr) if debug_logger is not None: debug_logger.write_record( "exception", { "phase": "direct-file", "url": url, "error": str(exc), "traceback": traceback.format_exc(), }, ) raise DownloadError(f"Error downloading file: {exc}") from exc def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) -> Optional[Dict[str, Any]]: """Probe URL to extract metadata WITHOUT downloading. Args: url: URL to probe no_playlist: If True, ignore playlists and probe only the single video timeout_seconds: Max seconds to wait for probe (default 15s) Returns: Dict with keys: extractor, title, entries (if playlist), duration, etc. Returns None if not supported by yt-dlp or on timeout. """ if not is_url_supported_by_ytdlp(url): return None # Wrap probe in timeout to prevent hanging on large playlists import threading from typing import cast result_container: List[Optional[Any]] = [None, None] # [result, error] def _do_probe() -> None: try: _ensure_yt_dlp_ready() assert yt_dlp is not None # Extract info without downloading # Use extract_flat='in_playlist' to get full metadata for playlist items ydl_opts = { "quiet": True, # Suppress all output "no_warnings": True, "socket_timeout": 10, "retries": 2, # Reduce retries for faster timeout "skip_download": True, # Don't actually download "extract_flat": "in_playlist", # Get playlist with metadata for each entry "noprogress": True, # No progress bars } # Cookies are optional for probing; callers should pass cookiefile via DownloadOptions when needed. # Add no_playlist option if specified if no_playlist: ydl_opts["noplaylist"] = True with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type] info = ydl.extract_info(url, download=False) if not isinstance(info, dict): result_container[0] = None return # Extract relevant fields result_container[0] = { "extractor": info.get("extractor", ""), "title": info.get("title", ""), "entries": info.get("entries", []), # Will be populated if playlist "duration": info.get("duration"), "uploader": info.get("uploader"), "description": info.get("description"), "url": url, } except Exception as exc: log(f"Probe error for {url}: {exc}") result_container[1] = exc thread = threading.Thread(target=_do_probe, daemon=False) thread.start() thread.join(timeout=timeout_seconds) if thread.is_alive(): # Probe timed out - return None to fall back to direct download debug(f"Probe timeout for {url} (>={timeout_seconds}s), proceeding with download") return None if result_container[1] is not None: # Probe error - return None to proceed anyway return None return cast(Optional[Dict[str, Any]], result_container[0]) __all__ = [ "is_url_supported_by_ytdlp", "list_formats", "probe_url", "DownloadError", "DownloadOptions", "DownloadMediaResult", ]