dfdsf

2025-12-11 19:04:02 -08:00
parent 6863c6c7ea
commit 16d8a763cd
103 changed files with 4759 additions and 9156 deletions
--- a/SYS/download.py
+++ b/SYS/download.py
@@ -0,0 +1,767 @@
+"""Download media files using yt-dlp with support for direct file downloads.
+
+Lean, focused downloader without event infrastructure overhead.
+- yt-dlp integration for streaming sites
+- Direct file download fallback for PDFs, images, documents
+- Tag extraction via metadata.extract_ytdlp_tags()
+- Logging via helper.logger.log()
+"""
+from __future__ import annotations
+
+import glob  # noqa: F401
+import hashlib
+import json  # noqa: F401
+import random
+import re
+import string
+import subprocess
+import sys
+import time
+import traceback
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional
+from urllib.parse import urljoin, urlparse
+
+import httpx
+
+from SYS.logger import log, debug
+from SYS.utils import ensure_directory, sha256_file
+from API.HTTP import HTTPClient
+from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar
+
+try:
+    import yt_dlp  # type: ignore
+    from yt_dlp.extractor import gen_extractors  # type: ignore
+except Exception as exc:
+    yt_dlp = None  # type: ignore
+    YTDLP_IMPORT_ERROR = exc
+else:
+    YTDLP_IMPORT_ERROR = None
+
+try:
+    from metadata import extract_ytdlp_tags
+except ImportError:
+    extract_ytdlp_tags = None
+
+_EXTRACTOR_CACHE: List[Any] | None = None
+
+
+def _ensure_yt_dlp_ready() -> None:
+    """Verify yt-dlp is available, raise if not."""
+    if yt_dlp is not None:
+        return
+    detail = str(YTDLP_IMPORT_ERROR or "yt-dlp is not installed")
+    raise DownloadError(f"yt-dlp module not available: {detail}")
+
+
+def _progress_callback(status: Dict[str, Any]) -> None:
+    """Simple progress callback using logger."""
+    event = status.get("status")
+    if event == "downloading":
+        percent = status.get("_percent_str", "?")
+        speed = status.get("_speed_str", "?")
+        eta = status.get("_eta_str", "?")
+        sys.stdout.write(f"\r[download] {percent} at {speed} ETA {eta}   ")
+        sys.stdout.flush()
+    elif event == "finished":
+        sys.stdout.write("\r" + " " * 70 + "\r")
+        sys.stdout.flush()
+        debug(f"✓ Download finished: {status.get('filename')}")
+    elif event in ("postprocessing", "processing"):
+        debug(f"Post-processing: {status.get('postprocessor')}")
+
+
+def is_url_supported_by_ytdlp(url: str) -> bool:
+    """Check if URL is supported by yt-dlp."""
+    if yt_dlp is None:
+        return False
+    global _EXTRACTOR_CACHE
+    if _EXTRACTOR_CACHE is None:
+        try:
+            _EXTRACTOR_CACHE = [ie for ie in gen_extractors()]  # type: ignore[arg-type]
+        except Exception:
+            _EXTRACTOR_CACHE = []
+    for extractor in _EXTRACTOR_CACHE:
+        try:
+            if not extractor.suitable(url):
+                continue
+        except Exception:
+            continue
+        name = getattr(extractor, "IE_NAME", "")
+        if name.lower() == "generic":
+            continue
+        return True
+    return False
+
+
+def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
+    """Get list of available formats for a URL using yt-dlp."""
+    _ensure_yt_dlp_ready()
+
+    try:
+        ydl_opts = {
+            "quiet": True,
+            "no_warnings": True,
+            "socket_timeout": 30,
+        }
+
+        if no_playlist:
+            ydl_opts["noplaylist"] = True
+
+        if playlist_items:
+            ydl_opts["playlist_items"] = playlist_items
+
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            debug(f"Fetching format list for: {url}")
+            info = ydl.extract_info(url, download=False)
+
+            formats = info.get("formats", [])
+            if not formats:
+                log("No formats available", file=sys.stderr)
+                return None
+
+            result_formats = []
+            for fmt in formats:
+                result_formats.append({
+                    "format_id": fmt.get("format_id", ""),
+                    "format": fmt.get("format", ""),
+                    "ext": fmt.get("ext", ""),
+                    "resolution": fmt.get("resolution", ""),
+                    "width": fmt.get("width"),
+                    "height": fmt.get("height"),
+                    "fps": fmt.get("fps"),
+                    "vcodec": fmt.get("vcodec", "none"),
+                    "acodec": fmt.get("acodec", "none"),
+                    "filesize": fmt.get("filesize"),
+                    "tbr": fmt.get("tbr"),
+                })
+
+            debug(f"Found {len(result_formats)} available formats")
+            return result_formats
+
+    except Exception as e:
+        log(f"✗ Error fetching formats: {e}", file=sys.stderr)
+        return None
+
+
+def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sections: List[str], quiet: bool = False) -> tuple[Optional[str], Dict[str, Any]]:
+    """Download each section separately so merge-file can combine them.
+    
+    yt-dlp with multiple --download-sections args merges them into one file.
+    We need separate files for merge-file, so download each section individually.
+    
+    Uses hash-based filenames for sections (not title-based) to prevent yt-dlp from
+    thinking sections are already downloaded. The title is extracted and stored in tags.
+    
+    Returns:
+        (session_id, first_section_info_dict) - session_id for finding files, info dict for metadata extraction
+    """
+    
+    sections_list = ytdl_options.get("download_sections", [])
+    if not sections_list:
+        return "", {}
+    
+    # Generate a unique hash-based ID for this download session
+    # This ensures different videos/downloads don't have filename collisions
+    session_id = hashlib.md5(
+        (url + str(time.time()) + ''.join(random.choices(string.ascii_letters, k=10))).encode()
+    ).hexdigest()[:12]
+    
+    first_section_info = None
+    title_from_first = None
+    
+    # Download each section separately with unique output template using session ID
+    for section_idx, section in enumerate(sections_list, 1):
+        # Build unique output template for this section using session-based filename
+        # e.g., "{session_id}_{section_idx}.ext" - simple and unique per section
+        base_outtmpl = ytdl_options.get("outtmpl", "%(title)s.%(ext)s")
+        output_dir_path = Path(base_outtmpl).parent
+        
+        # Use session_id + section index for temp filename
+        # e.g., "/path/{session_id}_1.%(ext)s"
+        filename_tmpl = f"{session_id}_{section_idx}"
+        if base_outtmpl.endswith(".%(ext)s"):
+            filename_tmpl += ".%(ext)s"
+            
+        # Use Path to handle separators correctly for the OS
+        section_outtmpl = str(output_dir_path / filename_tmpl)
+        
+        # For the first section, extract metadata first (separate call)
+        if section_idx == 1:
+            metadata_cmd = ["yt-dlp", "--dump-json", "--skip-download"]
+            if ytdl_options.get("cookiefile"):
+                cookies_path = ytdl_options["cookiefile"].replace("\\", "/")
+                metadata_cmd.extend(["--cookies", cookies_path])
+            if ytdl_options.get("noplaylist"):
+                metadata_cmd.append("--no-playlist")
+            metadata_cmd.append(url)
+            
+            try:
+                meta_result = subprocess.run(metadata_cmd, capture_output=True, text=True)
+                if meta_result.returncode == 0 and meta_result.stdout:
+                    try:
+                        info_dict = json.loads(meta_result.stdout.strip())
+                        first_section_info = info_dict
+                        title_from_first = info_dict.get('title')
+                        if not quiet:
+                            debug(f"Extracted title from metadata: {title_from_first}")
+                    except json.JSONDecodeError:
+                        if not quiet:
+                            debug("Could not parse JSON metadata")
+            except Exception as e:
+                if not quiet:
+                    debug(f"Error extracting metadata: {e}")
+        
+        # Build yt-dlp command for downloading this section
+        cmd = ["yt-dlp"]
+        
+        # Add format
+        if ytdl_options.get("format"):
+            cmd.extend(["-f", ytdl_options["format"]])
+        
+        # Add ONLY this section (not all sections)
+        cmd.extend(["--download-sections", section])
+        
+        # Add force-keyframes-at-cuts if specified
+        if ytdl_options.get("force_keyframes_at_cuts"):
+            cmd.append("--force-keyframes-at-cuts")
+        
+        # Add output template for this section
+        cmd.extend(["-o", section_outtmpl])
+        
+        # Add cookies file if present
+        if ytdl_options.get("cookiefile"):
+            # Convert backslashes to forward slashes for better compatibility
+            cookies_path = ytdl_options["cookiefile"].replace("\\", "/")
+            cmd.extend(["--cookies", cookies_path])
+        
+        # Add no-playlist if specified
+        if ytdl_options.get("noplaylist"):
+            cmd.append("--no-playlist")
+        
+        # Add the URL
+        cmd.append(url)
+        
+        if not quiet:
+            debug(f"Running yt-dlp for section {section_idx}/{len(sections_list)}: {section}")
+            debug(f"Command: {' '.join(cmd)}")
+        
+        # Run the subprocess - don't capture output so progress is shown
+        try:
+            result = subprocess.run(cmd)
+            
+            if result.returncode != 0:
+                raise DownloadError(f"yt-dlp subprocess failed for section {section_idx} with code {result.returncode}")
+        except Exception as exc:
+            raise DownloadError(f"yt-dlp subprocess error for section {section_idx}: {exc}") from exc
+    
+    return session_id, first_section_info or {}
+
+
+def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
+    """Build yt-dlp download options."""
+    ensure_directory(opts.output_dir)
+
+    # Build output template
+    # When downloading sections, each section will have .section_N_of_M added by _download_with_sections_via_cli
+    outtmpl = str((opts.output_dir / "%(title)s.%(ext)s").resolve())
+
+    base_options: Dict[str, Any] = {
+        "outtmpl": outtmpl,
+        "quiet": True,
+        "no_warnings": True,
+        "noprogress": True,
+        "socket_timeout": 30,
+        "retries": 10,
+        "fragment_retries": 10,
+        "http_chunk_size": 10_485_760,
+        "restrictfilenames": True,
+        "progress_hooks": [] if opts.quiet else [_progress_callback],
+    }
+
+    if opts.cookies_path and opts.cookies_path.is_file():
+        base_options["cookiefile"] = str(opts.cookies_path)
+    else:
+        # Check global cookies file lazily to avoid import cycles
+        from hydrus_health_check import get_cookies_file_path  # local import
+
+        global_cookies = get_cookies_file_path()
+        if global_cookies:
+            base_options["cookiefile"] = global_cookies
+        else:
+            # Fallback to browser cookies
+            base_options["cookiesfrombrowser"] = ("chrome",)
+
+    # Add no-playlist option if specified (for single video from playlist url)
+    if opts.no_playlist:
+        base_options["noplaylist"] = True
+
+    # Configure based on mode
+    if opts.mode == "audio":
+        base_options["format"] = opts.ytdl_format or "251/140/bestaudio"
+        base_options["postprocessors"] = [{"key": "FFmpegExtractAudio"}]
+    else:  # video
+        base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best"
+        base_options["format_sort"] = [
+            "res:4320", "res:2880", "res:2160", "res:1440", "res:1080", "res:720", "res"
+        ]
+
+    # Add clip sections if provided (yt-dlp will download only these sections)
+    if opts.clip_sections:
+        # Parse section ranges like "48-65,120-152,196-205" (seconds) 
+        # and convert to yt-dlp format: "*HH:MM:SS-HH:MM:SS,*HH:MM:SS-HH:MM:SS"
+        sections = []
+        for section_range in opts.clip_sections.split(','):
+            try:
+                start_str, end_str = section_range.strip().split('-')
+                start_sec = float(start_str)
+                end_sec = float(end_str)
+                
+                # Convert seconds to HH:MM:SS format
+                def sec_to_hhmmss(seconds):
+                    hours = int(seconds // 3600)
+                    minutes = int((seconds % 3600) // 60)
+                    secs = int(seconds % 60)
+                    return f"{hours:02d}:{minutes:02d}:{secs:02d}"
+                
+                start_time = sec_to_hhmmss(start_sec)
+                end_time = sec_to_hhmmss(end_sec)
+                sections.append(f"*{start_time}-{end_time}")
+            except (ValueError, AttributeError):
+                pass
+        
+        if sections:
+            # Pass each section as a separate element in the list (yt-dlp expects multiple --download-sections args)
+            base_options["download_sections"] = sections
+            debug(f"Download sections configured: {', '.join(sections)}")
+            # Note: Not using --force-keyframes-at-cuts to avoid re-encoding
+            # This may result in less precise cuts but faster downloads
+
+    # Add playlist items selection if provided
+    if opts.playlist_items:
+        base_options["playlist_items"] = opts.playlist_items
+
+    if not opts.quiet:
+        debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}")
+    return base_options
+
+
+def _iter_download_entries(info: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
+    """Iterate through download entries, handling playlists."""
+    queue: List[Dict[str, Any]] = [info]
+    seen: set[int] = set()
+    while queue:
+        current = queue.pop(0)
+        obj_id = id(current)
+        if obj_id in seen:
+            continue
+        seen.add(obj_id)
+        entries = current.get("entries")
+        if isinstance(entries, list):
+            for entry in entries:
+                if isinstance(entry, dict):
+                    queue.append(entry)
+        if current.get("requested_downloads") or not entries:
+            yield current
+
+
+def _candidate_paths(entry: Dict[str, Any], output_dir: Path) -> Iterator[Path]:
+    """Get candidate file paths for downloaded media."""
+    requested = entry.get("requested_downloads")
+    if isinstance(requested, list):
+        for item in requested:
+            if isinstance(item, dict):
+                for key in ("filepath", "_filename", "filename"):
+                    value = item.get(key)
+                    if value:
+                        yield Path(value)
+    for key in ("filepath", "_filename", "filename"):
+        value = entry.get(key)
+        if value:
+            yield Path(value)
+    if entry.get("filename"):
+        yield output_dir / entry["filename"]
+
+
+def _resolve_entry_and_path(info: Dict[str, Any], output_dir: Path) -> tuple[Dict[str, Any], Path]:
+    """Find downloaded file in yt-dlp metadata."""
+    for entry in _iter_download_entries(info):
+        for candidate in _candidate_paths(entry, output_dir):
+            if candidate.is_file():
+                return entry, candidate
+            if not candidate.is_absolute():
+                resolved = output_dir / candidate
+                if resolved.is_file():
+                    return entry, resolved
+    raise FileNotFoundError("yt-dlp did not report a downloaded media file")
+
+
+def _extract_sha256(info: Dict[str, Any]) -> Optional[str]:
+    """Extract SHA256 hash from yt-dlp metadata."""
+    for payload in [info] + info.get("entries", []):
+        if not isinstance(payload, dict):
+            continue
+        hashes = payload.get("hashes")
+        if isinstance(hashes, dict):
+            for key in ("sha256", "sha-256", "sha_256"):
+                value = hashes.get(key)
+                if isinstance(value, str) and value.strip():
+                    return value.strip().lower()
+        for key in ("sha256", "sha-256", "sha_256"):
+            value = payload.get(key)
+            if isinstance(value, str) and value.strip():
+                return value.strip().lower()
+    return None
+
+
+def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
+    """Extract the actual download link from LibGen redirect URL.
+    
+    LibGen url like https://libgen.gl/file.php?id=123456 redirect to
+    actual mirror url. This follows the redirect chain to get the real file.
+    
+    Args:
+        libgen_url: LibGen file.php URL
+        
+    Returns:
+        Actual download URL or None if extraction fails
+    """
+    try:
+        import requests
+        from urllib.parse import urlparse
+        
+        # Check if this is a LibGen URL
+        parsed = urlparse(libgen_url)
+        if 'libgen' not in parsed.netloc.lower():
+            return None
+        
+        if '/file.php' not in parsed.path.lower():
+            return None
+        
+        # LibGen redirects to actual mirrors, follow redirects to get final URL
+        session = requests.Session()
+        session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        })
+        
+        debug(f"Following LibGen redirect chain for: {libgen_url}")
+        
+        # First, get the page and look for direct download link
+        try:
+            response = session.get(libgen_url, timeout=10, allow_redirects=True)
+            final_url = response.url
+            
+            # Try to find actual download link in the page
+            try:
+                from bs4 import BeautifulSoup
+                soup = BeautifulSoup(response.content, 'html.parser')
+                
+                # Look for download links - LibGen typically has forms with download buttons
+                # Look for all links and forms that might lead to download
+                for link in soup.find_all('a'):
+                    href = link.get('href')
+                    if href and isinstance(href, str):
+                        # Look for direct file links or get.php redirects
+                        if 'get.php' in href.lower() or href.endswith(('.pdf', '.epub', '.djvu', '.mobi')):
+                            download_url = href if href.startswith('http') else urljoin(final_url, href)
+                            debug(f"Found download link: {download_url}")
+                            return download_url
+            except ImportError:
+                pass  # BeautifulSoup not available
+            
+            # If we followed redirects successfully, return the final URL
+            # This handles cases where libgen redirects to a direct download mirror
+            if final_url != libgen_url:
+                debug(f"LibGen resolved to mirror: {final_url}")
+                return final_url
+        
+        except requests.RequestException as e:
+            log(f"Error following LibGen redirects: {e}", file=sys.stderr)
+            # Try head request as fallback
+            try:
+                response = session.head(libgen_url, allow_redirects=True, timeout=10)
+                if response.url != libgen_url:
+                    debug(f"LibGen HEAD resolved to: {response.url}")
+                    return response.url
+            except:
+                pass
+        
+        return None
+        
+    except Exception as e:
+        log(f"Error resolving LibGen URL: {e}", file=sys.stderr)
+        return None
+
+
+def _download_direct_file(
+    url: str,
+    output_dir: Path,
+    debug_logger: Optional[DebugLogger] = None,
+    quiet: bool = False,
+) -> DownloadMediaResult:
+    """Download a direct file (PDF, image, document, etc.) without yt-dlp."""
+    ensure_directory(output_dir)
+
+    from urllib.parse import unquote, urlparse, parse_qs
+    import re
+    
+    # Extract filename from URL
+    parsed_url = urlparse(url)
+    url_path = parsed_url.path
+    
+    # Try to get filename from query parameters first (for LibGen and similar services)
+    # e.g., ?filename=Book+Title.pdf or &download=filename.pdf
+    filename = None
+    if parsed_url.query:
+        query_params = parse_qs(parsed_url.query)
+        for param_name in ('filename', 'download', 'file', 'name'):
+            if param_name in query_params and query_params[param_name]:
+                filename = query_params[param_name][0]
+                filename = unquote(filename)
+                break
+    
+    # If not found in query params, extract from URL path
+    if not filename or not filename.strip():
+        filename = url_path.split("/")[-1] if url_path else ""
+        filename = unquote(filename)
+    
+    # Remove query strings from filename if any
+    if "?" in filename:
+        filename = filename.split("?")[0]
+    
+    # Try to get real filename from Content-Disposition header (HEAD request)
+    try:
+        with HTTPClient(timeout=10.0) as client:
+            response = client._request("HEAD", url, follow_redirects=True)
+            content_disposition = response.headers.get("content-disposition", "")
+            if content_disposition:
+                # Extract filename from Content-Disposition header
+                # Format: attachment; filename="filename.pdf" or filename=filename.pdf
+                match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition)
+                if match:
+                    extracted_name = match.group(1) or match.group(2)
+                    if extracted_name:
+                        filename = unquote(extracted_name)
+                        if not quiet:
+                            debug(f"Filename from Content-Disposition: {filename}")
+    except Exception as e:
+        if not quiet:
+            log(f"Could not get filename from headers: {e}", file=sys.stderr)
+    
+    # Fallback if we still don't have a good filename
+    if not filename or "." not in filename:
+        filename = "downloaded_file.bin"
+
+    file_path = output_dir / filename
+    progress_bar = ProgressBar()
+
+    if not quiet:
+        debug(f"Direct download: {filename}")
+
+    try:
+        start_time = time.time()
+        downloaded_bytes = [0]
+        total_bytes = [0]
+        last_progress_time = [start_time]
+
+        def progress_callback(bytes_downloaded: int, content_length: int) -> None:
+            downloaded_bytes[0] = bytes_downloaded
+            total_bytes[0] = content_length
+
+            now = time.time()
+            if now - last_progress_time[0] >= 0.5 and total_bytes[0] > 0:
+                elapsed = now - start_time
+                percent = (bytes_downloaded / content_length) * 100 if content_length > 0 else 0
+                speed = bytes_downloaded / elapsed if elapsed > 0 else 0
+                eta_seconds = (content_length - bytes_downloaded) / speed if speed > 0 else 0
+
+                speed_str = progress_bar.format_bytes(speed) + "/s"
+                minutes, seconds = divmod(int(eta_seconds), 60)
+                hours, minutes = divmod(minutes, 60)
+                eta_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
+
+                progress_line = progress_bar.format_progress(
+                    percent_str=f"{percent:.1f}%",
+                    downloaded=bytes_downloaded,
+                    total=content_length,
+                    speed_str=speed_str,
+                    eta_str=eta_str,
+                )
+                if not quiet:
+                    debug(progress_line)
+                last_progress_time[0] = now
+
+        with HTTPClient(timeout=30.0) as client:
+            client.download(url, str(file_path), progress_callback=progress_callback)
+
+        elapsed = time.time() - start_time
+        avg_speed_str = progress_bar.format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0) + "/s"
+        if not quiet:
+            debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}")
+
+        # For direct file downloads, create minimal info dict without filename as title
+        # This prevents creating duplicate title: tags when filename gets auto-generated
+        # We'll add title back later only if we couldn't extract meaningful tags
+        info = {
+            "id": filename.rsplit(".", 1)[0],
+            "ext": filename.rsplit(".", 1)[1] if "." in filename else "bin",
+            "webpage_url": url,
+        }
+
+        hash_value = None
+        try:
+            hash_value = sha256_file(file_path)
+        except Exception:
+            pass
+
+        tags = []
+        if extract_ytdlp_tags:
+            try:
+                tags = extract_ytdlp_tags(info)
+            except Exception as e:
+                log(f"Error extracting tags: {e}", file=sys.stderr)
+
+        # Only use filename as a title tag if we couldn't extract any meaningful tags
+        # This prevents duplicate title: tags when the filename could be mistaken for metadata
+        if not any(t.startswith('title:') for t in tags):
+            # Re-extract tags with filename as title only if needed
+            info['title'] = filename
+            tags = []
+            if extract_ytdlp_tags:
+                try:
+                    tags = extract_ytdlp_tags(info)
+                except Exception as e:
+                    log(f"Error extracting tags with filename: {e}", file=sys.stderr)
+
+        if debug_logger is not None:
+            debug_logger.write_record(
+                "direct-file-downloaded",
+                {"url": url, "path": str(file_path), "hash": hash_value},
+            )
+
+        return DownloadMediaResult(
+            path=file_path,
+            info=info,
+            tags=tags,
+            source_url=url,
+            hash_value=hash_value,
+        )
+
+    except (httpx.HTTPError, httpx.RequestError) as exc:
+        log(f"Download error: {exc}", file=sys.stderr)
+        if debug_logger is not None:
+            debug_logger.write_record(
+                "exception",
+                {"phase": "direct-file", "url": url, "error": str(exc)},
+            )
+        raise DownloadError(f"Failed to download {url}: {exc}") from exc
+    except Exception as exc:
+        log(f"Error downloading file: {exc}", file=sys.stderr)
+        if debug_logger is not None:
+            debug_logger.write_record(
+                "exception",
+                {
+                    "phase": "direct-file",
+                    "url": url,
+                    "error": str(exc),
+                    "traceback": traceback.format_exc(),
+                },
+            )
+        raise DownloadError(f"Error downloading file: {exc}") from exc
+
+
+def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) -> Optional[Dict[str, Any]]:
+    """Probe URL to extract metadata WITHOUT downloading.
+    
+    Args:
+        url: URL to probe
+        no_playlist: If True, ignore playlists and probe only the single video
+        timeout_seconds: Max seconds to wait for probe (default 15s)
+    
+    Returns:
+        Dict with keys: extractor, title, entries (if playlist), duration, etc.
+        Returns None if not supported by yt-dlp or on timeout.
+    """
+    if not is_url_supported_by_ytdlp(url):
+        return None
+    
+    # Wrap probe in timeout to prevent hanging on large playlists
+    import threading
+    from typing import cast
+    
+    result_container: List[Optional[Any]] = [None, None]  # [result, error]
+    
+    def _do_probe() -> None:
+        try:
+            _ensure_yt_dlp_ready()
+            
+            assert yt_dlp is not None
+            # Extract info without downloading
+            # Use extract_flat='in_playlist' to get full metadata for playlist items
+            ydl_opts = {
+                "quiet": True,  # Suppress all output
+                "no_warnings": True,
+                "socket_timeout": 10,
+                "retries": 2,  # Reduce retries for faster timeout
+                "skip_download": True,  # Don't actually download
+                "extract_flat": "in_playlist",  # Get playlist with metadata for each entry
+                "noprogress": True,  # No progress bars
+            }
+            
+            # Add cookies if available (lazy import to avoid circular dependency)
+            from hydrus_health_check import get_cookies_file_path  # local import
+
+            global_cookies = get_cookies_file_path()
+            if global_cookies:
+                ydl_opts["cookiefile"] = global_cookies
+            
+            # Add no_playlist option if specified
+            if no_playlist:
+                ydl_opts["noplaylist"] = True
+            
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:  # type: ignore[arg-type]
+                info = ydl.extract_info(url, download=False)
+            
+            if not isinstance(info, dict):
+                result_container[0] = None
+                return
+            
+            # Extract relevant fields
+            result_container[0] = {
+                "extractor": info.get("extractor", ""),
+                "title": info.get("title", ""),
+                "entries": info.get("entries", []),  # Will be populated if playlist
+                "duration": info.get("duration"),
+                "uploader": info.get("uploader"),
+                "description": info.get("description"),
+                "url": url,
+            }
+        except Exception as exc:
+            log(f"Probe error for {url}: {exc}")
+            result_container[1] = exc
+    
+    thread = threading.Thread(target=_do_probe, daemon=False)
+    thread.start()
+    thread.join(timeout=timeout_seconds)
+    
+    if thread.is_alive():
+        # Probe timed out - return None to fall back to direct download
+        debug(f"Probe timeout for {url} (>={timeout_seconds}s), proceeding with download")
+        return None
+    
+    if result_container[1] is not None:
+        # Probe error - return None to proceed anyway
+        return None
+    
+    return cast(Optional[Dict[str, Any]], result_container[0])
+
+
+__all__ = [
+    "is_url_supported_by_ytdlp",
+    "list_formats",
+    "probe_url",
+    "DownloadError",
+    "DownloadOptions",
+    "DownloadMediaResult",
+]
+