df

2025-12-29 17:05:03 -08:00
parent 226de9316a
commit c019c00aed
104 changed files with 19669 additions and 12954 deletions
--- a/SYS/download.py
+++ b/SYS/download.py
@@ -6,6 +6,7 @@ Lean, focused downloader without event infrastructure overhead.
 - Tag extraction via metadata.extract_ytdlp_tags()
 - Logging via helper.logger.log()
 """
+
 from __future__ import annotations

 import glob  # noqa: F401
@@ -97,7 +98,9 @@ def is_url_supported_by_ytdlp(url: str) -> bool:
    return False


-def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
+def list_formats(
+    url: str, no_playlist: bool = False, playlist_items: Optional[str] = None
+) -> Optional[List[Dict[str, Any]]]:
    """Get list of available formats for a URL using yt-dlp."""
    _ensure_yt_dlp_ready()

@@ -125,19 +128,21 @@ def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[s

            result_formats = []
            for fmt in formats:
-                result_formats.append({
-                    "format_id": fmt.get("format_id", ""),
-                    "format": fmt.get("format", ""),
-                    "ext": fmt.get("ext", ""),
-                    "resolution": fmt.get("resolution", ""),
-                    "width": fmt.get("width"),
-                    "height": fmt.get("height"),
-                    "fps": fmt.get("fps"),
-                    "vcodec": fmt.get("vcodec", "none"),
-                    "acodec": fmt.get("acodec", "none"),
-                    "filesize": fmt.get("filesize"),
-                    "tbr": fmt.get("tbr"),
-                })
+                result_formats.append(
+                    {
+                        "format_id": fmt.get("format_id", ""),
+                        "format": fmt.get("format", ""),
+                        "ext": fmt.get("ext", ""),
+                        "resolution": fmt.get("resolution", ""),
+                        "width": fmt.get("width"),
+                        "height": fmt.get("height"),
+                        "fps": fmt.get("fps"),
+                        "vcodec": fmt.get("vcodec", "none"),
+                        "acodec": fmt.get("acodec", "none"),
+                        "filesize": fmt.get("filesize"),
+                        "tbr": fmt.get("tbr"),
+                    }
+                )

            debug(f"Found {len(result_formats)} available formats")
            return result_formats
@@ -147,48 +152,50 @@ def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[s
        return None


-def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sections: List[str], quiet: bool = False) -> tuple[Optional[str], Dict[str, Any]]:
+def _download_with_sections_via_cli(
+    url: str, ytdl_options: Dict[str, Any], sections: List[str], quiet: bool = False
+) -> tuple[Optional[str], Dict[str, Any]]:
    """Download each section separately so merge-file can combine them.
-    
+
    yt-dlp with multiple --download-sections args merges them into one file.
    We need separate files for merge-file, so download each section individually.
-    
+
    Uses hash-based filenames for sections (not title-based) to prevent yt-dlp from
    thinking sections are already downloaded. The title is extracted and stored in tags.
-    
+
    Returns:
        (session_id, first_section_info_dict) - session_id for finding files, info dict for metadata extraction
    """
-    
+
    sections_list = ytdl_options.get("download_sections", [])
    if not sections_list:
        return "", {}
-    
+
    # Generate a unique hash-based ID for this download session
    # This ensures different videos/downloads don't have filename collisions
    session_id = hashlib.md5(
-        (url + str(time.time()) + ''.join(random.choices(string.ascii_letters, k=10))).encode()
+        (url + str(time.time()) + "".join(random.choices(string.ascii_letters, k=10))).encode()
    ).hexdigest()[:12]
-    
+
    first_section_info = None
    title_from_first = None
-    
+
    # Download each section separately with unique output template using session ID
    for section_idx, section in enumerate(sections_list, 1):
        # Build unique output template for this section using session-based filename
        # e.g., "{session_id}_{section_idx}.ext" - simple and unique per section
        base_outtmpl = ytdl_options.get("outtmpl", "%(title)s.%(ext)s")
        output_dir_path = Path(base_outtmpl).parent
-        
+
        # Use session_id + section index for temp filename
        # e.g., "/path/{session_id}_1.%(ext)s"
        filename_tmpl = f"{session_id}_{section_idx}"
        if base_outtmpl.endswith(".%(ext)s"):
            filename_tmpl += ".%(ext)s"
-            
+
        # Use Path to handle separators correctly for the OS
        section_outtmpl = str(output_dir_path / filename_tmpl)
-        
+
        # For the first section, extract metadata first (separate call)
        if section_idx == 1:
            metadata_cmd = ["yt-dlp", "--dump-json", "--skip-download"]
@@ -198,14 +205,14 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
            if ytdl_options.get("noplaylist"):
                metadata_cmd.append("--no-playlist")
            metadata_cmd.append(url)
-            
+
            try:
                meta_result = subprocess.run(metadata_cmd, capture_output=True, text=True)
                if meta_result.returncode == 0 and meta_result.stdout:
                    try:
                        info_dict = json.loads(meta_result.stdout.strip())
                        first_section_info = info_dict
-                        title_from_first = info_dict.get('title')
+                        title_from_first = info_dict.get("title")
                        if not quiet:
                            debug(f"Extracted title from metadata: {title_from_first}")
                    except json.JSONDecodeError:
@@ -214,50 +221,54 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
            except Exception as e:
                if not quiet:
                    debug(f"Error extracting metadata: {e}")
-        
+
        # Build yt-dlp command for downloading this section
        cmd = ["yt-dlp"]
-        
+
        # Add format
        if ytdl_options.get("format"):
            cmd.extend(["-f", ytdl_options["format"]])
-        
+
        # Add ONLY this section (not all sections)
        cmd.extend(["--download-sections", section])
-        
+
        # Add force-keyframes-at-cuts if specified
        if ytdl_options.get("force_keyframes_at_cuts"):
            cmd.append("--force-keyframes-at-cuts")
-        
+
        # Add output template for this section
        cmd.extend(["-o", section_outtmpl])
-        
+
        # Add cookies file if present
        if ytdl_options.get("cookiefile"):
            # Convert backslashes to forward slashes for better compatibility
            cookies_path = ytdl_options["cookiefile"].replace("\\", "/")
            cmd.extend(["--cookies", cookies_path])
-        
+
        # Add no-playlist if specified
        if ytdl_options.get("noplaylist"):
            cmd.append("--no-playlist")
-        
+
        # Add the URL
        cmd.append(url)
-        
+
        if not quiet:
            debug(f"Running yt-dlp for section {section_idx}/{len(sections_list)}: {section}")
            debug(f"Command: {' '.join(cmd)}")
-        
+
        # Run the subprocess - don't capture output so progress is shown
        try:
            result = subprocess.run(cmd)
-            
+
            if result.returncode != 0:
-                raise DownloadError(f"yt-dlp subprocess failed for section {section_idx} with code {result.returncode}")
+                raise DownloadError(
+                    f"yt-dlp subprocess failed for section {section_idx} with code {result.returncode}"
+                )
        except Exception as exc:
-            raise DownloadError(f"yt-dlp subprocess error for section {section_idx}: {exc}") from exc
-    
+            raise DownloadError(
+                f"yt-dlp subprocess error for section {section_idx}: {exc}"
+            ) from exc
+
    return session_id, first_section_info or {}


@@ -299,33 +310,39 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
    else:  # video
        base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best"
        base_options["format_sort"] = [
-            "res:4320", "res:2880", "res:2160", "res:1440", "res:1080", "res:720", "res"
+            "res:4320",
+            "res:2880",
+            "res:2160",
+            "res:1440",
+            "res:1080",
+            "res:720",
+            "res",
        ]

    # Add clip sections if provided (yt-dlp will download only these sections)
    if opts.clip_sections:
-        # Parse section ranges like "48-65,120-152,196-205" (seconds) 
+        # Parse section ranges like "48-65,120-152,196-205" (seconds)
        # and convert to yt-dlp format: "*HH:MM:SS-HH:MM:SS,*HH:MM:SS-HH:MM:SS"
        sections = []
-        for section_range in opts.clip_sections.split(','):
+        for section_range in opts.clip_sections.split(","):
            try:
-                start_str, end_str = section_range.strip().split('-')
+                start_str, end_str = section_range.strip().split("-")
                start_sec = float(start_str)
                end_sec = float(end_str)
-                
+
                # Convert seconds to HH:MM:SS format
                def sec_to_hhmmss(seconds):
                    hours = int(seconds // 3600)
                    minutes = int((seconds % 3600) // 60)
                    secs = int(seconds % 60)
                    return f"{hours:02d}:{minutes:02d}:{secs:02d}"
-                
+
                start_time = sec_to_hhmmss(start_sec)
                end_time = sec_to_hhmmss(end_sec)
                sections.append(f"*{start_time}-{end_time}")
            except (ValueError, AttributeError):
                pass
-        
+
        if sections:
            # Pass each section as a separate element in the list (yt-dlp expects multiple --download-sections args)
            base_options["download_sections"] = sections
@@ -412,41 +429,41 @@ def _extract_sha256(info: Dict[str, Any]) -> Optional[str]:

 def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
    """Extract the actual download link from LibGen redirect URL.
-    
+
    LibGen url like https://libgen.gl/file.php?id=123456 redirect to
    actual mirror url. This follows the redirect chain to get the real file.
-    
+
    Args:
        libgen_url: LibGen file.php URL
-        
+
    Returns:
        Actual download URL or None if extraction fails
    """
    try:
        import requests
        from urllib.parse import urlparse
-        
+
        # Check if this is a LibGen URL
        parsed = urlparse(libgen_url)
-        if 'libgen' not in parsed.netloc.lower():
+        if "libgen" not in parsed.netloc.lower():
            return None
-        
-        if '/file.php' not in parsed.path.lower():
+
+        if "/file.php" not in parsed.path.lower():
            return None
-        
+
        # LibGen redirects to actual mirrors, follow redirects to get final URL
        session = requests.Session()
-        session.headers.update({
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
-        })
-        
+        session.headers.update(
+            {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
+        )
+
        debug(f"Following LibGen redirect chain for: {libgen_url}")
-        
+
        # First, get the page and look for direct download link
        try:
            response = session.get(libgen_url, timeout=10, allow_redirects=True)
            final_url = response.url
-            
+
            # Try to find actual download link in the page
            try:
                try:
@@ -462,8 +479,12 @@ def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
                            continue

                        href_lower = href.lower()
-                        if "get.php" in href_lower or href_lower.endswith((".pdf", ".epub", ".djvu", ".mobi")):
-                            download_url = href if href.startswith("http") else urljoin(final_url, href)
+                        if "get.php" in href_lower or href_lower.endswith(
+                            (".pdf", ".epub", ".djvu", ".mobi")
+                        ):
+                            download_url = (
+                                href if href.startswith("http") else urljoin(final_url, href)
+                            )
                            debug(f"Found download link: {download_url}")
                            return download_url
                else:
@@ -477,19 +498,23 @@ def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
                        if not href or href.lower().startswith("javascript:"):
                            continue
                        href_lower = href.lower()
-                        if "get.php" in href_lower or href_lower.endswith((".pdf", ".epub", ".djvu", ".mobi")):
-                            download_url = href if href.startswith("http") else urljoin(final_url, href)
+                        if "get.php" in href_lower or href_lower.endswith(
+                            (".pdf", ".epub", ".djvu", ".mobi")
+                        ):
+                            download_url = (
+                                href if href.startswith("http") else urljoin(final_url, href)
+                            )
                            debug(f"Found download link: {download_url}")
                            return download_url
            except Exception:
                pass
-            
+
            # If we followed redirects successfully, return the final URL
            # This handles cases where libgen redirects to a direct download mirror
            if final_url != libgen_url:
                debug(f"LibGen resolved to mirror: {final_url}")
                return final_url
-        
+
        except requests.RequestException as e:
            log(f"Error following LibGen redirects: {e}", file=sys.stderr)
            # Try head request as fallback
@@ -500,9 +525,9 @@ def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
                    return response.url
            except:
                pass
-        
+
        return None
-        
+
    except Exception as e:
        log(f"Error resolving LibGen URL: {e}", file=sys.stderr)
        return None
@@ -521,7 +546,7 @@ def _download_direct_file(

    from urllib.parse import unquote, urlparse, parse_qs
    import re
-    
+
    def _sanitize_filename(name: str) -> str:
        # Windows-safe filename sanitization.
        # Keep it simple: strip path parts, drop invalid chars, collapse whitespace.
@@ -563,27 +588,27 @@ def _download_direct_file(
    # Extract filename from URL
    parsed_url = urlparse(url)
    url_path = parsed_url.path
-    
+
    # Try to get filename from query parameters first (for LibGen and similar services)
    # e.g., ?filename=Book+Title.pdf or &download=filename.pdf
    filename = None
    if parsed_url.query:
        query_params = parse_qs(parsed_url.query)
-        for param_name in ('filename', 'download', 'file', 'name'):
+        for param_name in ("filename", "download", "file", "name"):
            if param_name in query_params and query_params[param_name]:
                filename = query_params[param_name][0]
                filename = unquote(filename)
                break
-    
+
    # If not found in query params, extract from URL path
    if not filename or not filename.strip():
        filename = url_path.split("/")[-1] if url_path else ""
        filename = unquote(filename)
-    
+
    # Remove query strings from filename if any
    if "?" in filename:
        filename = filename.split("?")[0]
-    
+
    # Try to get real filename from Content-Disposition header (HEAD request)
    content_type = ""
    try:
@@ -627,7 +652,12 @@ def _download_direct_file(
            with HTTPClient(timeout=10.0) as client:
                with client._request_stream("GET", url, follow_redirects=True) as resp:
                    resp.raise_for_status()
-                    ct = str(resp.headers.get("content-type", "") or "").split(";", 1)[0].strip().lower()
+                    ct = (
+                        str(resp.headers.get("content-type", "") or "")
+                        .split(";", 1)[0]
+                        .strip()
+                        .lower()
+                    )
                    if ct.startswith("text/html"):
                        raise DownloadError("URL appears to be an HTML page, not a direct file")
    except DownloadError:
@@ -635,7 +665,7 @@ def _download_direct_file(
    except Exception:
        # If we can't probe, keep going; later logic may still infer a safe extension.
        pass
-    
+
    # Apply suggested filename (from provider title) if given.
    suggested = _sanitize_filename(suggested_filename) if suggested_filename else ""
    if suggested:
@@ -683,7 +713,9 @@ def _download_direct_file(

    # Final guardrail: if filename is empty, refuse rather than inventing `download.bin`.
    if not filename or not str(filename).strip():
-        raise DownloadError("Could not determine filename for URL (no Content-Disposition and no path filename)")
+        raise DownloadError(
+            "Could not determine filename for URL (no Content-Disposition and no path filename)"
+        )

    file_path = _unique_path(output_dir / filename)

@@ -719,12 +751,18 @@ def _download_direct_file(
            if transfer_started[0]:
                return
            try:
-                total_val: Optional[int] = int(content_length) if isinstance(content_length, int) and content_length > 0 else None
+                total_val: Optional[int] = (
+                    int(content_length)
+                    if isinstance(content_length, int) and content_length > 0
+                    else None
+                )
            except Exception:
                total_val = None
            try:
                if hasattr(pipeline_progress, "begin_transfer"):
-                    pipeline_progress.begin_transfer(label=str(filename or "download"), total=total_val)
+                    pipeline_progress.begin_transfer(
+                        label=str(filename or "download"), total=total_val
+                    )
                    transfer_started[0] = True
            except Exception:
                return
@@ -737,7 +775,11 @@ def _download_direct_file(
            try:
                if pipeline_progress is not None and hasattr(pipeline_progress, "update_transfer"):
                    _maybe_begin_transfer(content_length)
-                    total_val: Optional[int] = int(content_length) if isinstance(content_length, int) and content_length > 0 else None
+                    total_val: Optional[int] = (
+                        int(content_length)
+                        if isinstance(content_length, int) and content_length > 0
+                        else None
+                    )
                    pipeline_progress.update_transfer(
                        label=str(filename or "download"),
                        completed=int(bytes_downloaded) if bytes_downloaded is not None else None,
@@ -790,14 +832,21 @@ def _download_direct_file(
            pass

        try:
-            if pipeline_progress is not None and transfer_started[0] and hasattr(pipeline_progress, "finish_transfer"):
+            if (
+                pipeline_progress is not None
+                and transfer_started[0]
+                and hasattr(pipeline_progress, "finish_transfer")
+            ):
                pipeline_progress.finish_transfer(label=str(filename or "download"))
        except Exception:
            pass

        try:
            if progress_bar is not None:
-                avg_speed_str = progress_bar.format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0) + "/s"
+                avg_speed_str = (
+                    progress_bar.format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0)
+                    + "/s"
+                )
            else:
                avg_speed_str = f"{(downloaded_bytes[0] / elapsed if elapsed > 0 else 0):.1f} B/s"
        except Exception:
@@ -835,9 +884,9 @@ def _download_direct_file(

        # Only use filename as a title tag if we couldn't extract any meaningful tags
        # This prevents duplicate title: tags when the filename could be mistaken for metadata
-        if not any(t.startswith('title:') for t in tags):
+        if not any(t.startswith("title:") for t in tags):
            # Re-extract tags with filename as title only if needed
-            info['title'] = filename
+            info["title"] = filename
            tags = []
            if extract_ytdlp_tags:
                try:
@@ -866,7 +915,11 @@ def _download_direct_file(
        except Exception:
            pass
        try:
-            if pipeline_progress is not None and transfer_started[0] and hasattr(pipeline_progress, "finish_transfer"):
+            if (
+                pipeline_progress is not None
+                and transfer_started[0]
+                and hasattr(pipeline_progress, "finish_transfer")
+            ):
                pipeline_progress.finish_transfer(label=str(filename or "download"))
        except Exception:
            pass
@@ -884,7 +937,11 @@ def _download_direct_file(
        except Exception:
            pass
        try:
-            if pipeline_progress is not None and transfer_started[0] and hasattr(pipeline_progress, "finish_transfer"):
+            if (
+                pipeline_progress is not None
+                and transfer_started[0]
+                and hasattr(pipeline_progress, "finish_transfer")
+            ):
                pipeline_progress.finish_transfer(label=str(filename or "download"))
        except Exception:
            pass
@@ -902,31 +959,33 @@ def _download_direct_file(
        raise DownloadError(f"Error downloading file: {exc}") from exc


-def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) -> Optional[Dict[str, Any]]:
+def probe_url(
+    url: str, no_playlist: bool = False, timeout_seconds: int = 15
+) -> Optional[Dict[str, Any]]:
    """Probe URL to extract metadata WITHOUT downloading.
-    
+
    Args:
        url: URL to probe
        no_playlist: If True, ignore playlists and probe only the single video
        timeout_seconds: Max seconds to wait for probe (default 15s)
-    
+
    Returns:
        Dict with keys: extractor, title, entries (if playlist), duration, etc.
        Returns None if not supported by yt-dlp or on timeout.
    """
    if not is_url_supported_by_ytdlp(url):
        return None
-    
+
    # Wrap probe in timeout to prevent hanging on large playlists
    import threading
    from typing import cast
-    
+
    result_container: List[Optional[Any]] = [None, None]  # [result, error]
-    
+
    def _do_probe() -> None:
        try:
            _ensure_yt_dlp_ready()
-            
+
            assert yt_dlp is not None
            # Extract info without downloading
            # Use extract_flat='in_playlist' to get full metadata for playlist items
@@ -939,20 +998,20 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) ->
                "extract_flat": "in_playlist",  # Get playlist with metadata for each entry
                "noprogress": True,  # No progress bars
            }
-            
+
            # Cookies are optional for probing; callers should pass cookiefile via DownloadOptions when needed.
-            
+
            # Add no_playlist option if specified
            if no_playlist:
                ydl_opts["noplaylist"] = True
-            
+
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:  # type: ignore[arg-type]
                info = ydl.extract_info(url, download=False)
-            
+
            if not isinstance(info, dict):
                result_container[0] = None
                return
-            
+
            # Extract relevant fields
            result_container[0] = {
                "extractor": info.get("extractor", ""),
@@ -966,20 +1025,20 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) ->
        except Exception as exc:
            log(f"Probe error for {url}: {exc}")
            result_container[1] = exc
-    
+
    thread = threading.Thread(target=_do_probe, daemon=False)
    thread.start()
    thread.join(timeout=timeout_seconds)
-    
+
    if thread.is_alive():
        # Probe timed out - return None to fall back to direct download
        debug(f"Probe timeout for {url} (>={timeout_seconds}s), proceeding with download")
        return None
-    
+
    if result_container[1] is not None:
        # Probe error - return None to proceed anyway
        return None
-    
+
    return cast(Optional[Dict[str, Any]], result_container[0])


@@ -991,4 +1050,3 @@ __all__ = [
    "DownloadOptions",
    "DownloadMediaResult",
 ]
-