dfdkflj

2025-12-11 12:47:30 -08:00
parent 6b05dc5552
commit 65d12411a2
92 changed files with 17447 additions and 14308 deletions
--- a/helper/download.py
+++ b/helper/download.py
@@ -28,7 +28,6 @@ from helper.logger import log, debug
 from .utils import ensure_directory, sha256_file
 from .http_client import HTTPClient
 from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLogger, ProgressBar
-from hydrus_health_check import get_cookies_file_path

 try:
    import yt_dlp  # type: ignore
@@ -145,7 +144,7 @@ def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[s
        return None


-def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sections: List[str]) -> tuple[Optional[str], Dict[str, Any]]:
+def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sections: List[str], quiet: bool = False) -> tuple[Optional[str], Dict[str, Any]]:
    """Download each section separately so merge-file can combine them.
    
    yt-dlp with multiple --download-sections args merges them into one file.
@@ -204,11 +203,14 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
                        info_dict = json.loads(meta_result.stdout.strip())
                        first_section_info = info_dict
                        title_from_first = info_dict.get('title')
-                        debug(f"Extracted title from metadata: {title_from_first}")
+                        if not quiet:
+                            debug(f"Extracted title from metadata: {title_from_first}")
                    except json.JSONDecodeError:
-                        debug("Could not parse JSON metadata")
+                        if not quiet:
+                            debug("Could not parse JSON metadata")
            except Exception as e:
-                debug(f"Error extracting metadata: {e}")
+                if not quiet:
+                    debug(f"Error extracting metadata: {e}")
        
        # Build yt-dlp command for downloading this section
        cmd = ["yt-dlp"]
@@ -240,8 +242,9 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
        # Add the URL
        cmd.append(url)
        
-        debug(f"Running yt-dlp for section {section_idx}/{len(sections_list)}: {section}")
-        debug(f"Command: {' '.join(cmd)}")
+        if not quiet:
+            debug(f"Running yt-dlp for section {section_idx}/{len(sections_list)}: {section}")
+            debug(f"Command: {' '.join(cmd)}")
        
        # Run the subprocess - don't capture output so progress is shown
        try:
@@ -273,13 +276,15 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
        "fragment_retries": 10,
        "http_chunk_size": 10_485_760,
        "restrictfilenames": True,
-        "progress_hooks": [_progress_callback],
+        "progress_hooks": [] if opts.quiet else [_progress_callback],
    }

    if opts.cookies_path and opts.cookies_path.is_file():
        base_options["cookiefile"] = str(opts.cookies_path)
    else:
-        # Check global cookies file
+        # Check global cookies file lazily to avoid import cycles
+        from hydrus_health_check import get_cookies_file_path  # local import
+
        global_cookies = get_cookies_file_path()
        if global_cookies:
            base_options["cookiefile"] = global_cookies
@@ -287,7 +292,7 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
            # Fallback to browser cookies
            base_options["cookiesfrombrowser"] = ("chrome",)

-    # Add no-playlist option if specified (for single video from playlist URLs)
+    # Add no-playlist option if specified (for single video from playlist url)
    if opts.no_playlist:
        base_options["noplaylist"] = True

@@ -336,7 +341,8 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
    if opts.playlist_items:
        base_options["playlist_items"] = opts.playlist_items

-    debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}")
+    if not opts.quiet:
+        debug(f"yt-dlp: mode={opts.mode}, format={base_options.get('format')}")
    return base_options


@@ -411,8 +417,8 @@ def _extract_sha256(info: Dict[str, Any]) -> Optional[str]:
 def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
    """Extract the actual download link from LibGen redirect URL.
    
-    LibGen URLs like https://libgen.gl/file.php?id=123456 redirect to
-    actual mirror URLs. This follows the redirect chain to get the real file.
+    LibGen url like https://libgen.gl/file.php?id=123456 redirect to
+    actual mirror url. This follows the redirect chain to get the real file.
    
    Args:
        libgen_url: LibGen file.php URL
@@ -491,6 +497,7 @@ def _download_direct_file(
    url: str,
    output_dir: Path,
    debug_logger: Optional[DebugLogger] = None,
+    quiet: bool = False,
 ) -> DownloadMediaResult:
    """Download a direct file (PDF, image, document, etc.) without yt-dlp."""
    ensure_directory(output_dir)
@@ -535,9 +542,11 @@ def _download_direct_file(
                    extracted_name = match.group(1) or match.group(2)
                    if extracted_name:
                        filename = unquote(extracted_name)
-                        debug(f"Filename from Content-Disposition: {filename}")
+                        if not quiet:
+                            debug(f"Filename from Content-Disposition: {filename}")
    except Exception as e:
-        log(f"Could not get filename from headers: {e}", file=sys.stderr)
+        if not quiet:
+            log(f"Could not get filename from headers: {e}", file=sys.stderr)
    
    # Fallback if we still don't have a good filename
    if not filename or "." not in filename:
@@ -546,7 +555,8 @@ def _download_direct_file(
    file_path = output_dir / filename
    progress_bar = ProgressBar()

-    debug(f"Direct download: {filename}")
+    if not quiet:
+        debug(f"Direct download: {filename}")

    try:
        start_time = time.time()
@@ -577,7 +587,8 @@ def _download_direct_file(
                    speed_str=speed_str,
                    eta_str=eta_str,
                )
-                debug(progress_line)
+                if not quiet:
+                    debug(progress_line)
                last_progress_time[0] = now

        with HTTPClient(timeout=30.0) as client:
@@ -585,7 +596,8 @@ def _download_direct_file(

        elapsed = time.time() - start_time
        avg_speed_str = progress_bar.format_bytes(downloaded_bytes[0] / elapsed if elapsed > 0 else 0) + "/s"
-        debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}")
+        if not quiet:
+            debug(f"✓ Downloaded in {elapsed:.1f}s at {avg_speed_str}")

        # For direct file downloads, create minimal info dict without filename as title
        # This prevents creating duplicate title: tags when filename gets auto-generated
@@ -658,375 +670,98 @@ def _download_direct_file(
        raise DownloadError(f"Error downloading file: {exc}") from exc


-def probe_url(url: str, no_playlist: bool = False) -> Optional[Dict[str, Any]]:
+def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) -> Optional[Dict[str, Any]]:
    """Probe URL to extract metadata WITHOUT downloading.
    
    Args:
        url: URL to probe
        no_playlist: If True, ignore playlists and probe only the single video
+        timeout_seconds: Max seconds to wait for probe (default 15s)
    
    Returns:
        Dict with keys: extractor, title, entries (if playlist), duration, etc.
-        Returns None if not supported by yt-dlp.
+        Returns None if not supported by yt-dlp or on timeout.
    """
    if not is_url_supported_by_ytdlp(url):
        return None
    
-    _ensure_yt_dlp_ready()
+    # Wrap probe in timeout to prevent hanging on large playlists
+    import threading
+    from typing import cast
    
-    assert yt_dlp is not None
-    try:
-        # Extract info without downloading
-        # Use extract_flat='in_playlist' to get full metadata for playlist items
-        ydl_opts = {
-            "quiet": True,  # Suppress all output
-            "no_warnings": True,
-            "socket_timeout": 10,
-            "retries": 3,
-            "skip_download": True,  # Don't actually download
-            "extract_flat": "in_playlist",  # Get playlist with metadata for each entry
-            "noprogress": True,  # No progress bars
-        }
-        
-        # Add cookies if available
-        global_cookies = get_cookies_file_path()
-        if global_cookies:
-            ydl_opts["cookiefile"] = global_cookies
-        
-        # Add no_playlist option if specified
-        if no_playlist:
-            ydl_opts["noplaylist"] = True
-        
-        with yt_dlp.YoutubeDL(ydl_opts) as ydl:  # type: ignore[arg-type]
-            info = ydl.extract_info(url, download=False)
-        
-        if not isinstance(info, dict):
-            return None
-        
-        # Extract relevant fields
-        return {
-            "extractor": info.get("extractor", ""),
-            "title": info.get("title", ""),
-            "entries": info.get("entries", []),  # Will be populated if playlist
-            "duration": info.get("duration"),
-            "uploader": info.get("uploader"),
-            "description": info.get("description"),
-            "url": url,
-        }
-    except Exception as exc:
-        log(f"Probe failed for {url}: {exc}")
-        return None
-
-
-def download_media(
-    opts: DownloadOptions,
-    *,
-    debug_logger: Optional[DebugLogger] = None,
-) -> DownloadMediaResult:
-    """Download media from URL using yt-dlp or direct HTTP download.
+    result_container: List[Optional[Any]] = [None, None]  # [result, error]
    
-    Args:
-        opts: DownloadOptions with url, mode, output_dir, etc.
-        debug_logger: Optional debug logger for troubleshooting
-        
-    Returns:
-        DownloadMediaResult with path, info, tags, hash
-        
-    Raises:
-        DownloadError: If download fails
-    """
-    # Handle LibGen URLs specially
-    # file.php redirects to mirrors, get.php is direct from modern API
-    if 'libgen' in opts.url.lower():
-        if '/get.php' in opts.url.lower():
-            # Modern API get.php links are direct downloads from mirrors (not file redirects)
-            log(f"Detected LibGen get.php URL, downloading directly...")
-            if debug_logger is not None:
-                debug_logger.write_record("libgen-direct", {"url": opts.url})
-            return _download_direct_file(opts.url, opts.output_dir, debug_logger)
-        elif '/file.php' in opts.url.lower():
-            # Old-style file.php redirects to mirrors, we need to resolve
-            log(f"Detected LibGen file.php URL, resolving to actual mirror...")
-            actual_url = _get_libgen_download_url(opts.url)
-            if actual_url and actual_url != opts.url:
-                log(f"Resolved LibGen URL to mirror: {actual_url}")
-                opts.url = actual_url
-                # After resolution, this will typically be an onion link or direct file
-                # Skip yt-dlp for this (it won't support onion/mirrors), go direct
-                if debug_logger is not None:
-                    debug_logger.write_record("libgen-resolved", {"original": opts.url, "resolved": actual_url})
-                return _download_direct_file(opts.url, opts.output_dir, debug_logger)
-            else:
-                log(f"Could not resolve LibGen URL, trying direct download anyway", file=sys.stderr)
-                if debug_logger is not None:
-                    debug_logger.write_record("libgen-resolve-failed", {"url": opts.url})
-                return _download_direct_file(opts.url, opts.output_dir, debug_logger)
-    
-    # Handle GoFile shares with a dedicated resolver before yt-dlp/direct fallbacks
-    try:
-        netloc = urlparse(opts.url).netloc.lower()
-    except Exception:
-        netloc = ""
-    if "gofile.io" in netloc:
-        msg = "GoFile links are currently unsupported"
-        debug(msg)
-        if debug_logger is not None:
-            debug_logger.write_record("gofile-unsupported", {"url": opts.url})
-        raise DownloadError(msg)
-
-    # Determine if yt-dlp should be used
-    ytdlp_supported = is_url_supported_by_ytdlp(opts.url)
-    if ytdlp_supported:
-        probe_result = probe_url(opts.url, no_playlist=opts.no_playlist)
-        if probe_result is None:
-            log(f"URL supported by yt-dlp but no media detected, falling back to direct download: {opts.url}")
-            if debug_logger is not None:
-                debug_logger.write_record("ytdlp-skip-no-media", {"url": opts.url})
-            return _download_direct_file(opts.url, opts.output_dir, debug_logger)
-    else:
-        log(f"URL not supported by yt-dlp, trying direct download: {opts.url}")
-        if debug_logger is not None:
-            debug_logger.write_record("direct-file-attempt", {"url": opts.url})
-        return _download_direct_file(opts.url, opts.output_dir, debug_logger)
-
-    _ensure_yt_dlp_ready()
-
-    ytdl_options = _build_ytdlp_options(opts)
-    debug(f"Starting yt-dlp download: {opts.url}")
-    if debug_logger is not None:
-        debug_logger.write_record("ytdlp-start", {"url": opts.url})
-
-    assert yt_dlp is not None
-    try:
-        # Debug: show what options we're using
-        if ytdl_options.get("download_sections"):
-            debug(f"[yt-dlp] download_sections: {ytdl_options['download_sections']}")
-        debug(f"[yt-dlp] force_keyframes_at_cuts: {ytdl_options.get('force_keyframes_at_cuts', False)}")
-        
-        # Use subprocess when download_sections are present (Python API doesn't support them properly)
-        session_id = None
-        first_section_info = {}
-        if ytdl_options.get("download_sections"):
-            session_id, first_section_info = _download_with_sections_via_cli(opts.url, ytdl_options, ytdl_options.get("download_sections", []))
-            info = None
-        else:
-            with yt_dlp.YoutubeDL(ytdl_options) as ydl:  # type: ignore[arg-type]
-                info = ydl.extract_info(opts.url, download=True)
-    except Exception as exc:
-        log(f"yt-dlp failed: {exc}", file=sys.stderr)
-        if debug_logger is not None:
-            debug_logger.write_record(
-                "exception",
-                {
-                    "phase": "yt-dlp",
-                    "error": str(exc),
-                    "traceback": traceback.format_exc(),
-                },
-            )
-        raise DownloadError("yt-dlp download failed") from exc
-
-    # If we used subprocess, we need to find the file manually
-    if info is None:
-        # Find files created/modified during this download (after we started)
-        # Look for files matching the expected output template pattern
+    def _do_probe() -> None:
        try:
-            import glob
-            import time
-            import re
+            _ensure_yt_dlp_ready()
            
-            # Get the expected filename pattern from outtmpl
-            # For sections: "C:\path\{session_id}.section_1_of_3.ext", etc.
-            # For non-sections: "C:\path\title.ext"
-            
-            # Wait a moment to ensure files are fully written
-            time.sleep(0.5)
-            
-            # List all files in output_dir, sorted by modification time
-            files = sorted(opts.output_dir.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True)
-            if not files:
-                raise FileNotFoundError(f"No files found in {opts.output_dir}")
-            
-            # If we downloaded sections, look for files with the session_id pattern
-            if opts.clip_sections and session_id:
-                # Pattern: "{session_id}_1.ext", "{session_id}_2.ext", etc.
-                section_pattern = re.compile(rf'^{re.escape(session_id)}_(\d+)\.')
-                matching_files = [f for f in files if section_pattern.search(f.name)]
-                
-                if matching_files:
-                    # Sort by section number to ensure correct order
-                    def extract_section_num(path: Path) -> int:
-                        match = section_pattern.search(path.name)
-                        return int(match.group(1)) if match else 999
-                    
-                    matching_files.sort(key=extract_section_num)
-                    debug(f"Found {len(matching_files)} section file(s) matching pattern")
-                    
-                    # Now rename section files to use hash-based names
-                    # This ensures unique filenames for each section content
-                    renamed_files = []
-                    
-                    for idx, section_file in enumerate(matching_files, 1):
-                        try:
-                            # Calculate hash for the file
-                            file_hash = sha256_file(section_file)
-                            ext = section_file.suffix
-                            new_name = f"{file_hash}{ext}"
-                            new_path = opts.output_dir / new_name
-                            
-                            if new_path.exists() and new_path != section_file:
-                                # If file with same hash exists, use it and delete the temp one
-                                debug(f"File with hash {file_hash} already exists, using existing file.")
-                                try:
-                                    section_file.unlink()
-                                except OSError:
-                                    pass
-                                renamed_files.append(new_path)
-                            else:
-                                section_file.rename(new_path)
-                                debug(f"Renamed section file: {section_file.name} → {new_name}")
-                                renamed_files.append(new_path)
-                        except Exception as e:
-                            debug(f"Failed to process section file {section_file.name}: {e}")
-                            renamed_files.append(section_file)
-                    
-                    media_path = renamed_files[0]
-                    media_paths = renamed_files
-                    debug(f"✓ Downloaded {len(media_paths)} section file(s) (session: {session_id})")
-                else:
-                    # Fallback to most recent file if pattern not found
-                    media_path = files[0]
-                    media_paths = None
-                    debug(f"✓ Downloaded section file (pattern not found): {media_path.name}")
-            else:
-                # No sections, just take the most recent file
-                media_path = files[0]
-                media_paths = None
-            
-            debug(f"✓ Downloaded: {media_path.name}")
-            if debug_logger is not None:
-                debug_logger.write_record("ytdlp-file-found", {"path": str(media_path)})
-        except Exception as exc:
-            log(f"Error finding downloaded file: {exc}", file=sys.stderr)
-            if debug_logger is not None:
-                debug_logger.write_record(
-                    "exception",
-                    {"phase": "find-file", "error": str(exc)},
-                )
-            raise DownloadError(str(exc)) from exc
-        
-        # Create result with minimal data extracted from filename
-        file_hash = sha256_file(media_path)
-        
-        # For section downloads, create tags with the title and build proper info dict
-        tags = []
-        title = ''
-        if first_section_info:
-            title = first_section_info.get('title', '')
-            if title:
-                tags.append(f'title:{title}')
-                debug(f"Added title tag for section download: {title}")
-        
-        # Build info dict - always use extracted title if available, not hash
-        if first_section_info:
-            info_dict = first_section_info
-        else:
-            info_dict = {
-                "id": media_path.stem,
-                "title": title or media_path.stem,
-                "ext": media_path.suffix.lstrip(".")
+            assert yt_dlp is not None
+            # Extract info without downloading
+            # Use extract_flat='in_playlist' to get full metadata for playlist items
+            ydl_opts = {
+                "quiet": True,  # Suppress all output
+                "no_warnings": True,
+                "socket_timeout": 10,
+                "retries": 2,  # Reduce retries for faster timeout
+                "skip_download": True,  # Don't actually download
+                "extract_flat": "in_playlist",  # Get playlist with metadata for each entry
+                "noprogress": True,  # No progress bars
            }
-        
-        return DownloadMediaResult(
-            path=media_path,
-            info=info_dict,
-            tags=tags,
-            source_url=opts.url,
-            hash_value=file_hash,
-            paths=media_paths,  # Include all section files if present
-        )
+            
+            # Add cookies if available (lazy import to avoid circular dependency)
+            from hydrus_health_check import get_cookies_file_path  # local import

-    if not isinstance(info, dict):
-        log(f"Unexpected yt-dlp response: {type(info)}", file=sys.stderr)
-        raise DownloadError("Unexpected yt-dlp response type")
-
-    info_dict: Dict[str, Any] = info
-    if debug_logger is not None:
-        debug_logger.write_record(
-            "ytdlp-info",
-            {
-                "keys": sorted(info_dict.keys()),
-                "is_playlist": bool(info_dict.get("entries")),
-            },
-        )
-
-    try:
-        entry, media_path = _resolve_entry_and_path(info_dict, opts.output_dir)
-    except FileNotFoundError as exc:
-        log(f"Error: {exc}", file=sys.stderr)
-        if debug_logger is not None:
-            debug_logger.write_record(
-                "exception",
-                {"phase": "resolve-path", "error": str(exc)},
-            )
-        raise DownloadError(str(exc)) from exc
-
-    if debug_logger is not None:
-        debug_logger.write_record(
-            "resolved-media",
-            {"path": str(media_path), "entry_keys": sorted(entry.keys())},
-        )
-
-    # Extract hash from metadata or compute
-    hash_value = _extract_sha256(entry) or _extract_sha256(info_dict)
-    if not hash_value:
-        try:
-            hash_value = sha256_file(media_path)
-        except OSError as exc:
-            if debug_logger is not None:
-                debug_logger.write_record(
-                    "hash-error",
-                    {"path": str(media_path), "error": str(exc)},
-                )
-
-    # Extract tags using metadata.py
-    tags = []
-    if extract_ytdlp_tags:
-        try:
-            tags = extract_ytdlp_tags(entry)
-        except Exception as e:
-            log(f"Error extracting tags: {e}", file=sys.stderr)
-
-    source_url = (
-        entry.get("webpage_url")
-        or entry.get("original_url")
-        or entry.get("url")
-    )
-
-    debug(f"✓ Downloaded: {media_path.name} ({len(tags)} tags)")
-    if debug_logger is not None:
-        debug_logger.write_record(
-            "downloaded",
-            {
-                "path": str(media_path),
-                "tag_count": len(tags),
-                "source_url": source_url,
-                "sha256": hash_value,
-            },
-        )
-
-    return DownloadMediaResult(
-        path=media_path,
-        info=entry,
-        tags=tags,
-        source_url=source_url,
-        hash_value=hash_value,
-    )
+            global_cookies = get_cookies_file_path()
+            if global_cookies:
+                ydl_opts["cookiefile"] = global_cookies
+            
+            # Add no_playlist option if specified
+            if no_playlist:
+                ydl_opts["noplaylist"] = True
+            
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:  # type: ignore[arg-type]
+                info = ydl.extract_info(url, download=False)
+            
+            if not isinstance(info, dict):
+                result_container[0] = None
+                return
+            
+            # Extract relevant fields
+            result_container[0] = {
+                "extractor": info.get("extractor", ""),
+                "title": info.get("title", ""),
+                "entries": info.get("entries", []),  # Will be populated if playlist
+                "duration": info.get("duration"),
+                "uploader": info.get("uploader"),
+                "description": info.get("description"),
+                "url": url,
+            }
+        except Exception as exc:
+            log(f"Probe error for {url}: {exc}")
+            result_container[1] = exc
+    
+    thread = threading.Thread(target=_do_probe, daemon=False)
+    thread.start()
+    thread.join(timeout=timeout_seconds)
+    
+    if thread.is_alive():
+        # Probe timed out - return None to fall back to direct download
+        debug(f"Probe timeout for {url} (>={timeout_seconds}s), proceeding with download")
+        return None
+    
+    if result_container[1] is not None:
+        # Probe error - return None to proceed anyway
+        return None
+    
+    return cast(Optional[Dict[str, Any]], result_container[0])


 __all__ = [
-    "download_media",
    "is_url_supported_by_ytdlp",
+    "list_formats",
+    "probe_url",
    "DownloadError",
    "DownloadOptions",
    "DownloadMediaResult",
 ]
+