df

2026-01-05 07:51:19 -08:00
parent 8545367e28
commit 1f765cffda
32 changed files with 3447 additions and 3250 deletions
--- a/API/HTTP.py
+++ b/API/HTTP.py
@@ -10,10 +10,24 @@ Provides synchronous and asynchronous HTTP operations with:

 import httpx
 import asyncio
-from typing import Optional, Dict, Any, Callable, BinaryIO
+import sys
+import time
+import traceback
+import re
+from typing import Optional, Dict, Any, Callable, BinaryIO, List, Iterable, Set
 from pathlib import Path
+from urllib.parse import unquote, urlparse, parse_qs
 import logging

+from SYS.logger import debug, log
+from SYS.models import DebugLogger, DownloadError, DownloadMediaResult, ProgressBar
+from SYS.utils import ensure_directory, sha256_file
+
+try:  # Optional; used for metadata extraction when available
+    from SYS.metadata import extract_ytdlp_tags
+except Exception:  # pragma: no cover - optional dependency
+    extract_ytdlp_tags = None  # type: ignore[assignment]
+
 logger = logging.getLogger(__name__)

 # Default configuration
@@ -366,6 +380,359 @@ class HTTPClient:
        return self._client.stream(method, url, **kwargs)


+def download_direct_file(
+    url: str,
+    output_dir: Path,
+    debug_logger: Optional[DebugLogger] = None,
+    quiet: bool = False,
+    suggested_filename: Optional[str] = None,
+    pipeline_progress: Optional[Any] = None,
+) -> DownloadMediaResult:
+    """Download a direct file (PDF, image, document, etc.) with guardrails and metadata hooks."""
+
+    ensure_directory(output_dir)
+
+    def _sanitize_filename(name: str) -> str:
+        # Windows-safe filename sanitization.
+        text = str(name or "").strip()
+        if not text:
+            return ""
+        text = text.replace("/", "\\")
+        text = text.split("\\")[-1]
+
+        invalid = set('<>:"/\\|?*')
+        cleaned_chars: List[str] = []
+        for ch in text:
+            o = ord(ch)
+            if o < 32 or ch in invalid:
+                cleaned_chars.append(" ")
+                continue
+            cleaned_chars.append(ch)
+        cleaned = " ".join("".join(cleaned_chars).split()).strip()
+        cleaned = cleaned.rstrip(" .")
+        return cleaned
+
+    def _unique_path(path: Path) -> Path:
+        if not path.exists():
+            return path
+        stem = path.stem
+        suffix = path.suffix
+        parent = path.parent
+        for i in range(1, 10_000):
+            candidate = parent / f"{stem} ({i}){suffix}"
+            if not candidate.exists():
+                return candidate
+        return parent / f"{stem} ({int(time.time())}){suffix}"
+
+    parsed_url = urlparse(url)
+    url_path = parsed_url.path
+
+    filename: Optional[str] = None
+    if parsed_url.query:
+        query_params = parse_qs(parsed_url.query)
+        for param_name in ("filename", "download", "file", "name"):
+            if param_name in query_params and query_params[param_name]:
+                filename = query_params[param_name][0]
+                filename = unquote(filename)
+                break
+
+    if not filename or not filename.strip():
+        filename = url_path.split("/")[-1] if url_path else ""
+        filename = unquote(filename)
+
+    if "?" in filename:
+        filename = filename.split("?")[0]
+
+    content_type = ""
+    try:
+        with HTTPClient(timeout=10.0) as client:
+            response = client._request("HEAD", url, follow_redirects=True)
+            content_disposition = response.headers.get("content-disposition", "")
+            try:
+                content_type = str(response.headers.get("content-type", "") or "").strip().lower()
+            except Exception:
+                content_type = ""
+
+            if content_disposition:
+                match = re.search(r'filename\*?=(?:"([^"]*)"|([^;\s]*))', content_disposition)
+                if match:
+                    extracted_name = match.group(1) or match.group(2)
+                    if extracted_name:
+                        filename = unquote(extracted_name)
+                        if not quiet:
+                            debug(f"Filename from Content-Disposition: {filename}")
+    except Exception as exc:
+        if not quiet:
+            log(f"Could not get filename from headers: {exc}", file=sys.stderr)
+
+    try:
+        page_like_exts = {".php", ".asp", ".aspx", ".jsp", ".cgi"}
+        ext = ""
+        try:
+            ext = Path(str(filename or "")).suffix.lower()
+        except Exception:
+            ext = ""
+
+        ct0 = (content_type or "").split(";", 1)[0].strip().lower()
+        must_probe = bool(ct0.startswith("text/html") or ext in page_like_exts)
+
+        if must_probe:
+            with HTTPClient(timeout=10.0) as client:
+                with client._request_stream("GET", url, follow_redirects=True) as resp:
+                    resp.raise_for_status()
+                    ct = (
+                        str(resp.headers.get("content-type", "") or "")
+                        .split(";", 1)[0]
+                        .strip()
+                        .lower()
+                    )
+                    if ct.startswith("text/html"):
+                        raise DownloadError("URL appears to be an HTML page, not a direct file")
+    except DownloadError:
+        raise
+    except Exception:
+        pass
+
+    suggested = _sanitize_filename(suggested_filename) if suggested_filename else ""
+    if suggested:
+        suggested_path = Path(suggested)
+        if suggested_path.suffix:
+            filename = suggested
+        else:
+            detected_ext = ""
+            try:
+                detected_ext = Path(str(filename)).suffix
+            except Exception:
+                detected_ext = ""
+            filename = suggested + detected_ext if detected_ext else suggested
+
+    try:
+        has_ext = bool(filename and Path(str(filename)).suffix)
+    except Exception:
+        has_ext = False
+
+    if filename and (not has_ext):
+        ct = (content_type or "").split(";", 1)[0].strip().lower()
+        ext_by_ct = {
+            "application/pdf": ".pdf",
+            "application/epub+zip": ".epub",
+            "application/x-mobipocket-ebook": ".mobi",
+            "image/jpeg": ".jpg",
+            "image/png": ".png",
+            "image/webp": ".webp",
+            "image/gif": ".gif",
+            "text/plain": ".txt",
+            "application/zip": ".zip",
+        }
+
+        if ct in ext_by_ct:
+            filename = f"{filename}{ext_by_ct[ct]}"
+        elif ct.startswith("text/html"):
+            raise DownloadError("URL appears to be an HTML page, not a direct file")
+
+    if not filename or not str(filename).strip():
+        raise DownloadError(
+            "Could not determine filename for URL (no Content-Disposition and no path filename)"
+        )
+
+    file_path = _unique_path(output_dir / str(filename))
+
+    use_pipeline_transfer = False
+    try:
+        if pipeline_progress is not None and hasattr(pipeline_progress, "update_transfer"):
+            ui = None
+            if hasattr(pipeline_progress, "ui_and_pipe_index"):
+                ui, _ = pipeline_progress.ui_and_pipe_index()  # type: ignore[attr-defined]
+            use_pipeline_transfer = ui is not None
+    except Exception:
+        use_pipeline_transfer = False
+
+    progress_bar: Optional[ProgressBar] = None
+    if (not quiet) and (not use_pipeline_transfer):
+        progress_bar = ProgressBar()
+
+    transfer_started = [False]
+
+    if not quiet:
+        debug(f"Direct download: {filename}")
+
+    try:
+        start_time = time.time()
+        downloaded_bytes = [0]
+        transfer_started[0] = False
+
+        def _maybe_begin_transfer(content_length: int) -> None:
+            if pipeline_progress is None or transfer_started[0]:
+                return
+            try:
+                total_val: Optional[int] = (
+                    int(content_length)
+                    if isinstance(content_length, int) and content_length > 0
+                    else None
+                )
+            except Exception:
+                total_val = None
+            try:
+                if hasattr(pipeline_progress, "begin_transfer"):
+                    pipeline_progress.begin_transfer(
+                        label=str(filename or "download"),
+                        total=total_val,
+                    )
+                    transfer_started[0] = True
+            except Exception:
+                return
+
+        def progress_callback(bytes_downloaded: int, content_length: int) -> None:
+            downloaded_bytes[0] = int(bytes_downloaded or 0)
+
+            try:
+                if pipeline_progress is not None and hasattr(pipeline_progress, "update_transfer"):
+                    _maybe_begin_transfer(content_length)
+                    total_val: Optional[int] = (
+                        int(content_length)
+                        if isinstance(content_length, int) and content_length > 0
+                        else None
+                    )
+                    pipeline_progress.update_transfer(
+                        label=str(filename or "download"),
+                        completed=int(bytes_downloaded or 0),
+                        total=total_val,
+                    )
+            except Exception:
+                pass
+
+            if progress_bar is not None:
+                progress_bar.update(
+                    downloaded=int(bytes_downloaded or 0),
+                    total=int(content_length) if content_length and content_length > 0 else None,
+                    label=str(filename or "download"),
+                    file=sys.stderr,
+                )
+
+        with HTTPClient(timeout=30.0) as client:
+            client.download(url, str(file_path), progress_callback=progress_callback)
+
+        elapsed = time.time() - start_time
+
+        try:
+            if progress_bar is not None:
+                progress_bar.finish()
+        except Exception:
+            pass
+
+        try:
+            if pipeline_progress is not None and transfer_started[0] and hasattr(
+                pipeline_progress, "finish_transfer"
+            ):
+                pipeline_progress.finish_transfer(label=str(filename or "download"))
+        except Exception:
+            pass
+
+        if not quiet:
+            debug(f"✓ Downloaded in {elapsed:.1f}s")
+
+        ext_out = ""
+        try:
+            ext_out = Path(str(filename)).suffix.lstrip(".")
+        except Exception:
+            ext_out = ""
+
+        info: Dict[str, Any] = {
+            "id": str(filename).rsplit(".", 1)[0] if "." in str(filename) else str(filename),
+            "ext": ext_out,
+            "webpage_url": url,
+        }
+
+        hash_value = None
+        try:
+            hash_value = sha256_file(file_path)
+        except Exception:
+            pass
+
+        tags: List[str] = []
+        if extract_ytdlp_tags:
+            try:
+                tags = extract_ytdlp_tags(info)
+            except Exception as exc:
+                log(f"Error extracting tags: {exc}", file=sys.stderr)
+
+        if not any(str(t).startswith("title:") for t in tags):
+            info["title"] = str(filename)
+            tags = []
+            if extract_ytdlp_tags:
+                try:
+                    tags = extract_ytdlp_tags(info)
+                except Exception as exc:
+                    log(f"Error extracting tags with filename: {exc}", file=sys.stderr)
+
+        if debug_logger is not None:
+            debug_logger.write_record(
+                "direct-file-downloaded",
+                {"url": url, "path": str(file_path), "hash": hash_value},
+            )
+
+        return DownloadMediaResult(
+            path=file_path,
+            info=info,
+            tag=tags,
+            source_url=url,
+            hash_value=hash_value,
+        )
+
+    except (httpx.HTTPError, httpx.RequestError) as exc:
+        try:
+            if progress_bar is not None:
+                progress_bar.finish()
+        except Exception:
+            pass
+        try:
+            if pipeline_progress is not None and transfer_started[0] and hasattr(
+                pipeline_progress, "finish_transfer"
+            ):
+                pipeline_progress.finish_transfer(label=str(filename or "download"))
+        except Exception:
+            pass
+
+        log(f"Download error: {exc}", file=sys.stderr)
+        if debug_logger is not None:
+            debug_logger.write_record(
+                "exception",
+                {"phase": "direct-file", "url": url, "error": str(exc)},
+            )
+        raise DownloadError(f"Failed to download {url}: {exc}") from exc
+
+    except Exception as exc:
+        try:
+            if progress_bar is not None:
+                progress_bar.finish()
+        except Exception:
+            pass
+        try:
+            if pipeline_progress is not None and transfer_started[0] and hasattr(
+                pipeline_progress, "finish_transfer"
+            ):
+                pipeline_progress.finish_transfer(label=str(filename or "download"))
+        except Exception:
+            pass
+
+        log(f"Error downloading file: {exc}", file=sys.stderr)
+        if debug_logger is not None:
+            debug_logger.write_record(
+                "exception",
+                {
+                    "phase": "direct-file",
+                    "url": url,
+                    "error": str(exc),
+                    "traceback": traceback.format_exc(),
+                },
+            )
+        raise DownloadError(f"Error downloading file: {exc}") from exc
+
+
+# Back-compat alias
+_download_direct_file = download_direct_file
+
+
 class AsyncHTTPClient:
    """Unified async HTTP client with asyncio support."""