dfd

2025-12-22 02:11:53 -08:00
parent d0b821b5dd
commit 16316bb3fd
20 changed files with 4218 additions and 2422 deletions
--- a/cmdlet/screen_shot.py
+++ b/cmdlet/screen_shot.py
@@ -14,10 +14,11 @@ import httpx
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence, Tuple
-from urllib.parse import urlsplit, quote, urljoin
+from urllib.parse import urlsplit, quote, urljoin, unquote

 from SYS.logger import log, debug
 from API.HTTP import HTTPClient
+from SYS.pipeline_progress import PipelineProgress
 from SYS.utils import ensure_directory, unique_path, unique_preserve_order
 from . import _shared as sh

@@ -31,54 +32,6 @@ get_field = sh.get_field
 parse_cmdlet_args = sh.parse_cmdlet_args
 import pipeline as pipeline_context

-
-def _live_ui_and_pipe_index() -> tuple[Optional[Any], int]:
-    ui = None
-    try:
-        ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
-    except Exception:
-        ui = None
-
-    pipe_idx: int = 0
-    try:
-        stage_ctx = pipeline_context.get_stage_context() if hasattr(pipeline_context, "get_stage_context") else None
-        maybe_idx = getattr(stage_ctx, "pipe_index", None) if stage_ctx is not None else None
-        if isinstance(maybe_idx, int):
-            pipe_idx = int(maybe_idx)
-    except Exception:
-        pipe_idx = 0
-
-    return ui, pipe_idx
-
-
-def _begin_live_steps(total_steps: int) -> None:
-    """Declare the total number of steps for this cmdlet run (per-pipe)."""
-    ui, pipe_idx = _live_ui_and_pipe_index()
-    if ui is None:
-        return
-    try:
-        begin = getattr(ui, "begin_pipe_steps", None)
-        if callable(begin):
-            begin(int(pipe_idx), total_steps=int(total_steps))
-    except Exception:
-        return
-
-
-def _step(text: str) -> None:
-    """Emit a *new* step.
-
-    Each call increments the step counter and advances percent automatically.
-    """
-    ui, pipe_idx = _live_ui_and_pipe_index()
-    if ui is None:
-        return
-    try:
-        adv = getattr(ui, "advance_pipe_step", None)
-        if callable(adv):
-            adv(int(pipe_idx), str(text))
-    except Exception:
-        return
-
 # ============================================================================
 # CMDLET Metadata Declaration
 # ============================================================================
@@ -115,6 +68,10 @@ USER_AGENT = (
 DEFAULT_VIEWPORT: dict[str, int] = {"width": 1920, "height": 1080}
 ARCHIVE_TIMEOUT = 30.0

+# WebP has a hard maximum dimension per side.
+# Pillow typically fails with: "encoding error 5: Image size exceeds WebP limit of 16383 pixels"
+WEBP_MAX_DIM = 16_383
+
 # Configurable selectors for specific websites
 SITE_SELECTORS: Dict[str, List[str]] = {
    "twitter.com": [
@@ -200,6 +157,80 @@ def _slugify_url(url: str) -> str:
    return slug[:100]


+def _tags_from_url(url: str) -> List[str]:
+    """Derive simple tags from a URL.
+
+    - site:<domain> (strips leading www.)
+    - title:<slug> derived from the last path segment, with extension removed
+      and separators (-, _, %) normalized to spaces.
+    """
+
+    u = str(url or "").strip()
+    if not u:
+        return []
+
+    parsed = None
+    try:
+        parsed = urlsplit(u)
+        host = str(getattr(parsed, "hostname", None) or getattr(parsed, "netloc", "") or "").strip().lower()
+    except Exception:
+        parsed = None
+        host = ""
+
+    if host:
+        # Drop credentials and port if present.
+        if "@" in host:
+            host = host.rsplit("@", 1)[-1]
+        if ":" in host:
+            host = host.split(":", 1)[0]
+        if host.startswith("www."):
+            host = host[len("www.") :]
+
+    path = ""
+    if parsed is not None:
+        try:
+            path = str(getattr(parsed, "path", "") or "")
+        except Exception:
+            path = ""
+
+    last = ""
+    if path:
+        try:
+            last = path.rsplit("/", 1)[-1]
+        except Exception:
+            last = ""
+
+    try:
+        last = unquote(last or "")
+    except Exception:
+        last = last or ""
+
+    if last and "." in last:
+        # Drop a single trailing extension (e.g. .html, .php).
+        last = last.rsplit(".", 1)[0]
+
+    for sep in ("_", "-", "%"):
+        if last and sep in last:
+            last = last.replace(sep, " ")
+
+    title = " ".join(str(last or "").split()).strip().lower()
+
+    tags: List[str] = []
+    if host:
+        tags.append(f"site:{host}")
+    if title:
+        tags.append(f"title:{title}")
+    return tags
+
+
+def _title_from_url(url: str) -> str:
+    """Return the normalized title derived from a URL's last path segment."""
+    for t in _tags_from_url(url):
+        if str(t).lower().startswith("title:"):
+            return str(t)[len("title:") :].strip()
+    return ""
+
+
 def _normalise_format(fmt: Optional[str]) -> str:
    """Normalize output format to valid values."""
    if not fmt:
@@ -218,6 +249,89 @@ def _format_suffix(fmt: str) -> str:
        return ".jpg"
    return f".{fmt}"

+
+def _convert_to_webp(
+    src_png: Path,
+    dst_webp: Path,
+    *,
+    quality: int = 90,
+    method: int = 6,
+    max_dim: int = WEBP_MAX_DIM,
+    downscale_if_oversize: bool = True,
+) -> bool:
+    """Convert a PNG screenshot to WebP via Pillow.
+
+    Playwright does not currently support emitting WebP directly.
+    """
+    if not src_png or not Path(src_png).is_file():
+        raise ScreenshotError(f"Source image not found: {src_png}")
+
+    dst_webp = Path(dst_webp)
+    try:
+        dst_webp.parent.mkdir(parents=True, exist_ok=True)
+    except Exception:
+        pass
+
+    try:
+        from PIL import Image
+    except Exception as exc:
+        raise ScreenshotError(f"Pillow is required for webp conversion: {exc}") from exc
+
+    # Write atomically to avoid partial files if conversion is interrupted.
+    tmp_path = unique_path(dst_webp.with_suffix(".tmp.webp"))
+    try:
+        with Image.open(src_png) as im:
+            did_downscale = False
+            save_kwargs: Dict[str, Any] = {
+                "format": "WEBP",
+                "quality": int(quality),
+                "method": int(method),
+            }
+
+            # Preserve alpha when present; Pillow handles it for WEBP.
+            # Normalize palette images to RGBA to avoid odd palette artifacts.
+            if im.mode == "P":
+                im = im.convert("RGBA")
+
+            # WebP enforces a hard max dimension per side (16383px).
+            # When full-page captures are very tall, downscale proportionally to fit.
+            try:
+                w, h = im.size
+            except Exception:
+                w, h = 0, 0
+
+            if downscale_if_oversize and isinstance(max_dim, int) and max_dim > 0 and (w > max_dim or h > max_dim):
+                scale = 1.0
+                try:
+                    scale = min(float(max_dim) / float(w), float(max_dim) / float(h))
+                except Exception:
+                    scale = 1.0
+
+                if scale > 0.0 and scale < 1.0:
+                    new_w = max(1, int(w * scale))
+                    new_h = max(1, int(h * scale))
+                    debug(
+                        f"[_convert_to_webp] Image exceeds WebP limit ({w}x{h}); downscaling -> {new_w}x{new_h}"
+                    )
+                    try:
+                        resample = getattr(getattr(Image, "Resampling", Image), "LANCZOS", None)
+                        if resample is None:
+                            resample = getattr(Image, "LANCZOS", 1)
+                        im = im.resize((new_w, new_h), resample=resample)
+                        did_downscale = True
+                    except Exception as exc:
+                        debug(f"[_convert_to_webp] Downscale failed; attempting direct WEBP save anyway: {exc}")
+
+            im.save(tmp_path, **save_kwargs)
+
+        tmp_path.replace(dst_webp)
+        return bool(did_downscale)
+    finally:
+        try:
+            tmp_path.unlink(missing_ok=True)
+        except Exception:
+            pass
+
 def _matched_site_selectors(url: str) -> List[str]:
    """Return SITE_SELECTORS for a matched domain; empty if no match.

@@ -231,6 +345,16 @@ def _matched_site_selectors(url: str) -> List[str]:
    return sels


+def _selectors_for_url(url: str) -> List[str]:
+    """Return selectors to try for a URL.
+
+    For now, prefer a minimal behavior: only return known SITE_SELECTORS.
+    (The cmdlet already falls back to full-page capture when no selectors match.)
+    """
+
+    return _matched_site_selectors(url)
+
+
 def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000) -> None:
    """Best-effort page tweaks for popular platforms before capture."""
    try:
@@ -366,11 +490,11 @@ def _prepare_output_path(options: ScreenshotOptions) -> Path:
    return unique_path(path)


-def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) -> None:
+def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str], progress: PipelineProgress) -> None:
    """Capture screenshot using Playwright."""
    debug(f"[_capture] Starting capture for {options.url} -> {destination}")
    try:
-        _step("loading launching browser")
+        progress.step("loading launching browser")
        tool = options.playwright_tool or PlaywrightTool({})

        # Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency)
@@ -405,16 +529,16 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])

        try:
            with tool.open_page(headless=headless) as page:
-                _step("loading navigating")
+                progress.step("loading navigating")
                debug(f"Navigating to {options.url}...")
                try:
                    tool.goto(page, options.url)
                    debug("Page loaded successfully")
-                    _step("loading page loaded")
+                    progress.step("loading page loaded")
                except PlaywrightTimeoutError:
                    warnings.append("navigation timeout; capturing current page state")
                    debug("Navigation timeout; proceeding with current state")
-                    _step("loading navigation timeout")
+                    progress.step("loading navigation timeout")
                
                # Skip article lookup by default (wait_for_article defaults to False)
                if options.wait_for_article:
@@ -430,9 +554,9 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                    debug(f"Waiting {options.wait_after_load}s for page stabilization...")
                    time.sleep(min(10.0, max(0.0, options.wait_after_load)))

-                _step("loading stabilized")
+                progress.step("loading stabilized")

-                _step("capturing preparing")
+                progress.step("capturing preparing")
                if options.replace_video_posters:
                    debug("Replacing video elements with posters...")
                    page.evaluate(
@@ -453,7 +577,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                if options.prefer_platform_target and format_name != "pdf":
                    debug(f"[_capture] Target capture enabled")
                    debug("Attempting platform-specific content capture...")
-                    _step("capturing locating target")
+                    progress.step("capturing locating target")
                    try:
                        _platform_preprocess(options.url, page, warnings)
                    except Exception as e:
@@ -478,7 +602,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                                    el.scroll_into_view_if_needed(timeout=1000)
                                except Exception:
                                    pass
-                                _step("capturing output")
+                                progress.step("capturing output")
                                debug(f"Capturing element to {destination}...")
                                el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
                                element_captured = True
@@ -489,14 +613,14 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                            debug(f"Failed to capture element: {exc}")
                # Fallback to default capture paths
                if element_captured:
-                    _step("capturing saved")
+                    progress.step("capturing saved")
                elif format_name == "pdf":
                    debug("Generating PDF...")
                    page.emulate_media(media="print")
-                    _step("capturing output")
+                    progress.step("capturing output")
                    page.pdf(path=str(destination), print_background=True)
                    debug(f"PDF saved to {destination}")
-                    _step("capturing saved")
+                    progress.step("capturing saved")
                else:
                    debug(f"Capturing full page to {destination}...")
                    screenshot_kwargs: Dict[str, Any] = {"path": str(destination)}
@@ -504,20 +628,20 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                        screenshot_kwargs["type"] = "jpeg"
                        screenshot_kwargs["quality"] = 90
                    if options.full_page:
-                        _step("capturing output")
+                        progress.step("capturing output")
                        page.screenshot(full_page=True, **screenshot_kwargs)
                    else:
                        article = page.query_selector("article")
                        if article is not None:
                            article_kwargs = dict(screenshot_kwargs)
                            article_kwargs.pop("full_page", None)
-                            _step("capturing output")
+                            progress.step("capturing output")
                            article.screenshot(**article_kwargs)
                        else:
-                            _step("capturing output")
+                            progress.step("capturing output")
                            page.screenshot(**screenshot_kwargs)
                    debug(f"Screenshot saved to {destination}")
-                    _step("capturing saved")
+                    progress.step("capturing saved")
        except Exception as exc:
            debug(f"[_capture] Exception launching browser/page: {exc}")
            msg = str(exc).lower()
@@ -532,7 +656,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
        raise ScreenshotError(f"Failed to capture screenshot: {exc}") from exc


-def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
+def _capture_screenshot(options: ScreenshotOptions, progress: PipelineProgress) -> ScreenshotResult:
    """Capture a screenshot for the given options."""
    debug(f"[_capture_screenshot] Preparing capture for {options.url}")
    requested_format = _normalise_format(options.output_format)
@@ -543,8 +667,8 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
    will_convert = requested_format == "webp"
    will_archive = bool(options.archive and options.url)
    total_steps = 9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
-    _begin_live_steps(total_steps)
-    _step("loading starting")
+    progress.begin_steps(total_steps)
+    progress.step("loading starting")

    # Playwright screenshots do not natively support WebP output.
    # Capture as PNG, then convert via Pillow.
@@ -553,17 +677,22 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
        capture_path = unique_path(destination.with_suffix(".png"))
        debug(f"[_capture_screenshot] Requested webp; capturing intermediate png -> {capture_path}")
        options.output_format = "png"
-    _capture(options, capture_path, warnings)
+    _capture(options, capture_path, warnings, progress)

    if requested_format == "webp":
-        _step("capturing converting to webp")
+        progress.step("capturing converting to webp")
        debug(f"[_capture_screenshot] Converting png -> webp: {destination}")
        try:
-            _convert_to_webp(capture_path, destination)
-            try:
-                capture_path.unlink(missing_ok=True)
-            except Exception:
-                pass
+            did_downscale = _convert_to_webp(capture_path, destination)
+            if did_downscale:
+                warnings.append(
+                    f"webp conversion used downscaling to fit {WEBP_MAX_DIM}px limit; keeping original png: {capture_path.name}"
+                )
+            else:
+                try:
+                    capture_path.unlink(missing_ok=True)
+                except Exception:
+                    pass
        except Exception as exc:
            warnings.append(f"webp conversion failed; keeping png: {exc}")
            destination = capture_path
@@ -572,7 +701,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
    url: List[str] = [options.url] if options.url else []
    archive_url: List[str] = []
    if options.archive and options.url:
-        _step("capturing archiving")
+        progress.step("capturing archiving")
        debug(f"[_capture_screenshot] Archiving enabled for {options.url}")
        archives, archive_warnings = _archive_url(options.url, options.archive_timeout)
        archive_url.extend(archives)
@@ -580,7 +709,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
        if archives:
            url = unique_preserve_order([*url, *archives])

-    _step("capturing finalized")
+    progress.step("capturing finalized")

    applied_tag = unique_preserve_order(list(tag for tag in options.tag if tag.strip()))

@@ -627,6 +756,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
        )
        return 1

+    progress = PipelineProgress(pipeline_context)
+
    # ========================================================================
    # ARGUMENT PARSING
    # ========================================================================
@@ -685,32 +816,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:

    debug(f"[_run] url to process: {[u for u, _ in url_to_process]}")

-    # If the caller isn't running the shared pipeline Live progress UI (e.g. direct
-    # cmdlet execution), start a minimal local pipeline progress panel so this cmdlet
-    # still shows step-level progress.
-    local_progress_ui = None
-    try:
-        existing_ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
-    except Exception:
-        existing_ui = None
-    try:
-        if existing_ui is None and bool(getattr(sys.stderr, "isatty", lambda: False)()):
-            from models import PipelineLiveProgress
-
-            local_progress_ui = PipelineLiveProgress(["screen-shot"], enabled=True)
-            local_progress_ui.start()
-            try:
-                if hasattr(pipeline_context, "set_live_progress"):
-                    pipeline_context.set_live_progress(local_progress_ui)
-            except Exception:
-                pass
-            try:
-                local_progress_ui.begin_pipe(0, total_items=len(url_to_process), items_preview=[u for u, _ in url_to_process])
-            except Exception:
-                pass
-    except Exception:
-        local_progress_ui = None
-
    # ========================================================================
    # OUTPUT DIRECTORY RESOLUTION - Priority chain
    # ========================================================================
@@ -749,6 +854,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
    
    ensure_directory(screenshot_dir)

+    # If the caller isn't running the shared pipeline Live progress UI (e.g. direct
+    # cmdlet execution), start a minimal local pipeline progress panel so this cmdlet
+    # still shows step-level progress.
+    try:
+        progress.ensure_local_ui(
+            label="screen-shot",
+            total_items=len(url_to_process),
+            items_preview=[u for u, _ in url_to_process],
+        )
+    except Exception:
+        pass
+
    # ========================================================================
    # PREPARE SCREENSHOT OPTIONS
    # ========================================================================
@@ -850,7 +967,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
                options.target_selectors = auto_selectors
                debug(f"[screen_shot] Auto selectors matched for url: {auto_selectors}")
            
-            screenshot_result = _capture_screenshot(options)
+            screenshot_result = _capture_screenshot(options, progress)
            
            # Log results and warnings
            debug(f"Screenshot captured to {screenshot_result.path}")
@@ -875,15 +992,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
                capture_date = datetime.now().date().isoformat()

            upstream_title = _clean_title(_extract_item_title(origin_item))
-            display_title = upstream_title or url
+            url_title = _title_from_url(url)
+            display_title = upstream_title or url_title or url

            upstream_tags = _extract_item_tags(origin_item)
            filtered_upstream_tags = [
                t for t in upstream_tags
                if not str(t).strip().lower().startswith(("type:", "date:"))
            ]
+
+            url_tags = _tags_from_url(url)
            merged_tags = unique_preserve_order(
-                ["type:screenshot", f"date:{capture_date}"] + filtered_upstream_tags
+                ["type:screenshot", f"date:{capture_date}"] + filtered_upstream_tags + url_tags
            )

            pipe_obj = create_pipe_object_result(
@@ -910,11 +1030,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
            all_emitted.append(pipe_obj)

            # If we created a local progress UI, advance it per completed item.
-            if local_progress_ui is not None:
-                try:
-                    local_progress_ui.on_emit(0, pipe_obj)
-                except Exception:
-                    pass
+            progress.on_emit(pipe_obj)
            
        except ScreenshotError as exc:
            log(f"Error taking screenshot of {url}: {exc}", file=sys.stderr)
@@ -925,23 +1041,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
            traceback.print_exc(file=sys.stderr)
            exit_code = 1
    
-    try:
-        if local_progress_ui is not None:
-            try:
-                local_progress_ui.finish_pipe(0, force_complete=True)
-            except Exception:
-                pass
-    finally:
-        if local_progress_ui is not None:
-            try:
-                local_progress_ui.stop()
-            except Exception:
-                pass
-            try:
-                if hasattr(pipeline_context, "set_live_progress"):
-                    pipeline_context.set_live_progress(None)
-            except Exception:
-                pass
+    progress.close_local_ui(force_complete=True)

    if not all_emitted:
        log(f"No screenshots were successfully captured", file=sys.stderr)