dfdsf

2025-12-21 05:10:09 -08:00
parent 8ca5783970
commit 11a13edb84
15 changed files with 1712 additions and 213 deletions
--- a/cmdlet/screen_shot.py
+++ b/cmdlet/screen_shot.py
@@ -6,7 +6,6 @@ Playwright, marking them as temporary artifacts for cleanup.

 from __future__ import annotations

-import contextlib
 import hashlib
 import sys
 import time
@@ -32,6 +31,22 @@ get_field = sh.get_field
 parse_cmdlet_args = sh.parse_cmdlet_args
 import pipeline as pipeline_context

+
+def _set_live_step(text: str) -> None:
+    """Best-effort update to the pipeline Live progress title (if enabled)."""
+    try:
+        ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
+    except Exception:
+        ui = None
+    if ui is None:
+        return
+    try:
+        setter = getattr(ui, "set_active_subtask_text", None)
+        if callable(setter):
+            setter(str(text or "").strip())
+    except Exception:
+        pass
+
 # ============================================================================
 # CMDLET Metadata Declaration
 # ============================================================================
@@ -65,7 +80,7 @@ USER_AGENT = (
    "Chrome/120.0.0.0 Safari/537.36"
 )

-DEFAULT_VIEWPORT: dict[str, int] = {"width": 1280, "height": 1200}
+DEFAULT_VIEWPORT: dict[str, int] = {"width": 1920, "height": 1080}
 ARCHIVE_TIMEOUT = 30.0

 # Configurable selectors for specific websites
@@ -114,7 +129,7 @@ class ScreenshotOptions:
    output_path: Optional[Path] = None
    full_page: bool = True
    headless: bool = True
-    wait_after_load: float = 2.0
+    wait_after_load: float = 6.0
    wait_for_article: bool = False
    replace_video_posters: bool = True
    tag: Sequence[str] = ()
@@ -156,13 +171,13 @@ def _slugify_url(url: str) -> str:
 def _normalise_format(fmt: Optional[str]) -> str:
    """Normalize output format to valid values."""
    if not fmt:
-        return "png"
+        return "webp"
    value = fmt.strip().lower()
    if value in {"jpg", "jpeg"}:
        return "jpeg"
-    if value in {"png", "pdf"}:
+    if value in {"png", "pdf", "webp"}:
        return value
-    return "png"
+    return "webp"


 def _format_suffix(fmt: str) -> str:
@@ -172,6 +187,15 @@ def _format_suffix(fmt: str) -> str:
    return f".{fmt}"


+def _convert_to_webp(source_path: Path, dest_path: Path) -> None:
+    """Convert an image file to WebP using Pillow."""
+    from PIL import Image
+
+    with Image.open(source_path) as img:
+        # Keep a sensible default: good quality + small size.
+        img.save(dest_path, format="WEBP", quality=100, method=6)
+
+
 def _selectors_for_url(url: str) -> List[str]:
    """Return a list of likely content selectors for known platforms."""
    u = url.lower()
@@ -184,6 +208,19 @@ def _selectors_for_url(url: str) -> List[str]:
    return sels or ["article"]


+def _matched_site_selectors(url: str) -> List[str]:
+    """Return SITE_SELECTORS for a matched domain; empty if no match.
+
+    Unlike `_selectors_for_url()`, this does not return a generic fallback.
+    """
+    u = str(url or "").lower()
+    sels: List[str] = []
+    for domain, selectors in SITE_SELECTORS.items():
+        if domain in u:
+            sels.extend(selectors)
+    return sels
+
+
 def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000) -> None:
    """Best-effort page tweaks for popular platforms before capture."""
    u = url.lower()
@@ -322,6 +359,10 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
    """Capture screenshot using Playwright."""
    debug(f"[_capture] Starting capture for {options.url} -> {destination}")
    try:
+        # Two-phase Live progress:
+        #   1) load + stabilize (ends right after the wait_after_load sleep)
+        #   2) capture + save (and any post-processing)
+        _set_live_step("screen-shot: loading")
        tool = options.playwright_tool or PlaywrightTool({})

        # Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency)
@@ -329,7 +370,18 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
            current_browser = getattr(tool.defaults, "browser", "").lower() if getattr(tool, "defaults", None) is not None else ""
            if current_browser != "chromium":
                debug(f"[_capture] Overriding Playwright browser '{current_browser}' -> 'chromium' for screen-shot cmdlet")
-                tool = PlaywrightTool({"tool": {"playwright": {"browser": "chromium"}}})
+                base_cfg = {}
+                try:
+                    base_cfg = dict(getattr(tool, "_config", {}) or {})
+                except Exception:
+                    base_cfg = {}
+                tool_block = dict(base_cfg.get("tool") or {}) if isinstance(base_cfg, dict) else {}
+                pw_block = dict(tool_block.get("playwright") or {}) if isinstance(tool_block, dict) else {}
+                pw_block["browser"] = "chromium"
+                tool_block["playwright"] = pw_block
+                if isinstance(base_cfg, dict):
+                    base_cfg["tool"] = tool_block
+                tool = PlaywrightTool(base_cfg)
        except Exception:
            tool = PlaywrightTool({"tool": {"playwright": {"browser": "chromium"}}})

@@ -366,6 +418,9 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                if options.wait_after_load > 0:
                    debug(f"Waiting {options.wait_after_load}s for page stabilization...")
                    time.sleep(min(10.0, max(0.0, options.wait_after_load)))
+
+                # Phase 2 begins here (per request).
+                _set_live_step("screen-shot: capturing")
                if options.replace_video_posters:
                    debug("Replacing video elements with posters...")
                    page.evaluate(
@@ -384,6 +439,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                # Attempt platform-specific target capture if requested (and not PDF)
                element_captured = False
                if options.prefer_platform_target and format_name != "pdf":
+                    debug(f"[_capture] Target capture enabled")
                    debug("Attempting platform-specific content capture...")
                    try:
                        _platform_preprocess(options.url, page, warnings)
@@ -393,7 +449,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
                    selectors = list(options.target_selectors or [])
                    if not selectors:
                        selectors = _selectors_for_url(options.url)
-                    
+
                    debug(f"[_capture] Trying selectors: {selectors}")
                    for sel in selectors:
                        try:
@@ -459,14 +515,36 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
 def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
    """Capture a screenshot for the given options."""
    debug(f"[_capture_screenshot] Preparing capture for {options.url}")
+    requested_format = _normalise_format(options.output_format)
    destination = _prepare_output_path(options)
    warnings: List[str] = []
-    _capture(options, destination, warnings)
+
+    # Playwright screenshots do not natively support WebP output.
+    # Capture as PNG, then convert via Pillow.
+    capture_path = destination
+    if requested_format == "webp":
+        capture_path = unique_path(destination.with_suffix(".png"))
+        debug(f"[_capture_screenshot] Requested webp; capturing intermediate png -> {capture_path}")
+        options.output_format = "png"
+    _capture(options, capture_path, warnings)
+
+    if requested_format == "webp":
+        debug(f"[_capture_screenshot] Converting png -> webp: {destination}")
+        try:
+            _convert_to_webp(capture_path, destination)
+            try:
+                capture_path.unlink(missing_ok=True)
+            except Exception:
+                pass
+        except Exception as exc:
+            warnings.append(f"webp conversion failed; keeping png: {exc}")
+            destination = capture_path

    # Build URL list from captured url and any archives
    url: List[str] = [options.url] if options.url else []
    archive_url: List[str] = []
    if options.archive and options.url:
+        _set_live_step("screen-shot: archiving")
        debug(f"[_capture_screenshot] Archiving enabled for {options.url}")
        archives, archive_warnings = _archive_url(options.url, options.archive_timeout)
        archive_url.extend(archives)
@@ -538,7 +616,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
        except Exception:
            pass
    if not format_value:
-        format_value = "png"
+        format_value = "webp"
    storage_value = parsed.get("storage")
    selector_arg = parsed.get("selector")
    selectors = [selector_arg] if selector_arg else []
@@ -549,27 +627,27 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
    positional_url = [str(url_arg)] if url_arg else []

    # ========================================================================
-    # INPUT PROCESSING - Extract url from pipeline or command arguments
+    # INPUT PROCESSING - Extract url from command args or pipeline
    # ========================================================================
-    
-    piped_results = normalize_result_input(result)
-    url_to_process: List[Tuple[str, Any]] = []
-    
-    # Extract url from piped results
-    if piped_results:
-        for item in piped_results:
-            url = (
-                get_field(item, 'path')
-                or get_field(item, 'url')
-                or get_field(item, 'target')
-            )

-            if url:
-                url_to_process.append((str(url), item))
-    
-    # Use positional arguments if no pipeline input
-    if not url_to_process and positional_url:
+    # If the user provided an explicit URL argument, prefer it.
+    url_to_process: List[Tuple[str, Any]] = []
+    if positional_url:
        url_to_process = [(u, None) for u in positional_url]
+    else:
+        piped_results = normalize_result_input(result)
+
+        # Extract url from piped results
+        if piped_results:
+            for item in piped_results:
+                url = (
+                    get_field(item, 'path')
+                    or get_field(item, 'url')
+                    or get_field(item, 'target')
+                )
+
+                if url:
+                    url_to_process.append((str(url), item))
    
    if not url_to_process:
        log(f"No url to process for screen-shot cmdlet", file=sys.stderr)
@@ -577,6 +655,32 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:

    debug(f"[_run] url to process: {[u for u, _ in url_to_process]}")

+    # If the caller isn't running the shared pipeline Live progress UI (e.g. direct
+    # cmdlet execution), start a minimal local pipeline progress panel so this cmdlet
+    # still shows step-level progress.
+    local_progress_ui = None
+    try:
+        existing_ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
+    except Exception:
+        existing_ui = None
+    try:
+        if existing_ui is None and bool(getattr(sys.stderr, "isatty", lambda: False)()):
+            from models import PipelineLiveProgress
+
+            local_progress_ui = PipelineLiveProgress(["screen-shot"], enabled=True)
+            local_progress_ui.start()
+            try:
+                if hasattr(pipeline_context, "set_live_progress"):
+                    pipeline_context.set_live_progress(local_progress_ui)
+            except Exception:
+                pass
+            try:
+                local_progress_ui.begin_pipe(0, total_items=len(url_to_process), items_preview=[u for u, _ in url_to_process])
+            except Exception:
+                pass
+    except Exception:
+        local_progress_ui = None
+
    # ========================================================================
    # OUTPUT DIRECTORY RESOLUTION - Priority chain
    # ========================================================================
@@ -621,7 +725,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
    
    format_name = _normalise_format(format_value)
    filtered_selectors = [str(s).strip() for s in selectors if str(s).strip()]
-    target_selectors = filtered_selectors if filtered_selectors else None
+    manual_target_selectors = filtered_selectors if filtered_selectors else None
    
    all_emitted = []
    exit_code = 0
@@ -664,6 +768,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
            continue
        
        try:
+            _set_live_step("screen-shot: starting")
            # Create screenshot with provided options
            # Force the Playwright engine to Chromium for the screen-shot cmdlet
            # (this ensures consistent rendering and supports PDF output requirements).
@@ -672,23 +777,49 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
                tool_block = dict(config.get("tool") or {})
                pw_block = dict(tool_block.get("playwright") or {})
                pw_block["browser"] = "chromium"
+                # Use Playwright-native UA/headers (matches bundled Chromium version).
+                pw_block["user_agent"] = "native"
+                pw_block["viewport_width"] = int(DEFAULT_VIEWPORT.get("width", 1920))
+                pw_block["viewport_height"] = int(DEFAULT_VIEWPORT.get("height", 1080))
                tool_block["playwright"] = pw_block
                pw_local_cfg = dict(config)
                pw_local_cfg["tool"] = tool_block
            else:
-                pw_local_cfg = {"tool": {"playwright": {"browser": "chromium"}}}
+                pw_local_cfg = {
+                    "tool": {
+                        "playwright": {
+                            "browser": "chromium",
+                            "user_agent": "native",
+                            "viewport_width": int(DEFAULT_VIEWPORT.get("width", 1920)),
+                            "viewport_height": int(DEFAULT_VIEWPORT.get("height", 1080)),
+                        }
+                    }
+                }

            options = ScreenshotOptions(
                url=url,
                output_dir=screenshot_dir,
                output_format=format_name,
                archive=archive_enabled,
-                target_selectors=target_selectors,
+                target_selectors=None,
                prefer_platform_target=False,
                wait_for_article=False,
                full_page=True,
                playwright_tool=PlaywrightTool(pw_local_cfg),
            )
+
+            # Auto element capture for known sites (x.com/twitter/etc.).
+            # - If the user provided --selector, treat that as an explicit target.
+            # - Otherwise, if SITE_SELECTORS matches the URL, auto-capture the post/content element.
+            auto_selectors = _matched_site_selectors(url)
+            if manual_target_selectors:
+                options.prefer_platform_target = True
+                options.target_selectors = manual_target_selectors
+                debug(f"[screen_shot] Using explicit selector(s): {manual_target_selectors}")
+            elif auto_selectors:
+                options.prefer_platform_target = True
+                options.target_selectors = auto_selectors
+                debug(f"[screen_shot] Auto selectors matched for url: {auto_selectors}")
            
            screenshot_result = _capture_screenshot(options)
            
@@ -748,6 +879,13 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
            # Emit the result so downstream cmdlet (like add-file) can use it
            pipeline_context.emit(pipe_obj)
            all_emitted.append(pipe_obj)
+
+            # If we created a local progress UI, advance it per completed item.
+            if local_progress_ui is not None:
+                try:
+                    local_progress_ui.on_emit(0, pipe_obj)
+                except Exception:
+                    pass
            
        except ScreenshotError as exc:
            log(f"Error taking screenshot of {url}: {exc}", file=sys.stderr)
@@ -758,13 +896,31 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
            traceback.print_exc(file=sys.stderr)
            exit_code = 1
    
+    try:
+        if local_progress_ui is not None:
+            try:
+                local_progress_ui.finish_pipe(0, force_complete=True)
+            except Exception:
+                pass
+    finally:
+        if local_progress_ui is not None:
+            try:
+                local_progress_ui.stop()
+            except Exception:
+                pass
+            try:
+                if hasattr(pipeline_context, "set_live_progress"):
+                    pipeline_context.set_live_progress(None)
+            except Exception:
+                pass
+
    if not all_emitted:
        log(f"No screenshots were successfully captured", file=sys.stderr)
        return 1
-    
+
    # Log completion message (keep this as normal output)
    log(f"✓ Successfully captured {len(all_emitted)} screenshot(s)")
-    
+
    return exit_code
 CMDLET = Cmdlet(
    name="screen-shot",
@@ -773,7 +929,7 @@ CMDLET = Cmdlet(
    alias=["screenshot", "ss"],
    arg=[
        SharedArgs.URL,
-        CmdletArg(name="format", type="string", description="Output format: png, jpeg, or pdf"),
+        CmdletArg(name="format", type="string", description="Output format: webp, png, jpeg, or pdf"),
        CmdletArg(name="selector", type="string", description="CSS selector for element capture"),

    ],