df

2025-12-29 17:05:03 -08:00
parent 226de9316a
commit c019c00aed
104 changed files with 19669 additions and 12954 deletions
--- a/cmdlet/screen_shot.py
+++ b/cmdlet/screen_shot.py
@@ -37,7 +37,6 @@ import pipeline as pipeline_context
 # ============================================================================


-
 # ============================================================================
 # Playwright & Screenshot Dependencies
 # ============================================================================
@@ -104,7 +103,6 @@ SITE_SELECTORS: Dict[str, List[str]] = {
 }


-
 class ScreenshotError(RuntimeError):
    """Raised when screenshot capture or upload fails."""

@@ -146,6 +144,7 @@ class ScreenshotResult:
 # Helper Functions
 # ============================================================================

+
 def _slugify_url(url: str) -> str:
    """Convert URL to filesystem-safe slug."""
    parsed = urlsplit(url)
@@ -172,7 +171,11 @@ def _tags_from_url(url: str) -> List[str]:
    parsed = None
    try:
        parsed = urlsplit(u)
-        host = str(getattr(parsed, "hostname", None) or getattr(parsed, "netloc", "") or "").strip().lower()
+        host = (
+            str(getattr(parsed, "hostname", None) or getattr(parsed, "netloc", "") or "")
+            .strip()
+            .lower()
+        )
    except Exception:
        parsed = None
        host = ""
@@ -300,7 +303,12 @@ def _convert_to_webp(
            except Exception:
                w, h = 0, 0

-            if downscale_if_oversize and isinstance(max_dim, int) and max_dim > 0 and (w > max_dim or h > max_dim):
+            if (
+                downscale_if_oversize
+                and isinstance(max_dim, int)
+                and max_dim > 0
+                and (w > max_dim or h > max_dim)
+            ):
                scale = 1.0
                try:
                    scale = min(float(max_dim) / float(w), float(max_dim) / float(h))
@@ -320,7 +328,9 @@ def _convert_to_webp(
                        im = im.resize((new_w, new_h), resample=resample)
                        did_downscale = True
                    except Exception as exc:
-                        debug(f"[_convert_to_webp] Downscale failed; attempting direct WEBP save anyway: {exc}")
+                        debug(
+                            f"[_convert_to_webp] Downscale failed; attempting direct WEBP save anyway: {exc}"
+                        )

            im.save(tmp_path, **save_kwargs)

@@ -332,6 +342,7 @@ def _convert_to_webp(
        except Exception:
            pass

+
 def _matched_site_selectors(url: str) -> List[str]:
    """Return SITE_SELECTORS for a matched domain; empty if no match.

@@ -355,7 +366,9 @@ def _selectors_for_url(url: str) -> List[str]:
    return _matched_site_selectors(url)


-def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000) -> None:
+def _platform_preprocess(
+    url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000
+) -> None:
    """Best-effort page tweaks for popular platforms before capture."""
    try:
        u = str(url or "").lower()
@@ -373,14 +386,16 @@ def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: i
            return clicks

        # Dismiss common cookie / consent prompts.
-        _try_click_buttons([
-            "Accept all",
-            "Accept",
-            "I agree",
-            "Agree",
-            "Allow all",
-            "OK",
-        ])
+        _try_click_buttons(
+            [
+                "Accept all",
+                "Accept",
+                "I agree",
+                "Agree",
+                "Allow all",
+                "OK",
+            ]
+        )

        # Some sites need small nudges (best-effort).
        if "reddit.com" in u:
@@ -490,7 +505,9 @@ def _prepare_output_path(options: ScreenshotOptions) -> Path:
    return unique_path(path)


-def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str], progress: PipelineProgress) -> None:
+def _capture(
+    options: ScreenshotOptions, destination: Path, warnings: List[str], progress: PipelineProgress
+) -> None:
    """Capture screenshot using Playwright."""
    debug(f"[_capture] Starting capture for {options.url} -> {destination}")
    try:
@@ -499,16 +516,24 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],

        # Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency)
        try:
-            current_browser = getattr(tool.defaults, "browser", "").lower() if getattr(tool, "defaults", None) is not None else ""
+            current_browser = (
+                getattr(tool.defaults, "browser", "").lower()
+                if getattr(tool, "defaults", None) is not None
+                else ""
+            )
            if current_browser != "chromium":
-                debug(f"[_capture] Overriding Playwright browser '{current_browser}' -> 'chromium' for screen-shot cmdlet")
+                debug(
+                    f"[_capture] Overriding Playwright browser '{current_browser}' -> 'chromium' for screen-shot cmdlet"
+                )
                base_cfg = {}
                try:
                    base_cfg = dict(getattr(tool, "_config", {}) or {})
                except Exception:
                    base_cfg = {}
                tool_block = dict(base_cfg.get("tool") or {}) if isinstance(base_cfg, dict) else {}
-                pw_block = dict(tool_block.get("playwright") or {}) if isinstance(tool_block, dict) else {}
+                pw_block = (
+                    dict(tool_block.get("playwright") or {}) if isinstance(tool_block, dict) else {}
+                )
                pw_block["browser"] = "chromium"
                tool_block["playwright"] = pw_block
                if isinstance(base_cfg, dict):
@@ -523,7 +548,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
        format_name = _normalise_format(options.output_format)
        headless = options.headless or format_name == "pdf"
        debug(f"[_capture] Format: {format_name}, Headless: {headless}")
-        
+
        if format_name == "pdf" and not options.headless:
            warnings.append("pdf output requires headless Chromium; overriding headless mode")

@@ -539,7 +564,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
                    warnings.append("navigation timeout; capturing current page state")
                    debug("Navigation timeout; proceeding with current state")
                    progress.step("loading navigation timeout")
-                
+
                # Skip article lookup by default (wait_for_article defaults to False)
                if options.wait_for_article:
                    try:
@@ -549,7 +574,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
                    except PlaywrightTimeoutError:
                        warnings.append("<article> selector not found; capturing fallback")
                        debug("Article element not found; using fallback")
-                
+
                if options.wait_after_load > 0:
                    debug(f"Waiting {options.wait_after_load}s for page stabilization...")
                    time.sleep(min(10.0, max(0.0, options.wait_after_load)))
@@ -591,7 +616,9 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
                    for sel in selectors:
                        try:
                            debug(f"Trying selector: {sel}")
-                            el = page.wait_for_selector(sel, timeout=max(0, int(options.selector_timeout_ms)))
+                            el = page.wait_for_selector(
+                                sel, timeout=max(0, int(options.selector_timeout_ms))
+                            )
                        except PlaywrightTimeoutError:
                            debug(f"Selector not found: {sel}")
                            continue
@@ -604,7 +631,10 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
                                    pass
                                progress.step("capturing output")
                                debug(f"Capturing element to {destination}...")
-                                el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
+                                el.screenshot(
+                                    path=str(destination),
+                                    type=("jpeg" if format_name == "jpeg" else None),
+                                )
                                element_captured = True
                                debug("Element captured successfully")
                                break
@@ -645,8 +675,13 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
        except Exception as exc:
            debug(f"[_capture] Exception launching browser/page: {exc}")
            msg = str(exc).lower()
-            if any(k in msg for k in ["executable", "not found", "no such file", "cannot find", "install"]):
-                raise ScreenshotError("Chromium Playwright browser binaries not found. Install them: python ./scripts/bootstrap.py --playwright-only --browsers chromium") from exc
+            if any(
+                k in msg
+                for k in ["executable", "not found", "no such file", "cannot find", "install"]
+            ):
+                raise ScreenshotError(
+                    "Chromium Playwright browser binaries not found. Install them: python ./scripts/bootstrap.py --playwright-only --browsers chromium"
+                ) from exc
            raise
    except ScreenshotError:
        # Re-raise ScreenshotError raised intentionally (do not wrap)
@@ -666,7 +701,9 @@ def _capture_screenshot(options: ScreenshotOptions, progress: PipelineProgress)
    will_target = bool(options.prefer_platform_target) and requested_format != "pdf"
    will_convert = requested_format == "webp"
    will_archive = bool(options.archive and options.url)
-    total_steps = 9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
+    total_steps = (
+        9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
+    )
    progress.begin_steps(total_steps)
    progress.step("loading starting")

@@ -726,19 +763,20 @@ def _capture_screenshot(options: ScreenshotOptions, progress: PipelineProgress)
 # Main Cmdlet Function
 # ============================================================================

+
 def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
    """Take screenshots of url in the pipeline.
-    
+
    Accepts:
    - Single result object (dict or PipeObject) with 'path' field
    - List of result objects to screenshot each
    - Direct URL as string
-    
+
    Emits PipeObject-formatted results for each screenshot with:
    - action: 'cmdlet:screen-shot'
    - is_temp: True (screenshots are temporary artifacts)
    - parent_id: hash of the original file/URL
-    
+
    Screenshots are created using Playwright and marked as temporary
    so they can be cleaned up later with the cleanup cmdlet.
    """
@@ -761,9 +799,9 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
    # ========================================================================
    # ARGUMENT PARSING
    # ========================================================================
-    
+
    parsed = parse_cmdlet_args(args, CMDLET)
-    
+
    format_value = parsed.get("format")
    if not format_value:
        # Default format can be set via config.conf tool block:
@@ -782,7 +820,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
    selector_arg = parsed.get("selector")
    selectors = [selector_arg] if selector_arg else []
    archive_enabled = parsed.get("archive", False)
-    
+
    # Positional URL argument (if provided)
    url_arg = parsed.get("url")
    positional_url = [str(url_arg)] if url_arg else []
@@ -801,15 +839,11 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
        # Extract url from piped results
        if piped_results:
            for item in piped_results:
-                url = (
-                    get_field(item, 'path')
-                    or get_field(item, 'url')
-                    or get_field(item, 'target')
-                )
+                url = get_field(item, "path") or get_field(item, "url") or get_field(item, "target")

                if url:
                    url_to_process.append((str(url), item))
-    
+
    if not url_to_process:
        log(f"No url to process for screen-shot cmdlet", file=sys.stderr)
        return 1
@@ -819,9 +853,9 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
    # ========================================================================
    # OUTPUT DIRECTORY RESOLUTION - Priority chain
    # ========================================================================
-    
+
    screenshot_dir: Optional[Path] = None
-    
+
    # Primary: Use --storage if provided (highest priority)
    if storage_value:
        try:
@@ -830,7 +864,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
        except ValueError as e:
            log(str(e), file=sys.stderr)
            return 1
-    
+
    # Secondary: Use config-based resolver ONLY if --storage not provided
    if screenshot_dir is None and resolve_output_dir is not None:
        try:
@@ -838,7 +872,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
            debug(f"[screen_shot] Using config resolver: {screenshot_dir}")
        except Exception:
            pass
-    
+
    # Tertiary: Use config outfile ONLY if neither --storage nor resolver worked
    if screenshot_dir is None and config and config.get("outfile"):
        try:
@@ -846,12 +880,12 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
            debug(f"[screen_shot] Using config outfile: {screenshot_dir}")
        except Exception:
            pass
-    
+
    # Default: User's Videos directory
    if screenshot_dir is None:
        screenshot_dir = Path.home() / "Videos"
        debug(f"[screen_shot] Using default directory: {screenshot_dir}")
-    
+
    ensure_directory(screenshot_dir)

    # If the caller isn't running the shared pipeline Live progress UI (e.g. direct
@@ -869,21 +903,21 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
    # ========================================================================
    # PREPARE SCREENSHOT OPTIONS
    # ========================================================================
-    
+
    format_name = _normalise_format(format_value)
    filtered_selectors = [str(s).strip() for s in selectors if str(s).strip()]
    manual_target_selectors = filtered_selectors if filtered_selectors else None
-    
+
    all_emitted = []
    exit_code = 0
    # ========================================================================
    # PROCESS url AND CAPTURE SCREENSHOTS
    # ========================================================================
-    
+
    def _extract_item_tags(item: Any) -> List[str]:
        if item is None:
            return []
-        raw = get_field(item, 'tag')
+        raw = get_field(item, "tag")
        if isinstance(raw, list):
            return [str(t) for t in raw if t is not None and str(t).strip()]
        if isinstance(raw, str) and raw.strip():
@@ -913,7 +947,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
        if not url.lower().startswith(("http://", "https://", "file://")):
            log(f"[screen_shot] Skipping non-URL input: {url}", file=sys.stderr)
            continue
-        
+
        try:
            # Create screenshot with provided options
            # Force the Playwright engine to Chromium for the screen-shot cmdlet
@@ -966,28 +1000,32 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
                options.prefer_platform_target = True
                options.target_selectors = auto_selectors
                debug(f"[screen_shot] Auto selectors matched for url: {auto_selectors}")
-            
+
            screenshot_result = _capture_screenshot(options, progress)
-            
+
            # Log results and warnings
            debug(f"Screenshot captured to {screenshot_result.path}")
            if screenshot_result.archive_url:
                debug(f"Archives: {', '.join(screenshot_result.archive_url)}")
            for warning in screenshot_result.warnings:
                debug(f"Warning: {warning}")
-            
+
            # Compute hash of screenshot file
            screenshot_hash = None
            try:
-                with open(screenshot_result.path, 'rb') as f:
+                with open(screenshot_result.path, "rb") as f:
                    screenshot_hash = hashlib.sha256(f.read()).hexdigest()
            except Exception:
                pass
-            
+
            # Create PipeObject result - marked as TEMP since derivative artifact
            capture_date = ""
            try:
-                capture_date = datetime.fromtimestamp(screenshot_result.path.stat().st_mtime).date().isoformat()
+                capture_date = (
+                    datetime.fromtimestamp(screenshot_result.path.stat().st_mtime)
+                    .date()
+                    .isoformat()
+                )
            except Exception:
                capture_date = datetime.now().date().isoformat()

@@ -997,7 +1035,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:

            upstream_tags = _extract_item_tags(origin_item)
            filtered_upstream_tags = [
-                t for t in upstream_tags
+                t
+                for t in upstream_tags
                if not str(t).strip().lower().startswith(("type:", "date:"))
            ]

@@ -1007,40 +1046,41 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
            )

            pipe_obj = create_pipe_object_result(
-                source='screenshot',
-                store='PATH',
+                source="screenshot",
+                store="PATH",
                identifier=Path(screenshot_result.path).stem,
                file_path=str(screenshot_result.path),
-                cmdlet_name='screen-shot',
+                cmdlet_name="screen-shot",
                title=display_title,
                hash_value=screenshot_hash,
                is_temp=True,
                parent_hash=hashlib.sha256(url.encode()).hexdigest(),
                tag=merged_tags,
                extra={
-                    'source_url': url,
-                    'archive_url': screenshot_result.archive_url,
-                    'url': screenshot_result.url,
-                    'target': str(screenshot_result.path), # Explicit target for add-file
-                }
+                    "source_url": url,
+                    "archive_url": screenshot_result.archive_url,
+                    "url": screenshot_result.url,
+                    "target": str(screenshot_result.path),  # Explicit target for add-file
+                },
            )
-            
+
            # Emit the result so downstream cmdlet (like add-file) can use it
            pipeline_context.emit(pipe_obj)
            all_emitted.append(pipe_obj)

            # If we created a local progress UI, advance it per completed item.
            progress.on_emit(pipe_obj)
-            
+
        except ScreenshotError as exc:
            log(f"Error taking screenshot of {url}: {exc}", file=sys.stderr)
            exit_code = 1
        except Exception as exc:
            log(f"Unexpected error taking screenshot of {url}: {exc}", file=sys.stderr)
            import traceback
+
            traceback.print_exc(file=sys.stderr)
            exit_code = 1
-    
+
    progress.close_local_ui(force_complete=True)

    if not all_emitted:
@@ -1051,6 +1091,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
    log(f"✓ Successfully captured {len(all_emitted)} screenshot(s)")

    return exit_code
+
+
 CMDLET = Cmdlet(
    name="screen-shot",
    summary="Capture a website screenshot",
@@ -1058,16 +1100,17 @@ CMDLET = Cmdlet(
    alias=["screenshot", "ss"],
    arg=[
        SharedArgs.URL,
-        CmdletArg(name="format", type="string", description="Output format: webp, png, jpeg, or pdf"),
+        CmdletArg(
+            name="format", type="string", description="Output format: webp, png, jpeg, or pdf"
+        ),
        CmdletArg(name="selector", type="string", description="CSS selector for element capture"),
-        SharedArgs.PATH
-
+        SharedArgs.PATH,
    ],
    detail=[
        "Uses Playwright Chromium engine only. Install Chromium with: python ./scripts/bootstrap.py --playwright-only --browsers chromium",
        "PDF output requires headless Chromium (the cmdlet will enforce headless mode for PDF).",
        "Screenshots are temporary artifacts stored in the configured `temp` directory.",
-    ]
+    ],
 )

 CMDLET.exec = _run