added mhtml support and fixed some bugs in the process

2026-04-22 21:19:55 -07:00
parent 90787bd0a2
commit 67c272db4b
9 changed files with 564 additions and 66 deletions
@@ -73,6 +73,61 @@ DEFAULT_VIEWPORT: dict[str,
                       }
 ARCHIVE_TIMEOUT = 30.0

+ADBLOCK_HOST_PATTERNS: tuple[str, ...] = (
+    "doubleclick.net",
+    "googlesyndication.com",
+    "googleadservices.com",
+    "google-analytics.com",
+    "googletagmanager.com",
+    "googletagservices.com",
+    "adservice.google.",
+    "adsystem.com",
+    "adnxs.com",
+    "taboola.com",
+    "outbrain.com",
+    "criteo.com",
+    "casalemedia.com",
+    "rubiconproject.com",
+    "pubmatic.com",
+    "scorecardresearch.com",
+    "quantserve.com",
+    "zedo.com",
+    "moatads.com",
+    "amazon-adsystem.com",
+    "media.net",
+)
+
+ADBLOCK_URL_PATTERNS: tuple[str, ...] = (
+    "/ads/",
+    "?ads=",
+    "&ads=",
+    "advertisement",
+    "googlesyndication",
+    "doubleclick",
+    "adservice",
+    "adserver",
+    "prebid",
+    "taboola",
+    "outbrain",
+    "amazon-adsystem",
+)
+
+ADBLOCK_CSS_SELECTORS: tuple[str, ...] = (
+    "[id*='ad-']",
+    "[id^='ad-']",
+    "[id*='ads-']",
+    "[class*=' ad-']",
+    "[class^='ad-']",
+    "[class*='ads-']",
+    "[class*='advert']",
+    "[id*='sponsor']",
+    "[class*='sponsor']",
+    "iframe[src*='doubleclick.net']",
+    "iframe[src*='googlesyndication.com']",
+    "iframe[src*='taboola.com']",
+    "iframe[src*='outbrain.com']",
+)
+
 # WebP has a hard maximum dimension per side.
 # Pillow typically fails with: "encoding error 5: Image size exceeds WebP limit of 16383 pixels"
 WEBP_MAX_DIM = 16_383
@@ -136,6 +191,7 @@ class ScreenshotOptions:
    interactive_pick: bool = False
    interactive_pick_timeout_s: float = 120.0
    quality: int = 8
+    adblock: bool = True
    playwright_tool: Optional[PlaywrightTool] = None


@@ -255,11 +311,14 @@ def _normalize_format(fmt: Optional[str]) -> str:
    if not fmt:
        return "webp"
    value = fmt.strip().lower()
+    if value in {"mht", "mhtml"}:
+        return "mhtml"
    if value in {"jpg",
                 "jpeg"}:
        return "jpeg"
    if value in {"png",
                 "pdf",
+                 "mhtml",
                 "webp"}:
        return value
    return "webp"
@@ -281,6 +340,10 @@ def _normalize_capture_mode(value: Optional[str]) -> str:
    return ""


+def _format_supports_target_selection(fmt: Optional[str]) -> bool:
+    return _normalize_format(fmt) not in {"pdf", "mhtml"}
+
+
 def _normalize_quality(value: Any) -> int:
    try:
        quality = int(str(value).strip())
@@ -289,6 +352,92 @@ def _normalize_quality(value: Any) -> int:
    return max(1, min(10, quality))


+def _normalize_bool(value: Any, *, default: bool = False) -> bool:
+    if value is None:
+        return bool(default)
+    if isinstance(value, bool):
+        return value
+    text = str(value).strip().lower()
+    if not text:
+        return bool(default)
+    if text in {"1", "true", "yes", "on", "enable", "enabled"}:
+        return True
+    if text in {"0", "false", "no", "off", "disable", "disabled"}:
+        return False
+    return bool(default)
+
+
+def _url_matches_adblock(url: str) -> bool:
+    lowered = str(url or "").strip().lower()
+    if not lowered:
+        return False
+    try:
+        host = str(urlsplit(lowered).hostname or "").strip().lower()
+    except Exception:
+        host = ""
+    if host and any(pattern in host for pattern in ADBLOCK_HOST_PATTERNS):
+        return True
+    return any(pattern in lowered for pattern in ADBLOCK_URL_PATTERNS)
+
+
+def _install_adblock(page: Any) -> Optional[Dict[str, int]]:
+    try:
+        state: Dict[str, int] = {"blocked": 0}
+
+        def _route(route: Any) -> None:
+            try:
+                request = route.request
+                url = str(getattr(request, "url", "") or "")
+                resource_type = str(getattr(request, "resource_type", "") or "").strip().lower()
+                if resource_type != "document" and _url_matches_adblock(url):
+                    state["blocked"] = int(state.get("blocked", 0)) + 1
+                    route.abort("blockedbyclient")
+                    return
+            except Exception:
+                pass
+            route.continue_()
+
+        page.route("**/*", _route)
+        return state
+    except Exception:
+        return None
+
+
+def _remove_ad_elements(page: Any) -> int:
+    try:
+        selectors_json = repr(list(ADBLOCK_CSS_SELECTORS))
+        removed = page.evaluate(
+            f"""
+            () => {{
+                const selectors = {selectors_json};
+                const seen = new Set();
+                let removed = 0;
+                for (const selector of selectors) {{
+                    let nodes = [];
+                    try {{
+                        nodes = Array.from(document.querySelectorAll(selector));
+                    }} catch (e) {{
+                        continue;
+                    }}
+                    for (const node of nodes) {{
+                        if (!(node instanceof Element)) continue;
+                        if (seen.has(node)) continue;
+                        seen.add(node);
+                        try {{
+                            node.remove();
+                            removed += 1;
+                        }} catch (e) {{}}
+                    }}
+                }}
+                return removed;
+            }}
+            """
+        )
+        return int(removed or 0)
+    except Exception:
+        return 0
+
+
 def _jpeg_quality_from_level(level: int) -> int:
    normalized = _normalize_quality(level)
    if normalized >= 10:
@@ -577,6 +726,9 @@ def _prepare_capture_page(
    progress: PipelineProgress,
 ) -> str:
    navigation_status = "loaded"
+    adblock_state: Optional[Dict[str, int]] = None
+    if options.adblock:
+        adblock_state = _install_adblock(page)
    progress.step("loading navigating")
    try:
        tool.goto(page, options.url)
@@ -611,6 +763,14 @@ def _prepare_capture_page(
                });
            """
        )
+    removed_ads = 0
+    if options.adblock:
+        removed_ads = _remove_ad_elements(page)
+        blocked_count = int((adblock_state or {}).get("blocked", 0))
+        if blocked_count or removed_ads:
+            warnings.append(
+                f"adblock filtered {blocked_count} request(s) and removed {removed_ads} page element(s)"
+            )
    return navigation_status


@@ -1034,6 +1194,32 @@ def _capture_selector_screenshot(
    page.screenshot(**screenshot_kwargs)


+def _capture_mhtml(page: Any, destination: Path) -> None:
+    session = None
+    try:
+        context = getattr(page, "context", None)
+        if context is None or not hasattr(context, "new_cdp_session"):
+            raise ScreenshotError("MHTML output requires Chromium CDP session support")
+
+        session = context.new_cdp_session(page)
+        session.send("Page.enable")
+        snapshot = session.send("Page.captureSnapshot", {"format": "mhtml"})
+        data = snapshot.get("data") if isinstance(snapshot, dict) else None
+        if not data:
+            raise ScreenshotError("Chromium did not return any MHTML snapshot data")
+        destination.write_text(str(data), encoding="utf-8", newline="")
+    except ScreenshotError:
+        raise
+    except Exception as exc:
+        raise ScreenshotError(f"Could not capture MHTML snapshot: {exc}") from exc
+    finally:
+        if session is not None:
+            try:
+                session.detach()
+            except Exception:
+                pass
+
+
 def _convert_to_webp(
    src_png: Path,
    dst_webp: Path,
@@ -1364,7 +1550,7 @@ def _capture(
        format_name = _normalize_format(options.output_format)
        capture_headless = bool(options.headless)
        picker_headless = capture_headless
-        if options.interactive_pick and format_name != "pdf":
+        if options.interactive_pick and _format_supports_target_selection(format_name):
            picker_headless = False
            capture_headless = True
        elif format_name == "pdf":
@@ -1405,10 +1591,19 @@ def _capture(
            warnings.append(
                "pdf output requires headless Chromium; overriding headless mode"
            )
+        if not _format_supports_target_selection(format_name):
+            if options.interactive_pick:
+                warnings.append(
+                    f"{format_name} output captures the full page; interactive element picking is ignored"
+                )
+            if options.prefer_platform_target:
+                warnings.append(
+                    f"{format_name} output captures the full page; selector targeting is ignored"
+                )

        try:
            element_captured = False
-            if options.interactive_pick and format_name != "pdf":
+            if options.interactive_pick and _format_supports_target_selection(format_name):
                selected_selector = ""
                with tool.open_page(
                    headless=picker_headless,
@@ -1463,7 +1658,7 @@ def _capture(
                        progress,
                    )
                    # Attempt platform-specific target capture if requested (and not PDF)
-                    if options.prefer_platform_target and format_name != "pdf":
+                    if options.prefer_platform_target and _format_supports_target_selection(format_name):
                        progress.step("capturing locating target")
                        try:
                            _platform_preprocess(options.url, page, warnings)
@@ -1501,6 +1696,10 @@ def _capture(
                            page.emulate_media(media="print")
                            progress.step("capturing output")
                            page.pdf(path=str(destination), print_background=True)
+                        elif format_name == "mhtml":
+                            capture_mode = "mhtml"
+                            progress.step("capturing output")
+                            _capture_mhtml(page, destination)
                        else:
                            screenshot_kwargs: Dict[str, Any] = {
                                "path": str(destination)
@@ -1579,10 +1778,10 @@ def _capture_screenshot(
    capture_mode = ""
    capture_target = ""

-    will_target = bool(options.prefer_platform_target or options.interactive_pick) and requested_format != "pdf"
+    will_target = bool(options.prefer_platform_target or options.interactive_pick) and _format_supports_target_selection(requested_format)
    will_convert = requested_format == "webp"
    will_archive = bool(options.archive and options.url)
-    interactive_extra_steps = 5 if (options.interactive_pick and requested_format != "pdf") else 0
+    interactive_extra_steps = 5 if (options.interactive_pick and _format_supports_target_selection(requested_format)) else 0
    total_steps = (
        9 + (1 if will_target else 0) + interactive_extra_steps +
        (1 if will_convert else 0) + (1 if will_archive else 0)
@@ -1685,6 +1884,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
    format_value = parsed.get("format")
    capture_mode_value = _normalize_capture_mode(parsed.get("capture_mode"))
    raw_quality_value = parsed.get("quality")
+    adblock_value = parsed.get("adblock")
    quality_value: Optional[int] = None
    if not format_value:
        try:
@@ -1709,6 +1909,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
            quality_value = None
    if quality_value is None:
        quality_value = _normalize_quality(None)
+    adblock_enabled = _normalize_bool(adblock_value, default=True)

    storage_value = parsed.get("storage")
    selector_arg = parsed.get("selector")
@@ -1774,7 +1975,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
                ("archive", archive_enabled),
                ("format", format_name),
                ("quality", quality_value),
-                ("capture_mode", capture_mode_value or ("interactive" if interactive_default and format_name != "pdf" else "auto")),
+                ("adblock", adblock_enabled),
+                ("capture_mode", capture_mode_value or ("interactive" if interactive_default and _format_supports_target_selection(format_name) else "auto")),
                ("output_dir", screenshot_dir),
                ("output_dir_source", screenshot_dir_source),
            ],
@@ -1848,6 +2050,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
                full_page=True,
                interactive_pick=False,
                quality=quality_value,
+                adblock=adblock_enabled,
                playwright_tool=shared_playwright_tool,
            )

@@ -1860,7 +2063,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
                options.target_selectors = None
            elif capture_mode_value == "interactive":
                options.interactive_pick = True
-            elif interactive_default and format_name != "pdf":
+            elif interactive_default and _format_supports_target_selection(format_name):
                options.interactive_pick = True
            elif auto_selectors:
                options.prefer_platform_target = True
@@ -1957,29 +2160,43 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
 CMDLET = Cmdlet(
    name="screen-shot",
    summary="Capture a website screenshot",
-    usage="screen-shot <url> [options] [-query \"format:full quality:10\"]",
+    usage="screen-shot <url> [options] [-query \"format:webp quality:10 mode:full\"]",
    alias=["screenshot",
           "ss"],
    arg=[
        SharedArgs.URL,
-        CmdletArg(
-            name="format",
+        sh.QueryArg(
+            "format",
+            key="format",
            type="string",
-            description="Output format: webp, png, jpeg, or pdf"
+            choices=["webp", "png", "jpeg", "jpg", "pdf", "mhtml", "mht"],
+            query_only=True,
+            description="Output format via -query, e.g. format:webp, format:pdf, or format:mhtml"
        ),
        sh.QueryArg(
            "capture_mode",
-            key="format",
+            key="mode",
            aliases=["capture", "mode"],
+            choices=["full", "interactive"],
            query_only=True,
-            description="Capture mode via -query, e.g. format:full or format:interactive"
+            description="Capture mode via -query, e.g. mode:full or mode:interactive"
        ),
        sh.QueryArg(
            "quality",
            key="quality",
+            choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
            query_only=True,
            description="Screenshot quality via -query, 1-10. 10 uses highest quality and lossless webp."
        ),
+        sh.QueryArg(
+            "adblock",
+            key="adblock",
+            aliases=["ads", "blockads"],
+            choices=["true", "false", "on", "off", "yes", "no", "1", "0"],
+            handler=lambda value: _normalize_bool(value, default=True),
+            query_only=True,
+            description="Ad and tracker blocking via -query. Defaults to true; use adblock:false to disable."
+        ),
        CmdletArg(
            name="selector",
            type="string",
@@ -1991,9 +2208,13 @@ CMDLET = Cmdlet(
    detail=[
        "Uses Playwright Chromium engine only. Install Chromium with: python ./scripts/bootstrap.py --playwright-only --browsers chromium",
        "PDF output requires headless Chromium (the cmdlet will enforce headless mode for PDF).",
+        "MHTML output uses Chromium page snapshots to save the full page as a single archival file.",
+        "Basic ad and tracker blocking is enabled by default during capture so MHTML archives are less likely to embed ad content.",
        "Screenshots are temporary artifacts stored in the configured `temp` directory.",
        "Interactive single-URL runs open a headful browser picker by default so you can hover and click the element to capture.",
-        "Use -query \"format:full\" to bypass the picker and capture the full page directly.",
+        "Use -query \"mode:full\" to bypass the picker and capture the full page directly.",
+        "Use -query \"format:webp\", \"format:pdf\", or \"format:mhtml\" to choose the output format.",
+        "Use -query \"adblock:false\" if a site breaks and you need the raw unfiltered page.",
        "Use -query \"quality:1\" through \"quality:10\" to control jpeg/webp compression. quality:10 uses lossless webp.",
    ],
 )