added mhtml support and fixed some bugs in the process

This commit is contained in:
2026-04-22 21:19:55 -07:00
parent 90787bd0a2
commit 67c272db4b
9 changed files with 564 additions and 66 deletions
+235 -14
View File
@@ -73,6 +73,61 @@ DEFAULT_VIEWPORT: dict[str,
}
ARCHIVE_TIMEOUT = 30.0
ADBLOCK_HOST_PATTERNS: tuple[str, ...] = (
"doubleclick.net",
"googlesyndication.com",
"googleadservices.com",
"google-analytics.com",
"googletagmanager.com",
"googletagservices.com",
"adservice.google.",
"adsystem.com",
"adnxs.com",
"taboola.com",
"outbrain.com",
"criteo.com",
"casalemedia.com",
"rubiconproject.com",
"pubmatic.com",
"scorecardresearch.com",
"quantserve.com",
"zedo.com",
"moatads.com",
"amazon-adsystem.com",
"media.net",
)
ADBLOCK_URL_PATTERNS: tuple[str, ...] = (
"/ads/",
"?ads=",
"&ads=",
"advertisement",
"googlesyndication",
"doubleclick",
"adservice",
"adserver",
"prebid",
"taboola",
"outbrain",
"amazon-adsystem",
)
ADBLOCK_CSS_SELECTORS: tuple[str, ...] = (
"[id*='ad-']",
"[id^='ad-']",
"[id*='ads-']",
"[class*=' ad-']",
"[class^='ad-']",
"[class*='ads-']",
"[class*='advert']",
"[id*='sponsor']",
"[class*='sponsor']",
"iframe[src*='doubleclick.net']",
"iframe[src*='googlesyndication.com']",
"iframe[src*='taboola.com']",
"iframe[src*='outbrain.com']",
)
# WebP has a hard maximum dimension per side.
# Pillow typically fails with: "encoding error 5: Image size exceeds WebP limit of 16383 pixels"
WEBP_MAX_DIM = 16_383
@@ -136,6 +191,7 @@ class ScreenshotOptions:
interactive_pick: bool = False
interactive_pick_timeout_s: float = 120.0
quality: int = 8
adblock: bool = True
playwright_tool: Optional[PlaywrightTool] = None
@@ -255,11 +311,14 @@ def _normalize_format(fmt: Optional[str]) -> str:
if not fmt:
return "webp"
value = fmt.strip().lower()
if value in {"mht", "mhtml"}:
return "mhtml"
if value in {"jpg",
"jpeg"}:
return "jpeg"
if value in {"png",
"pdf",
"mhtml",
"webp"}:
return value
return "webp"
@@ -281,6 +340,10 @@ def _normalize_capture_mode(value: Optional[str]) -> str:
return ""
def _format_supports_target_selection(fmt: Optional[str]) -> bool:
return _normalize_format(fmt) not in {"pdf", "mhtml"}
def _normalize_quality(value: Any) -> int:
try:
quality = int(str(value).strip())
@@ -289,6 +352,92 @@ def _normalize_quality(value: Any) -> int:
return max(1, min(10, quality))
def _normalize_bool(value: Any, *, default: bool = False) -> bool:
if value is None:
return bool(default)
if isinstance(value, bool):
return value
text = str(value).strip().lower()
if not text:
return bool(default)
if text in {"1", "true", "yes", "on", "enable", "enabled"}:
return True
if text in {"0", "false", "no", "off", "disable", "disabled"}:
return False
return bool(default)
def _url_matches_adblock(url: str) -> bool:
lowered = str(url or "").strip().lower()
if not lowered:
return False
try:
host = str(urlsplit(lowered).hostname or "").strip().lower()
except Exception:
host = ""
if host and any(pattern in host for pattern in ADBLOCK_HOST_PATTERNS):
return True
return any(pattern in lowered for pattern in ADBLOCK_URL_PATTERNS)
def _install_adblock(page: Any) -> Optional[Dict[str, int]]:
try:
state: Dict[str, int] = {"blocked": 0}
def _route(route: Any) -> None:
try:
request = route.request
url = str(getattr(request, "url", "") or "")
resource_type = str(getattr(request, "resource_type", "") or "").strip().lower()
if resource_type != "document" and _url_matches_adblock(url):
state["blocked"] = int(state.get("blocked", 0)) + 1
route.abort("blockedbyclient")
return
except Exception:
pass
route.continue_()
page.route("**/*", _route)
return state
except Exception:
return None
def _remove_ad_elements(page: Any) -> int:
try:
selectors_json = repr(list(ADBLOCK_CSS_SELECTORS))
removed = page.evaluate(
f"""
() => {{
const selectors = {selectors_json};
const seen = new Set();
let removed = 0;
for (const selector of selectors) {{
let nodes = [];
try {{
nodes = Array.from(document.querySelectorAll(selector));
}} catch (e) {{
continue;
}}
for (const node of nodes) {{
if (!(node instanceof Element)) continue;
if (seen.has(node)) continue;
seen.add(node);
try {{
node.remove();
removed += 1;
}} catch (e) {{}}
}}
}}
return removed;
}}
"""
)
return int(removed or 0)
except Exception:
return 0
def _jpeg_quality_from_level(level: int) -> int:
normalized = _normalize_quality(level)
if normalized >= 10:
@@ -577,6 +726,9 @@ def _prepare_capture_page(
progress: PipelineProgress,
) -> str:
navigation_status = "loaded"
adblock_state: Optional[Dict[str, int]] = None
if options.adblock:
adblock_state = _install_adblock(page)
progress.step("loading navigating")
try:
tool.goto(page, options.url)
@@ -611,6 +763,14 @@ def _prepare_capture_page(
});
"""
)
removed_ads = 0
if options.adblock:
removed_ads = _remove_ad_elements(page)
blocked_count = int((adblock_state or {}).get("blocked", 0))
if blocked_count or removed_ads:
warnings.append(
f"adblock filtered {blocked_count} request(s) and removed {removed_ads} page element(s)"
)
return navigation_status
@@ -1034,6 +1194,32 @@ def _capture_selector_screenshot(
page.screenshot(**screenshot_kwargs)
def _capture_mhtml(page: Any, destination: Path) -> None:
session = None
try:
context = getattr(page, "context", None)
if context is None or not hasattr(context, "new_cdp_session"):
raise ScreenshotError("MHTML output requires Chromium CDP session support")
session = context.new_cdp_session(page)
session.send("Page.enable")
snapshot = session.send("Page.captureSnapshot", {"format": "mhtml"})
data = snapshot.get("data") if isinstance(snapshot, dict) else None
if not data:
raise ScreenshotError("Chromium did not return any MHTML snapshot data")
destination.write_text(str(data), encoding="utf-8", newline="")
except ScreenshotError:
raise
except Exception as exc:
raise ScreenshotError(f"Could not capture MHTML snapshot: {exc}") from exc
finally:
if session is not None:
try:
session.detach()
except Exception:
pass
def _convert_to_webp(
src_png: Path,
dst_webp: Path,
@@ -1364,7 +1550,7 @@ def _capture(
format_name = _normalize_format(options.output_format)
capture_headless = bool(options.headless)
picker_headless = capture_headless
if options.interactive_pick and format_name != "pdf":
if options.interactive_pick and _format_supports_target_selection(format_name):
picker_headless = False
capture_headless = True
elif format_name == "pdf":
@@ -1405,10 +1591,19 @@ def _capture(
warnings.append(
"pdf output requires headless Chromium; overriding headless mode"
)
if not _format_supports_target_selection(format_name):
if options.interactive_pick:
warnings.append(
f"{format_name} output captures the full page; interactive element picking is ignored"
)
if options.prefer_platform_target:
warnings.append(
f"{format_name} output captures the full page; selector targeting is ignored"
)
try:
element_captured = False
if options.interactive_pick and format_name != "pdf":
if options.interactive_pick and _format_supports_target_selection(format_name):
selected_selector = ""
with tool.open_page(
headless=picker_headless,
@@ -1463,7 +1658,7 @@ def _capture(
progress,
)
# Attempt platform-specific target capture if requested (and not PDF)
if options.prefer_platform_target and format_name != "pdf":
if options.prefer_platform_target and _format_supports_target_selection(format_name):
progress.step("capturing locating target")
try:
_platform_preprocess(options.url, page, warnings)
@@ -1501,6 +1696,10 @@ def _capture(
page.emulate_media(media="print")
progress.step("capturing output")
page.pdf(path=str(destination), print_background=True)
elif format_name == "mhtml":
capture_mode = "mhtml"
progress.step("capturing output")
_capture_mhtml(page, destination)
else:
screenshot_kwargs: Dict[str, Any] = {
"path": str(destination)
@@ -1579,10 +1778,10 @@ def _capture_screenshot(
capture_mode = ""
capture_target = ""
will_target = bool(options.prefer_platform_target or options.interactive_pick) and requested_format != "pdf"
will_target = bool(options.prefer_platform_target or options.interactive_pick) and _format_supports_target_selection(requested_format)
will_convert = requested_format == "webp"
will_archive = bool(options.archive and options.url)
interactive_extra_steps = 5 if (options.interactive_pick and requested_format != "pdf") else 0
interactive_extra_steps = 5 if (options.interactive_pick and _format_supports_target_selection(requested_format)) else 0
total_steps = (
9 + (1 if will_target else 0) + interactive_extra_steps +
(1 if will_convert else 0) + (1 if will_archive else 0)
@@ -1685,6 +1884,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
format_value = parsed.get("format")
capture_mode_value = _normalize_capture_mode(parsed.get("capture_mode"))
raw_quality_value = parsed.get("quality")
adblock_value = parsed.get("adblock")
quality_value: Optional[int] = None
if not format_value:
try:
@@ -1709,6 +1909,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
quality_value = None
if quality_value is None:
quality_value = _normalize_quality(None)
adblock_enabled = _normalize_bool(adblock_value, default=True)
storage_value = parsed.get("storage")
selector_arg = parsed.get("selector")
@@ -1774,7 +1975,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
("archive", archive_enabled),
("format", format_name),
("quality", quality_value),
("capture_mode", capture_mode_value or ("interactive" if interactive_default and format_name != "pdf" else "auto")),
("adblock", adblock_enabled),
("capture_mode", capture_mode_value or ("interactive" if interactive_default and _format_supports_target_selection(format_name) else "auto")),
("output_dir", screenshot_dir),
("output_dir_source", screenshot_dir_source),
],
@@ -1848,6 +2050,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
full_page=True,
interactive_pick=False,
quality=quality_value,
adblock=adblock_enabled,
playwright_tool=shared_playwright_tool,
)
@@ -1860,7 +2063,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
options.target_selectors = None
elif capture_mode_value == "interactive":
options.interactive_pick = True
elif interactive_default and format_name != "pdf":
elif interactive_default and _format_supports_target_selection(format_name):
options.interactive_pick = True
elif auto_selectors:
options.prefer_platform_target = True
@@ -1957,29 +2160,43 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
CMDLET = Cmdlet(
name="screen-shot",
summary="Capture a website screenshot",
usage="screen-shot <url> [options] [-query \"format:full quality:10\"]",
usage="screen-shot <url> [options] [-query \"format:webp quality:10 mode:full\"]",
alias=["screenshot",
"ss"],
arg=[
SharedArgs.URL,
CmdletArg(
name="format",
sh.QueryArg(
"format",
key="format",
type="string",
description="Output format: webp, png, jpeg, or pdf"
choices=["webp", "png", "jpeg", "jpg", "pdf", "mhtml", "mht"],
query_only=True,
description="Output format via -query, e.g. format:webp, format:pdf, or format:mhtml"
),
sh.QueryArg(
"capture_mode",
key="format",
key="mode",
aliases=["capture", "mode"],
choices=["full", "interactive"],
query_only=True,
description="Capture mode via -query, e.g. format:full or format:interactive"
description="Capture mode via -query, e.g. mode:full or mode:interactive"
),
sh.QueryArg(
"quality",
key="quality",
choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
query_only=True,
description="Screenshot quality via -query, 1-10. 10 uses highest quality and lossless webp."
),
sh.QueryArg(
"adblock",
key="adblock",
aliases=["ads", "blockads"],
choices=["true", "false", "on", "off", "yes", "no", "1", "0"],
handler=lambda value: _normalize_bool(value, default=True),
query_only=True,
description="Ad and tracker blocking via -query. Defaults to true; use adblock:false to disable."
),
CmdletArg(
name="selector",
type="string",
@@ -1991,9 +2208,13 @@ CMDLET = Cmdlet(
detail=[
"Uses Playwright Chromium engine only. Install Chromium with: python ./scripts/bootstrap.py --playwright-only --browsers chromium",
"PDF output requires headless Chromium (the cmdlet will enforce headless mode for PDF).",
"MHTML output uses Chromium page snapshots to save the full page as a single archival file.",
"Basic ad and tracker blocking is enabled by default during capture so MHTML archives are less likely to embed ad content.",
"Screenshots are temporary artifacts stored in the configured `temp` directory.",
"Interactive single-URL runs open a headful browser picker by default so you can hover and click the element to capture.",
"Use -query \"format:full\" to bypass the picker and capture the full page directly.",
"Use -query \"mode:full\" to bypass the picker and capture the full page directly.",
"Use -query \"format:webp\", \"format:pdf\", or \"format:mhtml\" to choose the output format.",
"Use -query \"adblock:false\" if a site breaks and you need the raw unfiltered page.",
"Use -query \"quality:1\" through \"quality:10\" to control jpeg/webp compression. quality:10 uses lossless webp.",
],
)