added mhtml support and fixed some bugs in the process
This commit is contained in:
+235
-14
@@ -73,6 +73,61 @@ DEFAULT_VIEWPORT: dict[str,
|
||||
}
|
||||
ARCHIVE_TIMEOUT = 30.0
|
||||
|
||||
ADBLOCK_HOST_PATTERNS: tuple[str, ...] = (
|
||||
"doubleclick.net",
|
||||
"googlesyndication.com",
|
||||
"googleadservices.com",
|
||||
"google-analytics.com",
|
||||
"googletagmanager.com",
|
||||
"googletagservices.com",
|
||||
"adservice.google.",
|
||||
"adsystem.com",
|
||||
"adnxs.com",
|
||||
"taboola.com",
|
||||
"outbrain.com",
|
||||
"criteo.com",
|
||||
"casalemedia.com",
|
||||
"rubiconproject.com",
|
||||
"pubmatic.com",
|
||||
"scorecardresearch.com",
|
||||
"quantserve.com",
|
||||
"zedo.com",
|
||||
"moatads.com",
|
||||
"amazon-adsystem.com",
|
||||
"media.net",
|
||||
)
|
||||
|
||||
ADBLOCK_URL_PATTERNS: tuple[str, ...] = (
|
||||
"/ads/",
|
||||
"?ads=",
|
||||
"&ads=",
|
||||
"advertisement",
|
||||
"googlesyndication",
|
||||
"doubleclick",
|
||||
"adservice",
|
||||
"adserver",
|
||||
"prebid",
|
||||
"taboola",
|
||||
"outbrain",
|
||||
"amazon-adsystem",
|
||||
)
|
||||
|
||||
ADBLOCK_CSS_SELECTORS: tuple[str, ...] = (
|
||||
"[id*='ad-']",
|
||||
"[id^='ad-']",
|
||||
"[id*='ads-']",
|
||||
"[class*=' ad-']",
|
||||
"[class^='ad-']",
|
||||
"[class*='ads-']",
|
||||
"[class*='advert']",
|
||||
"[id*='sponsor']",
|
||||
"[class*='sponsor']",
|
||||
"iframe[src*='doubleclick.net']",
|
||||
"iframe[src*='googlesyndication.com']",
|
||||
"iframe[src*='taboola.com']",
|
||||
"iframe[src*='outbrain.com']",
|
||||
)
|
||||
|
||||
# WebP has a hard maximum dimension per side.
|
||||
# Pillow typically fails with: "encoding error 5: Image size exceeds WebP limit of 16383 pixels"
|
||||
WEBP_MAX_DIM = 16_383
|
||||
@@ -136,6 +191,7 @@ class ScreenshotOptions:
|
||||
interactive_pick: bool = False
|
||||
interactive_pick_timeout_s: float = 120.0
|
||||
quality: int = 8
|
||||
adblock: bool = True
|
||||
playwright_tool: Optional[PlaywrightTool] = None
|
||||
|
||||
|
||||
@@ -255,11 +311,14 @@ def _normalize_format(fmt: Optional[str]) -> str:
|
||||
if not fmt:
|
||||
return "webp"
|
||||
value = fmt.strip().lower()
|
||||
if value in {"mht", "mhtml"}:
|
||||
return "mhtml"
|
||||
if value in {"jpg",
|
||||
"jpeg"}:
|
||||
return "jpeg"
|
||||
if value in {"png",
|
||||
"pdf",
|
||||
"mhtml",
|
||||
"webp"}:
|
||||
return value
|
||||
return "webp"
|
||||
@@ -281,6 +340,10 @@ def _normalize_capture_mode(value: Optional[str]) -> str:
|
||||
return ""
|
||||
|
||||
|
||||
def _format_supports_target_selection(fmt: Optional[str]) -> bool:
|
||||
return _normalize_format(fmt) not in {"pdf", "mhtml"}
|
||||
|
||||
|
||||
def _normalize_quality(value: Any) -> int:
|
||||
try:
|
||||
quality = int(str(value).strip())
|
||||
@@ -289,6 +352,92 @@ def _normalize_quality(value: Any) -> int:
|
||||
return max(1, min(10, quality))
|
||||
|
||||
|
||||
def _normalize_bool(value: Any, *, default: bool = False) -> bool:
|
||||
if value is None:
|
||||
return bool(default)
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
text = str(value).strip().lower()
|
||||
if not text:
|
||||
return bool(default)
|
||||
if text in {"1", "true", "yes", "on", "enable", "enabled"}:
|
||||
return True
|
||||
if text in {"0", "false", "no", "off", "disable", "disabled"}:
|
||||
return False
|
||||
return bool(default)
|
||||
|
||||
|
||||
def _url_matches_adblock(url: str) -> bool:
|
||||
lowered = str(url or "").strip().lower()
|
||||
if not lowered:
|
||||
return False
|
||||
try:
|
||||
host = str(urlsplit(lowered).hostname or "").strip().lower()
|
||||
except Exception:
|
||||
host = ""
|
||||
if host and any(pattern in host for pattern in ADBLOCK_HOST_PATTERNS):
|
||||
return True
|
||||
return any(pattern in lowered for pattern in ADBLOCK_URL_PATTERNS)
|
||||
|
||||
|
||||
def _install_adblock(page: Any) -> Optional[Dict[str, int]]:
|
||||
try:
|
||||
state: Dict[str, int] = {"blocked": 0}
|
||||
|
||||
def _route(route: Any) -> None:
|
||||
try:
|
||||
request = route.request
|
||||
url = str(getattr(request, "url", "") or "")
|
||||
resource_type = str(getattr(request, "resource_type", "") or "").strip().lower()
|
||||
if resource_type != "document" and _url_matches_adblock(url):
|
||||
state["blocked"] = int(state.get("blocked", 0)) + 1
|
||||
route.abort("blockedbyclient")
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
route.continue_()
|
||||
|
||||
page.route("**/*", _route)
|
||||
return state
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _remove_ad_elements(page: Any) -> int:
|
||||
try:
|
||||
selectors_json = repr(list(ADBLOCK_CSS_SELECTORS))
|
||||
removed = page.evaluate(
|
||||
f"""
|
||||
() => {{
|
||||
const selectors = {selectors_json};
|
||||
const seen = new Set();
|
||||
let removed = 0;
|
||||
for (const selector of selectors) {{
|
||||
let nodes = [];
|
||||
try {{
|
||||
nodes = Array.from(document.querySelectorAll(selector));
|
||||
}} catch (e) {{
|
||||
continue;
|
||||
}}
|
||||
for (const node of nodes) {{
|
||||
if (!(node instanceof Element)) continue;
|
||||
if (seen.has(node)) continue;
|
||||
seen.add(node);
|
||||
try {{
|
||||
node.remove();
|
||||
removed += 1;
|
||||
}} catch (e) {{}}
|
||||
}}
|
||||
}}
|
||||
return removed;
|
||||
}}
|
||||
"""
|
||||
)
|
||||
return int(removed or 0)
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
def _jpeg_quality_from_level(level: int) -> int:
|
||||
normalized = _normalize_quality(level)
|
||||
if normalized >= 10:
|
||||
@@ -577,6 +726,9 @@ def _prepare_capture_page(
|
||||
progress: PipelineProgress,
|
||||
) -> str:
|
||||
navigation_status = "loaded"
|
||||
adblock_state: Optional[Dict[str, int]] = None
|
||||
if options.adblock:
|
||||
adblock_state = _install_adblock(page)
|
||||
progress.step("loading navigating")
|
||||
try:
|
||||
tool.goto(page, options.url)
|
||||
@@ -611,6 +763,14 @@ def _prepare_capture_page(
|
||||
});
|
||||
"""
|
||||
)
|
||||
removed_ads = 0
|
||||
if options.adblock:
|
||||
removed_ads = _remove_ad_elements(page)
|
||||
blocked_count = int((adblock_state or {}).get("blocked", 0))
|
||||
if blocked_count or removed_ads:
|
||||
warnings.append(
|
||||
f"adblock filtered {blocked_count} request(s) and removed {removed_ads} page element(s)"
|
||||
)
|
||||
return navigation_status
|
||||
|
||||
|
||||
@@ -1034,6 +1194,32 @@ def _capture_selector_screenshot(
|
||||
page.screenshot(**screenshot_kwargs)
|
||||
|
||||
|
||||
def _capture_mhtml(page: Any, destination: Path) -> None:
|
||||
session = None
|
||||
try:
|
||||
context = getattr(page, "context", None)
|
||||
if context is None or not hasattr(context, "new_cdp_session"):
|
||||
raise ScreenshotError("MHTML output requires Chromium CDP session support")
|
||||
|
||||
session = context.new_cdp_session(page)
|
||||
session.send("Page.enable")
|
||||
snapshot = session.send("Page.captureSnapshot", {"format": "mhtml"})
|
||||
data = snapshot.get("data") if isinstance(snapshot, dict) else None
|
||||
if not data:
|
||||
raise ScreenshotError("Chromium did not return any MHTML snapshot data")
|
||||
destination.write_text(str(data), encoding="utf-8", newline="")
|
||||
except ScreenshotError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise ScreenshotError(f"Could not capture MHTML snapshot: {exc}") from exc
|
||||
finally:
|
||||
if session is not None:
|
||||
try:
|
||||
session.detach()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _convert_to_webp(
|
||||
src_png: Path,
|
||||
dst_webp: Path,
|
||||
@@ -1364,7 +1550,7 @@ def _capture(
|
||||
format_name = _normalize_format(options.output_format)
|
||||
capture_headless = bool(options.headless)
|
||||
picker_headless = capture_headless
|
||||
if options.interactive_pick and format_name != "pdf":
|
||||
if options.interactive_pick and _format_supports_target_selection(format_name):
|
||||
picker_headless = False
|
||||
capture_headless = True
|
||||
elif format_name == "pdf":
|
||||
@@ -1405,10 +1591,19 @@ def _capture(
|
||||
warnings.append(
|
||||
"pdf output requires headless Chromium; overriding headless mode"
|
||||
)
|
||||
if not _format_supports_target_selection(format_name):
|
||||
if options.interactive_pick:
|
||||
warnings.append(
|
||||
f"{format_name} output captures the full page; interactive element picking is ignored"
|
||||
)
|
||||
if options.prefer_platform_target:
|
||||
warnings.append(
|
||||
f"{format_name} output captures the full page; selector targeting is ignored"
|
||||
)
|
||||
|
||||
try:
|
||||
element_captured = False
|
||||
if options.interactive_pick and format_name != "pdf":
|
||||
if options.interactive_pick and _format_supports_target_selection(format_name):
|
||||
selected_selector = ""
|
||||
with tool.open_page(
|
||||
headless=picker_headless,
|
||||
@@ -1463,7 +1658,7 @@ def _capture(
|
||||
progress,
|
||||
)
|
||||
# Attempt platform-specific target capture if requested (and not PDF)
|
||||
if options.prefer_platform_target and format_name != "pdf":
|
||||
if options.prefer_platform_target and _format_supports_target_selection(format_name):
|
||||
progress.step("capturing locating target")
|
||||
try:
|
||||
_platform_preprocess(options.url, page, warnings)
|
||||
@@ -1501,6 +1696,10 @@ def _capture(
|
||||
page.emulate_media(media="print")
|
||||
progress.step("capturing output")
|
||||
page.pdf(path=str(destination), print_background=True)
|
||||
elif format_name == "mhtml":
|
||||
capture_mode = "mhtml"
|
||||
progress.step("capturing output")
|
||||
_capture_mhtml(page, destination)
|
||||
else:
|
||||
screenshot_kwargs: Dict[str, Any] = {
|
||||
"path": str(destination)
|
||||
@@ -1579,10 +1778,10 @@ def _capture_screenshot(
|
||||
capture_mode = ""
|
||||
capture_target = ""
|
||||
|
||||
will_target = bool(options.prefer_platform_target or options.interactive_pick) and requested_format != "pdf"
|
||||
will_target = bool(options.prefer_platform_target or options.interactive_pick) and _format_supports_target_selection(requested_format)
|
||||
will_convert = requested_format == "webp"
|
||||
will_archive = bool(options.archive and options.url)
|
||||
interactive_extra_steps = 5 if (options.interactive_pick and requested_format != "pdf") else 0
|
||||
interactive_extra_steps = 5 if (options.interactive_pick and _format_supports_target_selection(requested_format)) else 0
|
||||
total_steps = (
|
||||
9 + (1 if will_target else 0) + interactive_extra_steps +
|
||||
(1 if will_convert else 0) + (1 if will_archive else 0)
|
||||
@@ -1685,6 +1884,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
format_value = parsed.get("format")
|
||||
capture_mode_value = _normalize_capture_mode(parsed.get("capture_mode"))
|
||||
raw_quality_value = parsed.get("quality")
|
||||
adblock_value = parsed.get("adblock")
|
||||
quality_value: Optional[int] = None
|
||||
if not format_value:
|
||||
try:
|
||||
@@ -1709,6 +1909,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
quality_value = None
|
||||
if quality_value is None:
|
||||
quality_value = _normalize_quality(None)
|
||||
adblock_enabled = _normalize_bool(adblock_value, default=True)
|
||||
|
||||
storage_value = parsed.get("storage")
|
||||
selector_arg = parsed.get("selector")
|
||||
@@ -1774,7 +1975,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
("archive", archive_enabled),
|
||||
("format", format_name),
|
||||
("quality", quality_value),
|
||||
("capture_mode", capture_mode_value or ("interactive" if interactive_default and format_name != "pdf" else "auto")),
|
||||
("adblock", adblock_enabled),
|
||||
("capture_mode", capture_mode_value or ("interactive" if interactive_default and _format_supports_target_selection(format_name) else "auto")),
|
||||
("output_dir", screenshot_dir),
|
||||
("output_dir_source", screenshot_dir_source),
|
||||
],
|
||||
@@ -1848,6 +2050,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
full_page=True,
|
||||
interactive_pick=False,
|
||||
quality=quality_value,
|
||||
adblock=adblock_enabled,
|
||||
playwright_tool=shared_playwright_tool,
|
||||
)
|
||||
|
||||
@@ -1860,7 +2063,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
options.target_selectors = None
|
||||
elif capture_mode_value == "interactive":
|
||||
options.interactive_pick = True
|
||||
elif interactive_default and format_name != "pdf":
|
||||
elif interactive_default and _format_supports_target_selection(format_name):
|
||||
options.interactive_pick = True
|
||||
elif auto_selectors:
|
||||
options.prefer_platform_target = True
|
||||
@@ -1957,29 +2160,43 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
CMDLET = Cmdlet(
|
||||
name="screen-shot",
|
||||
summary="Capture a website screenshot",
|
||||
usage="screen-shot <url> [options] [-query \"format:full quality:10\"]",
|
||||
usage="screen-shot <url> [options] [-query \"format:webp quality:10 mode:full\"]",
|
||||
alias=["screenshot",
|
||||
"ss"],
|
||||
arg=[
|
||||
SharedArgs.URL,
|
||||
CmdletArg(
|
||||
name="format",
|
||||
sh.QueryArg(
|
||||
"format",
|
||||
key="format",
|
||||
type="string",
|
||||
description="Output format: webp, png, jpeg, or pdf"
|
||||
choices=["webp", "png", "jpeg", "jpg", "pdf", "mhtml", "mht"],
|
||||
query_only=True,
|
||||
description="Output format via -query, e.g. format:webp, format:pdf, or format:mhtml"
|
||||
),
|
||||
sh.QueryArg(
|
||||
"capture_mode",
|
||||
key="format",
|
||||
key="mode",
|
||||
aliases=["capture", "mode"],
|
||||
choices=["full", "interactive"],
|
||||
query_only=True,
|
||||
description="Capture mode via -query, e.g. format:full or format:interactive"
|
||||
description="Capture mode via -query, e.g. mode:full or mode:interactive"
|
||||
),
|
||||
sh.QueryArg(
|
||||
"quality",
|
||||
key="quality",
|
||||
choices=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
|
||||
query_only=True,
|
||||
description="Screenshot quality via -query, 1-10. 10 uses highest quality and lossless webp."
|
||||
),
|
||||
sh.QueryArg(
|
||||
"adblock",
|
||||
key="adblock",
|
||||
aliases=["ads", "blockads"],
|
||||
choices=["true", "false", "on", "off", "yes", "no", "1", "0"],
|
||||
handler=lambda value: _normalize_bool(value, default=True),
|
||||
query_only=True,
|
||||
description="Ad and tracker blocking via -query. Defaults to true; use adblock:false to disable."
|
||||
),
|
||||
CmdletArg(
|
||||
name="selector",
|
||||
type="string",
|
||||
@@ -1991,9 +2208,13 @@ CMDLET = Cmdlet(
|
||||
detail=[
|
||||
"Uses Playwright Chromium engine only. Install Chromium with: python ./scripts/bootstrap.py --playwright-only --browsers chromium",
|
||||
"PDF output requires headless Chromium (the cmdlet will enforce headless mode for PDF).",
|
||||
"MHTML output uses Chromium page snapshots to save the full page as a single archival file.",
|
||||
"Basic ad and tracker blocking is enabled by default during capture so MHTML archives are less likely to embed ad content.",
|
||||
"Screenshots are temporary artifacts stored in the configured `temp` directory.",
|
||||
"Interactive single-URL runs open a headful browser picker by default so you can hover and click the element to capture.",
|
||||
"Use -query \"format:full\" to bypass the picker and capture the full page directly.",
|
||||
"Use -query \"mode:full\" to bypass the picker and capture the full page directly.",
|
||||
"Use -query \"format:webp\", \"format:pdf\", or \"format:mhtml\" to choose the output format.",
|
||||
"Use -query \"adblock:false\" if a site breaks and you need the raw unfiltered page.",
|
||||
"Use -query \"quality:1\" through \"quality:10\" to control jpeg/webp compression. quality:10 uses lossless webp.",
|
||||
],
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user