This commit is contained in:
nose
2025-12-17 17:42:46 -08:00
parent d2e7385280
commit 76691dbbf5
9 changed files with 762 additions and 119 deletions

View File

@@ -323,6 +323,16 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
debug(f"[_capture] Starting capture for {options.url} -> {destination}")
try:
tool = options.playwright_tool or PlaywrightTool({})
# Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency)
try:
current_browser = getattr(tool.defaults, "browser", "").lower() if getattr(tool, "defaults", None) is not None else ""
if current_browser != "chromium":
debug(f"[_capture] Overriding Playwright browser '{current_browser}' -> 'chromium' for screen-shot cmdlet")
tool = PlaywrightTool({"tool": {"playwright": {"browser": "chromium"}}})
except Exception:
tool = PlaywrightTool({"tool": {"playwright": {"browser": "chromium"}}})
tool.debug_dump()
log("Launching browser...", flush=True)
@@ -333,104 +343,114 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
if format_name == "pdf" and not options.headless:
warnings.append("pdf output requires headless Chromium; overriding headless mode")
with tool.open_page(headless=headless) as page:
log(f"Navigating to {options.url}...", flush=True)
try:
tool.goto(page, options.url)
log("Page loaded successfully", flush=True)
except PlaywrightTimeoutError:
warnings.append("navigation timeout; capturing current page state")
log("Navigation timeout; proceeding with current state", flush=True)
# Skip article lookup by default (wait_for_article defaults to False)
if options.wait_for_article:
try:
with tool.open_page(headless=headless) as page:
log(f"Navigating to {options.url}...", flush=True)
try:
log("Waiting for article element...", flush=True)
page.wait_for_selector("article", timeout=10_000)
log("Article element found", flush=True)
tool.goto(page, options.url)
log("Page loaded successfully", flush=True)
except PlaywrightTimeoutError:
warnings.append("<article> selector not found; capturing fallback")
log("Article element not found; using fallback", flush=True)
if options.wait_after_load > 0:
log(f"Waiting {options.wait_after_load}s for page stabilization...", flush=True)
time.sleep(min(10.0, max(0.0, options.wait_after_load)))
if options.replace_video_posters:
log("Replacing video elements with posters...", flush=True)
page.evaluate(
"""
document.querySelectorAll('video').forEach(v => {
if (v.poster) {
const img = document.createElement('img');
img.src = v.poster;
img.style.maxWidth = '100%';
img.style.borderRadius = '12px';
v.replaceWith(img);
}
});
"""
)
# Attempt platform-specific target capture if requested (and not PDF)
element_captured = False
if options.prefer_platform_target and format_name != "pdf":
log("Attempting platform-specific content capture...", flush=True)
try:
_platform_preprocess(options.url, page, warnings)
except Exception as e:
debug(f"[_capture] Platform preprocess failed: {e}")
pass
selectors = list(options.target_selectors or [])
if not selectors:
selectors = _selectors_for_url(options.url)
warnings.append("navigation timeout; capturing current page state")
log("Navigation timeout; proceeding with current state", flush=True)
debug(f"[_capture] Trying selectors: {selectors}")
for sel in selectors:
# Skip article lookup by default (wait_for_article defaults to False)
if options.wait_for_article:
try:
log(f"Trying selector: {sel}", flush=True)
el = page.wait_for_selector(sel, timeout=max(0, int(options.selector_timeout_ms)))
log("Waiting for article element...", flush=True)
page.wait_for_selector("article", timeout=10_000)
log("Article element found", flush=True)
except PlaywrightTimeoutError:
log(f"Selector not found: {sel}", flush=True)
continue
warnings.append("<article> selector not found; capturing fallback")
log("Article element not found; using fallback", flush=True)
if options.wait_after_load > 0:
log(f"Waiting {options.wait_after_load}s for page stabilization...", flush=True)
time.sleep(min(10.0, max(0.0, options.wait_after_load)))
if options.replace_video_posters:
log("Replacing video elements with posters...", flush=True)
page.evaluate(
"""
document.querySelectorAll('video').forEach(v => {
if (v.poster) {
const img = document.createElement('img');
img.src = v.poster;
img.style.maxWidth = '100%';
img.style.borderRadius = '12px';
v.replaceWith(img);
}
});
"""
)
# Attempt platform-specific target capture if requested (and not PDF)
element_captured = False
if options.prefer_platform_target and format_name != "pdf":
log("Attempting platform-specific content capture...", flush=True)
try:
if el is not None:
log(f"Found element with selector: {sel}", flush=True)
try:
el.scroll_into_view_if_needed(timeout=1000)
except Exception:
pass
log(f"Capturing element to {destination}...", flush=True)
el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
element_captured = True
log("Element captured successfully", flush=True)
break
except Exception as exc:
warnings.append(f"element capture failed for '{sel}': {exc}")
log(f"Failed to capture element: {exc}", flush=True)
# Fallback to default capture paths
if element_captured:
pass
elif format_name == "pdf":
log("Generating PDF...", flush=True)
page.emulate_media(media="print")
page.pdf(path=str(destination), print_background=True)
log(f"PDF saved to {destination}", flush=True)
else:
log(f"Capturing full page to {destination}...", flush=True)
screenshot_kwargs: Dict[str, Any] = {"path": str(destination)}
if format_name == "jpeg":
screenshot_kwargs["type"] = "jpeg"
screenshot_kwargs["quality"] = 90
if options.full_page:
page.screenshot(full_page=True, **screenshot_kwargs)
_platform_preprocess(options.url, page, warnings)
except Exception as e:
debug(f"[_capture] Platform preprocess failed: {e}")
pass
selectors = list(options.target_selectors or [])
if not selectors:
selectors = _selectors_for_url(options.url)
debug(f"[_capture] Trying selectors: {selectors}")
for sel in selectors:
try:
log(f"Trying selector: {sel}", flush=True)
el = page.wait_for_selector(sel, timeout=max(0, int(options.selector_timeout_ms)))
except PlaywrightTimeoutError:
log(f"Selector not found: {sel}", flush=True)
continue
try:
if el is not None:
log(f"Found element with selector: {sel}", flush=True)
try:
el.scroll_into_view_if_needed(timeout=1000)
except Exception:
pass
log(f"Capturing element to {destination}...", flush=True)
el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
element_captured = True
log("Element captured successfully", flush=True)
break
except Exception as exc:
warnings.append(f"element capture failed for '{sel}': {exc}")
log(f"Failed to capture element: {exc}", flush=True)
# Fallback to default capture paths
if element_captured:
pass
elif format_name == "pdf":
log("Generating PDF...", flush=True)
page.emulate_media(media="print")
page.pdf(path=str(destination), print_background=True)
log(f"PDF saved to {destination}", flush=True)
else:
article = page.query_selector("article")
if article is not None:
article_kwargs = dict(screenshot_kwargs)
article_kwargs.pop("full_page", None)
article.screenshot(**article_kwargs)
log(f"Capturing full page to {destination}...", flush=True)
screenshot_kwargs: Dict[str, Any] = {"path": str(destination)}
if format_name == "jpeg":
screenshot_kwargs["type"] = "jpeg"
screenshot_kwargs["quality"] = 90
if options.full_page:
page.screenshot(full_page=True, **screenshot_kwargs)
else:
page.screenshot(**screenshot_kwargs)
log(f"Screenshot saved to {destination}", flush=True)
article = page.query_selector("article")
if article is not None:
article_kwargs = dict(screenshot_kwargs)
article_kwargs.pop("full_page", None)
article.screenshot(**article_kwargs)
else:
page.screenshot(**screenshot_kwargs)
log(f"Screenshot saved to {destination}", flush=True)
except Exception as exc:
debug(f"[_capture] Exception launching browser/page: {exc}")
msg = str(exc).lower()
if any(k in msg for k in ["executable", "not found", "no such file", "cannot find", "install"]):
raise ScreenshotError("Chromium Playwright browser binaries not found. Install them: python ./scripts/setup.py --playwright-only --browsers chromium") from exc
raise
except ScreenshotError:
# Re-raise ScreenshotError raised intentionally (do not wrap)
raise
except Exception as exc:
debug(f"[_capture] Exception: {exc}")
raise ScreenshotError(f"Failed to capture screenshot: {exc}") from exc
@@ -645,6 +665,19 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
try:
# Create screenshot with provided options
# Force the Playwright engine to Chromium for the screen-shot cmdlet
# (this ensures consistent rendering and supports PDF output requirements).
pw_local_cfg = {}
if isinstance(config, dict):
tool_block = dict(config.get("tool") or {})
pw_block = dict(tool_block.get("playwright") or {})
pw_block["browser"] = "chromium"
tool_block["playwright"] = pw_block
pw_local_cfg = dict(config)
pw_local_cfg["tool"] = tool_block
else:
pw_local_cfg = {"tool": {"playwright": {"browser": "chromium"}}}
options = ScreenshotOptions(
url=url,
output_dir=screenshot_dir,
@@ -654,7 +687,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
prefer_platform_target=False,
wait_for_article=False,
full_page=True,
playwright_tool=PlaywrightTool(config),
playwright_tool=PlaywrightTool(pw_local_cfg),
)
screenshot_result = _capture_screenshot(options)
@@ -744,12 +777,11 @@ CMDLET = Cmdlet(
CmdletArg(name="selector", type="string", description="CSS selector for element capture"),
],
detail=
["""
"""]
detail=[
"Uses Playwright Chromium engine only. Install Chromium with: python ./scripts/setup.py --playwright-only --browsers chromium",
"PDF output requires headless Chromium (the cmdlet will enforce headless mode for PDF).",
"Screenshots are temporary artifacts stored in the configured `temp` directory.",
]
)
CMDLET.exec = _run