jkj
This commit is contained in:
@@ -20,7 +20,16 @@ from urllib.parse import urlsplit, quote, urljoin
|
||||
from SYS.logger import log, debug
|
||||
from API.HTTP import HTTPClient
|
||||
from SYS.utils import ensure_directory, unique_path, unique_preserve_order
|
||||
from ._shared import Cmdlet, CmdletArg, SharedArgs, create_pipe_object_result, normalize_result_input, should_show_help, get_field
|
||||
from . import _shared as sh
|
||||
|
||||
Cmdlet = sh.Cmdlet
|
||||
CmdletArg = sh.CmdletArg
|
||||
SharedArgs = sh.SharedArgs
|
||||
create_pipe_object_result = sh.create_pipe_object_result
|
||||
normalize_result_input = sh.normalize_result_input
|
||||
should_show_help = sh.should_show_help
|
||||
get_field = sh.get_field
|
||||
parse_cmdlet_args = sh.parse_cmdlet_args
|
||||
import pipeline as pipeline_context
|
||||
|
||||
# ============================================================================
|
||||
@@ -33,20 +42,7 @@ import pipeline as pipeline_context
|
||||
# Playwright & Screenshot Dependencies
|
||||
# ============================================================================
|
||||
|
||||
try:
|
||||
from playwright.sync_api import (
|
||||
TimeoutError as PlaywrightTimeoutError,
|
||||
sync_playwright,
|
||||
)
|
||||
HAS_PLAYWRIGHT = True
|
||||
except Exception:
|
||||
HAS_PLAYWRIGHT = False
|
||||
PlaywrightTimeoutError = TimeoutError # type: ignore
|
||||
|
||||
def sync_playwright(*_args: Any, **_kwargs: Any) -> Any: # type: ignore
|
||||
raise RuntimeError(
|
||||
"playwright is required for screenshot capture; install with: pip install playwright; then: playwright install"
|
||||
)
|
||||
from tool.playwright import HAS_PLAYWRIGHT, PlaywrightTimeoutError, PlaywrightTool
|
||||
|
||||
try:
|
||||
from config import resolve_output_dir
|
||||
@@ -128,6 +124,7 @@ class ScreenshotOptions:
|
||||
prefer_platform_target: bool = False
|
||||
target_selectors: Optional[Sequence[str]] = None
|
||||
selector_timeout_ms: int = 10_000
|
||||
playwright_tool: Optional[PlaywrightTool] = None
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
@@ -324,142 +321,119 @@ def _prepare_output_path(options: ScreenshotOptions) -> Path:
|
||||
def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) -> None:
|
||||
"""Capture screenshot using Playwright."""
|
||||
debug(f"[_capture] Starting capture for {options.url} -> {destination}")
|
||||
playwright = None
|
||||
browser = None
|
||||
context = None
|
||||
try:
|
||||
debug("Starting Playwright...", flush=True)
|
||||
playwright = sync_playwright().start()
|
||||
log("Launching Chromium browser...", flush=True)
|
||||
tool = options.playwright_tool or PlaywrightTool({})
|
||||
tool.debug_dump()
|
||||
|
||||
log("Launching browser...", flush=True)
|
||||
format_name = _normalise_format(options.output_format)
|
||||
headless = options.headless or format_name == "pdf"
|
||||
debug(f"[_capture] Format: {format_name}, Headless: {headless}")
|
||||
|
||||
if format_name == "pdf" and not options.headless:
|
||||
warnings.append("pdf output requires headless Chromium; overriding headless mode")
|
||||
browser = playwright.chromium.launch(
|
||||
headless=headless,
|
||||
args=["--disable-blink-features=AutomationControlled"],
|
||||
)
|
||||
log("Creating browser context...", flush=True)
|
||||
context = browser.new_context(
|
||||
user_agent=USER_AGENT,
|
||||
viewport=DEFAULT_VIEWPORT,
|
||||
ignore_https_errors=True,
|
||||
)
|
||||
page = context.new_page()
|
||||
log(f"Navigating to {options.url}...", flush=True)
|
||||
try:
|
||||
page.goto(options.url, timeout=90_000, wait_until="domcontentloaded")
|
||||
log("Page loaded successfully", flush=True)
|
||||
except PlaywrightTimeoutError:
|
||||
warnings.append("navigation timeout; capturing current page state")
|
||||
log("Navigation timeout; proceeding with current state", flush=True)
|
||||
|
||||
# Skip article lookup by default (wait_for_article defaults to False)
|
||||
if options.wait_for_article:
|
||||
|
||||
with tool.open_page(headless=headless) as page:
|
||||
log(f"Navigating to {options.url}...", flush=True)
|
||||
try:
|
||||
log("Waiting for article element...", flush=True)
|
||||
page.wait_for_selector("article", timeout=10_000)
|
||||
log("Article element found", flush=True)
|
||||
tool.goto(page, options.url)
|
||||
log("Page loaded successfully", flush=True)
|
||||
except PlaywrightTimeoutError:
|
||||
warnings.append("<article> selector not found; capturing fallback")
|
||||
log("Article element not found; using fallback", flush=True)
|
||||
|
||||
if options.wait_after_load > 0:
|
||||
log(f"Waiting {options.wait_after_load}s for page stabilization...", flush=True)
|
||||
time.sleep(min(10.0, max(0.0, options.wait_after_load)))
|
||||
if options.replace_video_posters:
|
||||
log("Replacing video elements with posters...", flush=True)
|
||||
page.evaluate(
|
||||
"""
|
||||
document.querySelectorAll('video').forEach(v => {
|
||||
if (v.poster) {
|
||||
const img = document.createElement('img');
|
||||
img.src = v.poster;
|
||||
img.style.maxWidth = '100%';
|
||||
img.style.borderRadius = '12px';
|
||||
v.replaceWith(img);
|
||||
}
|
||||
});
|
||||
"""
|
||||
)
|
||||
# Attempt platform-specific target capture if requested (and not PDF)
|
||||
element_captured = False
|
||||
if options.prefer_platform_target and format_name != "pdf":
|
||||
log("Attempting platform-specific content capture...", flush=True)
|
||||
try:
|
||||
_platform_preprocess(options.url, page, warnings)
|
||||
except Exception as e:
|
||||
debug(f"[_capture] Platform preprocess failed: {e}")
|
||||
pass
|
||||
selectors = list(options.target_selectors or [])
|
||||
if not selectors:
|
||||
selectors = _selectors_for_url(options.url)
|
||||
warnings.append("navigation timeout; capturing current page state")
|
||||
log("Navigation timeout; proceeding with current state", flush=True)
|
||||
|
||||
debug(f"[_capture] Trying selectors: {selectors}")
|
||||
for sel in selectors:
|
||||
# Skip article lookup by default (wait_for_article defaults to False)
|
||||
if options.wait_for_article:
|
||||
try:
|
||||
log(f"Trying selector: {sel}", flush=True)
|
||||
el = page.wait_for_selector(sel, timeout=max(0, int(options.selector_timeout_ms)))
|
||||
log("Waiting for article element...", flush=True)
|
||||
page.wait_for_selector("article", timeout=10_000)
|
||||
log("Article element found", flush=True)
|
||||
except PlaywrightTimeoutError:
|
||||
log(f"Selector not found: {sel}", flush=True)
|
||||
continue
|
||||
warnings.append("<article> selector not found; capturing fallback")
|
||||
log("Article element not found; using fallback", flush=True)
|
||||
|
||||
if options.wait_after_load > 0:
|
||||
log(f"Waiting {options.wait_after_load}s for page stabilization...", flush=True)
|
||||
time.sleep(min(10.0, max(0.0, options.wait_after_load)))
|
||||
if options.replace_video_posters:
|
||||
log("Replacing video elements with posters...", flush=True)
|
||||
page.evaluate(
|
||||
"""
|
||||
document.querySelectorAll('video').forEach(v => {
|
||||
if (v.poster) {
|
||||
const img = document.createElement('img');
|
||||
img.src = v.poster;
|
||||
img.style.maxWidth = '100%';
|
||||
img.style.borderRadius = '12px';
|
||||
v.replaceWith(img);
|
||||
}
|
||||
});
|
||||
"""
|
||||
)
|
||||
# Attempt platform-specific target capture if requested (and not PDF)
|
||||
element_captured = False
|
||||
if options.prefer_platform_target and format_name != "pdf":
|
||||
log("Attempting platform-specific content capture...", flush=True)
|
||||
try:
|
||||
if el is not None:
|
||||
log(f"Found element with selector: {sel}", flush=True)
|
||||
try:
|
||||
el.scroll_into_view_if_needed(timeout=1000)
|
||||
except Exception:
|
||||
pass
|
||||
log(f"Capturing element to {destination}...", flush=True)
|
||||
el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
|
||||
element_captured = True
|
||||
log("Element captured successfully", flush=True)
|
||||
break
|
||||
except Exception as exc:
|
||||
warnings.append(f"element capture failed for '{sel}': {exc}")
|
||||
log(f"Failed to capture element: {exc}", flush=True)
|
||||
# Fallback to default capture paths
|
||||
if element_captured:
|
||||
pass
|
||||
elif format_name == "pdf":
|
||||
log("Generating PDF...", flush=True)
|
||||
page.emulate_media(media="print")
|
||||
page.pdf(path=str(destination), print_background=True)
|
||||
log(f"PDF saved to {destination}", flush=True)
|
||||
else:
|
||||
log(f"Capturing full page to {destination}...", flush=True)
|
||||
screenshot_kwargs: Dict[str, Any] = {"path": str(destination)}
|
||||
if format_name == "jpeg":
|
||||
screenshot_kwargs["type"] = "jpeg"
|
||||
screenshot_kwargs["quality"] = 90
|
||||
if options.full_page:
|
||||
page.screenshot(full_page=True, **screenshot_kwargs)
|
||||
_platform_preprocess(options.url, page, warnings)
|
||||
except Exception as e:
|
||||
debug(f"[_capture] Platform preprocess failed: {e}")
|
||||
pass
|
||||
selectors = list(options.target_selectors or [])
|
||||
if not selectors:
|
||||
selectors = _selectors_for_url(options.url)
|
||||
|
||||
debug(f"[_capture] Trying selectors: {selectors}")
|
||||
for sel in selectors:
|
||||
try:
|
||||
log(f"Trying selector: {sel}", flush=True)
|
||||
el = page.wait_for_selector(sel, timeout=max(0, int(options.selector_timeout_ms)))
|
||||
except PlaywrightTimeoutError:
|
||||
log(f"Selector not found: {sel}", flush=True)
|
||||
continue
|
||||
try:
|
||||
if el is not None:
|
||||
log(f"Found element with selector: {sel}", flush=True)
|
||||
try:
|
||||
el.scroll_into_view_if_needed(timeout=1000)
|
||||
except Exception:
|
||||
pass
|
||||
log(f"Capturing element to {destination}...", flush=True)
|
||||
el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
|
||||
element_captured = True
|
||||
log("Element captured successfully", flush=True)
|
||||
break
|
||||
except Exception as exc:
|
||||
warnings.append(f"element capture failed for '{sel}': {exc}")
|
||||
log(f"Failed to capture element: {exc}", flush=True)
|
||||
# Fallback to default capture paths
|
||||
if element_captured:
|
||||
pass
|
||||
elif format_name == "pdf":
|
||||
log("Generating PDF...", flush=True)
|
||||
page.emulate_media(media="print")
|
||||
page.pdf(path=str(destination), print_background=True)
|
||||
log(f"PDF saved to {destination}", flush=True)
|
||||
else:
|
||||
article = page.query_selector("article")
|
||||
if article is not None:
|
||||
article_kwargs = dict(screenshot_kwargs)
|
||||
article_kwargs.pop("full_page", None)
|
||||
article.screenshot(**article_kwargs)
|
||||
log(f"Capturing full page to {destination}...", flush=True)
|
||||
screenshot_kwargs: Dict[str, Any] = {"path": str(destination)}
|
||||
if format_name == "jpeg":
|
||||
screenshot_kwargs["type"] = "jpeg"
|
||||
screenshot_kwargs["quality"] = 90
|
||||
if options.full_page:
|
||||
page.screenshot(full_page=True, **screenshot_kwargs)
|
||||
else:
|
||||
page.screenshot(**screenshot_kwargs)
|
||||
log(f"Screenshot saved to {destination}", flush=True)
|
||||
article = page.query_selector("article")
|
||||
if article is not None:
|
||||
article_kwargs = dict(screenshot_kwargs)
|
||||
article_kwargs.pop("full_page", None)
|
||||
article.screenshot(**article_kwargs)
|
||||
else:
|
||||
page.screenshot(**screenshot_kwargs)
|
||||
log(f"Screenshot saved to {destination}", flush=True)
|
||||
except Exception as exc:
|
||||
debug(f"[_capture] Exception: {exc}")
|
||||
raise ScreenshotError(f"Failed to capture screenshot: {exc}") from exc
|
||||
finally:
|
||||
log("Cleaning up browser resources...", flush=True)
|
||||
with contextlib.suppress(Exception):
|
||||
if context is not None:
|
||||
context.close()
|
||||
with contextlib.suppress(Exception):
|
||||
if browser is not None:
|
||||
browser.close()
|
||||
with contextlib.suppress(Exception):
|
||||
if playwright is not None:
|
||||
playwright.stop()
|
||||
log("Cleanup complete", flush=True)
|
||||
|
||||
|
||||
def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
|
||||
@@ -511,8 +485,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
Screenshots are created using Playwright and marked as temporary
|
||||
so they can be cleaned up later with the cleanup cmdlet.
|
||||
"""
|
||||
from ._shared import parse_cmdlet_args
|
||||
|
||||
debug(f"[_run] screen-shot invoked with args: {args}")
|
||||
|
||||
# Help check
|
||||
@@ -534,6 +506,19 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
parsed = parse_cmdlet_args(args, CMDLET)
|
||||
|
||||
format_value = parsed.get("format")
|
||||
if not format_value:
|
||||
# Default format can be set via config.conf tool block:
|
||||
# [tool=playwright]
|
||||
# format="pdf"
|
||||
try:
|
||||
tool_cfg = config.get("tool", {}) if isinstance(config, dict) else {}
|
||||
pw_cfg = tool_cfg.get("playwright") if isinstance(tool_cfg, dict) else None
|
||||
if isinstance(pw_cfg, dict):
|
||||
format_value = pw_cfg.get("format")
|
||||
except Exception:
|
||||
pass
|
||||
if not format_value:
|
||||
format_value = "png"
|
||||
storage_value = parsed.get("storage")
|
||||
selector_arg = parsed.get("selector")
|
||||
selectors = [selector_arg] if selector_arg else []
|
||||
@@ -669,6 +654,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
prefer_platform_target=False,
|
||||
wait_for_article=False,
|
||||
full_page=True,
|
||||
playwright_tool=PlaywrightTool(config),
|
||||
)
|
||||
|
||||
screenshot_result = _capture_screenshot(options)
|
||||
|
||||
Reference in New Issue
Block a user