This commit is contained in:
nose
2025-12-21 16:59:37 -08:00
parent 11a13edb84
commit d0b821b5dd
7 changed files with 508 additions and 136 deletions

View File

@@ -32,20 +32,52 @@ parse_cmdlet_args = sh.parse_cmdlet_args
import pipeline as pipeline_context
def _set_live_step(text: str) -> None:
"""Best-effort update to the pipeline Live progress title (if enabled)."""
def _live_ui_and_pipe_index() -> tuple[Optional[Any], int]:
ui = None
try:
ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
except Exception:
ui = None
pipe_idx: int = 0
try:
stage_ctx = pipeline_context.get_stage_context() if hasattr(pipeline_context, "get_stage_context") else None
maybe_idx = getattr(stage_ctx, "pipe_index", None) if stage_ctx is not None else None
if isinstance(maybe_idx, int):
pipe_idx = int(maybe_idx)
except Exception:
pipe_idx = 0
return ui, pipe_idx
def _begin_live_steps(total_steps: int) -> None:
"""Declare the total number of steps for this cmdlet run (per-pipe)."""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
setter = getattr(ui, "set_active_subtask_text", None)
if callable(setter):
setter(str(text or "").strip())
begin = getattr(ui, "begin_pipe_steps", None)
if callable(begin):
begin(int(pipe_idx), total_steps=int(total_steps))
except Exception:
pass
return
def _step(text: str) -> None:
"""Emit a *new* step.
Each call increments the step counter and advances percent automatically.
"""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
adv = getattr(ui, "advance_pipe_step", None)
if callable(adv):
adv(int(pipe_idx), str(text))
except Exception:
return
# ============================================================================
# CMDLET Metadata Declaration
@@ -186,28 +218,6 @@ def _format_suffix(fmt: str) -> str:
return ".jpg"
return f".{fmt}"
def _convert_to_webp(source_path: Path, dest_path: Path) -> None:
"""Convert an image file to WebP using Pillow."""
from PIL import Image
with Image.open(source_path) as img:
# Keep a sensible default: good quality + small size.
img.save(dest_path, format="WEBP", quality=100, method=6)
def _selectors_for_url(url: str) -> List[str]:
"""Return a list of likely content selectors for known platforms."""
u = url.lower()
sels: List[str] = []
for domain, selectors in SITE_SELECTORS.items():
if domain in u:
sels.extend(selectors)
return sels or ["article"]
def _matched_site_selectors(url: str) -> List[str]:
"""Return SITE_SELECTORS for a matched domain; empty if no match.
@@ -223,46 +233,47 @@ def _matched_site_selectors(url: str) -> List[str]:
def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000) -> None:
"""Best-effort page tweaks for popular platforms before capture."""
u = url.lower()
try:
u = str(url or "").lower()
def _try_click_texts(texts: List[str], passes: int = 2, per_timeout: int = 700) -> int:
clicks = 0
for _ in range(max(1, passes)):
for t in texts:
try:
page.locator(f"text=/{t}/i").first.click(timeout=per_timeout)
clicks += 1
except PlaywrightTimeoutError:
pass
except Exception:
pass
time.sleep(0.1)
return clicks
def _try_click_buttons(names: List[str], passes: int = 2, per_timeout: int = 700) -> int:
clicks = 0
for _ in range(max(1, int(passes))):
for name in names:
try:
locator = page.get_by_role("button", name=name)
locator.first.click(timeout=int(per_timeout))
clicks += 1
except Exception:
pass
return clicks
# Dismiss common cookie/consent prompts
_try_click_texts(["accept", "i agree", "agree", "got it", "allow all", "consent"])
# Dismiss common cookie / consent prompts.
_try_click_buttons([
"Accept all",
"Accept",
"I agree",
"Agree",
"Allow all",
"OK",
])
# Platform-specific expansions
if "reddit.com" in u:
_try_click_texts(["see more", "read more", "show more", "more"])
if ("twitter.com" in u) or ("x.com" in u):
_try_click_texts(["show more", "more"])
if "instagram.com" in u:
_try_click_texts(["more", "see more"])
if "tiktok.com" in u:
_try_click_texts(["more", "see more"])
if ("facebook.com" in u) or ("fb.watch" in u):
_try_click_texts(["see more", "show more", "more"])
if "rumble.com" in u:
_try_click_texts(["accept", "agree", "close"])
# Some sites need small nudges (best-effort).
if "reddit.com" in u:
_try_click_buttons(["Accept all", "Accept"])
if ("twitter.com" in u) or ("x.com" in u):
_try_click_buttons(["Accept all", "Accept"])
if "instagram.com" in u:
_try_click_buttons(["Allow all", "Accept all", "Accept"])
except Exception as exc:
debug(f"[_platform_preprocess] skipped: {exc}")
return
def _submit_wayback(url: str, timeout: float) -> Optional[str]:
"""Submit URL to Internet Archive Wayback Machine."""
encoded = quote(url, safe="/:?=&")
with HTTPClient() as client:
with HTTPClient(headers={"User-Agent": USER_AGENT}) as client:
response = client.get(f"https://web.archive.org/save/{encoded}")
response.raise_for_status()
content_location = response.headers.get("Content-Location")
if content_location:
return urljoin("https://web.archive.org", content_location)
@@ -359,10 +370,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
"""Capture screenshot using Playwright."""
debug(f"[_capture] Starting capture for {options.url} -> {destination}")
try:
# Two-phase Live progress:
# 1) load + stabilize (ends right after the wait_after_load sleep)
# 2) capture + save (and any post-processing)
_set_live_step("screen-shot: loading")
_step("loading launching browser")
tool = options.playwright_tool or PlaywrightTool({})
# Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency)
@@ -397,13 +405,16 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
try:
with tool.open_page(headless=headless) as page:
_step("loading navigating")
debug(f"Navigating to {options.url}...")
try:
tool.goto(page, options.url)
debug("Page loaded successfully")
_step("loading page loaded")
except PlaywrightTimeoutError:
warnings.append("navigation timeout; capturing current page state")
debug("Navigation timeout; proceeding with current state")
_step("loading navigation timeout")
# Skip article lookup by default (wait_for_article defaults to False)
if options.wait_for_article:
@@ -419,8 +430,9 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
debug(f"Waiting {options.wait_after_load}s for page stabilization...")
time.sleep(min(10.0, max(0.0, options.wait_after_load)))
# Phase 2 begins here (per request).
_set_live_step("screen-shot: capturing")
_step("loading stabilized")
_step("capturing preparing")
if options.replace_video_posters:
debug("Replacing video elements with posters...")
page.evaluate(
@@ -441,6 +453,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
if options.prefer_platform_target and format_name != "pdf":
debug(f"[_capture] Target capture enabled")
debug("Attempting platform-specific content capture...")
_step("capturing locating target")
try:
_platform_preprocess(options.url, page, warnings)
except Exception as e:
@@ -465,6 +478,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
el.scroll_into_view_if_needed(timeout=1000)
except Exception:
pass
_step("capturing output")
debug(f"Capturing element to {destination}...")
el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
element_captured = True
@@ -475,12 +489,14 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
debug(f"Failed to capture element: {exc}")
# Fallback to default capture paths
if element_captured:
pass
_step("capturing saved")
elif format_name == "pdf":
debug("Generating PDF...")
page.emulate_media(media="print")
_step("capturing output")
page.pdf(path=str(destination), print_background=True)
debug(f"PDF saved to {destination}")
_step("capturing saved")
else:
debug(f"Capturing full page to {destination}...")
screenshot_kwargs: Dict[str, Any] = {"path": str(destination)}
@@ -488,16 +504,20 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
screenshot_kwargs["type"] = "jpeg"
screenshot_kwargs["quality"] = 90
if options.full_page:
_step("capturing output")
page.screenshot(full_page=True, **screenshot_kwargs)
else:
article = page.query_selector("article")
if article is not None:
article_kwargs = dict(screenshot_kwargs)
article_kwargs.pop("full_page", None)
_step("capturing output")
article.screenshot(**article_kwargs)
else:
_step("capturing output")
page.screenshot(**screenshot_kwargs)
debug(f"Screenshot saved to {destination}")
_step("capturing saved")
except Exception as exc:
debug(f"[_capture] Exception launching browser/page: {exc}")
msg = str(exc).lower()
@@ -519,6 +539,13 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
destination = _prepare_output_path(options)
warnings: List[str] = []
will_target = bool(options.prefer_platform_target) and requested_format != "pdf"
will_convert = requested_format == "webp"
will_archive = bool(options.archive and options.url)
total_steps = 9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
_begin_live_steps(total_steps)
_step("loading starting")
# Playwright screenshots do not natively support WebP output.
# Capture as PNG, then convert via Pillow.
capture_path = destination
@@ -529,6 +556,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
_capture(options, capture_path, warnings)
if requested_format == "webp":
_step("capturing converting to webp")
debug(f"[_capture_screenshot] Converting png -> webp: {destination}")
try:
_convert_to_webp(capture_path, destination)
@@ -544,7 +572,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
url: List[str] = [options.url] if options.url else []
archive_url: List[str] = []
if options.archive and options.url:
_set_live_step("screen-shot: archiving")
_step("capturing archiving")
debug(f"[_capture_screenshot] Archiving enabled for {options.url}")
archives, archive_warnings = _archive_url(options.url, options.archive_timeout)
archive_url.extend(archives)
@@ -552,6 +580,8 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
if archives:
url = unique_preserve_order([*url, *archives])
_step("capturing finalized")
applied_tag = unique_preserve_order(list(tag for tag in options.tag if tag.strip()))
return ScreenshotResult(
@@ -768,7 +798,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
continue
try:
_set_live_step("screen-shot: starting")
# Create screenshot with provided options
# Force the Playwright engine to Chromium for the screen-shot cmdlet
# (this ensures consistent rendering and supports PDF output requirements).