This commit is contained in:
nose
2025-12-22 02:11:53 -08:00
parent d0b821b5dd
commit 16316bb3fd
20 changed files with 4218 additions and 2422 deletions

View File

@@ -14,10 +14,11 @@ import httpx
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple
from urllib.parse import urlsplit, quote, urljoin
from urllib.parse import urlsplit, quote, urljoin, unquote
from SYS.logger import log, debug
from API.HTTP import HTTPClient
from SYS.pipeline_progress import PipelineProgress
from SYS.utils import ensure_directory, unique_path, unique_preserve_order
from . import _shared as sh
@@ -31,54 +32,6 @@ get_field = sh.get_field
parse_cmdlet_args = sh.parse_cmdlet_args
import pipeline as pipeline_context
def _live_ui_and_pipe_index() -> tuple[Optional[Any], int]:
ui = None
try:
ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
except Exception:
ui = None
pipe_idx: int = 0
try:
stage_ctx = pipeline_context.get_stage_context() if hasattr(pipeline_context, "get_stage_context") else None
maybe_idx = getattr(stage_ctx, "pipe_index", None) if stage_ctx is not None else None
if isinstance(maybe_idx, int):
pipe_idx = int(maybe_idx)
except Exception:
pipe_idx = 0
return ui, pipe_idx
def _begin_live_steps(total_steps: int) -> None:
"""Declare the total number of steps for this cmdlet run (per-pipe)."""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
begin = getattr(ui, "begin_pipe_steps", None)
if callable(begin):
begin(int(pipe_idx), total_steps=int(total_steps))
except Exception:
return
def _step(text: str) -> None:
"""Emit a *new* step.
Each call increments the step counter and advances percent automatically.
"""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
adv = getattr(ui, "advance_pipe_step", None)
if callable(adv):
adv(int(pipe_idx), str(text))
except Exception:
return
# ============================================================================
# CMDLET Metadata Declaration
# ============================================================================
@@ -115,6 +68,10 @@ USER_AGENT = (
DEFAULT_VIEWPORT: dict[str, int] = {"width": 1920, "height": 1080}
ARCHIVE_TIMEOUT = 30.0
# WebP has a hard maximum dimension per side.
# Pillow typically fails with: "encoding error 5: Image size exceeds WebP limit of 16383 pixels"
WEBP_MAX_DIM = 16_383
# Configurable selectors for specific websites
SITE_SELECTORS: Dict[str, List[str]] = {
"twitter.com": [
@@ -200,6 +157,80 @@ def _slugify_url(url: str) -> str:
return slug[:100]
def _tags_from_url(url: str) -> List[str]:
"""Derive simple tags from a URL.
- site:<domain> (strips leading www.)
- title:<slug> derived from the last path segment, with extension removed
and separators (-, _, %) normalized to spaces.
"""
u = str(url or "").strip()
if not u:
return []
parsed = None
try:
parsed = urlsplit(u)
host = str(getattr(parsed, "hostname", None) or getattr(parsed, "netloc", "") or "").strip().lower()
except Exception:
parsed = None
host = ""
if host:
# Drop credentials and port if present.
if "@" in host:
host = host.rsplit("@", 1)[-1]
if ":" in host:
host = host.split(":", 1)[0]
if host.startswith("www."):
host = host[len("www.") :]
path = ""
if parsed is not None:
try:
path = str(getattr(parsed, "path", "") or "")
except Exception:
path = ""
last = ""
if path:
try:
last = path.rsplit("/", 1)[-1]
except Exception:
last = ""
try:
last = unquote(last or "")
except Exception:
last = last or ""
if last and "." in last:
# Drop a single trailing extension (e.g. .html, .php).
last = last.rsplit(".", 1)[0]
for sep in ("_", "-", "%"):
if last and sep in last:
last = last.replace(sep, " ")
title = " ".join(str(last or "").split()).strip().lower()
tags: List[str] = []
if host:
tags.append(f"site:{host}")
if title:
tags.append(f"title:{title}")
return tags
def _title_from_url(url: str) -> str:
"""Return the normalized title derived from a URL's last path segment."""
for t in _tags_from_url(url):
if str(t).lower().startswith("title:"):
return str(t)[len("title:") :].strip()
return ""
def _normalise_format(fmt: Optional[str]) -> str:
"""Normalize output format to valid values."""
if not fmt:
@@ -218,6 +249,89 @@ def _format_suffix(fmt: str) -> str:
return ".jpg"
return f".{fmt}"
def _convert_to_webp(
src_png: Path,
dst_webp: Path,
*,
quality: int = 90,
method: int = 6,
max_dim: int = WEBP_MAX_DIM,
downscale_if_oversize: bool = True,
) -> bool:
"""Convert a PNG screenshot to WebP via Pillow.
Playwright does not currently support emitting WebP directly.
"""
if not src_png or not Path(src_png).is_file():
raise ScreenshotError(f"Source image not found: {src_png}")
dst_webp = Path(dst_webp)
try:
dst_webp.parent.mkdir(parents=True, exist_ok=True)
except Exception:
pass
try:
from PIL import Image
except Exception as exc:
raise ScreenshotError(f"Pillow is required for webp conversion: {exc}") from exc
# Write atomically to avoid partial files if conversion is interrupted.
tmp_path = unique_path(dst_webp.with_suffix(".tmp.webp"))
try:
with Image.open(src_png) as im:
did_downscale = False
save_kwargs: Dict[str, Any] = {
"format": "WEBP",
"quality": int(quality),
"method": int(method),
}
# Preserve alpha when present; Pillow handles it for WEBP.
# Normalize palette images to RGBA to avoid odd palette artifacts.
if im.mode == "P":
im = im.convert("RGBA")
# WebP enforces a hard max dimension per side (16383px).
# When full-page captures are very tall, downscale proportionally to fit.
try:
w, h = im.size
except Exception:
w, h = 0, 0
if downscale_if_oversize and isinstance(max_dim, int) and max_dim > 0 and (w > max_dim or h > max_dim):
scale = 1.0
try:
scale = min(float(max_dim) / float(w), float(max_dim) / float(h))
except Exception:
scale = 1.0
if scale > 0.0 and scale < 1.0:
new_w = max(1, int(w * scale))
new_h = max(1, int(h * scale))
debug(
f"[_convert_to_webp] Image exceeds WebP limit ({w}x{h}); downscaling -> {new_w}x{new_h}"
)
try:
resample = getattr(getattr(Image, "Resampling", Image), "LANCZOS", None)
if resample is None:
resample = getattr(Image, "LANCZOS", 1)
im = im.resize((new_w, new_h), resample=resample)
did_downscale = True
except Exception as exc:
debug(f"[_convert_to_webp] Downscale failed; attempting direct WEBP save anyway: {exc}")
im.save(tmp_path, **save_kwargs)
tmp_path.replace(dst_webp)
return bool(did_downscale)
finally:
try:
tmp_path.unlink(missing_ok=True)
except Exception:
pass
def _matched_site_selectors(url: str) -> List[str]:
"""Return SITE_SELECTORS for a matched domain; empty if no match.
@@ -231,6 +345,16 @@ def _matched_site_selectors(url: str) -> List[str]:
return sels
def _selectors_for_url(url: str) -> List[str]:
"""Return selectors to try for a URL.
For now, prefer a minimal behavior: only return known SITE_SELECTORS.
(The cmdlet already falls back to full-page capture when no selectors match.)
"""
return _matched_site_selectors(url)
def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000) -> None:
"""Best-effort page tweaks for popular platforms before capture."""
try:
@@ -366,11 +490,11 @@ def _prepare_output_path(options: ScreenshotOptions) -> Path:
return unique_path(path)
def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) -> None:
def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str], progress: PipelineProgress) -> None:
"""Capture screenshot using Playwright."""
debug(f"[_capture] Starting capture for {options.url} -> {destination}")
try:
_step("loading launching browser")
progress.step("loading launching browser")
tool = options.playwright_tool or PlaywrightTool({})
# Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency)
@@ -405,16 +529,16 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
try:
with tool.open_page(headless=headless) as page:
_step("loading navigating")
progress.step("loading navigating")
debug(f"Navigating to {options.url}...")
try:
tool.goto(page, options.url)
debug("Page loaded successfully")
_step("loading page loaded")
progress.step("loading page loaded")
except PlaywrightTimeoutError:
warnings.append("navigation timeout; capturing current page state")
debug("Navigation timeout; proceeding with current state")
_step("loading navigation timeout")
progress.step("loading navigation timeout")
# Skip article lookup by default (wait_for_article defaults to False)
if options.wait_for_article:
@@ -430,9 +554,9 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
debug(f"Waiting {options.wait_after_load}s for page stabilization...")
time.sleep(min(10.0, max(0.0, options.wait_after_load)))
_step("loading stabilized")
progress.step("loading stabilized")
_step("capturing preparing")
progress.step("capturing preparing")
if options.replace_video_posters:
debug("Replacing video elements with posters...")
page.evaluate(
@@ -453,7 +577,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
if options.prefer_platform_target and format_name != "pdf":
debug(f"[_capture] Target capture enabled")
debug("Attempting platform-specific content capture...")
_step("capturing locating target")
progress.step("capturing locating target")
try:
_platform_preprocess(options.url, page, warnings)
except Exception as e:
@@ -478,7 +602,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
el.scroll_into_view_if_needed(timeout=1000)
except Exception:
pass
_step("capturing output")
progress.step("capturing output")
debug(f"Capturing element to {destination}...")
el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
element_captured = True
@@ -489,14 +613,14 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
debug(f"Failed to capture element: {exc}")
# Fallback to default capture paths
if element_captured:
_step("capturing saved")
progress.step("capturing saved")
elif format_name == "pdf":
debug("Generating PDF...")
page.emulate_media(media="print")
_step("capturing output")
progress.step("capturing output")
page.pdf(path=str(destination), print_background=True)
debug(f"PDF saved to {destination}")
_step("capturing saved")
progress.step("capturing saved")
else:
debug(f"Capturing full page to {destination}...")
screenshot_kwargs: Dict[str, Any] = {"path": str(destination)}
@@ -504,20 +628,20 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
screenshot_kwargs["type"] = "jpeg"
screenshot_kwargs["quality"] = 90
if options.full_page:
_step("capturing output")
progress.step("capturing output")
page.screenshot(full_page=True, **screenshot_kwargs)
else:
article = page.query_selector("article")
if article is not None:
article_kwargs = dict(screenshot_kwargs)
article_kwargs.pop("full_page", None)
_step("capturing output")
progress.step("capturing output")
article.screenshot(**article_kwargs)
else:
_step("capturing output")
progress.step("capturing output")
page.screenshot(**screenshot_kwargs)
debug(f"Screenshot saved to {destination}")
_step("capturing saved")
progress.step("capturing saved")
except Exception as exc:
debug(f"[_capture] Exception launching browser/page: {exc}")
msg = str(exc).lower()
@@ -532,7 +656,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
raise ScreenshotError(f"Failed to capture screenshot: {exc}") from exc
def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
def _capture_screenshot(options: ScreenshotOptions, progress: PipelineProgress) -> ScreenshotResult:
"""Capture a screenshot for the given options."""
debug(f"[_capture_screenshot] Preparing capture for {options.url}")
requested_format = _normalise_format(options.output_format)
@@ -543,8 +667,8 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
will_convert = requested_format == "webp"
will_archive = bool(options.archive and options.url)
total_steps = 9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
_begin_live_steps(total_steps)
_step("loading starting")
progress.begin_steps(total_steps)
progress.step("loading starting")
# Playwright screenshots do not natively support WebP output.
# Capture as PNG, then convert via Pillow.
@@ -553,17 +677,22 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
capture_path = unique_path(destination.with_suffix(".png"))
debug(f"[_capture_screenshot] Requested webp; capturing intermediate png -> {capture_path}")
options.output_format = "png"
_capture(options, capture_path, warnings)
_capture(options, capture_path, warnings, progress)
if requested_format == "webp":
_step("capturing converting to webp")
progress.step("capturing converting to webp")
debug(f"[_capture_screenshot] Converting png -> webp: {destination}")
try:
_convert_to_webp(capture_path, destination)
try:
capture_path.unlink(missing_ok=True)
except Exception:
pass
did_downscale = _convert_to_webp(capture_path, destination)
if did_downscale:
warnings.append(
f"webp conversion used downscaling to fit {WEBP_MAX_DIM}px limit; keeping original png: {capture_path.name}"
)
else:
try:
capture_path.unlink(missing_ok=True)
except Exception:
pass
except Exception as exc:
warnings.append(f"webp conversion failed; keeping png: {exc}")
destination = capture_path
@@ -572,7 +701,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
url: List[str] = [options.url] if options.url else []
archive_url: List[str] = []
if options.archive and options.url:
_step("capturing archiving")
progress.step("capturing archiving")
debug(f"[_capture_screenshot] Archiving enabled for {options.url}")
archives, archive_warnings = _archive_url(options.url, options.archive_timeout)
archive_url.extend(archives)
@@ -580,7 +709,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
if archives:
url = unique_preserve_order([*url, *archives])
_step("capturing finalized")
progress.step("capturing finalized")
applied_tag = unique_preserve_order(list(tag for tag in options.tag if tag.strip()))
@@ -627,6 +756,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
)
return 1
progress = PipelineProgress(pipeline_context)
# ========================================================================
# ARGUMENT PARSING
# ========================================================================
@@ -685,32 +816,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
debug(f"[_run] url to process: {[u for u, _ in url_to_process]}")
# If the caller isn't running the shared pipeline Live progress UI (e.g. direct
# cmdlet execution), start a minimal local pipeline progress panel so this cmdlet
# still shows step-level progress.
local_progress_ui = None
try:
existing_ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
except Exception:
existing_ui = None
try:
if existing_ui is None and bool(getattr(sys.stderr, "isatty", lambda: False)()):
from models import PipelineLiveProgress
local_progress_ui = PipelineLiveProgress(["screen-shot"], enabled=True)
local_progress_ui.start()
try:
if hasattr(pipeline_context, "set_live_progress"):
pipeline_context.set_live_progress(local_progress_ui)
except Exception:
pass
try:
local_progress_ui.begin_pipe(0, total_items=len(url_to_process), items_preview=[u for u, _ in url_to_process])
except Exception:
pass
except Exception:
local_progress_ui = None
# ========================================================================
# OUTPUT DIRECTORY RESOLUTION - Priority chain
# ========================================================================
@@ -749,6 +854,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
ensure_directory(screenshot_dir)
# If the caller isn't running the shared pipeline Live progress UI (e.g. direct
# cmdlet execution), start a minimal local pipeline progress panel so this cmdlet
# still shows step-level progress.
try:
progress.ensure_local_ui(
label="screen-shot",
total_items=len(url_to_process),
items_preview=[u for u, _ in url_to_process],
)
except Exception:
pass
# ========================================================================
# PREPARE SCREENSHOT OPTIONS
# ========================================================================
@@ -850,7 +967,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
options.target_selectors = auto_selectors
debug(f"[screen_shot] Auto selectors matched for url: {auto_selectors}")
screenshot_result = _capture_screenshot(options)
screenshot_result = _capture_screenshot(options, progress)
# Log results and warnings
debug(f"Screenshot captured to {screenshot_result.path}")
@@ -875,15 +992,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
capture_date = datetime.now().date().isoformat()
upstream_title = _clean_title(_extract_item_title(origin_item))
display_title = upstream_title or url
url_title = _title_from_url(url)
display_title = upstream_title or url_title or url
upstream_tags = _extract_item_tags(origin_item)
filtered_upstream_tags = [
t for t in upstream_tags
if not str(t).strip().lower().startswith(("type:", "date:"))
]
url_tags = _tags_from_url(url)
merged_tags = unique_preserve_order(
["type:screenshot", f"date:{capture_date}"] + filtered_upstream_tags
["type:screenshot", f"date:{capture_date}"] + filtered_upstream_tags + url_tags
)
pipe_obj = create_pipe_object_result(
@@ -910,11 +1030,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
all_emitted.append(pipe_obj)
# If we created a local progress UI, advance it per completed item.
if local_progress_ui is not None:
try:
local_progress_ui.on_emit(0, pipe_obj)
except Exception:
pass
progress.on_emit(pipe_obj)
except ScreenshotError as exc:
log(f"Error taking screenshot of {url}: {exc}", file=sys.stderr)
@@ -925,23 +1041,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
traceback.print_exc(file=sys.stderr)
exit_code = 1
try:
if local_progress_ui is not None:
try:
local_progress_ui.finish_pipe(0, force_complete=True)
except Exception:
pass
finally:
if local_progress_ui is not None:
try:
local_progress_ui.stop()
except Exception:
pass
try:
if hasattr(pipeline_context, "set_live_progress"):
pipeline_context.set_live_progress(None)
except Exception:
pass
progress.close_local_ui(force_complete=True)
if not all_emitted:
log(f"No screenshots were successfully captured", file=sys.stderr)