df
Some checks failed
smoke-mm / Install & smoke test mm --help (push) Has been cancelled
Some checks failed
smoke-mm / Install & smoke test mm --help (push) Has been cancelled
This commit is contained in:
@@ -37,7 +37,6 @@ import pipeline as pipeline_context
|
||||
# ============================================================================
|
||||
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Playwright & Screenshot Dependencies
|
||||
# ============================================================================
|
||||
@@ -104,7 +103,6 @@ SITE_SELECTORS: Dict[str, List[str]] = {
|
||||
}
|
||||
|
||||
|
||||
|
||||
class ScreenshotError(RuntimeError):
|
||||
"""Raised when screenshot capture or upload fails."""
|
||||
|
||||
@@ -146,6 +144,7 @@ class ScreenshotResult:
|
||||
# Helper Functions
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _slugify_url(url: str) -> str:
|
||||
"""Convert URL to filesystem-safe slug."""
|
||||
parsed = urlsplit(url)
|
||||
@@ -172,7 +171,11 @@ def _tags_from_url(url: str) -> List[str]:
|
||||
parsed = None
|
||||
try:
|
||||
parsed = urlsplit(u)
|
||||
host = str(getattr(parsed, "hostname", None) or getattr(parsed, "netloc", "") or "").strip().lower()
|
||||
host = (
|
||||
str(getattr(parsed, "hostname", None) or getattr(parsed, "netloc", "") or "")
|
||||
.strip()
|
||||
.lower()
|
||||
)
|
||||
except Exception:
|
||||
parsed = None
|
||||
host = ""
|
||||
@@ -300,7 +303,12 @@ def _convert_to_webp(
|
||||
except Exception:
|
||||
w, h = 0, 0
|
||||
|
||||
if downscale_if_oversize and isinstance(max_dim, int) and max_dim > 0 and (w > max_dim or h > max_dim):
|
||||
if (
|
||||
downscale_if_oversize
|
||||
and isinstance(max_dim, int)
|
||||
and max_dim > 0
|
||||
and (w > max_dim or h > max_dim)
|
||||
):
|
||||
scale = 1.0
|
||||
try:
|
||||
scale = min(float(max_dim) / float(w), float(max_dim) / float(h))
|
||||
@@ -320,7 +328,9 @@ def _convert_to_webp(
|
||||
im = im.resize((new_w, new_h), resample=resample)
|
||||
did_downscale = True
|
||||
except Exception as exc:
|
||||
debug(f"[_convert_to_webp] Downscale failed; attempting direct WEBP save anyway: {exc}")
|
||||
debug(
|
||||
f"[_convert_to_webp] Downscale failed; attempting direct WEBP save anyway: {exc}"
|
||||
)
|
||||
|
||||
im.save(tmp_path, **save_kwargs)
|
||||
|
||||
@@ -332,6 +342,7 @@ def _convert_to_webp(
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _matched_site_selectors(url: str) -> List[str]:
|
||||
"""Return SITE_SELECTORS for a matched domain; empty if no match.
|
||||
|
||||
@@ -355,7 +366,9 @@ def _selectors_for_url(url: str) -> List[str]:
|
||||
return _matched_site_selectors(url)
|
||||
|
||||
|
||||
def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000) -> None:
|
||||
def _platform_preprocess(
|
||||
url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000
|
||||
) -> None:
|
||||
"""Best-effort page tweaks for popular platforms before capture."""
|
||||
try:
|
||||
u = str(url or "").lower()
|
||||
@@ -373,14 +386,16 @@ def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: i
|
||||
return clicks
|
||||
|
||||
# Dismiss common cookie / consent prompts.
|
||||
_try_click_buttons([
|
||||
"Accept all",
|
||||
"Accept",
|
||||
"I agree",
|
||||
"Agree",
|
||||
"Allow all",
|
||||
"OK",
|
||||
])
|
||||
_try_click_buttons(
|
||||
[
|
||||
"Accept all",
|
||||
"Accept",
|
||||
"I agree",
|
||||
"Agree",
|
||||
"Allow all",
|
||||
"OK",
|
||||
]
|
||||
)
|
||||
|
||||
# Some sites need small nudges (best-effort).
|
||||
if "reddit.com" in u:
|
||||
@@ -490,7 +505,9 @@ def _prepare_output_path(options: ScreenshotOptions) -> Path:
|
||||
return unique_path(path)
|
||||
|
||||
|
||||
def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str], progress: PipelineProgress) -> None:
|
||||
def _capture(
|
||||
options: ScreenshotOptions, destination: Path, warnings: List[str], progress: PipelineProgress
|
||||
) -> None:
|
||||
"""Capture screenshot using Playwright."""
|
||||
debug(f"[_capture] Starting capture for {options.url} -> {destination}")
|
||||
try:
|
||||
@@ -499,16 +516,24 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
|
||||
|
||||
# Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency)
|
||||
try:
|
||||
current_browser = getattr(tool.defaults, "browser", "").lower() if getattr(tool, "defaults", None) is not None else ""
|
||||
current_browser = (
|
||||
getattr(tool.defaults, "browser", "").lower()
|
||||
if getattr(tool, "defaults", None) is not None
|
||||
else ""
|
||||
)
|
||||
if current_browser != "chromium":
|
||||
debug(f"[_capture] Overriding Playwright browser '{current_browser}' -> 'chromium' for screen-shot cmdlet")
|
||||
debug(
|
||||
f"[_capture] Overriding Playwright browser '{current_browser}' -> 'chromium' for screen-shot cmdlet"
|
||||
)
|
||||
base_cfg = {}
|
||||
try:
|
||||
base_cfg = dict(getattr(tool, "_config", {}) or {})
|
||||
except Exception:
|
||||
base_cfg = {}
|
||||
tool_block = dict(base_cfg.get("tool") or {}) if isinstance(base_cfg, dict) else {}
|
||||
pw_block = dict(tool_block.get("playwright") or {}) if isinstance(tool_block, dict) else {}
|
||||
pw_block = (
|
||||
dict(tool_block.get("playwright") or {}) if isinstance(tool_block, dict) else {}
|
||||
)
|
||||
pw_block["browser"] = "chromium"
|
||||
tool_block["playwright"] = pw_block
|
||||
if isinstance(base_cfg, dict):
|
||||
@@ -523,7 +548,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
|
||||
format_name = _normalise_format(options.output_format)
|
||||
headless = options.headless or format_name == "pdf"
|
||||
debug(f"[_capture] Format: {format_name}, Headless: {headless}")
|
||||
|
||||
|
||||
if format_name == "pdf" and not options.headless:
|
||||
warnings.append("pdf output requires headless Chromium; overriding headless mode")
|
||||
|
||||
@@ -539,7 +564,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
|
||||
warnings.append("navigation timeout; capturing current page state")
|
||||
debug("Navigation timeout; proceeding with current state")
|
||||
progress.step("loading navigation timeout")
|
||||
|
||||
|
||||
# Skip article lookup by default (wait_for_article defaults to False)
|
||||
if options.wait_for_article:
|
||||
try:
|
||||
@@ -549,7 +574,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
|
||||
except PlaywrightTimeoutError:
|
||||
warnings.append("<article> selector not found; capturing fallback")
|
||||
debug("Article element not found; using fallback")
|
||||
|
||||
|
||||
if options.wait_after_load > 0:
|
||||
debug(f"Waiting {options.wait_after_load}s for page stabilization...")
|
||||
time.sleep(min(10.0, max(0.0, options.wait_after_load)))
|
||||
@@ -591,7 +616,9 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
|
||||
for sel in selectors:
|
||||
try:
|
||||
debug(f"Trying selector: {sel}")
|
||||
el = page.wait_for_selector(sel, timeout=max(0, int(options.selector_timeout_ms)))
|
||||
el = page.wait_for_selector(
|
||||
sel, timeout=max(0, int(options.selector_timeout_ms))
|
||||
)
|
||||
except PlaywrightTimeoutError:
|
||||
debug(f"Selector not found: {sel}")
|
||||
continue
|
||||
@@ -604,7 +631,10 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
|
||||
pass
|
||||
progress.step("capturing output")
|
||||
debug(f"Capturing element to {destination}...")
|
||||
el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
|
||||
el.screenshot(
|
||||
path=str(destination),
|
||||
type=("jpeg" if format_name == "jpeg" else None),
|
||||
)
|
||||
element_captured = True
|
||||
debug("Element captured successfully")
|
||||
break
|
||||
@@ -645,8 +675,13 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str],
|
||||
except Exception as exc:
|
||||
debug(f"[_capture] Exception launching browser/page: {exc}")
|
||||
msg = str(exc).lower()
|
||||
if any(k in msg for k in ["executable", "not found", "no such file", "cannot find", "install"]):
|
||||
raise ScreenshotError("Chromium Playwright browser binaries not found. Install them: python ./scripts/bootstrap.py --playwright-only --browsers chromium") from exc
|
||||
if any(
|
||||
k in msg
|
||||
for k in ["executable", "not found", "no such file", "cannot find", "install"]
|
||||
):
|
||||
raise ScreenshotError(
|
||||
"Chromium Playwright browser binaries not found. Install them: python ./scripts/bootstrap.py --playwright-only --browsers chromium"
|
||||
) from exc
|
||||
raise
|
||||
except ScreenshotError:
|
||||
# Re-raise ScreenshotError raised intentionally (do not wrap)
|
||||
@@ -666,7 +701,9 @@ def _capture_screenshot(options: ScreenshotOptions, progress: PipelineProgress)
|
||||
will_target = bool(options.prefer_platform_target) and requested_format != "pdf"
|
||||
will_convert = requested_format == "webp"
|
||||
will_archive = bool(options.archive and options.url)
|
||||
total_steps = 9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
|
||||
total_steps = (
|
||||
9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
|
||||
)
|
||||
progress.begin_steps(total_steps)
|
||||
progress.step("loading starting")
|
||||
|
||||
@@ -726,19 +763,20 @@ def _capture_screenshot(options: ScreenshotOptions, progress: PipelineProgress)
|
||||
# Main Cmdlet Function
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
"""Take screenshots of url in the pipeline.
|
||||
|
||||
|
||||
Accepts:
|
||||
- Single result object (dict or PipeObject) with 'path' field
|
||||
- List of result objects to screenshot each
|
||||
- Direct URL as string
|
||||
|
||||
|
||||
Emits PipeObject-formatted results for each screenshot with:
|
||||
- action: 'cmdlet:screen-shot'
|
||||
- is_temp: True (screenshots are temporary artifacts)
|
||||
- parent_id: hash of the original file/URL
|
||||
|
||||
|
||||
Screenshots are created using Playwright and marked as temporary
|
||||
so they can be cleaned up later with the cleanup cmdlet.
|
||||
"""
|
||||
@@ -761,9 +799,9 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
# ========================================================================
|
||||
# ARGUMENT PARSING
|
||||
# ========================================================================
|
||||
|
||||
|
||||
parsed = parse_cmdlet_args(args, CMDLET)
|
||||
|
||||
|
||||
format_value = parsed.get("format")
|
||||
if not format_value:
|
||||
# Default format can be set via config.conf tool block:
|
||||
@@ -782,7 +820,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
selector_arg = parsed.get("selector")
|
||||
selectors = [selector_arg] if selector_arg else []
|
||||
archive_enabled = parsed.get("archive", False)
|
||||
|
||||
|
||||
# Positional URL argument (if provided)
|
||||
url_arg = parsed.get("url")
|
||||
positional_url = [str(url_arg)] if url_arg else []
|
||||
@@ -801,15 +839,11 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
# Extract url from piped results
|
||||
if piped_results:
|
||||
for item in piped_results:
|
||||
url = (
|
||||
get_field(item, 'path')
|
||||
or get_field(item, 'url')
|
||||
or get_field(item, 'target')
|
||||
)
|
||||
url = get_field(item, "path") or get_field(item, "url") or get_field(item, "target")
|
||||
|
||||
if url:
|
||||
url_to_process.append((str(url), item))
|
||||
|
||||
|
||||
if not url_to_process:
|
||||
log(f"No url to process for screen-shot cmdlet", file=sys.stderr)
|
||||
return 1
|
||||
@@ -819,9 +853,9 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
# ========================================================================
|
||||
# OUTPUT DIRECTORY RESOLUTION - Priority chain
|
||||
# ========================================================================
|
||||
|
||||
|
||||
screenshot_dir: Optional[Path] = None
|
||||
|
||||
|
||||
# Primary: Use --storage if provided (highest priority)
|
||||
if storage_value:
|
||||
try:
|
||||
@@ -830,7 +864,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
except ValueError as e:
|
||||
log(str(e), file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
# Secondary: Use config-based resolver ONLY if --storage not provided
|
||||
if screenshot_dir is None and resolve_output_dir is not None:
|
||||
try:
|
||||
@@ -838,7 +872,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
debug(f"[screen_shot] Using config resolver: {screenshot_dir}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# Tertiary: Use config outfile ONLY if neither --storage nor resolver worked
|
||||
if screenshot_dir is None and config and config.get("outfile"):
|
||||
try:
|
||||
@@ -846,12 +880,12 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
debug(f"[screen_shot] Using config outfile: {screenshot_dir}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# Default: User's Videos directory
|
||||
if screenshot_dir is None:
|
||||
screenshot_dir = Path.home() / "Videos"
|
||||
debug(f"[screen_shot] Using default directory: {screenshot_dir}")
|
||||
|
||||
|
||||
ensure_directory(screenshot_dir)
|
||||
|
||||
# If the caller isn't running the shared pipeline Live progress UI (e.g. direct
|
||||
@@ -869,21 +903,21 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
# ========================================================================
|
||||
# PREPARE SCREENSHOT OPTIONS
|
||||
# ========================================================================
|
||||
|
||||
|
||||
format_name = _normalise_format(format_value)
|
||||
filtered_selectors = [str(s).strip() for s in selectors if str(s).strip()]
|
||||
manual_target_selectors = filtered_selectors if filtered_selectors else None
|
||||
|
||||
|
||||
all_emitted = []
|
||||
exit_code = 0
|
||||
# ========================================================================
|
||||
# PROCESS url AND CAPTURE SCREENSHOTS
|
||||
# ========================================================================
|
||||
|
||||
|
||||
def _extract_item_tags(item: Any) -> List[str]:
|
||||
if item is None:
|
||||
return []
|
||||
raw = get_field(item, 'tag')
|
||||
raw = get_field(item, "tag")
|
||||
if isinstance(raw, list):
|
||||
return [str(t) for t in raw if t is not None and str(t).strip()]
|
||||
if isinstance(raw, str) and raw.strip():
|
||||
@@ -913,7 +947,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
if not url.lower().startswith(("http://", "https://", "file://")):
|
||||
log(f"[screen_shot] Skipping non-URL input: {url}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
|
||||
try:
|
||||
# Create screenshot with provided options
|
||||
# Force the Playwright engine to Chromium for the screen-shot cmdlet
|
||||
@@ -966,28 +1000,32 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
options.prefer_platform_target = True
|
||||
options.target_selectors = auto_selectors
|
||||
debug(f"[screen_shot] Auto selectors matched for url: {auto_selectors}")
|
||||
|
||||
|
||||
screenshot_result = _capture_screenshot(options, progress)
|
||||
|
||||
|
||||
# Log results and warnings
|
||||
debug(f"Screenshot captured to {screenshot_result.path}")
|
||||
if screenshot_result.archive_url:
|
||||
debug(f"Archives: {', '.join(screenshot_result.archive_url)}")
|
||||
for warning in screenshot_result.warnings:
|
||||
debug(f"Warning: {warning}")
|
||||
|
||||
|
||||
# Compute hash of screenshot file
|
||||
screenshot_hash = None
|
||||
try:
|
||||
with open(screenshot_result.path, 'rb') as f:
|
||||
with open(screenshot_result.path, "rb") as f:
|
||||
screenshot_hash = hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# Create PipeObject result - marked as TEMP since derivative artifact
|
||||
capture_date = ""
|
||||
try:
|
||||
capture_date = datetime.fromtimestamp(screenshot_result.path.stat().st_mtime).date().isoformat()
|
||||
capture_date = (
|
||||
datetime.fromtimestamp(screenshot_result.path.stat().st_mtime)
|
||||
.date()
|
||||
.isoformat()
|
||||
)
|
||||
except Exception:
|
||||
capture_date = datetime.now().date().isoformat()
|
||||
|
||||
@@ -997,7 +1035,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
|
||||
upstream_tags = _extract_item_tags(origin_item)
|
||||
filtered_upstream_tags = [
|
||||
t for t in upstream_tags
|
||||
t
|
||||
for t in upstream_tags
|
||||
if not str(t).strip().lower().startswith(("type:", "date:"))
|
||||
]
|
||||
|
||||
@@ -1007,40 +1046,41 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
)
|
||||
|
||||
pipe_obj = create_pipe_object_result(
|
||||
source='screenshot',
|
||||
store='PATH',
|
||||
source="screenshot",
|
||||
store="PATH",
|
||||
identifier=Path(screenshot_result.path).stem,
|
||||
file_path=str(screenshot_result.path),
|
||||
cmdlet_name='screen-shot',
|
||||
cmdlet_name="screen-shot",
|
||||
title=display_title,
|
||||
hash_value=screenshot_hash,
|
||||
is_temp=True,
|
||||
parent_hash=hashlib.sha256(url.encode()).hexdigest(),
|
||||
tag=merged_tags,
|
||||
extra={
|
||||
'source_url': url,
|
||||
'archive_url': screenshot_result.archive_url,
|
||||
'url': screenshot_result.url,
|
||||
'target': str(screenshot_result.path), # Explicit target for add-file
|
||||
}
|
||||
"source_url": url,
|
||||
"archive_url": screenshot_result.archive_url,
|
||||
"url": screenshot_result.url,
|
||||
"target": str(screenshot_result.path), # Explicit target for add-file
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# Emit the result so downstream cmdlet (like add-file) can use it
|
||||
pipeline_context.emit(pipe_obj)
|
||||
all_emitted.append(pipe_obj)
|
||||
|
||||
# If we created a local progress UI, advance it per completed item.
|
||||
progress.on_emit(pipe_obj)
|
||||
|
||||
|
||||
except ScreenshotError as exc:
|
||||
log(f"Error taking screenshot of {url}: {exc}", file=sys.stderr)
|
||||
exit_code = 1
|
||||
except Exception as exc:
|
||||
log(f"Unexpected error taking screenshot of {url}: {exc}", file=sys.stderr)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
exit_code = 1
|
||||
|
||||
|
||||
progress.close_local_ui(force_complete=True)
|
||||
|
||||
if not all_emitted:
|
||||
@@ -1051,6 +1091,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
log(f"✓ Successfully captured {len(all_emitted)} screenshot(s)")
|
||||
|
||||
return exit_code
|
||||
|
||||
|
||||
CMDLET = Cmdlet(
|
||||
name="screen-shot",
|
||||
summary="Capture a website screenshot",
|
||||
@@ -1058,16 +1100,17 @@ CMDLET = Cmdlet(
|
||||
alias=["screenshot", "ss"],
|
||||
arg=[
|
||||
SharedArgs.URL,
|
||||
CmdletArg(name="format", type="string", description="Output format: webp, png, jpeg, or pdf"),
|
||||
CmdletArg(
|
||||
name="format", type="string", description="Output format: webp, png, jpeg, or pdf"
|
||||
),
|
||||
CmdletArg(name="selector", type="string", description="CSS selector for element capture"),
|
||||
SharedArgs.PATH
|
||||
|
||||
SharedArgs.PATH,
|
||||
],
|
||||
detail=[
|
||||
"Uses Playwright Chromium engine only. Install Chromium with: python ./scripts/bootstrap.py --playwright-only --browsers chromium",
|
||||
"PDF output requires headless Chromium (the cmdlet will enforce headless mode for PDF).",
|
||||
"Screenshots are temporary artifacts stored in the configured `temp` directory.",
|
||||
]
|
||||
],
|
||||
)
|
||||
|
||||
CMDLET.exec = _run
|
||||
|
||||
Reference in New Issue
Block a user