dfd
This commit is contained in:
@@ -1585,9 +1585,46 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
|
||||
"warnings", "path", "relationships", "is_temp", "action", "parent_hash",
|
||||
}
|
||||
|
||||
# Convert ResultItem to dict to preserve all attributes
|
||||
# Convert common object-like results into a dict so we can preserve fields like
|
||||
# hash/store/url when they come from result tables (e.g., get-url emits UrlItem).
|
||||
#
|
||||
# Priority:
|
||||
# 1) explicit to_dict()
|
||||
# 2) best-effort attribute extraction for known PipeObject-ish fields
|
||||
if hasattr(value, 'to_dict'):
|
||||
value = value.to_dict()
|
||||
elif not isinstance(value, dict):
|
||||
try:
|
||||
obj_map: Dict[str, Any] = {}
|
||||
for k in (
|
||||
"hash",
|
||||
"store",
|
||||
"provider",
|
||||
"prov",
|
||||
"tag",
|
||||
"title",
|
||||
"url",
|
||||
"source_url",
|
||||
"duration",
|
||||
"duration_seconds",
|
||||
"metadata",
|
||||
"full_metadata",
|
||||
"warnings",
|
||||
"path",
|
||||
"target",
|
||||
"relationships",
|
||||
"is_temp",
|
||||
"action",
|
||||
"parent_hash",
|
||||
"extra",
|
||||
"media_kind",
|
||||
):
|
||||
if hasattr(value, k):
|
||||
obj_map[k] = getattr(value, k)
|
||||
if obj_map:
|
||||
value = obj_map
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if isinstance(value, dict):
|
||||
# Extract hash and store (canonical identifiers)
|
||||
@@ -1695,8 +1732,19 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
|
||||
# Fallback: build from path argument or bare value
|
||||
hash_val = "unknown"
|
||||
path_val = default_path or getattr(value, "path", None)
|
||||
url_val: Optional[str] = None
|
||||
title_val = None
|
||||
|
||||
# If the raw value is a string, treat it as either a URL or a file path.
|
||||
# This is important for @-selection results that are plain URL strings.
|
||||
if isinstance(value, str):
|
||||
s = value.strip()
|
||||
if s.lower().startswith(("http://", "https://")):
|
||||
url_val = s
|
||||
path_val = None
|
||||
else:
|
||||
path_val = s
|
||||
|
||||
if path_val and path_val != "unknown":
|
||||
try:
|
||||
from SYS.utils import sha256_file
|
||||
@@ -1708,8 +1756,9 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# When coming from path argument, store should be "PATH" (file path, not a backend)
|
||||
store_val = "PATH"
|
||||
# When coming from a raw URL string, mark it explicitly as URL.
|
||||
# Otherwise treat it as a local path.
|
||||
store_val = "URL" if url_val else "PATH"
|
||||
|
||||
pipe_obj = models.PipeObject(
|
||||
hash=hash_val,
|
||||
@@ -1717,6 +1766,8 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
|
||||
provider=None,
|
||||
path=str(path_val) if path_val and path_val != "unknown" else None,
|
||||
title=title_val,
|
||||
url=url_val,
|
||||
source_url=url_val,
|
||||
tag=[],
|
||||
extra={},
|
||||
)
|
||||
|
||||
@@ -12,6 +12,7 @@ import models
|
||||
import pipeline as ctx
|
||||
from API import HydrusNetwork as hydrus_wrapper
|
||||
from SYS.logger import log, debug
|
||||
from SYS.pipeline_progress import PipelineProgress
|
||||
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS
|
||||
from Store import Store
|
||||
from . import _shared as sh
|
||||
@@ -73,6 +74,7 @@ class Add_File(Cmdlet):
|
||||
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
"""Main execution entry point."""
|
||||
parsed = parse_cmdlet_args(args, self)
|
||||
progress = PipelineProgress(ctx)
|
||||
|
||||
path_arg = parsed.get("path")
|
||||
location = parsed.get("store")
|
||||
@@ -80,6 +82,35 @@ class Add_File(Cmdlet):
|
||||
provider_room = parsed.get("room")
|
||||
delete_after = parsed.get("delete", False)
|
||||
|
||||
# Convenience: when piping a file into add-file, allow `-path <existing dir>`
|
||||
# to act as the destination export directory.
|
||||
# Example: screen-shot "https://..." | add-file -path "C:\Users\Admin\Desktop"
|
||||
if path_arg and not location and not provider_name:
|
||||
try:
|
||||
candidate_dir = Path(str(path_arg))
|
||||
if candidate_dir.exists() and candidate_dir.is_dir():
|
||||
piped_items = result if isinstance(result, list) else [result]
|
||||
has_local_source = False
|
||||
for it in piped_items:
|
||||
try:
|
||||
po = coerce_to_pipe_object(it, None)
|
||||
src = str(getattr(po, "path", "") or "").strip()
|
||||
if not src:
|
||||
continue
|
||||
if src.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
|
||||
continue
|
||||
if Path(src).is_file():
|
||||
has_local_source = True
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
if has_local_source:
|
||||
debug(f"[add-file] Treating -path directory as destination: {candidate_dir}")
|
||||
location = str(candidate_dir)
|
||||
path_arg = None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
stage_ctx = ctx.get_stage_context()
|
||||
is_last_stage = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
|
||||
|
||||
@@ -93,7 +124,7 @@ class Add_File(Cmdlet):
|
||||
is_storage_backend_location = False
|
||||
|
||||
# Decide which items to process.
|
||||
# - If user provided -path, treat this invocation as single-item.
|
||||
# - If user provided -path (and it was not reinterpreted as destination), treat this invocation as single-item.
|
||||
# - Otherwise, if piped input is a list, ingest each item.
|
||||
if path_arg:
|
||||
items_to_process: List[Any] = [result]
|
||||
@@ -102,6 +133,17 @@ class Add_File(Cmdlet):
|
||||
else:
|
||||
items_to_process = [result]
|
||||
|
||||
# Minimal step-based progress for single-item runs.
|
||||
# Many add-file flows don't emit intermediate items, so without steps the pipe can look "stuck".
|
||||
use_steps = False
|
||||
steps_started = False
|
||||
step2_done = False
|
||||
try:
|
||||
ui, _ = progress.ui_and_pipe_index()
|
||||
use_steps = (ui is not None) and (len(items_to_process) == 1)
|
||||
except Exception:
|
||||
use_steps = False
|
||||
|
||||
debug(f"[add-file] INPUT result type={type(result).__name__}")
|
||||
if isinstance(result, list):
|
||||
debug(f"[add-file] INPUT result is list with {len(result)} items")
|
||||
@@ -235,6 +277,14 @@ class Add_File(Cmdlet):
|
||||
failures += 1
|
||||
continue
|
||||
|
||||
is_url_target = isinstance(media_path_or_url, str) and str(media_path_or_url).lower().startswith(
|
||||
("http://", "https://", "magnet:", "torrent:")
|
||||
)
|
||||
if use_steps and (not steps_started) and (not is_url_target):
|
||||
progress.begin_steps(3)
|
||||
progress.step("resolving source")
|
||||
steps_started = True
|
||||
|
||||
# Update pipe_obj with resolved path
|
||||
pipe_obj.path = str(media_path_or_url)
|
||||
|
||||
@@ -300,13 +350,34 @@ class Add_File(Cmdlet):
|
||||
pass
|
||||
|
||||
temp_dir_to_cleanup = Path(tempfile.mkdtemp(prefix="medios_openlibrary_"))
|
||||
|
||||
# Wire OpenLibrary download progress into pipeline Live UI (no tqdm spam).
|
||||
def _ol_progress(kind: str, completed: int, total: Optional[int], label: str) -> None:
|
||||
try:
|
||||
if kind == "pages" and total:
|
||||
progress.set_status(f"downloading pages {completed}/{total}")
|
||||
progress.set_percent(int(round((completed / max(1, total)) * 100.0)))
|
||||
elif kind == "bytes" and total:
|
||||
progress.set_status(f"downloading {label} {completed}/{total} bytes")
|
||||
progress.set_percent(int(round((completed / max(1, total)) * 100.0)))
|
||||
else:
|
||||
progress.set_status("downloading")
|
||||
except Exception:
|
||||
return
|
||||
|
||||
try:
|
||||
progress.set_percent(0)
|
||||
progress.set_status("downloading openlibrary")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
sr = SearchResult(
|
||||
table="openlibrary",
|
||||
title=str(getattr(pipe_obj, "title", None) or "Unknown"),
|
||||
path=str(media_path_or_url),
|
||||
full_metadata=full_metadata if isinstance(full_metadata, dict) else {},
|
||||
)
|
||||
downloaded = provider.download(sr, temp_dir_to_cleanup)
|
||||
downloaded = provider.download(sr, temp_dir_to_cleanup, progress_callback=_ol_progress)
|
||||
if downloaded is None:
|
||||
log("[add-file] OpenLibrary download failed", file=sys.stderr)
|
||||
failures += 1
|
||||
@@ -325,6 +396,13 @@ class Add_File(Cmdlet):
|
||||
pipe_obj.path = str(downloaded_path)
|
||||
delete_after_item = True
|
||||
|
||||
try:
|
||||
if ui is not None:
|
||||
ui.set_pipe_percent(int(pipe_idx), 100)
|
||||
ui.set_pipe_status_text(int(pipe_idx), "downloaded")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# For non-provider URLs, or if still a URL after provider attempt, delegate to download-media.
|
||||
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
|
||||
("http://", "https://", "magnet:", "torrent:")
|
||||
@@ -562,6 +640,10 @@ class Add_File(Cmdlet):
|
||||
failures += 1
|
||||
continue
|
||||
|
||||
if use_steps and steps_started and (not step2_done):
|
||||
progress.step("writing destination")
|
||||
step2_done = True
|
||||
|
||||
if code == 0:
|
||||
successes += 1
|
||||
else:
|
||||
@@ -619,6 +701,9 @@ class Add_File(Cmdlet):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if use_steps and steps_started:
|
||||
progress.step("finalized")
|
||||
|
||||
if successes > 0:
|
||||
return 0
|
||||
return 1
|
||||
|
||||
@@ -34,6 +34,19 @@ class Add_Url(sh.Cmdlet):
|
||||
"""Add URL to file via hash+store backend."""
|
||||
parsed = sh.parse_cmdlet_args(args, self)
|
||||
|
||||
# Compatibility/piping fix:
|
||||
# `SharedArgs.QUERY` is positional in the shared parser, so `add-url <url>`
|
||||
# (and `@N | add-url <url>`) can mistakenly parse the URL into `query`.
|
||||
# If `url` is missing and `query` looks like an http(s) URL, treat it as `url`.
|
||||
try:
|
||||
if (not parsed.get("url")) and isinstance(parsed.get("query"), str):
|
||||
q = str(parsed.get("query") or "").strip()
|
||||
if q.startswith(("http://", "https://")):
|
||||
parsed["url"] = q
|
||||
parsed.pop("query", None)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
query_hash = sh.parse_single_hash_query(parsed.get("query"))
|
||||
if parsed.get("query") and not query_hash:
|
||||
log("Error: -query must be of the form hash:<sha256>")
|
||||
|
||||
@@ -29,7 +29,7 @@ class Delete_Url(Cmdlet):
|
||||
arg=[
|
||||
SharedArgs.QUERY,
|
||||
SharedArgs.STORE,
|
||||
CmdletArg("url", required=True, description="URL to remove"),
|
||||
CmdletArg("url", required=False, description="URL to remove (optional when piping url rows)"),
|
||||
],
|
||||
detail=[
|
||||
"- Removes URL association from file identified by hash+store",
|
||||
@@ -69,22 +69,24 @@ class Delete_Url(Cmdlet):
|
||||
log("Error: No store name provided")
|
||||
return 1
|
||||
|
||||
if not url_arg:
|
||||
log("Error: No URL provided")
|
||||
return 1
|
||||
|
||||
# Normalize hash (single-item mode)
|
||||
if not results and file_hash:
|
||||
file_hash = normalize_hash(file_hash)
|
||||
if not file_hash:
|
||||
log("Error: Invalid hash format")
|
||||
return 1
|
||||
|
||||
# Parse url (comma-separated)
|
||||
urls = [u.strip() for u in str(url_arg).split(',') if u.strip()]
|
||||
if not urls:
|
||||
log("Error: No valid url provided")
|
||||
return 1
|
||||
|
||||
from metadata import normalize_urls
|
||||
|
||||
def _urls_from_arg(raw: Any) -> List[str]:
|
||||
if raw is None:
|
||||
return []
|
||||
# Support comma-separated input for backwards compatibility
|
||||
if isinstance(raw, str) and "," in raw:
|
||||
return [u.strip() for u in raw.split(",") if u.strip()]
|
||||
return [u.strip() for u in normalize_urls(raw) if str(u).strip()]
|
||||
|
||||
urls_from_cli = _urls_from_arg(url_arg)
|
||||
|
||||
# Get backend and delete url
|
||||
try:
|
||||
@@ -145,7 +147,17 @@ class Delete_Url(Cmdlet):
|
||||
)
|
||||
continue
|
||||
|
||||
batch.setdefault(store_text, []).append((normalized, list(urls)))
|
||||
# Determine which URLs to delete.
|
||||
# - If user passed an explicit <url>, apply it to all items.
|
||||
# - Otherwise, when piping url rows from get-url, delete the url(s) from each item.
|
||||
item_urls = list(urls_from_cli)
|
||||
if not item_urls:
|
||||
item_urls = [u.strip() for u in normalize_urls(get_field(item, "url") or get_field(item, "source_url")) if str(u).strip()]
|
||||
if not item_urls:
|
||||
ctx.print_if_visible("[delete-url] Warning: Item has no url field; skipping", file=sys.stderr)
|
||||
continue
|
||||
|
||||
batch.setdefault(store_text, []).append((normalized, item_urls))
|
||||
|
||||
for store_text, pairs in batch.items():
|
||||
try:
|
||||
@@ -168,24 +180,39 @@ class Delete_Url(Cmdlet):
|
||||
for h, ulist in bulk_pairs:
|
||||
backend.delete_url(h, ulist, config=config)
|
||||
|
||||
deleted_count = 0
|
||||
for _h, ulist in bulk_pairs:
|
||||
deleted_count += len(ulist or [])
|
||||
ctx.print_if_visible(
|
||||
f"✓ delete-url: {len(urls)} url(s) for {len(bulk_pairs)} item(s) in '{store_text}'",
|
||||
f"✓ delete-url: {deleted_count} url(s) for {len(bulk_pairs)} item(s) in '{store_text}'",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
for item in pass_through:
|
||||
existing = get_field(item, "url")
|
||||
_set_item_url(item, _remove_urls(existing, list(urls)))
|
||||
# In batch mode we removed the union of requested urls for the file.
|
||||
# Using urls_from_cli (if present) matches the user's explicit intent; otherwise
|
||||
# remove the piped url row(s).
|
||||
remove_set = urls_from_cli
|
||||
if not remove_set:
|
||||
remove_set = [u.strip() for u in normalize_urls(get_field(item, "url") or get_field(item, "source_url")) if str(u).strip()]
|
||||
_set_item_url(item, _remove_urls(existing, list(remove_set)))
|
||||
ctx.emit(item)
|
||||
return 0
|
||||
|
||||
# Single-item mode
|
||||
if not urls_from_cli:
|
||||
urls_from_cli = [u.strip() for u in normalize_urls(get_field(result, "url") or get_field(result, "source_url")) if str(u).strip()]
|
||||
if not urls_from_cli:
|
||||
log("Error: No URL provided")
|
||||
return 1
|
||||
|
||||
backend = storage[str(store_name)]
|
||||
backend.delete_url(str(file_hash), urls, config=config)
|
||||
ctx.print_if_visible(f"✓ delete-url: {len(urls)} url(s) removed", file=sys.stderr)
|
||||
backend.delete_url(str(file_hash), list(urls_from_cli), config=config)
|
||||
ctx.print_if_visible(f"✓ delete-url: {len(urls_from_cli)} url(s) removed", file=sys.stderr)
|
||||
if result is not None:
|
||||
existing = get_field(result, "url")
|
||||
_set_item_url(result, _remove_urls(existing, list(urls)))
|
||||
_set_item_url(result, _remove_urls(existing, list(urls_from_cli)))
|
||||
ctx.emit(result)
|
||||
return 0
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -126,7 +126,7 @@ class Get_File(sh.Cmdlet):
|
||||
except Exception as exc:
|
||||
log(f"Error opening browser: {exc}", file=sys.stderr)
|
||||
else:
|
||||
log(f"Opened in browser: {source_path}", file=sys.stderr)
|
||||
debug(f"Opened in browser: {source_path}", file=sys.stderr)
|
||||
|
||||
# Emit result for pipeline
|
||||
ctx.emit({
|
||||
|
||||
@@ -47,6 +47,210 @@ except ImportError:
|
||||
extract_title = None
|
||||
|
||||
|
||||
def _dedup_tags_preserve_order(tags: List[str]) -> List[str]:
|
||||
"""Deduplicate tags case-insensitively while preserving order."""
|
||||
out: List[str] = []
|
||||
seen: set[str] = set()
|
||||
for t in tags or []:
|
||||
if not isinstance(t, str):
|
||||
continue
|
||||
s = t.strip()
|
||||
if not s:
|
||||
continue
|
||||
key = s.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
out.append(s)
|
||||
return out
|
||||
|
||||
|
||||
def _extract_subtitle_tags(info: Dict[str, Any]) -> List[str]:
|
||||
"""Extract subtitle availability tags from a yt-dlp info dict.
|
||||
|
||||
Produces multi-valued tags so languages can coexist:
|
||||
- subs:<lang>
|
||||
- subs_auto:<lang>
|
||||
"""
|
||||
def _langs(value: Any) -> List[str]:
|
||||
if not isinstance(value, dict):
|
||||
return []
|
||||
langs: List[str] = []
|
||||
for k in value.keys():
|
||||
if not isinstance(k, str):
|
||||
continue
|
||||
lang = k.strip().lower()
|
||||
if lang:
|
||||
langs.append(lang)
|
||||
return sorted(set(langs))
|
||||
|
||||
out: List[str] = []
|
||||
for lang in _langs(info.get("subtitles")):
|
||||
out.append(f"subs:{lang}")
|
||||
for lang in _langs(info.get("automatic_captions")):
|
||||
out.append(f"subs_auto:{lang}")
|
||||
return out
|
||||
|
||||
|
||||
def _scrape_ytdlp_info(url: str) -> Optional[Dict[str, Any]]:
|
||||
"""Fetch a yt-dlp info dict without downloading media."""
|
||||
if not isinstance(url, str) or not url.strip():
|
||||
return None
|
||||
url = url.strip()
|
||||
|
||||
# Prefer the Python module when available (faster, avoids shell quoting issues).
|
||||
try:
|
||||
import yt_dlp # type: ignore
|
||||
opts: Any = {
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"skip_download": True,
|
||||
"noprogress": True,
|
||||
"socket_timeout": 15,
|
||||
"retries": 1,
|
||||
"playlist_items": "1-10",
|
||||
}
|
||||
with yt_dlp.YoutubeDL(opts) as ydl:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
return info if isinstance(info, dict) else None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to yt-dlp CLI if the module isn't available.
|
||||
try:
|
||||
import json as json_module
|
||||
cmd = [
|
||||
"yt-dlp",
|
||||
"-J",
|
||||
"--no-warnings",
|
||||
"--skip-download",
|
||||
"--playlist-items",
|
||||
"1-10",
|
||||
url,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
if result.returncode != 0:
|
||||
return None
|
||||
payload = (result.stdout or "").strip()
|
||||
if not payload:
|
||||
return None
|
||||
data = json_module.loads(payload)
|
||||
return data if isinstance(data, dict) else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_candidate_urls_for_item(
|
||||
result: Any,
|
||||
backend: Any,
|
||||
file_hash: str,
|
||||
config: Dict[str, Any],
|
||||
) -> List[str]:
|
||||
"""Get candidate URLs from backend and/or piped result."""
|
||||
try:
|
||||
from metadata import normalize_urls
|
||||
except Exception:
|
||||
normalize_urls = None # type: ignore[assignment]
|
||||
|
||||
urls: List[str] = []
|
||||
# 1) Backend URL association (best source of truth)
|
||||
try:
|
||||
backend_urls = backend.get_url(file_hash, config=config)
|
||||
if backend_urls:
|
||||
if normalize_urls:
|
||||
urls.extend(normalize_urls(backend_urls))
|
||||
else:
|
||||
urls.extend([str(u).strip() for u in backend_urls if isinstance(u, str) and str(u).strip()])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 2) Backend metadata url field
|
||||
try:
|
||||
meta = backend.get_metadata(file_hash, config=config)
|
||||
if isinstance(meta, dict) and meta.get("url"):
|
||||
if normalize_urls:
|
||||
urls.extend(normalize_urls(meta.get("url")))
|
||||
else:
|
||||
raw = meta.get("url")
|
||||
if isinstance(raw, list):
|
||||
urls.extend([str(u).strip() for u in raw if isinstance(u, str) and str(u).strip()])
|
||||
elif isinstance(raw, str) and raw.strip():
|
||||
urls.append(raw.strip())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 3) Piped result fields
|
||||
def _get(obj: Any, key: str, default: Any = None) -> Any:
|
||||
if isinstance(obj, dict):
|
||||
return obj.get(key, default)
|
||||
return getattr(obj, key, default)
|
||||
|
||||
for key in ("url", "webpage_url", "source_url", "target"):
|
||||
val = _get(result, key, None)
|
||||
if not val:
|
||||
continue
|
||||
if normalize_urls:
|
||||
urls.extend(normalize_urls(val))
|
||||
continue
|
||||
if isinstance(val, str) and val.strip():
|
||||
urls.append(val.strip())
|
||||
elif isinstance(val, list):
|
||||
urls.extend([str(u).strip() for u in val if isinstance(u, str) and str(u).strip()])
|
||||
|
||||
meta_field = _get(result, "metadata", None)
|
||||
if isinstance(meta_field, dict) and meta_field.get("url"):
|
||||
val = meta_field.get("url")
|
||||
if normalize_urls:
|
||||
urls.extend(normalize_urls(val))
|
||||
elif isinstance(val, list):
|
||||
urls.extend([str(u).strip() for u in val if isinstance(u, str) and str(u).strip()])
|
||||
elif isinstance(val, str) and val.strip():
|
||||
urls.append(val.strip())
|
||||
|
||||
# Dedup
|
||||
return _dedup_tags_preserve_order(urls)
|
||||
|
||||
|
||||
def _pick_supported_ytdlp_url(urls: List[str]) -> Optional[str]:
|
||||
"""Pick the first URL that looks supported by yt-dlp (best effort)."""
|
||||
if not urls:
|
||||
return None
|
||||
|
||||
def _is_hydrus_file_url(u: str) -> bool:
|
||||
text = str(u or "").strip().lower()
|
||||
if not text:
|
||||
return False
|
||||
# Hydrus-local file URLs are retrievable blobs, not original source pages.
|
||||
# yt-dlp generally can't extract meaningful metadata from these.
|
||||
return ("/get_files/file" in text) and ("hash=" in text)
|
||||
|
||||
http_urls: List[str] = []
|
||||
for u in urls:
|
||||
text = str(u or "").strip()
|
||||
if text.lower().startswith(("http://", "https://")):
|
||||
http_urls.append(text)
|
||||
|
||||
# Prefer non-Hydrus URLs for yt-dlp scraping.
|
||||
candidates = [u for u in http_urls if not _is_hydrus_file_url(u)]
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
# Prefer a true support check when the Python module is available.
|
||||
try:
|
||||
from SYS.download import is_url_supported_by_ytdlp
|
||||
for text in candidates:
|
||||
try:
|
||||
if is_url_supported_by_ytdlp(text):
|
||||
return text
|
||||
except Exception:
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback: use the first non-Hydrus http(s) URL and let extraction decide.
|
||||
return candidates[0] if candidates else None
|
||||
|
||||
|
||||
_scrape_isbn_metadata = _ol_scrape_isbn_metadata # type: ignore[assignment]
|
||||
_scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata # type: ignore[assignment]
|
||||
|
||||
@@ -853,7 +1057,12 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
scrape_url = parsed_args.get("scrape")
|
||||
scrape_requested = scrape_flag_present or scrape_url is not None
|
||||
|
||||
if scrape_requested and (not scrape_url or str(scrape_url).strip() == ""):
|
||||
# Convenience: `-scrape` with no value defaults to `ytdlp` (store-backed URL scrape).
|
||||
if scrape_flag_present and (scrape_url is None or str(scrape_url).strip() == ""):
|
||||
scrape_url = "ytdlp"
|
||||
scrape_requested = True
|
||||
|
||||
if scrape_requested and (scrape_url is None or str(scrape_url).strip() == ""):
|
||||
log("-scrape requires a URL or provider name", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
@@ -861,6 +1070,123 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
if scrape_requested and scrape_url:
|
||||
import json as json_module
|
||||
|
||||
if str(scrape_url).strip().lower() == "ytdlp":
|
||||
# Scrape metadata from the selected item's URL via yt-dlp (no download),
|
||||
# then OVERWRITE all existing tags (including title:).
|
||||
#
|
||||
# This mode requires a store-backed item (hash + store).
|
||||
#
|
||||
# NOTE: We intentionally do not reuse _scrape_url_metadata() here because it
|
||||
# performs namespace deduplication that would collapse multi-valued tags.
|
||||
file_hash = normalize_hash(hash_override) or normalize_hash(get_field(result, "hash", None))
|
||||
store_name = get_field(result, "store", None)
|
||||
subject_path = get_field(result, "path", None) or get_field(result, "target", None) or get_field(result, "filename", None)
|
||||
item_title = get_field(result, "title", None) or get_field(result, "name", None) or get_field(result, "filename", None)
|
||||
|
||||
# Only run overwrite-apply when the item is store-backed.
|
||||
# If this is a URL-only PipeObject, fall through to provider mode below.
|
||||
if file_hash and store_name and str(file_hash).strip().lower() != "unknown" and str(store_name).strip().upper() not in {"PATH", "URL"}:
|
||||
try:
|
||||
from Store import Store
|
||||
storage = Store(config)
|
||||
backend = storage[str(store_name)]
|
||||
except Exception as exc:
|
||||
log(f"Failed to resolve store backend '{store_name}': {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
candidate_urls = _resolve_candidate_urls_for_item(result, backend, file_hash, config)
|
||||
scrape_target = _pick_supported_ytdlp_url(candidate_urls)
|
||||
if not scrape_target:
|
||||
log(
|
||||
"No yt-dlp-supported source URL found for this item (Hydrus /get_files/file URLs are ignored). ",
|
||||
file=sys.stderr,
|
||||
)
|
||||
log(
|
||||
"Add the original page URL to the file (e.g. via add-url), then retry get-tag -scrape.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
info = _scrape_ytdlp_info(scrape_target)
|
||||
if not info:
|
||||
log("yt-dlp could not extract metadata for this URL (unsupported or failed)", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
try:
|
||||
from metadata import extract_ytdlp_tags
|
||||
except Exception:
|
||||
extract_ytdlp_tags = None # type: ignore[assignment]
|
||||
|
||||
# Prefer the top-level metadata, but if this is a playlist container, use
|
||||
# the first entry for per-item fields like subtitles.
|
||||
info_for_subs = info
|
||||
entries = info.get("entries") if isinstance(info, dict) else None
|
||||
if isinstance(entries, list) and entries:
|
||||
first = entries[0]
|
||||
if isinstance(first, dict):
|
||||
info_for_subs = first
|
||||
|
||||
tags: List[str] = []
|
||||
if extract_ytdlp_tags:
|
||||
try:
|
||||
tags.extend(extract_ytdlp_tags(info))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Subtitle availability tags
|
||||
try:
|
||||
tags.extend(_extract_subtitle_tags(info_for_subs if isinstance(info_for_subs, dict) else {}))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Ensure we actually have something to apply.
|
||||
tags = _dedup_tags_preserve_order(tags)
|
||||
if not tags:
|
||||
log("No tags extracted from yt-dlp metadata", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Full overwrite: delete all existing tags, then add the new set.
|
||||
try:
|
||||
existing_tags, _src = backend.get_tag(file_hash, config=config)
|
||||
except Exception:
|
||||
existing_tags = []
|
||||
try:
|
||||
if existing_tags:
|
||||
backend.delete_tag(file_hash, list(existing_tags), config=config)
|
||||
except Exception as exc:
|
||||
debug(f"[get_tag] ytdlp overwrite: delete_tag failed: {exc}")
|
||||
try:
|
||||
backend.add_tag(file_hash, list(tags), config=config)
|
||||
except Exception as exc:
|
||||
log(f"Failed to apply yt-dlp tags: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Show updated tags
|
||||
try:
|
||||
updated_tags, _src = backend.get_tag(file_hash, config=config)
|
||||
except Exception:
|
||||
updated_tags = tags
|
||||
if not updated_tags:
|
||||
updated_tags = tags
|
||||
|
||||
_emit_tags_as_table(
|
||||
tags_list=list(updated_tags),
|
||||
file_hash=file_hash,
|
||||
store=str(store_name),
|
||||
service_name=None,
|
||||
config=config,
|
||||
item_title=str(item_title or "ytdlp"),
|
||||
path=str(subject_path) if subject_path else None,
|
||||
subject={
|
||||
"hash": file_hash,
|
||||
"store": str(store_name),
|
||||
"path": str(subject_path) if subject_path else None,
|
||||
"title": item_title,
|
||||
"extra": {"applied_provider": "ytdlp", "scrape_url": scrape_target},
|
||||
},
|
||||
)
|
||||
return 0
|
||||
|
||||
if scrape_url.startswith("http://") or scrape_url.startswith("https://"):
|
||||
# URL scraping (existing behavior)
|
||||
title, tags, formats, playlist_items = _scrape_url_metadata(scrape_url)
|
||||
@@ -951,7 +1277,16 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
else:
|
||||
combined_query = f"{title_hint} {artist_hint}"
|
||||
|
||||
query_hint = identifier_query or combined_query or title_hint
|
||||
# yt-dlp isn't a search provider; it requires a URL.
|
||||
url_hint: Optional[str] = None
|
||||
if provider.name == "ytdlp":
|
||||
raw_url = get_field(result, "url", None) or get_field(result, "source_url", None) or get_field(result, "target", None)
|
||||
if isinstance(raw_url, list) and raw_url:
|
||||
raw_url = raw_url[0]
|
||||
if isinstance(raw_url, str) and raw_url.strip().startswith(("http://", "https://")):
|
||||
url_hint = raw_url.strip()
|
||||
|
||||
query_hint = url_hint or identifier_query or combined_query or title_hint
|
||||
if not query_hint:
|
||||
log("No title or identifier available to search for metadata", file=sys.stderr)
|
||||
return 1
|
||||
@@ -967,6 +1302,27 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
if not items:
|
||||
log("No metadata results found", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# For yt-dlp, emit tags directly (there is no meaningful multi-result selection step).
|
||||
if provider.name == "ytdlp":
|
||||
try:
|
||||
tags = [str(t) for t in provider.to_tags(items[0]) if t is not None]
|
||||
except Exception:
|
||||
tags = []
|
||||
if not tags:
|
||||
log("No tags extracted from yt-dlp metadata", file=sys.stderr)
|
||||
return 1
|
||||
_emit_tags_as_table(
|
||||
tags_list=list(tags),
|
||||
file_hash=None,
|
||||
store="url",
|
||||
service_name=None,
|
||||
config=config,
|
||||
item_title=str(items[0].get("title") or "ytdlp"),
|
||||
path=None,
|
||||
subject={"provider": "ytdlp", "url": str(query_hint)},
|
||||
)
|
||||
return 0
|
||||
|
||||
from result_table import ResultTable
|
||||
table = ResultTable(f"Metadata: {provider.name}")
|
||||
@@ -1040,7 +1396,10 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
return 0
|
||||
|
||||
# Apply tags to the store backend (no sidecar writing here).
|
||||
apply_tags = _filter_scraped_tags([str(t) for t in result_tags if t is not None])
|
||||
if str(result_provider).strip().lower() == "ytdlp":
|
||||
apply_tags = [str(t) for t in result_tags if t is not None]
|
||||
else:
|
||||
apply_tags = _filter_scraped_tags([str(t) for t in result_tags if t is not None])
|
||||
if not apply_tags:
|
||||
log("No applicable scraped tags to apply (title:/artist:/source: are skipped)", file=sys.stderr)
|
||||
return 0
|
||||
@@ -1167,6 +1526,11 @@ try:
|
||||
except Exception:
|
||||
_SCRAPE_CHOICES = ["itunes", "openlibrary", "googlebooks", "google", "musicbrainz"]
|
||||
|
||||
# Special scrape mode: pull tags from an item's URL via yt-dlp (no download)
|
||||
if "ytdlp" not in _SCRAPE_CHOICES:
|
||||
_SCRAPE_CHOICES.append("ytdlp")
|
||||
_SCRAPE_CHOICES = sorted(_SCRAPE_CHOICES)
|
||||
|
||||
|
||||
class Get_Tag(Cmdlet):
|
||||
"""Class-based get-tag cmdlet with self-registration."""
|
||||
@@ -1195,7 +1559,7 @@ class Get_Tag(Cmdlet):
|
||||
CmdletArg(
|
||||
name="-scrape",
|
||||
type="string",
|
||||
description="Scrape metadata from URL or provider name (returns tags as JSON or table)",
|
||||
description="Scrape metadata from URL/provider, or use 'ytdlp' to scrape from the item's URL and overwrite tags",
|
||||
required=False,
|
||||
choices=_SCRAPE_CHOICES,
|
||||
)
|
||||
|
||||
@@ -14,10 +14,11 @@ import httpx
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
||||
from urllib.parse import urlsplit, quote, urljoin
|
||||
from urllib.parse import urlsplit, quote, urljoin, unquote
|
||||
|
||||
from SYS.logger import log, debug
|
||||
from API.HTTP import HTTPClient
|
||||
from SYS.pipeline_progress import PipelineProgress
|
||||
from SYS.utils import ensure_directory, unique_path, unique_preserve_order
|
||||
from . import _shared as sh
|
||||
|
||||
@@ -31,54 +32,6 @@ get_field = sh.get_field
|
||||
parse_cmdlet_args = sh.parse_cmdlet_args
|
||||
import pipeline as pipeline_context
|
||||
|
||||
|
||||
def _live_ui_and_pipe_index() -> tuple[Optional[Any], int]:
|
||||
ui = None
|
||||
try:
|
||||
ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
|
||||
except Exception:
|
||||
ui = None
|
||||
|
||||
pipe_idx: int = 0
|
||||
try:
|
||||
stage_ctx = pipeline_context.get_stage_context() if hasattr(pipeline_context, "get_stage_context") else None
|
||||
maybe_idx = getattr(stage_ctx, "pipe_index", None) if stage_ctx is not None else None
|
||||
if isinstance(maybe_idx, int):
|
||||
pipe_idx = int(maybe_idx)
|
||||
except Exception:
|
||||
pipe_idx = 0
|
||||
|
||||
return ui, pipe_idx
|
||||
|
||||
|
||||
def _begin_live_steps(total_steps: int) -> None:
|
||||
"""Declare the total number of steps for this cmdlet run (per-pipe)."""
|
||||
ui, pipe_idx = _live_ui_and_pipe_index()
|
||||
if ui is None:
|
||||
return
|
||||
try:
|
||||
begin = getattr(ui, "begin_pipe_steps", None)
|
||||
if callable(begin):
|
||||
begin(int(pipe_idx), total_steps=int(total_steps))
|
||||
except Exception:
|
||||
return
|
||||
|
||||
|
||||
def _step(text: str) -> None:
|
||||
"""Emit a *new* step.
|
||||
|
||||
Each call increments the step counter and advances percent automatically.
|
||||
"""
|
||||
ui, pipe_idx = _live_ui_and_pipe_index()
|
||||
if ui is None:
|
||||
return
|
||||
try:
|
||||
adv = getattr(ui, "advance_pipe_step", None)
|
||||
if callable(adv):
|
||||
adv(int(pipe_idx), str(text))
|
||||
except Exception:
|
||||
return
|
||||
|
||||
# ============================================================================
|
||||
# CMDLET Metadata Declaration
|
||||
# ============================================================================
|
||||
@@ -115,6 +68,10 @@ USER_AGENT = (
|
||||
DEFAULT_VIEWPORT: dict[str, int] = {"width": 1920, "height": 1080}
|
||||
ARCHIVE_TIMEOUT = 30.0
|
||||
|
||||
# WebP has a hard maximum dimension per side.
|
||||
# Pillow typically fails with: "encoding error 5: Image size exceeds WebP limit of 16383 pixels"
|
||||
WEBP_MAX_DIM = 16_383
|
||||
|
||||
# Configurable selectors for specific websites
|
||||
SITE_SELECTORS: Dict[str, List[str]] = {
|
||||
"twitter.com": [
|
||||
@@ -200,6 +157,80 @@ def _slugify_url(url: str) -> str:
|
||||
return slug[:100]
|
||||
|
||||
|
||||
def _tags_from_url(url: str) -> List[str]:
|
||||
"""Derive simple tags from a URL.
|
||||
|
||||
- site:<domain> (strips leading www.)
|
||||
- title:<slug> derived from the last path segment, with extension removed
|
||||
and separators (-, _, %) normalized to spaces.
|
||||
"""
|
||||
|
||||
u = str(url or "").strip()
|
||||
if not u:
|
||||
return []
|
||||
|
||||
parsed = None
|
||||
try:
|
||||
parsed = urlsplit(u)
|
||||
host = str(getattr(parsed, "hostname", None) or getattr(parsed, "netloc", "") or "").strip().lower()
|
||||
except Exception:
|
||||
parsed = None
|
||||
host = ""
|
||||
|
||||
if host:
|
||||
# Drop credentials and port if present.
|
||||
if "@" in host:
|
||||
host = host.rsplit("@", 1)[-1]
|
||||
if ":" in host:
|
||||
host = host.split(":", 1)[0]
|
||||
if host.startswith("www."):
|
||||
host = host[len("www.") :]
|
||||
|
||||
path = ""
|
||||
if parsed is not None:
|
||||
try:
|
||||
path = str(getattr(parsed, "path", "") or "")
|
||||
except Exception:
|
||||
path = ""
|
||||
|
||||
last = ""
|
||||
if path:
|
||||
try:
|
||||
last = path.rsplit("/", 1)[-1]
|
||||
except Exception:
|
||||
last = ""
|
||||
|
||||
try:
|
||||
last = unquote(last or "")
|
||||
except Exception:
|
||||
last = last or ""
|
||||
|
||||
if last and "." in last:
|
||||
# Drop a single trailing extension (e.g. .html, .php).
|
||||
last = last.rsplit(".", 1)[0]
|
||||
|
||||
for sep in ("_", "-", "%"):
|
||||
if last and sep in last:
|
||||
last = last.replace(sep, " ")
|
||||
|
||||
title = " ".join(str(last or "").split()).strip().lower()
|
||||
|
||||
tags: List[str] = []
|
||||
if host:
|
||||
tags.append(f"site:{host}")
|
||||
if title:
|
||||
tags.append(f"title:{title}")
|
||||
return tags
|
||||
|
||||
|
||||
def _title_from_url(url: str) -> str:
|
||||
"""Return the normalized title derived from a URL's last path segment."""
|
||||
for t in _tags_from_url(url):
|
||||
if str(t).lower().startswith("title:"):
|
||||
return str(t)[len("title:") :].strip()
|
||||
return ""
|
||||
|
||||
|
||||
def _normalise_format(fmt: Optional[str]) -> str:
|
||||
"""Normalize output format to valid values."""
|
||||
if not fmt:
|
||||
@@ -218,6 +249,89 @@ def _format_suffix(fmt: str) -> str:
|
||||
return ".jpg"
|
||||
return f".{fmt}"
|
||||
|
||||
|
||||
def _convert_to_webp(
|
||||
src_png: Path,
|
||||
dst_webp: Path,
|
||||
*,
|
||||
quality: int = 90,
|
||||
method: int = 6,
|
||||
max_dim: int = WEBP_MAX_DIM,
|
||||
downscale_if_oversize: bool = True,
|
||||
) -> bool:
|
||||
"""Convert a PNG screenshot to WebP via Pillow.
|
||||
|
||||
Playwright does not currently support emitting WebP directly.
|
||||
"""
|
||||
if not src_png or not Path(src_png).is_file():
|
||||
raise ScreenshotError(f"Source image not found: {src_png}")
|
||||
|
||||
dst_webp = Path(dst_webp)
|
||||
try:
|
||||
dst_webp.parent.mkdir(parents=True, exist_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
except Exception as exc:
|
||||
raise ScreenshotError(f"Pillow is required for webp conversion: {exc}") from exc
|
||||
|
||||
# Write atomically to avoid partial files if conversion is interrupted.
|
||||
tmp_path = unique_path(dst_webp.with_suffix(".tmp.webp"))
|
||||
try:
|
||||
with Image.open(src_png) as im:
|
||||
did_downscale = False
|
||||
save_kwargs: Dict[str, Any] = {
|
||||
"format": "WEBP",
|
||||
"quality": int(quality),
|
||||
"method": int(method),
|
||||
}
|
||||
|
||||
# Preserve alpha when present; Pillow handles it for WEBP.
|
||||
# Normalize palette images to RGBA to avoid odd palette artifacts.
|
||||
if im.mode == "P":
|
||||
im = im.convert("RGBA")
|
||||
|
||||
# WebP enforces a hard max dimension per side (16383px).
|
||||
# When full-page captures are very tall, downscale proportionally to fit.
|
||||
try:
|
||||
w, h = im.size
|
||||
except Exception:
|
||||
w, h = 0, 0
|
||||
|
||||
if downscale_if_oversize and isinstance(max_dim, int) and max_dim > 0 and (w > max_dim or h > max_dim):
|
||||
scale = 1.0
|
||||
try:
|
||||
scale = min(float(max_dim) / float(w), float(max_dim) / float(h))
|
||||
except Exception:
|
||||
scale = 1.0
|
||||
|
||||
if scale > 0.0 and scale < 1.0:
|
||||
new_w = max(1, int(w * scale))
|
||||
new_h = max(1, int(h * scale))
|
||||
debug(
|
||||
f"[_convert_to_webp] Image exceeds WebP limit ({w}x{h}); downscaling -> {new_w}x{new_h}"
|
||||
)
|
||||
try:
|
||||
resample = getattr(getattr(Image, "Resampling", Image), "LANCZOS", None)
|
||||
if resample is None:
|
||||
resample = getattr(Image, "LANCZOS", 1)
|
||||
im = im.resize((new_w, new_h), resample=resample)
|
||||
did_downscale = True
|
||||
except Exception as exc:
|
||||
debug(f"[_convert_to_webp] Downscale failed; attempting direct WEBP save anyway: {exc}")
|
||||
|
||||
im.save(tmp_path, **save_kwargs)
|
||||
|
||||
tmp_path.replace(dst_webp)
|
||||
return bool(did_downscale)
|
||||
finally:
|
||||
try:
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _matched_site_selectors(url: str) -> List[str]:
|
||||
"""Return SITE_SELECTORS for a matched domain; empty if no match.
|
||||
|
||||
@@ -231,6 +345,16 @@ def _matched_site_selectors(url: str) -> List[str]:
|
||||
return sels
|
||||
|
||||
|
||||
def _selectors_for_url(url: str) -> List[str]:
|
||||
"""Return selectors to try for a URL.
|
||||
|
||||
For now, prefer a minimal behavior: only return known SITE_SELECTORS.
|
||||
(The cmdlet already falls back to full-page capture when no selectors match.)
|
||||
"""
|
||||
|
||||
return _matched_site_selectors(url)
|
||||
|
||||
|
||||
def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000) -> None:
|
||||
"""Best-effort page tweaks for popular platforms before capture."""
|
||||
try:
|
||||
@@ -366,11 +490,11 @@ def _prepare_output_path(options: ScreenshotOptions) -> Path:
|
||||
return unique_path(path)
|
||||
|
||||
|
||||
def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) -> None:
|
||||
def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str], progress: PipelineProgress) -> None:
|
||||
"""Capture screenshot using Playwright."""
|
||||
debug(f"[_capture] Starting capture for {options.url} -> {destination}")
|
||||
try:
|
||||
_step("loading launching browser")
|
||||
progress.step("loading launching browser")
|
||||
tool = options.playwright_tool or PlaywrightTool({})
|
||||
|
||||
# Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency)
|
||||
@@ -405,16 +529,16 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
|
||||
|
||||
try:
|
||||
with tool.open_page(headless=headless) as page:
|
||||
_step("loading navigating")
|
||||
progress.step("loading navigating")
|
||||
debug(f"Navigating to {options.url}...")
|
||||
try:
|
||||
tool.goto(page, options.url)
|
||||
debug("Page loaded successfully")
|
||||
_step("loading page loaded")
|
||||
progress.step("loading page loaded")
|
||||
except PlaywrightTimeoutError:
|
||||
warnings.append("navigation timeout; capturing current page state")
|
||||
debug("Navigation timeout; proceeding with current state")
|
||||
_step("loading navigation timeout")
|
||||
progress.step("loading navigation timeout")
|
||||
|
||||
# Skip article lookup by default (wait_for_article defaults to False)
|
||||
if options.wait_for_article:
|
||||
@@ -430,9 +554,9 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
|
||||
debug(f"Waiting {options.wait_after_load}s for page stabilization...")
|
||||
time.sleep(min(10.0, max(0.0, options.wait_after_load)))
|
||||
|
||||
_step("loading stabilized")
|
||||
progress.step("loading stabilized")
|
||||
|
||||
_step("capturing preparing")
|
||||
progress.step("capturing preparing")
|
||||
if options.replace_video_posters:
|
||||
debug("Replacing video elements with posters...")
|
||||
page.evaluate(
|
||||
@@ -453,7 +577,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
|
||||
if options.prefer_platform_target and format_name != "pdf":
|
||||
debug(f"[_capture] Target capture enabled")
|
||||
debug("Attempting platform-specific content capture...")
|
||||
_step("capturing locating target")
|
||||
progress.step("capturing locating target")
|
||||
try:
|
||||
_platform_preprocess(options.url, page, warnings)
|
||||
except Exception as e:
|
||||
@@ -478,7 +602,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
|
||||
el.scroll_into_view_if_needed(timeout=1000)
|
||||
except Exception:
|
||||
pass
|
||||
_step("capturing output")
|
||||
progress.step("capturing output")
|
||||
debug(f"Capturing element to {destination}...")
|
||||
el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
|
||||
element_captured = True
|
||||
@@ -489,14 +613,14 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
|
||||
debug(f"Failed to capture element: {exc}")
|
||||
# Fallback to default capture paths
|
||||
if element_captured:
|
||||
_step("capturing saved")
|
||||
progress.step("capturing saved")
|
||||
elif format_name == "pdf":
|
||||
debug("Generating PDF...")
|
||||
page.emulate_media(media="print")
|
||||
_step("capturing output")
|
||||
progress.step("capturing output")
|
||||
page.pdf(path=str(destination), print_background=True)
|
||||
debug(f"PDF saved to {destination}")
|
||||
_step("capturing saved")
|
||||
progress.step("capturing saved")
|
||||
else:
|
||||
debug(f"Capturing full page to {destination}...")
|
||||
screenshot_kwargs: Dict[str, Any] = {"path": str(destination)}
|
||||
@@ -504,20 +628,20 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
|
||||
screenshot_kwargs["type"] = "jpeg"
|
||||
screenshot_kwargs["quality"] = 90
|
||||
if options.full_page:
|
||||
_step("capturing output")
|
||||
progress.step("capturing output")
|
||||
page.screenshot(full_page=True, **screenshot_kwargs)
|
||||
else:
|
||||
article = page.query_selector("article")
|
||||
if article is not None:
|
||||
article_kwargs = dict(screenshot_kwargs)
|
||||
article_kwargs.pop("full_page", None)
|
||||
_step("capturing output")
|
||||
progress.step("capturing output")
|
||||
article.screenshot(**article_kwargs)
|
||||
else:
|
||||
_step("capturing output")
|
||||
progress.step("capturing output")
|
||||
page.screenshot(**screenshot_kwargs)
|
||||
debug(f"Screenshot saved to {destination}")
|
||||
_step("capturing saved")
|
||||
progress.step("capturing saved")
|
||||
except Exception as exc:
|
||||
debug(f"[_capture] Exception launching browser/page: {exc}")
|
||||
msg = str(exc).lower()
|
||||
@@ -532,7 +656,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
|
||||
raise ScreenshotError(f"Failed to capture screenshot: {exc}") from exc
|
||||
|
||||
|
||||
def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
|
||||
def _capture_screenshot(options: ScreenshotOptions, progress: PipelineProgress) -> ScreenshotResult:
|
||||
"""Capture a screenshot for the given options."""
|
||||
debug(f"[_capture_screenshot] Preparing capture for {options.url}")
|
||||
requested_format = _normalise_format(options.output_format)
|
||||
@@ -543,8 +667,8 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
|
||||
will_convert = requested_format == "webp"
|
||||
will_archive = bool(options.archive and options.url)
|
||||
total_steps = 9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
|
||||
_begin_live_steps(total_steps)
|
||||
_step("loading starting")
|
||||
progress.begin_steps(total_steps)
|
||||
progress.step("loading starting")
|
||||
|
||||
# Playwright screenshots do not natively support WebP output.
|
||||
# Capture as PNG, then convert via Pillow.
|
||||
@@ -553,17 +677,22 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
|
||||
capture_path = unique_path(destination.with_suffix(".png"))
|
||||
debug(f"[_capture_screenshot] Requested webp; capturing intermediate png -> {capture_path}")
|
||||
options.output_format = "png"
|
||||
_capture(options, capture_path, warnings)
|
||||
_capture(options, capture_path, warnings, progress)
|
||||
|
||||
if requested_format == "webp":
|
||||
_step("capturing converting to webp")
|
||||
progress.step("capturing converting to webp")
|
||||
debug(f"[_capture_screenshot] Converting png -> webp: {destination}")
|
||||
try:
|
||||
_convert_to_webp(capture_path, destination)
|
||||
try:
|
||||
capture_path.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
did_downscale = _convert_to_webp(capture_path, destination)
|
||||
if did_downscale:
|
||||
warnings.append(
|
||||
f"webp conversion used downscaling to fit {WEBP_MAX_DIM}px limit; keeping original png: {capture_path.name}"
|
||||
)
|
||||
else:
|
||||
try:
|
||||
capture_path.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as exc:
|
||||
warnings.append(f"webp conversion failed; keeping png: {exc}")
|
||||
destination = capture_path
|
||||
@@ -572,7 +701,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
|
||||
url: List[str] = [options.url] if options.url else []
|
||||
archive_url: List[str] = []
|
||||
if options.archive and options.url:
|
||||
_step("capturing archiving")
|
||||
progress.step("capturing archiving")
|
||||
debug(f"[_capture_screenshot] Archiving enabled for {options.url}")
|
||||
archives, archive_warnings = _archive_url(options.url, options.archive_timeout)
|
||||
archive_url.extend(archives)
|
||||
@@ -580,7 +709,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
|
||||
if archives:
|
||||
url = unique_preserve_order([*url, *archives])
|
||||
|
||||
_step("capturing finalized")
|
||||
progress.step("capturing finalized")
|
||||
|
||||
applied_tag = unique_preserve_order(list(tag for tag in options.tag if tag.strip()))
|
||||
|
||||
@@ -627,6 +756,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
)
|
||||
return 1
|
||||
|
||||
progress = PipelineProgress(pipeline_context)
|
||||
|
||||
# ========================================================================
|
||||
# ARGUMENT PARSING
|
||||
# ========================================================================
|
||||
@@ -685,32 +816,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
|
||||
debug(f"[_run] url to process: {[u for u, _ in url_to_process]}")
|
||||
|
||||
# If the caller isn't running the shared pipeline Live progress UI (e.g. direct
|
||||
# cmdlet execution), start a minimal local pipeline progress panel so this cmdlet
|
||||
# still shows step-level progress.
|
||||
local_progress_ui = None
|
||||
try:
|
||||
existing_ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
|
||||
except Exception:
|
||||
existing_ui = None
|
||||
try:
|
||||
if existing_ui is None and bool(getattr(sys.stderr, "isatty", lambda: False)()):
|
||||
from models import PipelineLiveProgress
|
||||
|
||||
local_progress_ui = PipelineLiveProgress(["screen-shot"], enabled=True)
|
||||
local_progress_ui.start()
|
||||
try:
|
||||
if hasattr(pipeline_context, "set_live_progress"):
|
||||
pipeline_context.set_live_progress(local_progress_ui)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
local_progress_ui.begin_pipe(0, total_items=len(url_to_process), items_preview=[u for u, _ in url_to_process])
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
local_progress_ui = None
|
||||
|
||||
# ========================================================================
|
||||
# OUTPUT DIRECTORY RESOLUTION - Priority chain
|
||||
# ========================================================================
|
||||
@@ -749,6 +854,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
|
||||
ensure_directory(screenshot_dir)
|
||||
|
||||
# If the caller isn't running the shared pipeline Live progress UI (e.g. direct
|
||||
# cmdlet execution), start a minimal local pipeline progress panel so this cmdlet
|
||||
# still shows step-level progress.
|
||||
try:
|
||||
progress.ensure_local_ui(
|
||||
label="screen-shot",
|
||||
total_items=len(url_to_process),
|
||||
items_preview=[u for u, _ in url_to_process],
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ========================================================================
|
||||
# PREPARE SCREENSHOT OPTIONS
|
||||
# ========================================================================
|
||||
@@ -850,7 +967,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
options.target_selectors = auto_selectors
|
||||
debug(f"[screen_shot] Auto selectors matched for url: {auto_selectors}")
|
||||
|
||||
screenshot_result = _capture_screenshot(options)
|
||||
screenshot_result = _capture_screenshot(options, progress)
|
||||
|
||||
# Log results and warnings
|
||||
debug(f"Screenshot captured to {screenshot_result.path}")
|
||||
@@ -875,15 +992,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
capture_date = datetime.now().date().isoformat()
|
||||
|
||||
upstream_title = _clean_title(_extract_item_title(origin_item))
|
||||
display_title = upstream_title or url
|
||||
url_title = _title_from_url(url)
|
||||
display_title = upstream_title or url_title or url
|
||||
|
||||
upstream_tags = _extract_item_tags(origin_item)
|
||||
filtered_upstream_tags = [
|
||||
t for t in upstream_tags
|
||||
if not str(t).strip().lower().startswith(("type:", "date:"))
|
||||
]
|
||||
|
||||
url_tags = _tags_from_url(url)
|
||||
merged_tags = unique_preserve_order(
|
||||
["type:screenshot", f"date:{capture_date}"] + filtered_upstream_tags
|
||||
["type:screenshot", f"date:{capture_date}"] + filtered_upstream_tags + url_tags
|
||||
)
|
||||
|
||||
pipe_obj = create_pipe_object_result(
|
||||
@@ -910,11 +1030,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
all_emitted.append(pipe_obj)
|
||||
|
||||
# If we created a local progress UI, advance it per completed item.
|
||||
if local_progress_ui is not None:
|
||||
try:
|
||||
local_progress_ui.on_emit(0, pipe_obj)
|
||||
except Exception:
|
||||
pass
|
||||
progress.on_emit(pipe_obj)
|
||||
|
||||
except ScreenshotError as exc:
|
||||
log(f"Error taking screenshot of {url}: {exc}", file=sys.stderr)
|
||||
@@ -925,23 +1041,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
exit_code = 1
|
||||
|
||||
try:
|
||||
if local_progress_ui is not None:
|
||||
try:
|
||||
local_progress_ui.finish_pipe(0, force_complete=True)
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
if local_progress_ui is not None:
|
||||
try:
|
||||
local_progress_ui.stop()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
if hasattr(pipeline_context, "set_live_progress"):
|
||||
pipeline_context.set_live_progress(None)
|
||||
except Exception:
|
||||
pass
|
||||
progress.close_local_ui(force_complete=True)
|
||||
|
||||
if not all_emitted:
|
||||
log(f"No screenshots were successfully captured", file=sys.stderr)
|
||||
|
||||
Reference in New Issue
Block a user