This commit is contained in:
nose
2025-12-22 02:11:53 -08:00
parent d0b821b5dd
commit 16316bb3fd
20 changed files with 4218 additions and 2422 deletions

View File

@@ -1585,9 +1585,46 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
"warnings", "path", "relationships", "is_temp", "action", "parent_hash",
}
# Convert ResultItem to dict to preserve all attributes
# Convert common object-like results into a dict so we can preserve fields like
# hash/store/url when they come from result tables (e.g., get-url emits UrlItem).
#
# Priority:
# 1) explicit to_dict()
# 2) best-effort attribute extraction for known PipeObject-ish fields
if hasattr(value, 'to_dict'):
value = value.to_dict()
elif not isinstance(value, dict):
try:
obj_map: Dict[str, Any] = {}
for k in (
"hash",
"store",
"provider",
"prov",
"tag",
"title",
"url",
"source_url",
"duration",
"duration_seconds",
"metadata",
"full_metadata",
"warnings",
"path",
"target",
"relationships",
"is_temp",
"action",
"parent_hash",
"extra",
"media_kind",
):
if hasattr(value, k):
obj_map[k] = getattr(value, k)
if obj_map:
value = obj_map
except Exception:
pass
if isinstance(value, dict):
# Extract hash and store (canonical identifiers)
@@ -1695,8 +1732,19 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
# Fallback: build from path argument or bare value
hash_val = "unknown"
path_val = default_path or getattr(value, "path", None)
url_val: Optional[str] = None
title_val = None
# If the raw value is a string, treat it as either a URL or a file path.
# This is important for @-selection results that are plain URL strings.
if isinstance(value, str):
s = value.strip()
if s.lower().startswith(("http://", "https://")):
url_val = s
path_val = None
else:
path_val = s
if path_val and path_val != "unknown":
try:
from SYS.utils import sha256_file
@@ -1708,8 +1756,9 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
except Exception:
pass
# When coming from path argument, store should be "PATH" (file path, not a backend)
store_val = "PATH"
# When coming from a raw URL string, mark it explicitly as URL.
# Otherwise treat it as a local path.
store_val = "URL" if url_val else "PATH"
pipe_obj = models.PipeObject(
hash=hash_val,
@@ -1717,6 +1766,8 @@ def coerce_to_pipe_object(value: Any, default_path: Optional[str] = None) -> mod
provider=None,
path=str(path_val) if path_val and path_val != "unknown" else None,
title=title_val,
url=url_val,
source_url=url_val,
tag=[],
extra={},
)

View File

@@ -12,6 +12,7 @@ import models
import pipeline as ctx
from API import HydrusNetwork as hydrus_wrapper
from SYS.logger import log, debug
from SYS.pipeline_progress import PipelineProgress
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS
from Store import Store
from . import _shared as sh
@@ -73,6 +74,7 @@ class Add_File(Cmdlet):
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Main execution entry point."""
parsed = parse_cmdlet_args(args, self)
progress = PipelineProgress(ctx)
path_arg = parsed.get("path")
location = parsed.get("store")
@@ -80,6 +82,35 @@ class Add_File(Cmdlet):
provider_room = parsed.get("room")
delete_after = parsed.get("delete", False)
# Convenience: when piping a file into add-file, allow `-path <existing dir>`
# to act as the destination export directory.
# Example: screen-shot "https://..." | add-file -path "C:\Users\Admin\Desktop"
if path_arg and not location and not provider_name:
try:
candidate_dir = Path(str(path_arg))
if candidate_dir.exists() and candidate_dir.is_dir():
piped_items = result if isinstance(result, list) else [result]
has_local_source = False
for it in piped_items:
try:
po = coerce_to_pipe_object(it, None)
src = str(getattr(po, "path", "") or "").strip()
if not src:
continue
if src.lower().startswith(("http://", "https://", "magnet:", "torrent:")):
continue
if Path(src).is_file():
has_local_source = True
break
except Exception:
continue
if has_local_source:
debug(f"[add-file] Treating -path directory as destination: {candidate_dir}")
location = str(candidate_dir)
path_arg = None
except Exception:
pass
stage_ctx = ctx.get_stage_context()
is_last_stage = (stage_ctx is None) or bool(getattr(stage_ctx, "is_last_stage", False))
@@ -93,7 +124,7 @@ class Add_File(Cmdlet):
is_storage_backend_location = False
# Decide which items to process.
# - If user provided -path, treat this invocation as single-item.
# - If user provided -path (and it was not reinterpreted as destination), treat this invocation as single-item.
# - Otherwise, if piped input is a list, ingest each item.
if path_arg:
items_to_process: List[Any] = [result]
@@ -102,6 +133,17 @@ class Add_File(Cmdlet):
else:
items_to_process = [result]
# Minimal step-based progress for single-item runs.
# Many add-file flows don't emit intermediate items, so without steps the pipe can look "stuck".
use_steps = False
steps_started = False
step2_done = False
try:
ui, _ = progress.ui_and_pipe_index()
use_steps = (ui is not None) and (len(items_to_process) == 1)
except Exception:
use_steps = False
debug(f"[add-file] INPUT result type={type(result).__name__}")
if isinstance(result, list):
debug(f"[add-file] INPUT result is list with {len(result)} items")
@@ -235,6 +277,14 @@ class Add_File(Cmdlet):
failures += 1
continue
is_url_target = isinstance(media_path_or_url, str) and str(media_path_or_url).lower().startswith(
("http://", "https://", "magnet:", "torrent:")
)
if use_steps and (not steps_started) and (not is_url_target):
progress.begin_steps(3)
progress.step("resolving source")
steps_started = True
# Update pipe_obj with resolved path
pipe_obj.path = str(media_path_or_url)
@@ -300,13 +350,34 @@ class Add_File(Cmdlet):
pass
temp_dir_to_cleanup = Path(tempfile.mkdtemp(prefix="medios_openlibrary_"))
# Wire OpenLibrary download progress into pipeline Live UI (no tqdm spam).
def _ol_progress(kind: str, completed: int, total: Optional[int], label: str) -> None:
try:
if kind == "pages" and total:
progress.set_status(f"downloading pages {completed}/{total}")
progress.set_percent(int(round((completed / max(1, total)) * 100.0)))
elif kind == "bytes" and total:
progress.set_status(f"downloading {label} {completed}/{total} bytes")
progress.set_percent(int(round((completed / max(1, total)) * 100.0)))
else:
progress.set_status("downloading")
except Exception:
return
try:
progress.set_percent(0)
progress.set_status("downloading openlibrary")
except Exception:
pass
sr = SearchResult(
table="openlibrary",
title=str(getattr(pipe_obj, "title", None) or "Unknown"),
path=str(media_path_or_url),
full_metadata=full_metadata if isinstance(full_metadata, dict) else {},
)
downloaded = provider.download(sr, temp_dir_to_cleanup)
downloaded = provider.download(sr, temp_dir_to_cleanup, progress_callback=_ol_progress)
if downloaded is None:
log("[add-file] OpenLibrary download failed", file=sys.stderr)
failures += 1
@@ -325,6 +396,13 @@ class Add_File(Cmdlet):
pipe_obj.path = str(downloaded_path)
delete_after_item = True
try:
if ui is not None:
ui.set_pipe_percent(int(pipe_idx), 100)
ui.set_pipe_status_text(int(pipe_idx), "downloaded")
except Exception:
pass
# For non-provider URLs, or if still a URL after provider attempt, delegate to download-media.
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
("http://", "https://", "magnet:", "torrent:")
@@ -562,6 +640,10 @@ class Add_File(Cmdlet):
failures += 1
continue
if use_steps and steps_started and (not step2_done):
progress.step("writing destination")
step2_done = True
if code == 0:
successes += 1
else:
@@ -619,6 +701,9 @@ class Add_File(Cmdlet):
except Exception:
pass
if use_steps and steps_started:
progress.step("finalized")
if successes > 0:
return 0
return 1

View File

@@ -34,6 +34,19 @@ class Add_Url(sh.Cmdlet):
"""Add URL to file via hash+store backend."""
parsed = sh.parse_cmdlet_args(args, self)
# Compatibility/piping fix:
# `SharedArgs.QUERY` is positional in the shared parser, so `add-url <url>`
# (and `@N | add-url <url>`) can mistakenly parse the URL into `query`.
# If `url` is missing and `query` looks like an http(s) URL, treat it as `url`.
try:
if (not parsed.get("url")) and isinstance(parsed.get("query"), str):
q = str(parsed.get("query") or "").strip()
if q.startswith(("http://", "https://")):
parsed["url"] = q
parsed.pop("query", None)
except Exception:
pass
query_hash = sh.parse_single_hash_query(parsed.get("query"))
if parsed.get("query") and not query_hash:
log("Error: -query must be of the form hash:<sha256>")

View File

@@ -29,7 +29,7 @@ class Delete_Url(Cmdlet):
arg=[
SharedArgs.QUERY,
SharedArgs.STORE,
CmdletArg("url", required=True, description="URL to remove"),
CmdletArg("url", required=False, description="URL to remove (optional when piping url rows)"),
],
detail=[
"- Removes URL association from file identified by hash+store",
@@ -69,22 +69,24 @@ class Delete_Url(Cmdlet):
log("Error: No store name provided")
return 1
if not url_arg:
log("Error: No URL provided")
return 1
# Normalize hash (single-item mode)
if not results and file_hash:
file_hash = normalize_hash(file_hash)
if not file_hash:
log("Error: Invalid hash format")
return 1
# Parse url (comma-separated)
urls = [u.strip() for u in str(url_arg).split(',') if u.strip()]
if not urls:
log("Error: No valid url provided")
return 1
from metadata import normalize_urls
def _urls_from_arg(raw: Any) -> List[str]:
if raw is None:
return []
# Support comma-separated input for backwards compatibility
if isinstance(raw, str) and "," in raw:
return [u.strip() for u in raw.split(",") if u.strip()]
return [u.strip() for u in normalize_urls(raw) if str(u).strip()]
urls_from_cli = _urls_from_arg(url_arg)
# Get backend and delete url
try:
@@ -145,7 +147,17 @@ class Delete_Url(Cmdlet):
)
continue
batch.setdefault(store_text, []).append((normalized, list(urls)))
# Determine which URLs to delete.
# - If user passed an explicit <url>, apply it to all items.
# - Otherwise, when piping url rows from get-url, delete the url(s) from each item.
item_urls = list(urls_from_cli)
if not item_urls:
item_urls = [u.strip() for u in normalize_urls(get_field(item, "url") or get_field(item, "source_url")) if str(u).strip()]
if not item_urls:
ctx.print_if_visible("[delete-url] Warning: Item has no url field; skipping", file=sys.stderr)
continue
batch.setdefault(store_text, []).append((normalized, item_urls))
for store_text, pairs in batch.items():
try:
@@ -168,24 +180,39 @@ class Delete_Url(Cmdlet):
for h, ulist in bulk_pairs:
backend.delete_url(h, ulist, config=config)
deleted_count = 0
for _h, ulist in bulk_pairs:
deleted_count += len(ulist or [])
ctx.print_if_visible(
f"✓ delete-url: {len(urls)} url(s) for {len(bulk_pairs)} item(s) in '{store_text}'",
f"✓ delete-url: {deleted_count} url(s) for {len(bulk_pairs)} item(s) in '{store_text}'",
file=sys.stderr,
)
for item in pass_through:
existing = get_field(item, "url")
_set_item_url(item, _remove_urls(existing, list(urls)))
# In batch mode we removed the union of requested urls for the file.
# Using urls_from_cli (if present) matches the user's explicit intent; otherwise
# remove the piped url row(s).
remove_set = urls_from_cli
if not remove_set:
remove_set = [u.strip() for u in normalize_urls(get_field(item, "url") or get_field(item, "source_url")) if str(u).strip()]
_set_item_url(item, _remove_urls(existing, list(remove_set)))
ctx.emit(item)
return 0
# Single-item mode
if not urls_from_cli:
urls_from_cli = [u.strip() for u in normalize_urls(get_field(result, "url") or get_field(result, "source_url")) if str(u).strip()]
if not urls_from_cli:
log("Error: No URL provided")
return 1
backend = storage[str(store_name)]
backend.delete_url(str(file_hash), urls, config=config)
ctx.print_if_visible(f"✓ delete-url: {len(urls)} url(s) removed", file=sys.stderr)
backend.delete_url(str(file_hash), list(urls_from_cli), config=config)
ctx.print_if_visible(f"✓ delete-url: {len(urls_from_cli)} url(s) removed", file=sys.stderr)
if result is not None:
existing = get_field(result, "url")
_set_item_url(result, _remove_urls(existing, list(urls)))
_set_item_url(result, _remove_urls(existing, list(urls_from_cli)))
ctx.emit(result)
return 0

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -126,7 +126,7 @@ class Get_File(sh.Cmdlet):
except Exception as exc:
log(f"Error opening browser: {exc}", file=sys.stderr)
else:
log(f"Opened in browser: {source_path}", file=sys.stderr)
debug(f"Opened in browser: {source_path}", file=sys.stderr)
# Emit result for pipeline
ctx.emit({

View File

@@ -47,6 +47,210 @@ except ImportError:
extract_title = None
def _dedup_tags_preserve_order(tags: List[str]) -> List[str]:
"""Deduplicate tags case-insensitively while preserving order."""
out: List[str] = []
seen: set[str] = set()
for t in tags or []:
if not isinstance(t, str):
continue
s = t.strip()
if not s:
continue
key = s.lower()
if key in seen:
continue
seen.add(key)
out.append(s)
return out
def _extract_subtitle_tags(info: Dict[str, Any]) -> List[str]:
"""Extract subtitle availability tags from a yt-dlp info dict.
Produces multi-valued tags so languages can coexist:
- subs:<lang>
- subs_auto:<lang>
"""
def _langs(value: Any) -> List[str]:
if not isinstance(value, dict):
return []
langs: List[str] = []
for k in value.keys():
if not isinstance(k, str):
continue
lang = k.strip().lower()
if lang:
langs.append(lang)
return sorted(set(langs))
out: List[str] = []
for lang in _langs(info.get("subtitles")):
out.append(f"subs:{lang}")
for lang in _langs(info.get("automatic_captions")):
out.append(f"subs_auto:{lang}")
return out
def _scrape_ytdlp_info(url: str) -> Optional[Dict[str, Any]]:
"""Fetch a yt-dlp info dict without downloading media."""
if not isinstance(url, str) or not url.strip():
return None
url = url.strip()
# Prefer the Python module when available (faster, avoids shell quoting issues).
try:
import yt_dlp # type: ignore
opts: Any = {
"quiet": True,
"no_warnings": True,
"skip_download": True,
"noprogress": True,
"socket_timeout": 15,
"retries": 1,
"playlist_items": "1-10",
}
with yt_dlp.YoutubeDL(opts) as ydl:
info = ydl.extract_info(url, download=False)
return info if isinstance(info, dict) else None
except Exception:
pass
# Fallback to yt-dlp CLI if the module isn't available.
try:
import json as json_module
cmd = [
"yt-dlp",
"-J",
"--no-warnings",
"--skip-download",
"--playlist-items",
"1-10",
url,
]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
if result.returncode != 0:
return None
payload = (result.stdout or "").strip()
if not payload:
return None
data = json_module.loads(payload)
return data if isinstance(data, dict) else None
except Exception:
return None
def _resolve_candidate_urls_for_item(
result: Any,
backend: Any,
file_hash: str,
config: Dict[str, Any],
) -> List[str]:
"""Get candidate URLs from backend and/or piped result."""
try:
from metadata import normalize_urls
except Exception:
normalize_urls = None # type: ignore[assignment]
urls: List[str] = []
# 1) Backend URL association (best source of truth)
try:
backend_urls = backend.get_url(file_hash, config=config)
if backend_urls:
if normalize_urls:
urls.extend(normalize_urls(backend_urls))
else:
urls.extend([str(u).strip() for u in backend_urls if isinstance(u, str) and str(u).strip()])
except Exception:
pass
# 2) Backend metadata url field
try:
meta = backend.get_metadata(file_hash, config=config)
if isinstance(meta, dict) and meta.get("url"):
if normalize_urls:
urls.extend(normalize_urls(meta.get("url")))
else:
raw = meta.get("url")
if isinstance(raw, list):
urls.extend([str(u).strip() for u in raw if isinstance(u, str) and str(u).strip()])
elif isinstance(raw, str) and raw.strip():
urls.append(raw.strip())
except Exception:
pass
# 3) Piped result fields
def _get(obj: Any, key: str, default: Any = None) -> Any:
if isinstance(obj, dict):
return obj.get(key, default)
return getattr(obj, key, default)
for key in ("url", "webpage_url", "source_url", "target"):
val = _get(result, key, None)
if not val:
continue
if normalize_urls:
urls.extend(normalize_urls(val))
continue
if isinstance(val, str) and val.strip():
urls.append(val.strip())
elif isinstance(val, list):
urls.extend([str(u).strip() for u in val if isinstance(u, str) and str(u).strip()])
meta_field = _get(result, "metadata", None)
if isinstance(meta_field, dict) and meta_field.get("url"):
val = meta_field.get("url")
if normalize_urls:
urls.extend(normalize_urls(val))
elif isinstance(val, list):
urls.extend([str(u).strip() for u in val if isinstance(u, str) and str(u).strip()])
elif isinstance(val, str) and val.strip():
urls.append(val.strip())
# Dedup
return _dedup_tags_preserve_order(urls)
def _pick_supported_ytdlp_url(urls: List[str]) -> Optional[str]:
"""Pick the first URL that looks supported by yt-dlp (best effort)."""
if not urls:
return None
def _is_hydrus_file_url(u: str) -> bool:
text = str(u or "").strip().lower()
if not text:
return False
# Hydrus-local file URLs are retrievable blobs, not original source pages.
# yt-dlp generally can't extract meaningful metadata from these.
return ("/get_files/file" in text) and ("hash=" in text)
http_urls: List[str] = []
for u in urls:
text = str(u or "").strip()
if text.lower().startswith(("http://", "https://")):
http_urls.append(text)
# Prefer non-Hydrus URLs for yt-dlp scraping.
candidates = [u for u in http_urls if not _is_hydrus_file_url(u)]
if not candidates:
return None
# Prefer a true support check when the Python module is available.
try:
from SYS.download import is_url_supported_by_ytdlp
for text in candidates:
try:
if is_url_supported_by_ytdlp(text):
return text
except Exception:
continue
except Exception:
pass
# Fallback: use the first non-Hydrus http(s) URL and let extraction decide.
return candidates[0] if candidates else None
_scrape_isbn_metadata = _ol_scrape_isbn_metadata # type: ignore[assignment]
_scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata # type: ignore[assignment]
@@ -853,7 +1057,12 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
scrape_url = parsed_args.get("scrape")
scrape_requested = scrape_flag_present or scrape_url is not None
if scrape_requested and (not scrape_url or str(scrape_url).strip() == ""):
# Convenience: `-scrape` with no value defaults to `ytdlp` (store-backed URL scrape).
if scrape_flag_present and (scrape_url is None or str(scrape_url).strip() == ""):
scrape_url = "ytdlp"
scrape_requested = True
if scrape_requested and (scrape_url is None or str(scrape_url).strip() == ""):
log("-scrape requires a URL or provider name", file=sys.stderr)
return 1
@@ -861,6 +1070,123 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
if scrape_requested and scrape_url:
import json as json_module
if str(scrape_url).strip().lower() == "ytdlp":
# Scrape metadata from the selected item's URL via yt-dlp (no download),
# then OVERWRITE all existing tags (including title:).
#
# This mode requires a store-backed item (hash + store).
#
# NOTE: We intentionally do not reuse _scrape_url_metadata() here because it
# performs namespace deduplication that would collapse multi-valued tags.
file_hash = normalize_hash(hash_override) or normalize_hash(get_field(result, "hash", None))
store_name = get_field(result, "store", None)
subject_path = get_field(result, "path", None) or get_field(result, "target", None) or get_field(result, "filename", None)
item_title = get_field(result, "title", None) or get_field(result, "name", None) or get_field(result, "filename", None)
# Only run overwrite-apply when the item is store-backed.
# If this is a URL-only PipeObject, fall through to provider mode below.
if file_hash and store_name and str(file_hash).strip().lower() != "unknown" and str(store_name).strip().upper() not in {"PATH", "URL"}:
try:
from Store import Store
storage = Store(config)
backend = storage[str(store_name)]
except Exception as exc:
log(f"Failed to resolve store backend '{store_name}': {exc}", file=sys.stderr)
return 1
candidate_urls = _resolve_candidate_urls_for_item(result, backend, file_hash, config)
scrape_target = _pick_supported_ytdlp_url(candidate_urls)
if not scrape_target:
log(
"No yt-dlp-supported source URL found for this item (Hydrus /get_files/file URLs are ignored). ",
file=sys.stderr,
)
log(
"Add the original page URL to the file (e.g. via add-url), then retry get-tag -scrape.",
file=sys.stderr,
)
return 1
info = _scrape_ytdlp_info(scrape_target)
if not info:
log("yt-dlp could not extract metadata for this URL (unsupported or failed)", file=sys.stderr)
return 1
try:
from metadata import extract_ytdlp_tags
except Exception:
extract_ytdlp_tags = None # type: ignore[assignment]
# Prefer the top-level metadata, but if this is a playlist container, use
# the first entry for per-item fields like subtitles.
info_for_subs = info
entries = info.get("entries") if isinstance(info, dict) else None
if isinstance(entries, list) and entries:
first = entries[0]
if isinstance(first, dict):
info_for_subs = first
tags: List[str] = []
if extract_ytdlp_tags:
try:
tags.extend(extract_ytdlp_tags(info))
except Exception:
pass
# Subtitle availability tags
try:
tags.extend(_extract_subtitle_tags(info_for_subs if isinstance(info_for_subs, dict) else {}))
except Exception:
pass
# Ensure we actually have something to apply.
tags = _dedup_tags_preserve_order(tags)
if not tags:
log("No tags extracted from yt-dlp metadata", file=sys.stderr)
return 1
# Full overwrite: delete all existing tags, then add the new set.
try:
existing_tags, _src = backend.get_tag(file_hash, config=config)
except Exception:
existing_tags = []
try:
if existing_tags:
backend.delete_tag(file_hash, list(existing_tags), config=config)
except Exception as exc:
debug(f"[get_tag] ytdlp overwrite: delete_tag failed: {exc}")
try:
backend.add_tag(file_hash, list(tags), config=config)
except Exception as exc:
log(f"Failed to apply yt-dlp tags: {exc}", file=sys.stderr)
return 1
# Show updated tags
try:
updated_tags, _src = backend.get_tag(file_hash, config=config)
except Exception:
updated_tags = tags
if not updated_tags:
updated_tags = tags
_emit_tags_as_table(
tags_list=list(updated_tags),
file_hash=file_hash,
store=str(store_name),
service_name=None,
config=config,
item_title=str(item_title or "ytdlp"),
path=str(subject_path) if subject_path else None,
subject={
"hash": file_hash,
"store": str(store_name),
"path": str(subject_path) if subject_path else None,
"title": item_title,
"extra": {"applied_provider": "ytdlp", "scrape_url": scrape_target},
},
)
return 0
if scrape_url.startswith("http://") or scrape_url.startswith("https://"):
# URL scraping (existing behavior)
title, tags, formats, playlist_items = _scrape_url_metadata(scrape_url)
@@ -951,7 +1277,16 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
else:
combined_query = f"{title_hint} {artist_hint}"
query_hint = identifier_query or combined_query or title_hint
# yt-dlp isn't a search provider; it requires a URL.
url_hint: Optional[str] = None
if provider.name == "ytdlp":
raw_url = get_field(result, "url", None) or get_field(result, "source_url", None) or get_field(result, "target", None)
if isinstance(raw_url, list) and raw_url:
raw_url = raw_url[0]
if isinstance(raw_url, str) and raw_url.strip().startswith(("http://", "https://")):
url_hint = raw_url.strip()
query_hint = url_hint or identifier_query or combined_query or title_hint
if not query_hint:
log("No title or identifier available to search for metadata", file=sys.stderr)
return 1
@@ -967,6 +1302,27 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
if not items:
log("No metadata results found", file=sys.stderr)
return 1
# For yt-dlp, emit tags directly (there is no meaningful multi-result selection step).
if provider.name == "ytdlp":
try:
tags = [str(t) for t in provider.to_tags(items[0]) if t is not None]
except Exception:
tags = []
if not tags:
log("No tags extracted from yt-dlp metadata", file=sys.stderr)
return 1
_emit_tags_as_table(
tags_list=list(tags),
file_hash=None,
store="url",
service_name=None,
config=config,
item_title=str(items[0].get("title") or "ytdlp"),
path=None,
subject={"provider": "ytdlp", "url": str(query_hint)},
)
return 0
from result_table import ResultTable
table = ResultTable(f"Metadata: {provider.name}")
@@ -1040,7 +1396,10 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
return 0
# Apply tags to the store backend (no sidecar writing here).
apply_tags = _filter_scraped_tags([str(t) for t in result_tags if t is not None])
if str(result_provider).strip().lower() == "ytdlp":
apply_tags = [str(t) for t in result_tags if t is not None]
else:
apply_tags = _filter_scraped_tags([str(t) for t in result_tags if t is not None])
if not apply_tags:
log("No applicable scraped tags to apply (title:/artist:/source: are skipped)", file=sys.stderr)
return 0
@@ -1167,6 +1526,11 @@ try:
except Exception:
_SCRAPE_CHOICES = ["itunes", "openlibrary", "googlebooks", "google", "musicbrainz"]
# Special scrape mode: pull tags from an item's URL via yt-dlp (no download)
if "ytdlp" not in _SCRAPE_CHOICES:
_SCRAPE_CHOICES.append("ytdlp")
_SCRAPE_CHOICES = sorted(_SCRAPE_CHOICES)
class Get_Tag(Cmdlet):
"""Class-based get-tag cmdlet with self-registration."""
@@ -1195,7 +1559,7 @@ class Get_Tag(Cmdlet):
CmdletArg(
name="-scrape",
type="string",
description="Scrape metadata from URL or provider name (returns tags as JSON or table)",
description="Scrape metadata from URL/provider, or use 'ytdlp' to scrape from the item's URL and overwrite tags",
required=False,
choices=_SCRAPE_CHOICES,
)

View File

@@ -14,10 +14,11 @@ import httpx
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple
from urllib.parse import urlsplit, quote, urljoin
from urllib.parse import urlsplit, quote, urljoin, unquote
from SYS.logger import log, debug
from API.HTTP import HTTPClient
from SYS.pipeline_progress import PipelineProgress
from SYS.utils import ensure_directory, unique_path, unique_preserve_order
from . import _shared as sh
@@ -31,54 +32,6 @@ get_field = sh.get_field
parse_cmdlet_args = sh.parse_cmdlet_args
import pipeline as pipeline_context
def _live_ui_and_pipe_index() -> tuple[Optional[Any], int]:
ui = None
try:
ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
except Exception:
ui = None
pipe_idx: int = 0
try:
stage_ctx = pipeline_context.get_stage_context() if hasattr(pipeline_context, "get_stage_context") else None
maybe_idx = getattr(stage_ctx, "pipe_index", None) if stage_ctx is not None else None
if isinstance(maybe_idx, int):
pipe_idx = int(maybe_idx)
except Exception:
pipe_idx = 0
return ui, pipe_idx
def _begin_live_steps(total_steps: int) -> None:
"""Declare the total number of steps for this cmdlet run (per-pipe)."""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
begin = getattr(ui, "begin_pipe_steps", None)
if callable(begin):
begin(int(pipe_idx), total_steps=int(total_steps))
except Exception:
return
def _step(text: str) -> None:
"""Emit a *new* step.
Each call increments the step counter and advances percent automatically.
"""
ui, pipe_idx = _live_ui_and_pipe_index()
if ui is None:
return
try:
adv = getattr(ui, "advance_pipe_step", None)
if callable(adv):
adv(int(pipe_idx), str(text))
except Exception:
return
# ============================================================================
# CMDLET Metadata Declaration
# ============================================================================
@@ -115,6 +68,10 @@ USER_AGENT = (
DEFAULT_VIEWPORT: dict[str, int] = {"width": 1920, "height": 1080}
ARCHIVE_TIMEOUT = 30.0
# WebP has a hard maximum dimension per side.
# Pillow typically fails with: "encoding error 5: Image size exceeds WebP limit of 16383 pixels"
WEBP_MAX_DIM = 16_383
# Configurable selectors for specific websites
SITE_SELECTORS: Dict[str, List[str]] = {
"twitter.com": [
@@ -200,6 +157,80 @@ def _slugify_url(url: str) -> str:
return slug[:100]
def _tags_from_url(url: str) -> List[str]:
"""Derive simple tags from a URL.
- site:<domain> (strips leading www.)
- title:<slug> derived from the last path segment, with extension removed
and separators (-, _, %) normalized to spaces.
"""
u = str(url or "").strip()
if not u:
return []
parsed = None
try:
parsed = urlsplit(u)
host = str(getattr(parsed, "hostname", None) or getattr(parsed, "netloc", "") or "").strip().lower()
except Exception:
parsed = None
host = ""
if host:
# Drop credentials and port if present.
if "@" in host:
host = host.rsplit("@", 1)[-1]
if ":" in host:
host = host.split(":", 1)[0]
if host.startswith("www."):
host = host[len("www.") :]
path = ""
if parsed is not None:
try:
path = str(getattr(parsed, "path", "") or "")
except Exception:
path = ""
last = ""
if path:
try:
last = path.rsplit("/", 1)[-1]
except Exception:
last = ""
try:
last = unquote(last or "")
except Exception:
last = last or ""
if last and "." in last:
# Drop a single trailing extension (e.g. .html, .php).
last = last.rsplit(".", 1)[0]
for sep in ("_", "-", "%"):
if last and sep in last:
last = last.replace(sep, " ")
title = " ".join(str(last or "").split()).strip().lower()
tags: List[str] = []
if host:
tags.append(f"site:{host}")
if title:
tags.append(f"title:{title}")
return tags
def _title_from_url(url: str) -> str:
"""Return the normalized title derived from a URL's last path segment."""
for t in _tags_from_url(url):
if str(t).lower().startswith("title:"):
return str(t)[len("title:") :].strip()
return ""
def _normalise_format(fmt: Optional[str]) -> str:
"""Normalize output format to valid values."""
if not fmt:
@@ -218,6 +249,89 @@ def _format_suffix(fmt: str) -> str:
return ".jpg"
return f".{fmt}"
def _convert_to_webp(
src_png: Path,
dst_webp: Path,
*,
quality: int = 90,
method: int = 6,
max_dim: int = WEBP_MAX_DIM,
downscale_if_oversize: bool = True,
) -> bool:
"""Convert a PNG screenshot to WebP via Pillow.
Playwright does not currently support emitting WebP directly.
"""
if not src_png or not Path(src_png).is_file():
raise ScreenshotError(f"Source image not found: {src_png}")
dst_webp = Path(dst_webp)
try:
dst_webp.parent.mkdir(parents=True, exist_ok=True)
except Exception:
pass
try:
from PIL import Image
except Exception as exc:
raise ScreenshotError(f"Pillow is required for webp conversion: {exc}") from exc
# Write atomically to avoid partial files if conversion is interrupted.
tmp_path = unique_path(dst_webp.with_suffix(".tmp.webp"))
try:
with Image.open(src_png) as im:
did_downscale = False
save_kwargs: Dict[str, Any] = {
"format": "WEBP",
"quality": int(quality),
"method": int(method),
}
# Preserve alpha when present; Pillow handles it for WEBP.
# Normalize palette images to RGBA to avoid odd palette artifacts.
if im.mode == "P":
im = im.convert("RGBA")
# WebP enforces a hard max dimension per side (16383px).
# When full-page captures are very tall, downscale proportionally to fit.
try:
w, h = im.size
except Exception:
w, h = 0, 0
if downscale_if_oversize and isinstance(max_dim, int) and max_dim > 0 and (w > max_dim or h > max_dim):
scale = 1.0
try:
scale = min(float(max_dim) / float(w), float(max_dim) / float(h))
except Exception:
scale = 1.0
if scale > 0.0 and scale < 1.0:
new_w = max(1, int(w * scale))
new_h = max(1, int(h * scale))
debug(
f"[_convert_to_webp] Image exceeds WebP limit ({w}x{h}); downscaling -> {new_w}x{new_h}"
)
try:
resample = getattr(getattr(Image, "Resampling", Image), "LANCZOS", None)
if resample is None:
resample = getattr(Image, "LANCZOS", 1)
im = im.resize((new_w, new_h), resample=resample)
did_downscale = True
except Exception as exc:
debug(f"[_convert_to_webp] Downscale failed; attempting direct WEBP save anyway: {exc}")
im.save(tmp_path, **save_kwargs)
tmp_path.replace(dst_webp)
return bool(did_downscale)
finally:
try:
tmp_path.unlink(missing_ok=True)
except Exception:
pass
def _matched_site_selectors(url: str) -> List[str]:
"""Return SITE_SELECTORS for a matched domain; empty if no match.
@@ -231,6 +345,16 @@ def _matched_site_selectors(url: str) -> List[str]:
return sels
def _selectors_for_url(url: str) -> List[str]:
"""Return selectors to try for a URL.
For now, prefer a minimal behavior: only return known SITE_SELECTORS.
(The cmdlet already falls back to full-page capture when no selectors match.)
"""
return _matched_site_selectors(url)
def _platform_preprocess(url: str, page: Any, warnings: List[str], timeout_ms: int = 10_000) -> None:
"""Best-effort page tweaks for popular platforms before capture."""
try:
@@ -366,11 +490,11 @@ def _prepare_output_path(options: ScreenshotOptions) -> Path:
return unique_path(path)
def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) -> None:
def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str], progress: PipelineProgress) -> None:
"""Capture screenshot using Playwright."""
debug(f"[_capture] Starting capture for {options.url} -> {destination}")
try:
_step("loading launching browser")
progress.step("loading launching browser")
tool = options.playwright_tool or PlaywrightTool({})
# Ensure Chromium engine is used for the screen-shot cmdlet (force for consistency)
@@ -405,16 +529,16 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
try:
with tool.open_page(headless=headless) as page:
_step("loading navigating")
progress.step("loading navigating")
debug(f"Navigating to {options.url}...")
try:
tool.goto(page, options.url)
debug("Page loaded successfully")
_step("loading page loaded")
progress.step("loading page loaded")
except PlaywrightTimeoutError:
warnings.append("navigation timeout; capturing current page state")
debug("Navigation timeout; proceeding with current state")
_step("loading navigation timeout")
progress.step("loading navigation timeout")
# Skip article lookup by default (wait_for_article defaults to False)
if options.wait_for_article:
@@ -430,9 +554,9 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
debug(f"Waiting {options.wait_after_load}s for page stabilization...")
time.sleep(min(10.0, max(0.0, options.wait_after_load)))
_step("loading stabilized")
progress.step("loading stabilized")
_step("capturing preparing")
progress.step("capturing preparing")
if options.replace_video_posters:
debug("Replacing video elements with posters...")
page.evaluate(
@@ -453,7 +577,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
if options.prefer_platform_target and format_name != "pdf":
debug(f"[_capture] Target capture enabled")
debug("Attempting platform-specific content capture...")
_step("capturing locating target")
progress.step("capturing locating target")
try:
_platform_preprocess(options.url, page, warnings)
except Exception as e:
@@ -478,7 +602,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
el.scroll_into_view_if_needed(timeout=1000)
except Exception:
pass
_step("capturing output")
progress.step("capturing output")
debug(f"Capturing element to {destination}...")
el.screenshot(path=str(destination), type=("jpeg" if format_name == "jpeg" else None))
element_captured = True
@@ -489,14 +613,14 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
debug(f"Failed to capture element: {exc}")
# Fallback to default capture paths
if element_captured:
_step("capturing saved")
progress.step("capturing saved")
elif format_name == "pdf":
debug("Generating PDF...")
page.emulate_media(media="print")
_step("capturing output")
progress.step("capturing output")
page.pdf(path=str(destination), print_background=True)
debug(f"PDF saved to {destination}")
_step("capturing saved")
progress.step("capturing saved")
else:
debug(f"Capturing full page to {destination}...")
screenshot_kwargs: Dict[str, Any] = {"path": str(destination)}
@@ -504,20 +628,20 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
screenshot_kwargs["type"] = "jpeg"
screenshot_kwargs["quality"] = 90
if options.full_page:
_step("capturing output")
progress.step("capturing output")
page.screenshot(full_page=True, **screenshot_kwargs)
else:
article = page.query_selector("article")
if article is not None:
article_kwargs = dict(screenshot_kwargs)
article_kwargs.pop("full_page", None)
_step("capturing output")
progress.step("capturing output")
article.screenshot(**article_kwargs)
else:
_step("capturing output")
progress.step("capturing output")
page.screenshot(**screenshot_kwargs)
debug(f"Screenshot saved to {destination}")
_step("capturing saved")
progress.step("capturing saved")
except Exception as exc:
debug(f"[_capture] Exception launching browser/page: {exc}")
msg = str(exc).lower()
@@ -532,7 +656,7 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
raise ScreenshotError(f"Failed to capture screenshot: {exc}") from exc
def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
def _capture_screenshot(options: ScreenshotOptions, progress: PipelineProgress) -> ScreenshotResult:
"""Capture a screenshot for the given options."""
debug(f"[_capture_screenshot] Preparing capture for {options.url}")
requested_format = _normalise_format(options.output_format)
@@ -543,8 +667,8 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
will_convert = requested_format == "webp"
will_archive = bool(options.archive and options.url)
total_steps = 9 + (1 if will_target else 0) + (1 if will_convert else 0) + (1 if will_archive else 0)
_begin_live_steps(total_steps)
_step("loading starting")
progress.begin_steps(total_steps)
progress.step("loading starting")
# Playwright screenshots do not natively support WebP output.
# Capture as PNG, then convert via Pillow.
@@ -553,17 +677,22 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
capture_path = unique_path(destination.with_suffix(".png"))
debug(f"[_capture_screenshot] Requested webp; capturing intermediate png -> {capture_path}")
options.output_format = "png"
_capture(options, capture_path, warnings)
_capture(options, capture_path, warnings, progress)
if requested_format == "webp":
_step("capturing converting to webp")
progress.step("capturing converting to webp")
debug(f"[_capture_screenshot] Converting png -> webp: {destination}")
try:
_convert_to_webp(capture_path, destination)
try:
capture_path.unlink(missing_ok=True)
except Exception:
pass
did_downscale = _convert_to_webp(capture_path, destination)
if did_downscale:
warnings.append(
f"webp conversion used downscaling to fit {WEBP_MAX_DIM}px limit; keeping original png: {capture_path.name}"
)
else:
try:
capture_path.unlink(missing_ok=True)
except Exception:
pass
except Exception as exc:
warnings.append(f"webp conversion failed; keeping png: {exc}")
destination = capture_path
@@ -572,7 +701,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
url: List[str] = [options.url] if options.url else []
archive_url: List[str] = []
if options.archive and options.url:
_step("capturing archiving")
progress.step("capturing archiving")
debug(f"[_capture_screenshot] Archiving enabled for {options.url}")
archives, archive_warnings = _archive_url(options.url, options.archive_timeout)
archive_url.extend(archives)
@@ -580,7 +709,7 @@ def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
if archives:
url = unique_preserve_order([*url, *archives])
_step("capturing finalized")
progress.step("capturing finalized")
applied_tag = unique_preserve_order(list(tag for tag in options.tag if tag.strip()))
@@ -627,6 +756,8 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
)
return 1
progress = PipelineProgress(pipeline_context)
# ========================================================================
# ARGUMENT PARSING
# ========================================================================
@@ -685,32 +816,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
debug(f"[_run] url to process: {[u for u, _ in url_to_process]}")
# If the caller isn't running the shared pipeline Live progress UI (e.g. direct
# cmdlet execution), start a minimal local pipeline progress panel so this cmdlet
# still shows step-level progress.
local_progress_ui = None
try:
existing_ui = pipeline_context.get_live_progress() if hasattr(pipeline_context, "get_live_progress") else None
except Exception:
existing_ui = None
try:
if existing_ui is None and bool(getattr(sys.stderr, "isatty", lambda: False)()):
from models import PipelineLiveProgress
local_progress_ui = PipelineLiveProgress(["screen-shot"], enabled=True)
local_progress_ui.start()
try:
if hasattr(pipeline_context, "set_live_progress"):
pipeline_context.set_live_progress(local_progress_ui)
except Exception:
pass
try:
local_progress_ui.begin_pipe(0, total_items=len(url_to_process), items_preview=[u for u, _ in url_to_process])
except Exception:
pass
except Exception:
local_progress_ui = None
# ========================================================================
# OUTPUT DIRECTORY RESOLUTION - Priority chain
# ========================================================================
@@ -749,6 +854,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
ensure_directory(screenshot_dir)
# If the caller isn't running the shared pipeline Live progress UI (e.g. direct
# cmdlet execution), start a minimal local pipeline progress panel so this cmdlet
# still shows step-level progress.
try:
progress.ensure_local_ui(
label="screen-shot",
total_items=len(url_to_process),
items_preview=[u for u, _ in url_to_process],
)
except Exception:
pass
# ========================================================================
# PREPARE SCREENSHOT OPTIONS
# ========================================================================
@@ -850,7 +967,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
options.target_selectors = auto_selectors
debug(f"[screen_shot] Auto selectors matched for url: {auto_selectors}")
screenshot_result = _capture_screenshot(options)
screenshot_result = _capture_screenshot(options, progress)
# Log results and warnings
debug(f"Screenshot captured to {screenshot_result.path}")
@@ -875,15 +992,18 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
capture_date = datetime.now().date().isoformat()
upstream_title = _clean_title(_extract_item_title(origin_item))
display_title = upstream_title or url
url_title = _title_from_url(url)
display_title = upstream_title or url_title or url
upstream_tags = _extract_item_tags(origin_item)
filtered_upstream_tags = [
t for t in upstream_tags
if not str(t).strip().lower().startswith(("type:", "date:"))
]
url_tags = _tags_from_url(url)
merged_tags = unique_preserve_order(
["type:screenshot", f"date:{capture_date}"] + filtered_upstream_tags
["type:screenshot", f"date:{capture_date}"] + filtered_upstream_tags + url_tags
)
pipe_obj = create_pipe_object_result(
@@ -910,11 +1030,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
all_emitted.append(pipe_obj)
# If we created a local progress UI, advance it per completed item.
if local_progress_ui is not None:
try:
local_progress_ui.on_emit(0, pipe_obj)
except Exception:
pass
progress.on_emit(pipe_obj)
except ScreenshotError as exc:
log(f"Error taking screenshot of {url}: {exc}", file=sys.stderr)
@@ -925,23 +1041,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
traceback.print_exc(file=sys.stderr)
exit_code = 1
try:
if local_progress_ui is not None:
try:
local_progress_ui.finish_pipe(0, force_complete=True)
except Exception:
pass
finally:
if local_progress_ui is not None:
try:
local_progress_ui.stop()
except Exception:
pass
try:
if hasattr(pipeline_context, "set_live_progress"):
pipeline_context.set_live_progress(None)
except Exception:
pass
progress.close_local_ui(force_complete=True)
if not all_emitted:
log(f"No screenshots were successfully captured", file=sys.stderr)