This commit is contained in:
nose
2025-12-22 02:11:53 -08:00
parent d0b821b5dd
commit 16316bb3fd
20 changed files with 4218 additions and 2422 deletions

View File

@@ -47,6 +47,210 @@ except ImportError:
extract_title = None
def _dedup_tags_preserve_order(tags: List[str]) -> List[str]:
"""Deduplicate tags case-insensitively while preserving order."""
out: List[str] = []
seen: set[str] = set()
for t in tags or []:
if not isinstance(t, str):
continue
s = t.strip()
if not s:
continue
key = s.lower()
if key in seen:
continue
seen.add(key)
out.append(s)
return out
def _extract_subtitle_tags(info: Dict[str, Any]) -> List[str]:
"""Extract subtitle availability tags from a yt-dlp info dict.
Produces multi-valued tags so languages can coexist:
- subs:<lang>
- subs_auto:<lang>
"""
def _langs(value: Any) -> List[str]:
if not isinstance(value, dict):
return []
langs: List[str] = []
for k in value.keys():
if not isinstance(k, str):
continue
lang = k.strip().lower()
if lang:
langs.append(lang)
return sorted(set(langs))
out: List[str] = []
for lang in _langs(info.get("subtitles")):
out.append(f"subs:{lang}")
for lang in _langs(info.get("automatic_captions")):
out.append(f"subs_auto:{lang}")
return out
def _scrape_ytdlp_info(url: str) -> Optional[Dict[str, Any]]:
"""Fetch a yt-dlp info dict without downloading media."""
if not isinstance(url, str) or not url.strip():
return None
url = url.strip()
# Prefer the Python module when available (faster, avoids shell quoting issues).
try:
import yt_dlp # type: ignore
opts: Any = {
"quiet": True,
"no_warnings": True,
"skip_download": True,
"noprogress": True,
"socket_timeout": 15,
"retries": 1,
"playlist_items": "1-10",
}
with yt_dlp.YoutubeDL(opts) as ydl:
info = ydl.extract_info(url, download=False)
return info if isinstance(info, dict) else None
except Exception:
pass
# Fallback to yt-dlp CLI if the module isn't available.
try:
import json as json_module
cmd = [
"yt-dlp",
"-J",
"--no-warnings",
"--skip-download",
"--playlist-items",
"1-10",
url,
]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
if result.returncode != 0:
return None
payload = (result.stdout or "").strip()
if not payload:
return None
data = json_module.loads(payload)
return data if isinstance(data, dict) else None
except Exception:
return None
def _resolve_candidate_urls_for_item(
result: Any,
backend: Any,
file_hash: str,
config: Dict[str, Any],
) -> List[str]:
"""Get candidate URLs from backend and/or piped result."""
try:
from metadata import normalize_urls
except Exception:
normalize_urls = None # type: ignore[assignment]
urls: List[str] = []
# 1) Backend URL association (best source of truth)
try:
backend_urls = backend.get_url(file_hash, config=config)
if backend_urls:
if normalize_urls:
urls.extend(normalize_urls(backend_urls))
else:
urls.extend([str(u).strip() for u in backend_urls if isinstance(u, str) and str(u).strip()])
except Exception:
pass
# 2) Backend metadata url field
try:
meta = backend.get_metadata(file_hash, config=config)
if isinstance(meta, dict) and meta.get("url"):
if normalize_urls:
urls.extend(normalize_urls(meta.get("url")))
else:
raw = meta.get("url")
if isinstance(raw, list):
urls.extend([str(u).strip() for u in raw if isinstance(u, str) and str(u).strip()])
elif isinstance(raw, str) and raw.strip():
urls.append(raw.strip())
except Exception:
pass
# 3) Piped result fields
def _get(obj: Any, key: str, default: Any = None) -> Any:
if isinstance(obj, dict):
return obj.get(key, default)
return getattr(obj, key, default)
for key in ("url", "webpage_url", "source_url", "target"):
val = _get(result, key, None)
if not val:
continue
if normalize_urls:
urls.extend(normalize_urls(val))
continue
if isinstance(val, str) and val.strip():
urls.append(val.strip())
elif isinstance(val, list):
urls.extend([str(u).strip() for u in val if isinstance(u, str) and str(u).strip()])
meta_field = _get(result, "metadata", None)
if isinstance(meta_field, dict) and meta_field.get("url"):
val = meta_field.get("url")
if normalize_urls:
urls.extend(normalize_urls(val))
elif isinstance(val, list):
urls.extend([str(u).strip() for u in val if isinstance(u, str) and str(u).strip()])
elif isinstance(val, str) and val.strip():
urls.append(val.strip())
# Dedup
return _dedup_tags_preserve_order(urls)
def _pick_supported_ytdlp_url(urls: List[str]) -> Optional[str]:
"""Pick the first URL that looks supported by yt-dlp (best effort)."""
if not urls:
return None
def _is_hydrus_file_url(u: str) -> bool:
text = str(u or "").strip().lower()
if not text:
return False
# Hydrus-local file URLs are retrievable blobs, not original source pages.
# yt-dlp generally can't extract meaningful metadata from these.
return ("/get_files/file" in text) and ("hash=" in text)
http_urls: List[str] = []
for u in urls:
text = str(u or "").strip()
if text.lower().startswith(("http://", "https://")):
http_urls.append(text)
# Prefer non-Hydrus URLs for yt-dlp scraping.
candidates = [u for u in http_urls if not _is_hydrus_file_url(u)]
if not candidates:
return None
# Prefer a true support check when the Python module is available.
try:
from SYS.download import is_url_supported_by_ytdlp
for text in candidates:
try:
if is_url_supported_by_ytdlp(text):
return text
except Exception:
continue
except Exception:
pass
# Fallback: use the first non-Hydrus http(s) URL and let extraction decide.
return candidates[0] if candidates else None
_scrape_isbn_metadata = _ol_scrape_isbn_metadata # type: ignore[assignment]
_scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata # type: ignore[assignment]
@@ -853,7 +1057,12 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
scrape_url = parsed_args.get("scrape")
scrape_requested = scrape_flag_present or scrape_url is not None
if scrape_requested and (not scrape_url or str(scrape_url).strip() == ""):
# Convenience: `-scrape` with no value defaults to `ytdlp` (store-backed URL scrape).
if scrape_flag_present and (scrape_url is None or str(scrape_url).strip() == ""):
scrape_url = "ytdlp"
scrape_requested = True
if scrape_requested and (scrape_url is None or str(scrape_url).strip() == ""):
log("-scrape requires a URL or provider name", file=sys.stderr)
return 1
@@ -861,6 +1070,123 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
if scrape_requested and scrape_url:
import json as json_module
if str(scrape_url).strip().lower() == "ytdlp":
# Scrape metadata from the selected item's URL via yt-dlp (no download),
# then OVERWRITE all existing tags (including title:).
#
# This mode requires a store-backed item (hash + store).
#
# NOTE: We intentionally do not reuse _scrape_url_metadata() here because it
# performs namespace deduplication that would collapse multi-valued tags.
file_hash = normalize_hash(hash_override) or normalize_hash(get_field(result, "hash", None))
store_name = get_field(result, "store", None)
subject_path = get_field(result, "path", None) or get_field(result, "target", None) or get_field(result, "filename", None)
item_title = get_field(result, "title", None) or get_field(result, "name", None) or get_field(result, "filename", None)
# Only run overwrite-apply when the item is store-backed.
# If this is a URL-only PipeObject, fall through to provider mode below.
if file_hash and store_name and str(file_hash).strip().lower() != "unknown" and str(store_name).strip().upper() not in {"PATH", "URL"}:
try:
from Store import Store
storage = Store(config)
backend = storage[str(store_name)]
except Exception as exc:
log(f"Failed to resolve store backend '{store_name}': {exc}", file=sys.stderr)
return 1
candidate_urls = _resolve_candidate_urls_for_item(result, backend, file_hash, config)
scrape_target = _pick_supported_ytdlp_url(candidate_urls)
if not scrape_target:
log(
"No yt-dlp-supported source URL found for this item (Hydrus /get_files/file URLs are ignored). ",
file=sys.stderr,
)
log(
"Add the original page URL to the file (e.g. via add-url), then retry get-tag -scrape.",
file=sys.stderr,
)
return 1
info = _scrape_ytdlp_info(scrape_target)
if not info:
log("yt-dlp could not extract metadata for this URL (unsupported or failed)", file=sys.stderr)
return 1
try:
from metadata import extract_ytdlp_tags
except Exception:
extract_ytdlp_tags = None # type: ignore[assignment]
# Prefer the top-level metadata, but if this is a playlist container, use
# the first entry for per-item fields like subtitles.
info_for_subs = info
entries = info.get("entries") if isinstance(info, dict) else None
if isinstance(entries, list) and entries:
first = entries[0]
if isinstance(first, dict):
info_for_subs = first
tags: List[str] = []
if extract_ytdlp_tags:
try:
tags.extend(extract_ytdlp_tags(info))
except Exception:
pass
# Subtitle availability tags
try:
tags.extend(_extract_subtitle_tags(info_for_subs if isinstance(info_for_subs, dict) else {}))
except Exception:
pass
# Ensure we actually have something to apply.
tags = _dedup_tags_preserve_order(tags)
if not tags:
log("No tags extracted from yt-dlp metadata", file=sys.stderr)
return 1
# Full overwrite: delete all existing tags, then add the new set.
try:
existing_tags, _src = backend.get_tag(file_hash, config=config)
except Exception:
existing_tags = []
try:
if existing_tags:
backend.delete_tag(file_hash, list(existing_tags), config=config)
except Exception as exc:
debug(f"[get_tag] ytdlp overwrite: delete_tag failed: {exc}")
try:
backend.add_tag(file_hash, list(tags), config=config)
except Exception as exc:
log(f"Failed to apply yt-dlp tags: {exc}", file=sys.stderr)
return 1
# Show updated tags
try:
updated_tags, _src = backend.get_tag(file_hash, config=config)
except Exception:
updated_tags = tags
if not updated_tags:
updated_tags = tags
_emit_tags_as_table(
tags_list=list(updated_tags),
file_hash=file_hash,
store=str(store_name),
service_name=None,
config=config,
item_title=str(item_title or "ytdlp"),
path=str(subject_path) if subject_path else None,
subject={
"hash": file_hash,
"store": str(store_name),
"path": str(subject_path) if subject_path else None,
"title": item_title,
"extra": {"applied_provider": "ytdlp", "scrape_url": scrape_target},
},
)
return 0
if scrape_url.startswith("http://") or scrape_url.startswith("https://"):
# URL scraping (existing behavior)
title, tags, formats, playlist_items = _scrape_url_metadata(scrape_url)
@@ -951,7 +1277,16 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
else:
combined_query = f"{title_hint} {artist_hint}"
query_hint = identifier_query or combined_query or title_hint
# yt-dlp isn't a search provider; it requires a URL.
url_hint: Optional[str] = None
if provider.name == "ytdlp":
raw_url = get_field(result, "url", None) or get_field(result, "source_url", None) or get_field(result, "target", None)
if isinstance(raw_url, list) and raw_url:
raw_url = raw_url[0]
if isinstance(raw_url, str) and raw_url.strip().startswith(("http://", "https://")):
url_hint = raw_url.strip()
query_hint = url_hint or identifier_query or combined_query or title_hint
if not query_hint:
log("No title or identifier available to search for metadata", file=sys.stderr)
return 1
@@ -967,6 +1302,27 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
if not items:
log("No metadata results found", file=sys.stderr)
return 1
# For yt-dlp, emit tags directly (there is no meaningful multi-result selection step).
if provider.name == "ytdlp":
try:
tags = [str(t) for t in provider.to_tags(items[0]) if t is not None]
except Exception:
tags = []
if not tags:
log("No tags extracted from yt-dlp metadata", file=sys.stderr)
return 1
_emit_tags_as_table(
tags_list=list(tags),
file_hash=None,
store="url",
service_name=None,
config=config,
item_title=str(items[0].get("title") or "ytdlp"),
path=None,
subject={"provider": "ytdlp", "url": str(query_hint)},
)
return 0
from result_table import ResultTable
table = ResultTable(f"Metadata: {provider.name}")
@@ -1040,7 +1396,10 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
return 0
# Apply tags to the store backend (no sidecar writing here).
apply_tags = _filter_scraped_tags([str(t) for t in result_tags if t is not None])
if str(result_provider).strip().lower() == "ytdlp":
apply_tags = [str(t) for t in result_tags if t is not None]
else:
apply_tags = _filter_scraped_tags([str(t) for t in result_tags if t is not None])
if not apply_tags:
log("No applicable scraped tags to apply (title:/artist:/source: are skipped)", file=sys.stderr)
return 0
@@ -1167,6 +1526,11 @@ try:
except Exception:
_SCRAPE_CHOICES = ["itunes", "openlibrary", "googlebooks", "google", "musicbrainz"]
# Special scrape mode: pull tags from an item's URL via yt-dlp (no download)
if "ytdlp" not in _SCRAPE_CHOICES:
_SCRAPE_CHOICES.append("ytdlp")
_SCRAPE_CHOICES = sorted(_SCRAPE_CHOICES)
class Get_Tag(Cmdlet):
"""Class-based get-tag cmdlet with self-registration."""
@@ -1195,7 +1559,7 @@ class Get_Tag(Cmdlet):
CmdletArg(
name="-scrape",
type="string",
description="Scrape metadata from URL or provider name (returns tags as JSON or table)",
description="Scrape metadata from URL/provider, or use 'ytdlp' to scrape from the item's URL and overwrite tags",
required=False,
choices=_SCRAPE_CHOICES,
)