dfd
This commit is contained in:
@@ -47,6 +47,210 @@ except ImportError:
|
||||
extract_title = None
|
||||
|
||||
|
||||
def _dedup_tags_preserve_order(tags: List[str]) -> List[str]:
|
||||
"""Deduplicate tags case-insensitively while preserving order."""
|
||||
out: List[str] = []
|
||||
seen: set[str] = set()
|
||||
for t in tags or []:
|
||||
if not isinstance(t, str):
|
||||
continue
|
||||
s = t.strip()
|
||||
if not s:
|
||||
continue
|
||||
key = s.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
out.append(s)
|
||||
return out
|
||||
|
||||
|
||||
def _extract_subtitle_tags(info: Dict[str, Any]) -> List[str]:
|
||||
"""Extract subtitle availability tags from a yt-dlp info dict.
|
||||
|
||||
Produces multi-valued tags so languages can coexist:
|
||||
- subs:<lang>
|
||||
- subs_auto:<lang>
|
||||
"""
|
||||
def _langs(value: Any) -> List[str]:
|
||||
if not isinstance(value, dict):
|
||||
return []
|
||||
langs: List[str] = []
|
||||
for k in value.keys():
|
||||
if not isinstance(k, str):
|
||||
continue
|
||||
lang = k.strip().lower()
|
||||
if lang:
|
||||
langs.append(lang)
|
||||
return sorted(set(langs))
|
||||
|
||||
out: List[str] = []
|
||||
for lang in _langs(info.get("subtitles")):
|
||||
out.append(f"subs:{lang}")
|
||||
for lang in _langs(info.get("automatic_captions")):
|
||||
out.append(f"subs_auto:{lang}")
|
||||
return out
|
||||
|
||||
|
||||
def _scrape_ytdlp_info(url: str) -> Optional[Dict[str, Any]]:
|
||||
"""Fetch a yt-dlp info dict without downloading media."""
|
||||
if not isinstance(url, str) or not url.strip():
|
||||
return None
|
||||
url = url.strip()
|
||||
|
||||
# Prefer the Python module when available (faster, avoids shell quoting issues).
|
||||
try:
|
||||
import yt_dlp # type: ignore
|
||||
opts: Any = {
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"skip_download": True,
|
||||
"noprogress": True,
|
||||
"socket_timeout": 15,
|
||||
"retries": 1,
|
||||
"playlist_items": "1-10",
|
||||
}
|
||||
with yt_dlp.YoutubeDL(opts) as ydl:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
return info if isinstance(info, dict) else None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to yt-dlp CLI if the module isn't available.
|
||||
try:
|
||||
import json as json_module
|
||||
cmd = [
|
||||
"yt-dlp",
|
||||
"-J",
|
||||
"--no-warnings",
|
||||
"--skip-download",
|
||||
"--playlist-items",
|
||||
"1-10",
|
||||
url,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
if result.returncode != 0:
|
||||
return None
|
||||
payload = (result.stdout or "").strip()
|
||||
if not payload:
|
||||
return None
|
||||
data = json_module.loads(payload)
|
||||
return data if isinstance(data, dict) else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_candidate_urls_for_item(
|
||||
result: Any,
|
||||
backend: Any,
|
||||
file_hash: str,
|
||||
config: Dict[str, Any],
|
||||
) -> List[str]:
|
||||
"""Get candidate URLs from backend and/or piped result."""
|
||||
try:
|
||||
from metadata import normalize_urls
|
||||
except Exception:
|
||||
normalize_urls = None # type: ignore[assignment]
|
||||
|
||||
urls: List[str] = []
|
||||
# 1) Backend URL association (best source of truth)
|
||||
try:
|
||||
backend_urls = backend.get_url(file_hash, config=config)
|
||||
if backend_urls:
|
||||
if normalize_urls:
|
||||
urls.extend(normalize_urls(backend_urls))
|
||||
else:
|
||||
urls.extend([str(u).strip() for u in backend_urls if isinstance(u, str) and str(u).strip()])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 2) Backend metadata url field
|
||||
try:
|
||||
meta = backend.get_metadata(file_hash, config=config)
|
||||
if isinstance(meta, dict) and meta.get("url"):
|
||||
if normalize_urls:
|
||||
urls.extend(normalize_urls(meta.get("url")))
|
||||
else:
|
||||
raw = meta.get("url")
|
||||
if isinstance(raw, list):
|
||||
urls.extend([str(u).strip() for u in raw if isinstance(u, str) and str(u).strip()])
|
||||
elif isinstance(raw, str) and raw.strip():
|
||||
urls.append(raw.strip())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 3) Piped result fields
|
||||
def _get(obj: Any, key: str, default: Any = None) -> Any:
|
||||
if isinstance(obj, dict):
|
||||
return obj.get(key, default)
|
||||
return getattr(obj, key, default)
|
||||
|
||||
for key in ("url", "webpage_url", "source_url", "target"):
|
||||
val = _get(result, key, None)
|
||||
if not val:
|
||||
continue
|
||||
if normalize_urls:
|
||||
urls.extend(normalize_urls(val))
|
||||
continue
|
||||
if isinstance(val, str) and val.strip():
|
||||
urls.append(val.strip())
|
||||
elif isinstance(val, list):
|
||||
urls.extend([str(u).strip() for u in val if isinstance(u, str) and str(u).strip()])
|
||||
|
||||
meta_field = _get(result, "metadata", None)
|
||||
if isinstance(meta_field, dict) and meta_field.get("url"):
|
||||
val = meta_field.get("url")
|
||||
if normalize_urls:
|
||||
urls.extend(normalize_urls(val))
|
||||
elif isinstance(val, list):
|
||||
urls.extend([str(u).strip() for u in val if isinstance(u, str) and str(u).strip()])
|
||||
elif isinstance(val, str) and val.strip():
|
||||
urls.append(val.strip())
|
||||
|
||||
# Dedup
|
||||
return _dedup_tags_preserve_order(urls)
|
||||
|
||||
|
||||
def _pick_supported_ytdlp_url(urls: List[str]) -> Optional[str]:
|
||||
"""Pick the first URL that looks supported by yt-dlp (best effort)."""
|
||||
if not urls:
|
||||
return None
|
||||
|
||||
def _is_hydrus_file_url(u: str) -> bool:
|
||||
text = str(u or "").strip().lower()
|
||||
if not text:
|
||||
return False
|
||||
# Hydrus-local file URLs are retrievable blobs, not original source pages.
|
||||
# yt-dlp generally can't extract meaningful metadata from these.
|
||||
return ("/get_files/file" in text) and ("hash=" in text)
|
||||
|
||||
http_urls: List[str] = []
|
||||
for u in urls:
|
||||
text = str(u or "").strip()
|
||||
if text.lower().startswith(("http://", "https://")):
|
||||
http_urls.append(text)
|
||||
|
||||
# Prefer non-Hydrus URLs for yt-dlp scraping.
|
||||
candidates = [u for u in http_urls if not _is_hydrus_file_url(u)]
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
# Prefer a true support check when the Python module is available.
|
||||
try:
|
||||
from SYS.download import is_url_supported_by_ytdlp
|
||||
for text in candidates:
|
||||
try:
|
||||
if is_url_supported_by_ytdlp(text):
|
||||
return text
|
||||
except Exception:
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback: use the first non-Hydrus http(s) URL and let extraction decide.
|
||||
return candidates[0] if candidates else None
|
||||
|
||||
|
||||
_scrape_isbn_metadata = _ol_scrape_isbn_metadata # type: ignore[assignment]
|
||||
_scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata # type: ignore[assignment]
|
||||
|
||||
@@ -853,7 +1057,12 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
scrape_url = parsed_args.get("scrape")
|
||||
scrape_requested = scrape_flag_present or scrape_url is not None
|
||||
|
||||
if scrape_requested and (not scrape_url or str(scrape_url).strip() == ""):
|
||||
# Convenience: `-scrape` with no value defaults to `ytdlp` (store-backed URL scrape).
|
||||
if scrape_flag_present and (scrape_url is None or str(scrape_url).strip() == ""):
|
||||
scrape_url = "ytdlp"
|
||||
scrape_requested = True
|
||||
|
||||
if scrape_requested and (scrape_url is None or str(scrape_url).strip() == ""):
|
||||
log("-scrape requires a URL or provider name", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
@@ -861,6 +1070,123 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
if scrape_requested and scrape_url:
|
||||
import json as json_module
|
||||
|
||||
if str(scrape_url).strip().lower() == "ytdlp":
|
||||
# Scrape metadata from the selected item's URL via yt-dlp (no download),
|
||||
# then OVERWRITE all existing tags (including title:).
|
||||
#
|
||||
# This mode requires a store-backed item (hash + store).
|
||||
#
|
||||
# NOTE: We intentionally do not reuse _scrape_url_metadata() here because it
|
||||
# performs namespace deduplication that would collapse multi-valued tags.
|
||||
file_hash = normalize_hash(hash_override) or normalize_hash(get_field(result, "hash", None))
|
||||
store_name = get_field(result, "store", None)
|
||||
subject_path = get_field(result, "path", None) or get_field(result, "target", None) or get_field(result, "filename", None)
|
||||
item_title = get_field(result, "title", None) or get_field(result, "name", None) or get_field(result, "filename", None)
|
||||
|
||||
# Only run overwrite-apply when the item is store-backed.
|
||||
# If this is a URL-only PipeObject, fall through to provider mode below.
|
||||
if file_hash and store_name and str(file_hash).strip().lower() != "unknown" and str(store_name).strip().upper() not in {"PATH", "URL"}:
|
||||
try:
|
||||
from Store import Store
|
||||
storage = Store(config)
|
||||
backend = storage[str(store_name)]
|
||||
except Exception as exc:
|
||||
log(f"Failed to resolve store backend '{store_name}': {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
candidate_urls = _resolve_candidate_urls_for_item(result, backend, file_hash, config)
|
||||
scrape_target = _pick_supported_ytdlp_url(candidate_urls)
|
||||
if not scrape_target:
|
||||
log(
|
||||
"No yt-dlp-supported source URL found for this item (Hydrus /get_files/file URLs are ignored). ",
|
||||
file=sys.stderr,
|
||||
)
|
||||
log(
|
||||
"Add the original page URL to the file (e.g. via add-url), then retry get-tag -scrape.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
info = _scrape_ytdlp_info(scrape_target)
|
||||
if not info:
|
||||
log("yt-dlp could not extract metadata for this URL (unsupported or failed)", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
try:
|
||||
from metadata import extract_ytdlp_tags
|
||||
except Exception:
|
||||
extract_ytdlp_tags = None # type: ignore[assignment]
|
||||
|
||||
# Prefer the top-level metadata, but if this is a playlist container, use
|
||||
# the first entry for per-item fields like subtitles.
|
||||
info_for_subs = info
|
||||
entries = info.get("entries") if isinstance(info, dict) else None
|
||||
if isinstance(entries, list) and entries:
|
||||
first = entries[0]
|
||||
if isinstance(first, dict):
|
||||
info_for_subs = first
|
||||
|
||||
tags: List[str] = []
|
||||
if extract_ytdlp_tags:
|
||||
try:
|
||||
tags.extend(extract_ytdlp_tags(info))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Subtitle availability tags
|
||||
try:
|
||||
tags.extend(_extract_subtitle_tags(info_for_subs if isinstance(info_for_subs, dict) else {}))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Ensure we actually have something to apply.
|
||||
tags = _dedup_tags_preserve_order(tags)
|
||||
if not tags:
|
||||
log("No tags extracted from yt-dlp metadata", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Full overwrite: delete all existing tags, then add the new set.
|
||||
try:
|
||||
existing_tags, _src = backend.get_tag(file_hash, config=config)
|
||||
except Exception:
|
||||
existing_tags = []
|
||||
try:
|
||||
if existing_tags:
|
||||
backend.delete_tag(file_hash, list(existing_tags), config=config)
|
||||
except Exception as exc:
|
||||
debug(f"[get_tag] ytdlp overwrite: delete_tag failed: {exc}")
|
||||
try:
|
||||
backend.add_tag(file_hash, list(tags), config=config)
|
||||
except Exception as exc:
|
||||
log(f"Failed to apply yt-dlp tags: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Show updated tags
|
||||
try:
|
||||
updated_tags, _src = backend.get_tag(file_hash, config=config)
|
||||
except Exception:
|
||||
updated_tags = tags
|
||||
if not updated_tags:
|
||||
updated_tags = tags
|
||||
|
||||
_emit_tags_as_table(
|
||||
tags_list=list(updated_tags),
|
||||
file_hash=file_hash,
|
||||
store=str(store_name),
|
||||
service_name=None,
|
||||
config=config,
|
||||
item_title=str(item_title or "ytdlp"),
|
||||
path=str(subject_path) if subject_path else None,
|
||||
subject={
|
||||
"hash": file_hash,
|
||||
"store": str(store_name),
|
||||
"path": str(subject_path) if subject_path else None,
|
||||
"title": item_title,
|
||||
"extra": {"applied_provider": "ytdlp", "scrape_url": scrape_target},
|
||||
},
|
||||
)
|
||||
return 0
|
||||
|
||||
if scrape_url.startswith("http://") or scrape_url.startswith("https://"):
|
||||
# URL scraping (existing behavior)
|
||||
title, tags, formats, playlist_items = _scrape_url_metadata(scrape_url)
|
||||
@@ -951,7 +1277,16 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
else:
|
||||
combined_query = f"{title_hint} {artist_hint}"
|
||||
|
||||
query_hint = identifier_query or combined_query or title_hint
|
||||
# yt-dlp isn't a search provider; it requires a URL.
|
||||
url_hint: Optional[str] = None
|
||||
if provider.name == "ytdlp":
|
||||
raw_url = get_field(result, "url", None) or get_field(result, "source_url", None) or get_field(result, "target", None)
|
||||
if isinstance(raw_url, list) and raw_url:
|
||||
raw_url = raw_url[0]
|
||||
if isinstance(raw_url, str) and raw_url.strip().startswith(("http://", "https://")):
|
||||
url_hint = raw_url.strip()
|
||||
|
||||
query_hint = url_hint or identifier_query or combined_query or title_hint
|
||||
if not query_hint:
|
||||
log("No title or identifier available to search for metadata", file=sys.stderr)
|
||||
return 1
|
||||
@@ -967,6 +1302,27 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
if not items:
|
||||
log("No metadata results found", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# For yt-dlp, emit tags directly (there is no meaningful multi-result selection step).
|
||||
if provider.name == "ytdlp":
|
||||
try:
|
||||
tags = [str(t) for t in provider.to_tags(items[0]) if t is not None]
|
||||
except Exception:
|
||||
tags = []
|
||||
if not tags:
|
||||
log("No tags extracted from yt-dlp metadata", file=sys.stderr)
|
||||
return 1
|
||||
_emit_tags_as_table(
|
||||
tags_list=list(tags),
|
||||
file_hash=None,
|
||||
store="url",
|
||||
service_name=None,
|
||||
config=config,
|
||||
item_title=str(items[0].get("title") or "ytdlp"),
|
||||
path=None,
|
||||
subject={"provider": "ytdlp", "url": str(query_hint)},
|
||||
)
|
||||
return 0
|
||||
|
||||
from result_table import ResultTable
|
||||
table = ResultTable(f"Metadata: {provider.name}")
|
||||
@@ -1040,7 +1396,10 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||||
return 0
|
||||
|
||||
# Apply tags to the store backend (no sidecar writing here).
|
||||
apply_tags = _filter_scraped_tags([str(t) for t in result_tags if t is not None])
|
||||
if str(result_provider).strip().lower() == "ytdlp":
|
||||
apply_tags = [str(t) for t in result_tags if t is not None]
|
||||
else:
|
||||
apply_tags = _filter_scraped_tags([str(t) for t in result_tags if t is not None])
|
||||
if not apply_tags:
|
||||
log("No applicable scraped tags to apply (title:/artist:/source: are skipped)", file=sys.stderr)
|
||||
return 0
|
||||
@@ -1167,6 +1526,11 @@ try:
|
||||
except Exception:
|
||||
_SCRAPE_CHOICES = ["itunes", "openlibrary", "googlebooks", "google", "musicbrainz"]
|
||||
|
||||
# Special scrape mode: pull tags from an item's URL via yt-dlp (no download)
|
||||
if "ytdlp" not in _SCRAPE_CHOICES:
|
||||
_SCRAPE_CHOICES.append("ytdlp")
|
||||
_SCRAPE_CHOICES = sorted(_SCRAPE_CHOICES)
|
||||
|
||||
|
||||
class Get_Tag(Cmdlet):
|
||||
"""Class-based get-tag cmdlet with self-registration."""
|
||||
@@ -1195,7 +1559,7 @@ class Get_Tag(Cmdlet):
|
||||
CmdletArg(
|
||||
name="-scrape",
|
||||
type="string",
|
||||
description="Scrape metadata from URL or provider name (returns tags as JSON or table)",
|
||||
description="Scrape metadata from URL/provider, or use 'ytdlp' to scrape from the item's URL and overwrite tags",
|
||||
required=False,
|
||||
choices=_SCRAPE_CHOICES,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user