dfd

2025-12-22 02:11:53 -08:00
parent d0b821b5dd
commit 16316bb3fd
20 changed files with 4218 additions and 2422 deletions
--- a/cmdlet/get_tag.py
+++ b/cmdlet/get_tag.py
@@ -47,6 +47,210 @@ except ImportError:
 	extract_title = None


+def _dedup_tags_preserve_order(tags: List[str]) -> List[str]:
+	"""Deduplicate tags case-insensitively while preserving order."""
+	out: List[str] = []
+	seen: set[str] = set()
+	for t in tags or []:
+		if not isinstance(t, str):
+			continue
+		s = t.strip()
+		if not s:
+			continue
+		key = s.lower()
+		if key in seen:
+			continue
+		seen.add(key)
+		out.append(s)
+	return out
+
+
+def _extract_subtitle_tags(info: Dict[str, Any]) -> List[str]:
+	"""Extract subtitle availability tags from a yt-dlp info dict.
+
+	Produces multi-valued tags so languages can coexist:
+	- subs:<lang>
+	- subs_auto:<lang>
+	"""
+	def _langs(value: Any) -> List[str]:
+		if not isinstance(value, dict):
+			return []
+		langs: List[str] = []
+		for k in value.keys():
+			if not isinstance(k, str):
+				continue
+			lang = k.strip().lower()
+			if lang:
+				langs.append(lang)
+		return sorted(set(langs))
+
+	out: List[str] = []
+	for lang in _langs(info.get("subtitles")):
+		out.append(f"subs:{lang}")
+	for lang in _langs(info.get("automatic_captions")):
+		out.append(f"subs_auto:{lang}")
+	return out
+
+
+def _scrape_ytdlp_info(url: str) -> Optional[Dict[str, Any]]:
+	"""Fetch a yt-dlp info dict without downloading media."""
+	if not isinstance(url, str) or not url.strip():
+		return None
+	url = url.strip()
+
+	# Prefer the Python module when available (faster, avoids shell quoting issues).
+	try:
+		import yt_dlp  # type: ignore
+		opts: Any = {
+			"quiet": True,
+			"no_warnings": True,
+			"skip_download": True,
+			"noprogress": True,
+			"socket_timeout": 15,
+			"retries": 1,
+			"playlist_items": "1-10",
+		}
+		with yt_dlp.YoutubeDL(opts) as ydl:
+			info = ydl.extract_info(url, download=False)
+			return info if isinstance(info, dict) else None
+	except Exception:
+		pass
+
+	# Fallback to yt-dlp CLI if the module isn't available.
+	try:
+		import json as json_module
+		cmd = [
+			"yt-dlp",
+			"-J",
+			"--no-warnings",
+			"--skip-download",
+			"--playlist-items",
+			"1-10",
+			url,
+		]
+		result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+		if result.returncode != 0:
+			return None
+		payload = (result.stdout or "").strip()
+		if not payload:
+			return None
+		data = json_module.loads(payload)
+		return data if isinstance(data, dict) else None
+	except Exception:
+		return None
+
+
+def _resolve_candidate_urls_for_item(
+	result: Any,
+	backend: Any,
+	file_hash: str,
+	config: Dict[str, Any],
+) -> List[str]:
+	"""Get candidate URLs from backend and/or piped result."""
+	try:
+		from metadata import normalize_urls
+	except Exception:
+		normalize_urls = None  # type: ignore[assignment]
+
+	urls: List[str] = []
+	# 1) Backend URL association (best source of truth)
+	try:
+		backend_urls = backend.get_url(file_hash, config=config)
+		if backend_urls:
+			if normalize_urls:
+				urls.extend(normalize_urls(backend_urls))
+			else:
+				urls.extend([str(u).strip() for u in backend_urls if isinstance(u, str) and str(u).strip()])
+	except Exception:
+		pass
+
+	# 2) Backend metadata url field
+	try:
+		meta = backend.get_metadata(file_hash, config=config)
+		if isinstance(meta, dict) and meta.get("url"):
+			if normalize_urls:
+				urls.extend(normalize_urls(meta.get("url")))
+			else:
+				raw = meta.get("url")
+				if isinstance(raw, list):
+					urls.extend([str(u).strip() for u in raw if isinstance(u, str) and str(u).strip()])
+				elif isinstance(raw, str) and raw.strip():
+					urls.append(raw.strip())
+	except Exception:
+		pass
+
+	# 3) Piped result fields
+	def _get(obj: Any, key: str, default: Any = None) -> Any:
+		if isinstance(obj, dict):
+			return obj.get(key, default)
+		return getattr(obj, key, default)
+
+	for key in ("url", "webpage_url", "source_url", "target"):
+		val = _get(result, key, None)
+		if not val:
+			continue
+		if normalize_urls:
+			urls.extend(normalize_urls(val))
+			continue
+		if isinstance(val, str) and val.strip():
+			urls.append(val.strip())
+		elif isinstance(val, list):
+			urls.extend([str(u).strip() for u in val if isinstance(u, str) and str(u).strip()])
+
+	meta_field = _get(result, "metadata", None)
+	if isinstance(meta_field, dict) and meta_field.get("url"):
+		val = meta_field.get("url")
+		if normalize_urls:
+			urls.extend(normalize_urls(val))
+		elif isinstance(val, list):
+			urls.extend([str(u).strip() for u in val if isinstance(u, str) and str(u).strip()])
+		elif isinstance(val, str) and val.strip():
+			urls.append(val.strip())
+
+	# Dedup
+	return _dedup_tags_preserve_order(urls)
+
+
+def _pick_supported_ytdlp_url(urls: List[str]) -> Optional[str]:
+	"""Pick the first URL that looks supported by yt-dlp (best effort)."""
+	if not urls:
+		return None
+
+	def _is_hydrus_file_url(u: str) -> bool:
+		text = str(u or "").strip().lower()
+		if not text:
+			return False
+		# Hydrus-local file URLs are retrievable blobs, not original source pages.
+		# yt-dlp generally can't extract meaningful metadata from these.
+		return ("/get_files/file" in text) and ("hash=" in text)
+
+	http_urls: List[str] = []
+	for u in urls:
+		text = str(u or "").strip()
+		if text.lower().startswith(("http://", "https://")):
+			http_urls.append(text)
+
+	# Prefer non-Hydrus URLs for yt-dlp scraping.
+	candidates = [u for u in http_urls if not _is_hydrus_file_url(u)]
+	if not candidates:
+		return None
+
+	# Prefer a true support check when the Python module is available.
+	try:
+		from SYS.download import is_url_supported_by_ytdlp
+		for text in candidates:
+			try:
+				if is_url_supported_by_ytdlp(text):
+					return text
+			except Exception:
+				continue
+	except Exception:
+		pass
+
+	# Fallback: use the first non-Hydrus http(s) URL and let extraction decide.
+	return candidates[0] if candidates else None
+
+
 _scrape_isbn_metadata = _ol_scrape_isbn_metadata  # type: ignore[assignment]
 _scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata  # type: ignore[assignment]

@@ -853,7 +1057,12 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
 	scrape_url = parsed_args.get("scrape")
 	scrape_requested = scrape_flag_present or scrape_url is not None

-	if scrape_requested and (not scrape_url or str(scrape_url).strip() == ""):
+	# Convenience: `-scrape` with no value defaults to `ytdlp` (store-backed URL scrape).
+	if scrape_flag_present and (scrape_url is None or str(scrape_url).strip() == ""):
+		scrape_url = "ytdlp"
+		scrape_requested = True
+
+	if scrape_requested and (scrape_url is None or str(scrape_url).strip() == ""):
 		log("-scrape requires a URL or provider name", file=sys.stderr)
 		return 1
 	
@@ -861,6 +1070,123 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
 	if scrape_requested and scrape_url:
 		import json as json_module

+		if str(scrape_url).strip().lower() == "ytdlp":
+			# Scrape metadata from the selected item's URL via yt-dlp (no download),
+			# then OVERWRITE all existing tags (including title:).
+			#
+			# This mode requires a store-backed item (hash + store).
+			#
+			# NOTE: We intentionally do not reuse _scrape_url_metadata() here because it
+			# performs namespace deduplication that would collapse multi-valued tags.
+			file_hash = normalize_hash(hash_override) or normalize_hash(get_field(result, "hash", None))
+			store_name = get_field(result, "store", None)
+			subject_path = get_field(result, "path", None) or get_field(result, "target", None) or get_field(result, "filename", None)
+			item_title = get_field(result, "title", None) or get_field(result, "name", None) or get_field(result, "filename", None)
+
+			# Only run overwrite-apply when the item is store-backed.
+			# If this is a URL-only PipeObject, fall through to provider mode below.
+			if file_hash and store_name and str(file_hash).strip().lower() != "unknown" and str(store_name).strip().upper() not in {"PATH", "URL"}:
+				try:
+					from Store import Store
+					storage = Store(config)
+					backend = storage[str(store_name)]
+				except Exception as exc:
+					log(f"Failed to resolve store backend '{store_name}': {exc}", file=sys.stderr)
+					return 1
+
+				candidate_urls = _resolve_candidate_urls_for_item(result, backend, file_hash, config)
+				scrape_target = _pick_supported_ytdlp_url(candidate_urls)
+				if not scrape_target:
+					log(
+						"No yt-dlp-supported source URL found for this item (Hydrus /get_files/file URLs are ignored). ",
+						file=sys.stderr,
+					)
+					log(
+						"Add the original page URL to the file (e.g. via add-url), then retry get-tag -scrape.",
+						file=sys.stderr,
+					)
+					return 1
+
+				info = _scrape_ytdlp_info(scrape_target)
+				if not info:
+					log("yt-dlp could not extract metadata for this URL (unsupported or failed)", file=sys.stderr)
+					return 1
+
+				try:
+					from metadata import extract_ytdlp_tags
+				except Exception:
+					extract_ytdlp_tags = None  # type: ignore[assignment]
+
+				# Prefer the top-level metadata, but if this is a playlist container, use
+				# the first entry for per-item fields like subtitles.
+				info_for_subs = info
+				entries = info.get("entries") if isinstance(info, dict) else None
+				if isinstance(entries, list) and entries:
+					first = entries[0]
+					if isinstance(first, dict):
+						info_for_subs = first
+
+				tags: List[str] = []
+				if extract_ytdlp_tags:
+					try:
+						tags.extend(extract_ytdlp_tags(info))
+					except Exception:
+						pass
+
+				# Subtitle availability tags
+				try:
+					tags.extend(_extract_subtitle_tags(info_for_subs if isinstance(info_for_subs, dict) else {}))
+				except Exception:
+					pass
+
+				# Ensure we actually have something to apply.
+				tags = _dedup_tags_preserve_order(tags)
+				if not tags:
+					log("No tags extracted from yt-dlp metadata", file=sys.stderr)
+					return 1
+
+				# Full overwrite: delete all existing tags, then add the new set.
+				try:
+					existing_tags, _src = backend.get_tag(file_hash, config=config)
+				except Exception:
+					existing_tags = []
+				try:
+					if existing_tags:
+						backend.delete_tag(file_hash, list(existing_tags), config=config)
+				except Exception as exc:
+					debug(f"[get_tag] ytdlp overwrite: delete_tag failed: {exc}")
+				try:
+					backend.add_tag(file_hash, list(tags), config=config)
+				except Exception as exc:
+					log(f"Failed to apply yt-dlp tags: {exc}", file=sys.stderr)
+					return 1
+
+				# Show updated tags
+				try:
+					updated_tags, _src = backend.get_tag(file_hash, config=config)
+				except Exception:
+					updated_tags = tags
+				if not updated_tags:
+					updated_tags = tags
+
+				_emit_tags_as_table(
+					tags_list=list(updated_tags),
+					file_hash=file_hash,
+					store=str(store_name),
+					service_name=None,
+					config=config,
+					item_title=str(item_title or "ytdlp"),
+					path=str(subject_path) if subject_path else None,
+					subject={
+						"hash": file_hash,
+						"store": str(store_name),
+						"path": str(subject_path) if subject_path else None,
+						"title": item_title,
+						"extra": {"applied_provider": "ytdlp", "scrape_url": scrape_target},
+					},
+				)
+				return 0
+
 		if scrape_url.startswith("http://") or scrape_url.startswith("https://"):
 			# URL scraping (existing behavior)
 			title, tags, formats, playlist_items = _scrape_url_metadata(scrape_url)
@@ -951,7 +1277,16 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
 			else:
 				combined_query = f"{title_hint} {artist_hint}"
 		
-		query_hint = identifier_query or combined_query or title_hint
+		# yt-dlp isn't a search provider; it requires a URL.
+		url_hint: Optional[str] = None
+		if provider.name == "ytdlp":
+			raw_url = get_field(result, "url", None) or get_field(result, "source_url", None) or get_field(result, "target", None)
+			if isinstance(raw_url, list) and raw_url:
+				raw_url = raw_url[0]
+			if isinstance(raw_url, str) and raw_url.strip().startswith(("http://", "https://")):
+				url_hint = raw_url.strip()
+
+		query_hint = url_hint or identifier_query or combined_query or title_hint
 		if not query_hint:
 			log("No title or identifier available to search for metadata", file=sys.stderr)
 			return 1
@@ -967,6 +1302,27 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
 		if not items:
 			log("No metadata results found", file=sys.stderr)
 			return 1
+
+		# For yt-dlp, emit tags directly (there is no meaningful multi-result selection step).
+		if provider.name == "ytdlp":
+			try:
+				tags = [str(t) for t in provider.to_tags(items[0]) if t is not None]
+			except Exception:
+				tags = []
+			if not tags:
+				log("No tags extracted from yt-dlp metadata", file=sys.stderr)
+				return 1
+			_emit_tags_as_table(
+				tags_list=list(tags),
+				file_hash=None,
+				store="url",
+				service_name=None,
+				config=config,
+				item_title=str(items[0].get("title") or "ytdlp"),
+				path=None,
+				subject={"provider": "ytdlp", "url": str(query_hint)},
+			)
+			return 0
 		
 		from result_table import ResultTable
 		table = ResultTable(f"Metadata: {provider.name}")
@@ -1040,7 +1396,10 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
 			return 0

 		# Apply tags to the store backend (no sidecar writing here).
-		apply_tags = _filter_scraped_tags([str(t) for t in result_tags if t is not None])
+		if str(result_provider).strip().lower() == "ytdlp":
+			apply_tags = [str(t) for t in result_tags if t is not None]
+		else:
+			apply_tags = _filter_scraped_tags([str(t) for t in result_tags if t is not None])
 		if not apply_tags:
 			log("No applicable scraped tags to apply (title:/artist:/source: are skipped)", file=sys.stderr)
 			return 0
@@ -1167,6 +1526,11 @@ try:
 except Exception:
 	_SCRAPE_CHOICES = ["itunes", "openlibrary", "googlebooks", "google", "musicbrainz"]

+# Special scrape mode: pull tags from an item's URL via yt-dlp (no download)
+if "ytdlp" not in _SCRAPE_CHOICES:
+	_SCRAPE_CHOICES.append("ytdlp")
+	_SCRAPE_CHOICES = sorted(_SCRAPE_CHOICES)
+

 class Get_Tag(Cmdlet):
 	"""Class-based get-tag cmdlet with self-registration."""
@@ -1195,7 +1559,7 @@ class Get_Tag(Cmdlet):
 				CmdletArg(
 					name="-scrape",
 					type="string",
-					description="Scrape metadata from URL or provider name (returns tags as JSON or table)",
+					description="Scrape metadata from URL/provider, or use 'ytdlp' to scrape from the item's URL and overwrite tags",
 					required=False,
 					choices=_SCRAPE_CHOICES,
 				)