2025-11-25 20:09:33 -08:00
|
|
|
"""Get tags from Hydrus or local sidecar metadata.
|
|
|
|
|
|
|
|
|
|
This cmdlet retrieves tags for a selected result, supporting both:
|
|
|
|
|
- Hydrus Network (for files with hash_hex)
|
|
|
|
|
- Local sidecar files (.tags)
|
|
|
|
|
|
|
|
|
|
In interactive mode: navigate with numbers, add/delete tags
|
|
|
|
|
In pipeline mode: display tags as read-only table, emit as structured JSON
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import sys
|
|
|
|
|
|
2025-12-07 00:21:30 -08:00
|
|
|
from helper.logger import log, debug
|
|
|
|
|
from helper.metadata_search import get_metadata_provider, list_metadata_providers
|
2025-11-25 20:09:33 -08:00
|
|
|
import subprocess
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
|
|
|
|
|
|
|
|
|
import pipeline as ctx
|
|
|
|
|
from helper import hydrus
|
2025-12-11 12:47:30 -08:00
|
|
|
from helper.folder_store import read_sidecar, write_sidecar, find_sidecar, FolderDB
|
|
|
|
|
from ._shared import normalize_hash, looks_like_hash, Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field
|
2025-11-25 20:09:33 -08:00
|
|
|
from config import get_local_storage_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
from metadata import extract_title
|
|
|
|
|
except ImportError:
|
|
|
|
|
extract_title = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Tag item for ResultTable display and piping
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class TagItem:
|
|
|
|
|
"""Tag item for display in ResultTable and piping to other cmdlets.
|
|
|
|
|
|
|
|
|
|
Allows tags to be selected and piped like:
|
|
|
|
|
- delete-tag @{3,4,9} (delete tags at indices 3, 4, 9)
|
|
|
|
|
- add-tag @"namespace:value" (add this tag)
|
|
|
|
|
"""
|
|
|
|
|
tag_name: str
|
|
|
|
|
tag_index: int # 1-based index for user reference
|
|
|
|
|
hash_hex: Optional[str] = None
|
|
|
|
|
source: str = "hydrus"
|
|
|
|
|
service_name: Optional[str] = None
|
2025-11-27 10:59:01 -08:00
|
|
|
file_path: Optional[str] = None
|
2025-11-25 20:09:33 -08:00
|
|
|
|
|
|
|
|
def __post_init__(self):
|
|
|
|
|
# Make ResultTable happy by adding standard fields
|
|
|
|
|
# NOTE: Don't set 'title' - we want only the tag column in ResultTable
|
|
|
|
|
self.origin = self.source
|
|
|
|
|
self.detail = f"Tag #{self.tag_index}"
|
|
|
|
|
self.target = self.tag_name
|
|
|
|
|
self.media_kind = "tag"
|
|
|
|
|
|
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
|
|
|
"""Convert to dict for JSON serialization."""
|
|
|
|
|
return {
|
|
|
|
|
"tag_name": self.tag_name,
|
|
|
|
|
"tag_index": self.tag_index,
|
|
|
|
|
"hash_hex": self.hash_hex,
|
|
|
|
|
"source": self.source,
|
|
|
|
|
"service_name": self.service_name,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _emit_tags_as_table(
|
|
|
|
|
tags_list: List[str],
|
|
|
|
|
hash_hex: Optional[str],
|
|
|
|
|
source: str = "hydrus",
|
|
|
|
|
service_name: Optional[str] = None,
|
2025-11-27 10:59:01 -08:00
|
|
|
config: Dict[str, Any] = None,
|
|
|
|
|
item_title: Optional[str] = None,
|
2025-12-06 00:10:19 -08:00
|
|
|
file_path: Optional[str] = None,
|
|
|
|
|
subject: Optional[Any] = None,
|
2025-11-25 20:09:33 -08:00
|
|
|
) -> None:
|
|
|
|
|
"""Emit tags as TagItem objects and display via ResultTable.
|
|
|
|
|
|
|
|
|
|
This replaces _print_tag_list to make tags pipe-able.
|
|
|
|
|
Stores the table in ctx._LAST_RESULT_TABLE for downstream @ selection.
|
|
|
|
|
"""
|
|
|
|
|
from result_table import ResultTable
|
|
|
|
|
|
|
|
|
|
# Create ResultTable with just tag column (no title)
|
2025-11-27 10:59:01 -08:00
|
|
|
table_title = "Tags"
|
|
|
|
|
if item_title:
|
|
|
|
|
table_title = f"Tags: {item_title}"
|
|
|
|
|
if hash_hex:
|
|
|
|
|
table_title += f" [{hash_hex[:8]}]"
|
|
|
|
|
|
|
|
|
|
table = ResultTable(table_title, max_columns=1)
|
2025-11-25 20:09:33 -08:00
|
|
|
table.set_source_command("get-tag", [])
|
|
|
|
|
|
|
|
|
|
# Create TagItem for each tag
|
|
|
|
|
tag_items = []
|
|
|
|
|
for idx, tag_name in enumerate(tags_list, start=1):
|
|
|
|
|
tag_item = TagItem(
|
|
|
|
|
tag_name=tag_name,
|
|
|
|
|
tag_index=idx,
|
|
|
|
|
hash_hex=hash_hex,
|
|
|
|
|
source=source,
|
|
|
|
|
service_name=service_name,
|
2025-11-27 10:59:01 -08:00
|
|
|
file_path=file_path,
|
2025-11-25 20:09:33 -08:00
|
|
|
)
|
|
|
|
|
tag_items.append(tag_item)
|
|
|
|
|
table.add_result(tag_item)
|
|
|
|
|
# Also emit to pipeline for downstream processing
|
|
|
|
|
ctx.emit(tag_item)
|
|
|
|
|
|
|
|
|
|
# Store the table and items in history so @.. works to go back
|
|
|
|
|
# Use overlay mode so it doesn't push the previous search to history stack
|
|
|
|
|
# This makes get-tag behave like a transient view
|
|
|
|
|
try:
|
2025-12-06 00:10:19 -08:00
|
|
|
ctx.set_last_result_table_overlay(table, tag_items, subject)
|
2025-11-25 20:09:33 -08:00
|
|
|
except AttributeError:
|
2025-12-06 00:10:19 -08:00
|
|
|
ctx.set_last_result_table(table, tag_items, subject)
|
2025-11-25 20:09:33 -08:00
|
|
|
# Note: CLI will handle displaying the table via ResultTable formatting
|
|
|
|
|
def _summarize_tags(tags_list: List[str], limit: int = 8) -> str:
|
|
|
|
|
"""Create a summary of tags for display."""
|
|
|
|
|
shown = [t for t in tags_list[:limit] if t]
|
|
|
|
|
summary = ", ".join(shown)
|
|
|
|
|
remaining = max(0, len(tags_list) - len(shown))
|
|
|
|
|
if remaining > 0:
|
|
|
|
|
summary = f"{summary} (+{remaining} more)" if summary else f"(+{remaining} more)"
|
|
|
|
|
if len(summary) > 200:
|
|
|
|
|
summary = summary[:197] + "..."
|
|
|
|
|
return summary
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_title_from(tags_list: List[str]) -> Optional[str]:
|
|
|
|
|
"""Extract title from tags list."""
|
|
|
|
|
if extract_title:
|
|
|
|
|
try:
|
|
|
|
|
return extract_title(tags_list)
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
for t in tags_list:
|
|
|
|
|
if isinstance(t, str) and t.lower().startswith("title:"):
|
|
|
|
|
val = t.split(":", 1)[1].strip()
|
|
|
|
|
if val:
|
|
|
|
|
return val
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _rename_file_if_title_tag(media: Optional[Path], tags_added: List[str]) -> bool:
|
|
|
|
|
"""Rename a local file if title: tag was added.
|
|
|
|
|
|
|
|
|
|
Returns True if file was renamed, False otherwise.
|
|
|
|
|
"""
|
|
|
|
|
if not media or not tags_added:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# Check if any of the added tags is a title: tag
|
|
|
|
|
title_value = None
|
|
|
|
|
for tag in tags_added:
|
|
|
|
|
if isinstance(tag, str):
|
|
|
|
|
lower_tag = tag.lower()
|
|
|
|
|
if lower_tag.startswith("title:"):
|
|
|
|
|
title_value = tag.split(":", 1)[1].strip()
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if not title_value:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Get current file path
|
|
|
|
|
file_path = media
|
|
|
|
|
if not file_path.exists():
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# Parse file path
|
|
|
|
|
dir_path = file_path.parent
|
|
|
|
|
old_name = file_path.name
|
|
|
|
|
|
|
|
|
|
# Get file extension
|
|
|
|
|
suffix = file_path.suffix or ''
|
|
|
|
|
|
|
|
|
|
# Sanitize title for use as filename
|
|
|
|
|
import re
|
|
|
|
|
safe_title = re.sub(r'[<>:"/\\|?*]', '', title_value).strip()
|
|
|
|
|
if not safe_title:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
new_name = safe_title + suffix
|
|
|
|
|
new_file_path = dir_path / new_name
|
|
|
|
|
|
|
|
|
|
if new_file_path == file_path:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# Build sidecar paths BEFORE renaming the file
|
|
|
|
|
old_sidecar = Path(str(file_path) + '.tags')
|
|
|
|
|
new_sidecar = Path(str(new_file_path) + '.tags')
|
|
|
|
|
|
|
|
|
|
# Rename file
|
|
|
|
|
try:
|
|
|
|
|
file_path.rename(new_file_path)
|
|
|
|
|
log(f"Renamed file: {old_name} → {new_name}")
|
|
|
|
|
|
|
|
|
|
# Rename .tags sidecar if it exists
|
|
|
|
|
if old_sidecar.exists():
|
|
|
|
|
try:
|
|
|
|
|
old_sidecar.rename(new_sidecar)
|
|
|
|
|
log(f"Renamed sidecar: {old_name}.tags → {new_name}.tags")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
log(f"Failed to rename sidecar: {e}", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
except Exception as e:
|
|
|
|
|
log(f"Failed to rename file: {e}", file=sys.stderr)
|
|
|
|
|
return False
|
|
|
|
|
except Exception as e:
|
|
|
|
|
log(f"Error during file rename: {e}", file=sys.stderr)
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _apply_result_updates_from_tags(result: Any, tag_list: List[str]) -> None:
|
|
|
|
|
"""Update result object with title and tag summary from tags."""
|
|
|
|
|
try:
|
|
|
|
|
new_title = _extract_title_from(tag_list)
|
|
|
|
|
if new_title:
|
|
|
|
|
setattr(result, "title", new_title)
|
|
|
|
|
setattr(result, "tag_summary", _summarize_tags(tag_list))
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _handle_title_rename(old_path: Path, tags_list: List[str]) -> Optional[Path]:
|
|
|
|
|
"""If a title: tag is present, rename the file and its .tags sidecar to match.
|
|
|
|
|
|
|
|
|
|
Returns the new path if renamed, otherwise returns None.
|
|
|
|
|
"""
|
|
|
|
|
# Extract title from tags
|
|
|
|
|
new_title = None
|
|
|
|
|
for tag in tags_list:
|
|
|
|
|
if isinstance(tag, str) and tag.lower().startswith('title:'):
|
|
|
|
|
new_title = tag.split(':', 1)[1].strip()
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
if not new_title or not old_path.exists():
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Build new filename with same extension
|
|
|
|
|
old_name = old_path.name
|
|
|
|
|
old_suffix = old_path.suffix
|
|
|
|
|
|
|
|
|
|
# Create new filename: title + extension
|
|
|
|
|
new_name = f"{new_title}{old_suffix}"
|
|
|
|
|
new_path = old_path.parent / new_name
|
|
|
|
|
|
|
|
|
|
# Don't rename if already the same name
|
|
|
|
|
if new_path == old_path:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
# Rename the main file
|
|
|
|
|
if new_path.exists():
|
|
|
|
|
log(f"Warning: Target filename already exists: {new_name}", file=sys.stderr)
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
old_path.rename(new_path)
|
|
|
|
|
log(f"Renamed file: {old_name} → {new_name}", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
# Rename the .tags sidecar if it exists
|
|
|
|
|
old_tags_path = old_path.parent / (old_name + '.tags')
|
|
|
|
|
if old_tags_path.exists():
|
|
|
|
|
new_tags_path = old_path.parent / (new_name + '.tags')
|
|
|
|
|
if new_tags_path.exists():
|
|
|
|
|
log(f"Warning: Target sidecar already exists: {new_tags_path.name}", file=sys.stderr)
|
|
|
|
|
else:
|
|
|
|
|
old_tags_path.rename(new_tags_path)
|
|
|
|
|
log(f"Renamed sidecar: {old_tags_path.name} → {new_tags_path.name}", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
return new_path
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
log(f"Warning: Failed to rename file: {exc}", file=sys.stderr)
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _read_sidecar_fallback(p: Path) -> tuple[Optional[str], List[str], List[str]]:
|
|
|
|
|
"""Fallback sidecar reader if metadata module unavailable.
|
|
|
|
|
|
|
|
|
|
Format:
|
|
|
|
|
- Lines with "hash:" prefix: file hash
|
2025-12-11 12:47:30 -08:00
|
|
|
- Lines with "url:" or "url:" prefix: url
|
2025-11-25 20:09:33 -08:00
|
|
|
- Lines with "relationship:" prefix: ignored (internal relationships)
|
|
|
|
|
- Lines with "key:", "namespace:value" format: treated as namespace tags
|
|
|
|
|
- Plain lines without colons: freeform tags
|
|
|
|
|
|
2025-12-11 12:47:30 -08:00
|
|
|
Excluded namespaces (treated as metadata, not tags): hash, url, url, relationship
|
2025-11-25 20:09:33 -08:00
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
raw = p.read_text(encoding="utf-8", errors="ignore")
|
|
|
|
|
except OSError:
|
|
|
|
|
return None, [], []
|
|
|
|
|
t: List[str] = []
|
|
|
|
|
u: List[str] = []
|
|
|
|
|
h: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
# Namespaces to exclude from tags
|
2025-12-11 12:47:30 -08:00
|
|
|
excluded_namespaces = {"hash", "url", "url", "relationship"}
|
2025-11-25 20:09:33 -08:00
|
|
|
|
|
|
|
|
for line in raw.splitlines():
|
|
|
|
|
s = line.strip()
|
|
|
|
|
if not s:
|
|
|
|
|
continue
|
|
|
|
|
low = s.lower()
|
|
|
|
|
|
|
|
|
|
# Check if this is a hash line
|
|
|
|
|
if low.startswith("hash:"):
|
|
|
|
|
h = s.split(":", 1)[1].strip() if ":" in s else h
|
|
|
|
|
# Check if this is a URL line
|
2025-12-11 12:47:30 -08:00
|
|
|
elif low.startswith("url:") or low.startswith("url:"):
|
2025-11-25 20:09:33 -08:00
|
|
|
val = s.split(":", 1)[1].strip() if ":" in s else ""
|
|
|
|
|
if val:
|
|
|
|
|
u.append(val)
|
|
|
|
|
# Check if this is an excluded namespace
|
|
|
|
|
elif ":" in s:
|
|
|
|
|
namespace = s.split(":", 1)[0].strip().lower()
|
|
|
|
|
if namespace not in excluded_namespaces:
|
|
|
|
|
# Include as namespace tag (e.g., "title: The Freemasons")
|
|
|
|
|
t.append(s)
|
|
|
|
|
else:
|
|
|
|
|
# Plain text without colon = freeform tag
|
|
|
|
|
t.append(s)
|
|
|
|
|
|
|
|
|
|
return h, t, u
|
|
|
|
|
|
|
|
|
|
|
2025-12-11 12:47:30 -08:00
|
|
|
def _write_sidecar(p: Path, media: Path, tag_list: List[str], url: List[str], hash_in_sidecar: Optional[str]) -> Path:
|
2025-11-25 20:09:33 -08:00
|
|
|
"""Write tags to sidecar file and handle title-based renaming.
|
|
|
|
|
|
|
|
|
|
Returns the new media path if renamed, otherwise returns the original media path.
|
|
|
|
|
"""
|
2025-12-11 12:47:30 -08:00
|
|
|
success = write_sidecar(media, tag_list, url, hash_in_sidecar)
|
2025-11-25 20:09:33 -08:00
|
|
|
if success:
|
|
|
|
|
_apply_result_updates_from_tags(None, tag_list)
|
|
|
|
|
# Check if we should rename the file based on title tag
|
|
|
|
|
new_media = _handle_title_rename(media, tag_list)
|
|
|
|
|
if new_media:
|
|
|
|
|
return new_media
|
|
|
|
|
return media
|
|
|
|
|
|
|
|
|
|
# Fallback writer
|
|
|
|
|
ordered = [s for s in tag_list if s and s.strip()]
|
|
|
|
|
lines = []
|
|
|
|
|
if hash_in_sidecar:
|
|
|
|
|
lines.append(f"hash:{hash_in_sidecar}")
|
|
|
|
|
lines.extend(ordered)
|
2025-12-11 12:47:30 -08:00
|
|
|
for u in url:
|
|
|
|
|
lines.append(f"url:{u}")
|
2025-11-25 20:09:33 -08:00
|
|
|
try:
|
|
|
|
|
p.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
|
|
|
# Check if we should rename the file based on title tag
|
|
|
|
|
new_media = _handle_title_rename(media, tag_list)
|
|
|
|
|
if new_media:
|
|
|
|
|
return new_media
|
|
|
|
|
return media
|
|
|
|
|
except OSError as exc:
|
|
|
|
|
log(f"Failed to write sidecar: {exc}", file=sys.stderr)
|
|
|
|
|
return media
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _emit_tag_payload(source: str, tags_list: List[str], *, hash_value: Optional[str], extra: Optional[Dict[str, Any]] = None, store_label: Optional[str] = None) -> int:
|
|
|
|
|
"""Emit tags as structured payload to pipeline.
|
|
|
|
|
|
|
|
|
|
Also emits individual tag objects to _PIPELINE_LAST_ITEMS so they can be selected by index.
|
|
|
|
|
"""
|
|
|
|
|
payload: Dict[str, Any] = {
|
|
|
|
|
"source": source,
|
|
|
|
|
"tags": list(tags_list),
|
|
|
|
|
"count": len(tags_list),
|
|
|
|
|
}
|
|
|
|
|
if hash_value:
|
|
|
|
|
payload["hash"] = hash_value
|
|
|
|
|
if extra:
|
|
|
|
|
for key, value in extra.items():
|
|
|
|
|
if value is not None:
|
|
|
|
|
payload[key] = value
|
|
|
|
|
label = None
|
|
|
|
|
if store_label:
|
|
|
|
|
label = store_label
|
2025-12-11 12:47:30 -08:00
|
|
|
elif ctx.get_stage_context() is not None:
|
2025-11-25 20:09:33 -08:00
|
|
|
label = "tags"
|
|
|
|
|
if label:
|
|
|
|
|
ctx.store_value(label, payload)
|
2025-12-11 12:47:30 -08:00
|
|
|
if ctx.get_stage_context() is not None and label.lower() != "tags":
|
2025-11-25 20:09:33 -08:00
|
|
|
ctx.store_value("tags", payload)
|
|
|
|
|
|
|
|
|
|
# Emit individual TagItem objects so they can be selected by bare index
|
|
|
|
|
# When in pipeline, emit individual TagItem objects
|
2025-12-11 12:47:30 -08:00
|
|
|
if ctx.get_stage_context() is not None:
|
2025-11-25 20:09:33 -08:00
|
|
|
for idx, tag_name in enumerate(tags_list, start=1):
|
|
|
|
|
tag_item = TagItem(
|
|
|
|
|
tag_name=tag_name,
|
|
|
|
|
tag_index=idx,
|
|
|
|
|
hash_hex=hash_value,
|
|
|
|
|
source=source,
|
|
|
|
|
service_name=None
|
|
|
|
|
)
|
|
|
|
|
ctx.emit(tag_item)
|
|
|
|
|
else:
|
|
|
|
|
# When not in pipeline, just emit the payload
|
|
|
|
|
ctx.emit(payload)
|
|
|
|
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_scrapable_identifiers(tags_list: List[str]) -> Dict[str, str]:
|
|
|
|
|
"""Extract scrapable identifiers from tags."""
|
|
|
|
|
identifiers = {}
|
2025-12-06 00:10:19 -08:00
|
|
|
scrapable_prefixes = {
|
|
|
|
|
'openlibrary', 'isbn', 'isbn_10', 'isbn_13',
|
|
|
|
|
'musicbrainz', 'musicbrainzalbum', 'imdb', 'tmdb', 'tvdb'
|
|
|
|
|
}
|
2025-11-25 20:09:33 -08:00
|
|
|
|
|
|
|
|
for tag in tags_list:
|
|
|
|
|
if not isinstance(tag, str) or ':' not in tag:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
parts = tag.split(':', 1)
|
|
|
|
|
if len(parts) != 2:
|
|
|
|
|
continue
|
|
|
|
|
|
2025-12-06 00:10:19 -08:00
|
|
|
key_raw = parts[0].strip().lower()
|
|
|
|
|
key = key_raw.replace('-', '_')
|
|
|
|
|
if key == 'isbn10':
|
|
|
|
|
key = 'isbn_10'
|
|
|
|
|
elif key == 'isbn13':
|
|
|
|
|
key = 'isbn_13'
|
2025-11-25 20:09:33 -08:00
|
|
|
value = parts[1].strip()
|
|
|
|
|
|
2025-12-06 00:10:19 -08:00
|
|
|
# Normalize ISBN values by removing hyphens for API friendliness
|
|
|
|
|
if key.startswith('isbn'):
|
|
|
|
|
value = value.replace('-', '')
|
|
|
|
|
|
2025-11-25 20:09:33 -08:00
|
|
|
if key in scrapable_prefixes and value:
|
|
|
|
|
identifiers[key] = value
|
|
|
|
|
|
|
|
|
|
return identifiers
|
|
|
|
|
|
|
|
|
|
|
2025-12-07 00:21:30 -08:00
|
|
|
def _extract_tag_value(tags_list: List[str], namespace: str) -> Optional[str]:
|
|
|
|
|
"""Get first tag value for a namespace (e.g., artist:, title:)."""
|
|
|
|
|
ns = namespace.lower()
|
|
|
|
|
for tag in tags_list:
|
|
|
|
|
if not isinstance(tag, str) or ':' not in tag:
|
|
|
|
|
continue
|
|
|
|
|
prefix, _, value = tag.partition(':')
|
|
|
|
|
if prefix.strip().lower() != ns:
|
|
|
|
|
continue
|
|
|
|
|
candidate = value.strip()
|
|
|
|
|
if candidate:
|
|
|
|
|
return candidate
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2025-11-25 20:09:33 -08:00
|
|
|
def _scrape_url_metadata(url: str) -> Tuple[Optional[str], List[str], List[Tuple[str, str]], List[Dict[str, Any]]]:
|
|
|
|
|
"""Scrape metadata from a URL using yt-dlp.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
(title, tags, formats, playlist_items) tuple where:
|
|
|
|
|
- title: Video/content title
|
|
|
|
|
- tags: List of extracted tags (both namespaced and freeform)
|
|
|
|
|
- formats: List of (display_label, format_id) tuples
|
|
|
|
|
- playlist_items: List of playlist entry dicts (empty if not a playlist)
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
import json as json_module
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
from metadata import extract_ytdlp_tags
|
|
|
|
|
except ImportError:
|
|
|
|
|
extract_ytdlp_tags = None
|
|
|
|
|
|
|
|
|
|
# Build yt-dlp command with playlist support
|
|
|
|
|
# IMPORTANT: Do NOT use --flat-playlist! It strips metadata like artist, album, uploader, genre
|
|
|
|
|
# Without it, yt-dlp gives us full metadata in an 'entries' array within a single JSON object
|
|
|
|
|
# This ensures we get album-level metadata from sources like BandCamp, YouTube Music, etc.
|
|
|
|
|
cmd = [
|
|
|
|
|
"yt-dlp",
|
|
|
|
|
"-j", # Output JSON
|
|
|
|
|
"--no-warnings",
|
|
|
|
|
"--playlist-items", "1-10", # Get first 10 items if it's a playlist (provides entries)
|
|
|
|
|
"-f", "best",
|
|
|
|
|
url
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
|
|
|
|
|
|
|
|
|
if result.returncode != 0:
|
|
|
|
|
log(f"yt-dlp error: {result.stderr}", file=sys.stderr)
|
|
|
|
|
return None, [], [], []
|
|
|
|
|
|
|
|
|
|
# Parse JSON output - WITHOUT --flat-playlist, we get ONE JSON object with 'entries' array
|
|
|
|
|
# This gives us full metadata instead of flat format
|
|
|
|
|
lines = result.stdout.strip().split('\n')
|
|
|
|
|
if not lines or not lines[0]:
|
|
|
|
|
log("yt-dlp returned empty output", file=sys.stderr)
|
|
|
|
|
return None, [], [], []
|
|
|
|
|
|
|
|
|
|
# Parse the single JSON object
|
|
|
|
|
try:
|
|
|
|
|
data = json_module.loads(lines[0])
|
|
|
|
|
except json_module.JSONDecodeError as e:
|
|
|
|
|
log(f"Failed to parse yt-dlp JSON: {e}", file=sys.stderr)
|
|
|
|
|
return None, [], [], []
|
|
|
|
|
|
|
|
|
|
# Extract title - use the main title
|
|
|
|
|
title = data.get('title', 'Unknown')
|
|
|
|
|
|
|
|
|
|
# Determine if this is a playlist/album (has entries array)
|
|
|
|
|
# is_playlist = 'entries' in data and isinstance(data.get('entries'), list)
|
|
|
|
|
|
|
|
|
|
# Extract tags and playlist items
|
|
|
|
|
tags = []
|
|
|
|
|
playlist_items = []
|
|
|
|
|
|
|
|
|
|
# IMPORTANT: Extract album/playlist-level tags FIRST (before processing entries)
|
|
|
|
|
# This ensures we get metadata about the collection, not just individual tracks
|
|
|
|
|
if extract_ytdlp_tags:
|
|
|
|
|
album_tags = extract_ytdlp_tags(data)
|
|
|
|
|
tags.extend(album_tags)
|
|
|
|
|
|
|
|
|
|
# Case 1: Entries are nested in the main object (standard playlist structure)
|
|
|
|
|
if 'entries' in data and isinstance(data.get('entries'), list):
|
|
|
|
|
entries = data['entries']
|
|
|
|
|
# Build playlist items with title and duration
|
|
|
|
|
for idx, entry in enumerate(entries, 1):
|
|
|
|
|
if isinstance(entry, dict):
|
|
|
|
|
item_title = entry.get('title', entry.get('id', f'Track {idx}'))
|
|
|
|
|
item_duration = entry.get('duration', 0)
|
|
|
|
|
playlist_items.append({
|
|
|
|
|
'index': idx,
|
|
|
|
|
'id': entry.get('id', f'track_{idx}'),
|
|
|
|
|
'title': item_title,
|
|
|
|
|
'duration': item_duration,
|
|
|
|
|
'url': entry.get('url') or entry.get('webpage_url', ''),
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
# Extract tags from each entry and merge (but don't duplicate album-level tags)
|
|
|
|
|
# Only merge entry tags that are multi-value prefixes (not single-value like title:, artist:, etc.)
|
|
|
|
|
if extract_ytdlp_tags:
|
|
|
|
|
entry_tags = extract_ytdlp_tags(entry)
|
|
|
|
|
|
|
|
|
|
# Single-value namespaces that should not be duplicated from entries
|
|
|
|
|
single_value_namespaces = {'title', 'artist', 'album', 'creator', 'channel', 'release_date', 'upload_date', 'license', 'location'}
|
|
|
|
|
|
|
|
|
|
for tag in entry_tags:
|
|
|
|
|
# Extract the namespace (part before the colon)
|
|
|
|
|
tag_namespace = tag.split(':', 1)[0].lower() if ':' in tag else None
|
|
|
|
|
|
|
|
|
|
# Skip if this namespace already exists in tags (from album level)
|
|
|
|
|
if tag_namespace and tag_namespace in single_value_namespaces:
|
|
|
|
|
# Check if any tag with this namespace already exists in tags
|
|
|
|
|
already_has_namespace = any(
|
|
|
|
|
t.split(':', 1)[0].lower() == tag_namespace
|
|
|
|
|
for t in tags if ':' in t
|
|
|
|
|
)
|
|
|
|
|
if already_has_namespace:
|
|
|
|
|
continue # Skip this tag, keep the album-level one
|
|
|
|
|
|
|
|
|
|
if tag not in tags: # Avoid exact duplicates
|
|
|
|
|
tags.append(tag)
|
|
|
|
|
|
|
|
|
|
# Case 2: Playlist detected by playlist_count field (BandCamp albums, etc.)
|
|
|
|
|
# These need a separate call with --flat-playlist to get the actual entries
|
|
|
|
|
elif (data.get('playlist_count') or 0) > 0 and 'entries' not in data:
|
|
|
|
|
try:
|
|
|
|
|
# Make a second call with --flat-playlist to get the actual tracks
|
|
|
|
|
flat_cmd = [
|
|
|
|
|
"yt-dlp",
|
|
|
|
|
"-j",
|
|
|
|
|
"--no-warnings",
|
|
|
|
|
"--flat-playlist",
|
|
|
|
|
"-f", "best",
|
|
|
|
|
url
|
|
|
|
|
]
|
|
|
|
|
flat_result = subprocess.run(flat_cmd, capture_output=True, text=True, timeout=30)
|
|
|
|
|
if flat_result.returncode == 0:
|
|
|
|
|
flat_lines = flat_result.stdout.strip().split('\n')
|
|
|
|
|
# With --flat-playlist, each line is a separate track JSON object
|
|
|
|
|
# (not nested in a playlist container), so process ALL lines
|
|
|
|
|
for idx, line in enumerate(flat_lines, 1):
|
|
|
|
|
if line.strip().startswith('{'):
|
|
|
|
|
try:
|
|
|
|
|
entry = json_module.loads(line)
|
|
|
|
|
item_title = entry.get('title', entry.get('id', f'Track {idx}'))
|
|
|
|
|
item_duration = entry.get('duration', 0)
|
|
|
|
|
playlist_items.append({
|
|
|
|
|
'index': idx,
|
|
|
|
|
'id': entry.get('id', f'track_{idx}'),
|
|
|
|
|
'title': item_title,
|
|
|
|
|
'duration': item_duration,
|
|
|
|
|
'url': entry.get('url') or entry.get('webpage_url', ''),
|
|
|
|
|
})
|
|
|
|
|
except json_module.JSONDecodeError:
|
|
|
|
|
pass
|
|
|
|
|
except Exception as e:
|
|
|
|
|
pass # Silently ignore if we can't get playlist entries
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Fallback: if still no tags detected, get from first item
|
|
|
|
|
if not tags and extract_ytdlp_tags:
|
|
|
|
|
tags = extract_ytdlp_tags(data)
|
|
|
|
|
|
|
|
|
|
# Extract formats from the main data object
|
|
|
|
|
formats = []
|
|
|
|
|
if 'formats' in data:
|
|
|
|
|
formats = _extract_url_formats(data.get('formats', []))
|
|
|
|
|
|
|
|
|
|
# Deduplicate tags by namespace to prevent duplicate title:, artist:, etc.
|
|
|
|
|
try:
|
|
|
|
|
from metadata import dedup_tags_by_namespace as _dedup
|
|
|
|
|
if _dedup:
|
|
|
|
|
tags = _dedup(tags, keep_first=True)
|
|
|
|
|
except Exception:
|
|
|
|
|
pass # If dedup fails, return tags as-is
|
|
|
|
|
|
|
|
|
|
return title, tags, formats, playlist_items
|
|
|
|
|
|
|
|
|
|
except subprocess.TimeoutExpired:
|
|
|
|
|
log("yt-dlp timeout (>30s)", file=sys.stderr)
|
|
|
|
|
return None, [], [], []
|
|
|
|
|
except Exception as e:
|
|
|
|
|
log(f"URL scraping error: {e}", file=sys.stderr)
|
|
|
|
|
return None, [], [], []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_url_formats(formats: list) -> List[Tuple[str, str]]:
|
|
|
|
|
"""Extract best formats from yt-dlp formats list.
|
|
|
|
|
|
|
|
|
|
Returns list of (display_label, format_id) tuples.
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
video_formats = {} # {resolution: format_data}
|
|
|
|
|
audio_formats = {} # {quality_label: format_data}
|
|
|
|
|
|
|
|
|
|
for fmt in formats:
|
|
|
|
|
vcodec = fmt.get('vcodec', 'none')
|
|
|
|
|
acodec = fmt.get('acodec', 'none')
|
|
|
|
|
height = fmt.get('height')
|
|
|
|
|
ext = fmt.get('ext', 'unknown')
|
|
|
|
|
format_id = fmt.get('format_id', '')
|
|
|
|
|
tbr = fmt.get('tbr', 0)
|
|
|
|
|
abr = fmt.get('abr', 0)
|
|
|
|
|
|
|
|
|
|
# Video format
|
|
|
|
|
if vcodec and vcodec != 'none' and height:
|
|
|
|
|
if height < 480:
|
|
|
|
|
continue
|
|
|
|
|
res_key = f"{height}p"
|
|
|
|
|
if res_key not in video_formats or tbr > video_formats[res_key].get('tbr', 0):
|
|
|
|
|
video_formats[res_key] = {
|
|
|
|
|
'label': f"{height}p ({ext})",
|
|
|
|
|
'format_id': format_id,
|
|
|
|
|
'tbr': tbr,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Audio-only format
|
|
|
|
|
elif acodec and acodec != 'none' and (not vcodec or vcodec == 'none'):
|
|
|
|
|
audio_key = f"audio_{abr}"
|
|
|
|
|
if audio_key not in audio_formats or abr > audio_formats[audio_key].get('abr', 0):
|
|
|
|
|
audio_formats[audio_key] = {
|
|
|
|
|
'label': f"audio ({ext})",
|
|
|
|
|
'format_id': format_id,
|
|
|
|
|
'abr': abr,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
result = []
|
|
|
|
|
|
|
|
|
|
# Add video formats in descending resolution order
|
|
|
|
|
for res in sorted(video_formats.keys(), key=lambda x: int(x.replace('p', '')), reverse=True):
|
|
|
|
|
fmt = video_formats[res]
|
|
|
|
|
result.append((fmt['label'], fmt['format_id']))
|
|
|
|
|
|
|
|
|
|
# Add best audio format
|
|
|
|
|
if audio_formats:
|
|
|
|
|
best_audio = max(audio_formats.values(), key=lambda x: x.get('abr', 0))
|
|
|
|
|
result.append((best_audio['label'], best_audio['format_id']))
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
log(f"Error extracting formats: {e}", file=sys.stderr)
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _scrape_isbn_metadata(isbn: str) -> List[str]:
|
|
|
|
|
"""Scrape metadata for an ISBN using Open Library API."""
|
|
|
|
|
new_tags = []
|
|
|
|
|
try:
|
|
|
|
|
from ..helper.http_client import HTTPClient
|
|
|
|
|
import json as json_module
|
|
|
|
|
|
|
|
|
|
isbn_clean = isbn.replace('-', '').strip()
|
|
|
|
|
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
with HTTPClient() as client:
|
|
|
|
|
response = client.get(url)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
data = json_module.loads(response.content.decode('utf-8'))
|
|
|
|
|
except Exception as e:
|
|
|
|
|
log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr)
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
if not data:
|
|
|
|
|
log(f"No ISBN metadata found for: {isbn}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
book_data = next(iter(data.values()), None)
|
|
|
|
|
if not book_data:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
if 'title' in book_data:
|
|
|
|
|
new_tags.append(f"title:{book_data['title']}")
|
|
|
|
|
|
|
|
|
|
if 'authors' in book_data and isinstance(book_data['authors'], list):
|
|
|
|
|
for author in book_data['authors'][:3]:
|
|
|
|
|
if 'name' in author:
|
|
|
|
|
new_tags.append(f"author:{author['name']}")
|
|
|
|
|
|
|
|
|
|
if 'publish_date' in book_data:
|
|
|
|
|
new_tags.append(f"publish_date:{book_data['publish_date']}")
|
|
|
|
|
|
|
|
|
|
if 'publishers' in book_data and isinstance(book_data['publishers'], list):
|
|
|
|
|
for pub in book_data['publishers'][:1]:
|
|
|
|
|
if 'name' in pub:
|
|
|
|
|
new_tags.append(f"publisher:{pub['name']}")
|
|
|
|
|
|
|
|
|
|
if 'description' in book_data:
|
|
|
|
|
desc = book_data['description']
|
|
|
|
|
if isinstance(desc, dict) and 'value' in desc:
|
|
|
|
|
desc = desc['value']
|
|
|
|
|
if desc:
|
|
|
|
|
desc_str = str(desc).strip()
|
|
|
|
|
# Include description if available (limit to 200 chars to keep it manageable)
|
|
|
|
|
if len(desc_str) > 0:
|
|
|
|
|
new_tags.append(f"description:{desc_str[:200]}")
|
|
|
|
|
|
|
|
|
|
if 'number_of_pages' in book_data:
|
|
|
|
|
page_count = book_data['number_of_pages']
|
|
|
|
|
if page_count and isinstance(page_count, int) and page_count > 0:
|
|
|
|
|
new_tags.append(f"pages:{page_count}")
|
|
|
|
|
|
|
|
|
|
if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict):
|
|
|
|
|
identifiers = book_data['identifiers']
|
|
|
|
|
|
|
|
|
|
if 'openlibrary' in identifiers:
|
|
|
|
|
ol_ids = identifiers['openlibrary']
|
|
|
|
|
if isinstance(ol_ids, list) and ol_ids:
|
|
|
|
|
new_tags.append(f"openlibrary:{ol_ids[0]}")
|
|
|
|
|
elif isinstance(ol_ids, str):
|
|
|
|
|
new_tags.append(f"openlibrary:{ol_ids}")
|
|
|
|
|
|
|
|
|
|
if 'lccn' in identifiers:
|
|
|
|
|
lccn_list = identifiers['lccn']
|
|
|
|
|
if isinstance(lccn_list, list) and lccn_list:
|
|
|
|
|
new_tags.append(f"lccn:{lccn_list[0]}")
|
|
|
|
|
elif isinstance(lccn_list, str):
|
|
|
|
|
new_tags.append(f"lccn:{lccn_list}")
|
|
|
|
|
|
|
|
|
|
if 'oclc' in identifiers:
|
|
|
|
|
oclc_list = identifiers['oclc']
|
|
|
|
|
if isinstance(oclc_list, list) and oclc_list:
|
|
|
|
|
new_tags.append(f"oclc:{oclc_list[0]}")
|
|
|
|
|
elif isinstance(oclc_list, str):
|
|
|
|
|
new_tags.append(f"oclc:{oclc_list}")
|
|
|
|
|
|
|
|
|
|
if 'goodreads' in identifiers:
|
|
|
|
|
goodreads_list = identifiers['goodreads']
|
|
|
|
|
if isinstance(goodreads_list, list) and goodreads_list:
|
|
|
|
|
new_tags.append(f"goodreads:{goodreads_list[0]}")
|
|
|
|
|
elif isinstance(goodreads_list, str):
|
|
|
|
|
new_tags.append(f"goodreads:{goodreads_list}")
|
|
|
|
|
|
|
|
|
|
if 'librarything' in identifiers:
|
|
|
|
|
lt_list = identifiers['librarything']
|
|
|
|
|
if isinstance(lt_list, list) and lt_list:
|
|
|
|
|
new_tags.append(f"librarything:{lt_list[0]}")
|
|
|
|
|
elif isinstance(lt_list, str):
|
|
|
|
|
new_tags.append(f"librarything:{lt_list}")
|
|
|
|
|
|
|
|
|
|
if 'doi' in identifiers:
|
|
|
|
|
doi_list = identifiers['doi']
|
|
|
|
|
if isinstance(doi_list, list) and doi_list:
|
|
|
|
|
new_tags.append(f"doi:{doi_list[0]}")
|
|
|
|
|
elif isinstance(doi_list, str):
|
|
|
|
|
new_tags.append(f"doi:{doi_list}")
|
|
|
|
|
|
|
|
|
|
if 'internet_archive' in identifiers:
|
|
|
|
|
ia_list = identifiers['internet_archive']
|
|
|
|
|
if isinstance(ia_list, list) and ia_list:
|
|
|
|
|
new_tags.append(f"internet_archive:{ia_list[0]}")
|
|
|
|
|
elif isinstance(ia_list, str):
|
|
|
|
|
new_tags.append(f"internet_archive:{ia_list}")
|
|
|
|
|
|
|
|
|
|
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
|
|
|
|
|
return new_tags
|
|
|
|
|
except Exception as e:
|
|
|
|
|
log(f"ISBN scraping error: {e}", file=sys.stderr)
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _scrape_openlibrary_metadata(olid: str) -> List[str]:
|
|
|
|
|
"""Scrape metadata for an OpenLibrary ID using the .json API endpoint.
|
|
|
|
|
|
|
|
|
|
Fetches from https://openlibrary.org/books/{OLID}.json and extracts:
|
|
|
|
|
- Title, authors, publish date, publishers
|
|
|
|
|
- Description
|
|
|
|
|
- Subjects as freeform tags (without namespace prefix)
|
|
|
|
|
- Identifiers (ISBN, LCCN, OCLC, etc.)
|
|
|
|
|
"""
|
|
|
|
|
new_tags = []
|
|
|
|
|
try:
|
|
|
|
|
from ..helper.http_client import HTTPClient
|
|
|
|
|
import json as json_module
|
|
|
|
|
|
|
|
|
|
# Format: OL9674499M or just 9674499M
|
|
|
|
|
olid_clean = olid.replace('OL', '').replace('M', '')
|
|
|
|
|
if not olid_clean.isdigit():
|
|
|
|
|
olid_clean = olid
|
|
|
|
|
|
|
|
|
|
# Ensure we have the full OLID format for the URL
|
|
|
|
|
if not olid.startswith('OL'):
|
|
|
|
|
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
|
|
|
|
|
else:
|
|
|
|
|
url = f"https://openlibrary.org/books/{olid}.json"
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
with HTTPClient() as client:
|
|
|
|
|
response = client.get(url)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
data = json_module.loads(response.content.decode('utf-8'))
|
|
|
|
|
except Exception as e:
|
|
|
|
|
log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr)
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
if not data:
|
|
|
|
|
log(f"No OpenLibrary metadata found for: {olid}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
# Add title
|
|
|
|
|
if 'title' in data:
|
|
|
|
|
new_tags.append(f"title:{data['title']}")
|
|
|
|
|
|
|
|
|
|
# Add authors
|
|
|
|
|
if 'authors' in data and isinstance(data['authors'], list):
|
|
|
|
|
for author in data['authors'][:3]:
|
|
|
|
|
if isinstance(author, dict) and 'name' in author:
|
|
|
|
|
new_tags.append(f"author:{author['name']}")
|
|
|
|
|
elif isinstance(author, str):
|
|
|
|
|
new_tags.append(f"author:{author}")
|
|
|
|
|
|
|
|
|
|
# Add publish date
|
|
|
|
|
if 'publish_date' in data:
|
|
|
|
|
new_tags.append(f"publish_date:{data['publish_date']}")
|
|
|
|
|
|
|
|
|
|
# Add publishers
|
|
|
|
|
if 'publishers' in data and isinstance(data['publishers'], list):
|
|
|
|
|
for pub in data['publishers'][:1]:
|
|
|
|
|
if isinstance(pub, dict) and 'name' in pub:
|
|
|
|
|
new_tags.append(f"publisher:{pub['name']}")
|
|
|
|
|
elif isinstance(pub, str):
|
|
|
|
|
new_tags.append(f"publisher:{pub}")
|
|
|
|
|
|
|
|
|
|
# Add description
|
|
|
|
|
if 'description' in data:
|
|
|
|
|
desc = data['description']
|
|
|
|
|
if isinstance(desc, dict) and 'value' in desc:
|
|
|
|
|
desc = desc['value']
|
|
|
|
|
if desc:
|
|
|
|
|
desc_str = str(desc).strip()
|
|
|
|
|
if len(desc_str) > 0:
|
|
|
|
|
new_tags.append(f"description:{desc_str[:200]}")
|
|
|
|
|
|
|
|
|
|
# Add number of pages
|
|
|
|
|
if 'number_of_pages' in data:
|
|
|
|
|
page_count = data['number_of_pages']
|
|
|
|
|
if page_count and isinstance(page_count, int) and page_count > 0:
|
|
|
|
|
new_tags.append(f"pages:{page_count}")
|
|
|
|
|
|
|
|
|
|
# Add subjects as FREEFORM tags (no namespace prefix)
|
|
|
|
|
if 'subjects' in data and isinstance(data['subjects'], list):
|
|
|
|
|
for subject in data['subjects'][:10]:
|
|
|
|
|
if subject and isinstance(subject, str):
|
|
|
|
|
subject_clean = str(subject).strip()
|
|
|
|
|
if subject_clean and subject_clean not in new_tags:
|
|
|
|
|
new_tags.append(subject_clean)
|
|
|
|
|
|
|
|
|
|
# Add identifiers
|
|
|
|
|
if 'identifiers' in data and isinstance(data['identifiers'], dict):
|
|
|
|
|
identifiers = data['identifiers']
|
|
|
|
|
|
|
|
|
|
if 'isbn_10' in identifiers:
|
|
|
|
|
isbn_10_list = identifiers['isbn_10']
|
|
|
|
|
if isinstance(isbn_10_list, list) and isbn_10_list:
|
|
|
|
|
new_tags.append(f"isbn_10:{isbn_10_list[0]}")
|
|
|
|
|
elif isinstance(isbn_10_list, str):
|
|
|
|
|
new_tags.append(f"isbn_10:{isbn_10_list}")
|
|
|
|
|
|
|
|
|
|
if 'isbn_13' in identifiers:
|
|
|
|
|
isbn_13_list = identifiers['isbn_13']
|
|
|
|
|
if isinstance(isbn_13_list, list) and isbn_13_list:
|
|
|
|
|
new_tags.append(f"isbn_13:{isbn_13_list[0]}")
|
|
|
|
|
elif isinstance(isbn_13_list, str):
|
|
|
|
|
new_tags.append(f"isbn_13:{isbn_13_list}")
|
|
|
|
|
|
|
|
|
|
if 'lccn' in identifiers:
|
|
|
|
|
lccn_list = identifiers['lccn']
|
|
|
|
|
if isinstance(lccn_list, list) and lccn_list:
|
|
|
|
|
new_tags.append(f"lccn:{lccn_list[0]}")
|
|
|
|
|
elif isinstance(lccn_list, str):
|
|
|
|
|
new_tags.append(f"lccn:{lccn_list}")
|
|
|
|
|
|
|
|
|
|
if 'oclc_numbers' in identifiers:
|
|
|
|
|
oclc_list = identifiers['oclc_numbers']
|
|
|
|
|
if isinstance(oclc_list, list) and oclc_list:
|
|
|
|
|
new_tags.append(f"oclc:{oclc_list[0]}")
|
|
|
|
|
elif isinstance(oclc_list, str):
|
|
|
|
|
new_tags.append(f"oclc:{oclc_list}")
|
|
|
|
|
|
|
|
|
|
if 'goodreads' in identifiers:
|
|
|
|
|
goodreads_list = identifiers['goodreads']
|
|
|
|
|
if isinstance(goodreads_list, list) and goodreads_list:
|
|
|
|
|
new_tags.append(f"goodreads:{goodreads_list[0]}")
|
|
|
|
|
elif isinstance(goodreads_list, str):
|
|
|
|
|
new_tags.append(f"goodreads:{goodreads_list}")
|
|
|
|
|
|
|
|
|
|
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
|
|
|
|
|
return new_tags
|
|
|
|
|
except Exception as e:
|
|
|
|
|
log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _perform_scraping(tags_list: List[str]) -> List[str]:
|
|
|
|
|
"""Perform scraping based on identifiers in tags.
|
|
|
|
|
|
|
|
|
|
Priority order:
|
|
|
|
|
1. openlibrary: (preferred - more complete metadata)
|
|
|
|
|
2. isbn_10 or isbn (fallback)
|
|
|
|
|
"""
|
|
|
|
|
identifiers = _extract_scrapable_identifiers(tags_list)
|
|
|
|
|
|
|
|
|
|
if not identifiers:
|
|
|
|
|
log("No scrapable identifiers found (openlibrary, ISBN, musicbrainz, imdb)")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
log(f"Found scrapable identifiers: {', '.join(identifiers.keys())}")
|
|
|
|
|
|
|
|
|
|
new_tags = []
|
|
|
|
|
|
|
|
|
|
# Prefer OpenLibrary over ISBN (more complete metadata)
|
|
|
|
|
if 'openlibrary' in identifiers:
|
|
|
|
|
olid = identifiers['openlibrary']
|
|
|
|
|
if olid:
|
|
|
|
|
log(f"Scraping OpenLibrary: {olid}")
|
|
|
|
|
new_tags.extend(_scrape_openlibrary_metadata(olid))
|
2025-12-06 00:10:19 -08:00
|
|
|
elif 'isbn_13' in identifiers or 'isbn_10' in identifiers or 'isbn' in identifiers:
|
|
|
|
|
isbn = identifiers.get('isbn_13') or identifiers.get('isbn_10') or identifiers.get('isbn')
|
2025-11-25 20:09:33 -08:00
|
|
|
if isbn:
|
|
|
|
|
log(f"Scraping ISBN: {isbn}")
|
|
|
|
|
new_tags.extend(_scrape_isbn_metadata(isbn))
|
|
|
|
|
|
|
|
|
|
existing_tags_lower = {tag.lower() for tag in tags_list}
|
|
|
|
|
scraped_unique = []
|
|
|
|
|
seen = set()
|
|
|
|
|
for tag in new_tags:
|
|
|
|
|
tag_lower = tag.lower()
|
|
|
|
|
if tag_lower not in existing_tags_lower and tag_lower not in seen:
|
|
|
|
|
scraped_unique.append(tag)
|
|
|
|
|
seen.add(tag_lower)
|
|
|
|
|
|
|
|
|
|
if scraped_unique:
|
|
|
|
|
log(f"Added {len(scraped_unique)} new tag(s) from scraping")
|
|
|
|
|
|
|
|
|
|
return scraped_unique
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
|
|
|
|
"""Get tags from Hydrus, local sidecar, or URL metadata.
|
|
|
|
|
|
|
|
|
|
Usage:
|
|
|
|
|
get-tag [-hash <sha256>] [--store <key>] [--emit]
|
2025-12-06 00:10:19 -08:00
|
|
|
get-tag -scrape <url|provider>
|
2025-11-25 20:09:33 -08:00
|
|
|
|
|
|
|
|
Options:
|
|
|
|
|
-hash <sha256>: Override hash to use instead of result's hash_hex
|
|
|
|
|
--store <key>: Store result to this key for pipeline
|
|
|
|
|
--emit: Emit result without interactive prompt (quiet mode)
|
2025-12-06 00:10:19 -08:00
|
|
|
-scrape <url|provider>: Scrape metadata from URL or provider name (itunes, openlibrary, googlebooks)
|
2025-11-25 20:09:33 -08:00
|
|
|
"""
|
2025-12-07 00:21:30 -08:00
|
|
|
args_list = [str(arg) for arg in (args or [])]
|
|
|
|
|
raw_args = list(args_list)
|
|
|
|
|
|
|
|
|
|
# Support numeric selection tokens (e.g., "@1" leading to argument "1") without treating
|
|
|
|
|
# them as hash overrides. This lets users pick from the most recent table overlay/results.
|
|
|
|
|
if len(args_list) == 1:
|
|
|
|
|
token = args_list[0]
|
|
|
|
|
if not token.startswith("-") and token.isdigit():
|
|
|
|
|
try:
|
|
|
|
|
idx = int(token) - 1
|
|
|
|
|
items_pool = ctx.get_last_result_items()
|
|
|
|
|
if 0 <= idx < len(items_pool):
|
|
|
|
|
result = items_pool[idx]
|
|
|
|
|
args_list = []
|
|
|
|
|
debug(f"[get_tag] Resolved numeric selection arg {token} -> last_result_items[{idx}]")
|
|
|
|
|
else:
|
|
|
|
|
debug(f"[get_tag] Numeric selection arg {token} out of range (items={len(items_pool)})")
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
debug(f"[get_tag] Failed to resolve numeric selection arg {token}: {exc}")
|
2025-11-25 20:09:33 -08:00
|
|
|
# Helper to get field from both dict and object
|
|
|
|
|
def get_field(obj: Any, field: str, default: Any = None) -> Any:
|
|
|
|
|
if isinstance(obj, dict):
|
|
|
|
|
return obj.get(field, default)
|
|
|
|
|
else:
|
|
|
|
|
return getattr(obj, field, default)
|
|
|
|
|
|
|
|
|
|
# Parse arguments using shared parser
|
2025-12-07 00:21:30 -08:00
|
|
|
parsed_args = parse_cmdlet_args(args_list, CMDLET)
|
2025-12-06 00:10:19 -08:00
|
|
|
|
|
|
|
|
# Detect if -scrape flag was provided without a value (parse_cmdlet_args skips missing values)
|
2025-12-07 00:21:30 -08:00
|
|
|
scrape_flag_present = any(str(arg).lower() in {"-scrape", "--scrape"} for arg in args_list)
|
2025-12-06 00:10:19 -08:00
|
|
|
|
2025-11-25 20:09:33 -08:00
|
|
|
# Extract values
|
2025-12-06 00:10:19 -08:00
|
|
|
hash_override_raw = parsed_args.get("hash")
|
|
|
|
|
hash_override = normalize_hash(hash_override_raw)
|
2025-11-25 20:09:33 -08:00
|
|
|
store_key = parsed_args.get("store")
|
|
|
|
|
emit_requested = parsed_args.get("emit", False)
|
|
|
|
|
scrape_url = parsed_args.get("scrape")
|
2025-12-06 00:10:19 -08:00
|
|
|
scrape_requested = scrape_flag_present or scrape_url is not None
|
|
|
|
|
|
2025-12-07 00:21:30 -08:00
|
|
|
explicit_hash_flag = any(str(arg).lower() in {"-hash", "--hash"} for arg in raw_args)
|
2025-12-06 00:10:19 -08:00
|
|
|
if hash_override_raw is not None:
|
|
|
|
|
if not hash_override or not looks_like_hash(hash_override):
|
2025-12-07 00:21:30 -08:00
|
|
|
debug(f"[get_tag] Ignoring invalid hash override '{hash_override_raw}' (explicit_flag={explicit_hash_flag})")
|
|
|
|
|
if explicit_hash_flag:
|
|
|
|
|
log("Invalid hash format: expected 64 hex characters", file=sys.stderr)
|
|
|
|
|
return 1
|
|
|
|
|
hash_override = None
|
2025-12-06 00:10:19 -08:00
|
|
|
|
|
|
|
|
if scrape_requested and (not scrape_url or str(scrape_url).strip() == ""):
|
|
|
|
|
log("-scrape requires a URL or provider name", file=sys.stderr)
|
|
|
|
|
return 1
|
2025-11-25 20:09:33 -08:00
|
|
|
|
2025-12-05 03:42:57 -08:00
|
|
|
# Handle URL or provider scraping mode
|
2025-11-25 20:09:33 -08:00
|
|
|
if scrape_requested and scrape_url:
|
|
|
|
|
import json as json_module
|
2025-12-05 03:42:57 -08:00
|
|
|
|
|
|
|
|
if scrape_url.startswith("http://") or scrape_url.startswith("https://"):
|
|
|
|
|
# URL scraping (existing behavior)
|
|
|
|
|
title, tags, formats, playlist_items = _scrape_url_metadata(scrape_url)
|
|
|
|
|
if not tags:
|
|
|
|
|
log("No tags extracted from URL", file=sys.stderr)
|
|
|
|
|
return 1
|
|
|
|
|
output = {
|
|
|
|
|
"title": title,
|
|
|
|
|
"tags": tags,
|
|
|
|
|
"formats": [(label, fmt_id) for label, fmt_id in formats],
|
|
|
|
|
"playlist_items": playlist_items,
|
|
|
|
|
}
|
|
|
|
|
print(json_module.dumps(output, ensure_ascii=False))
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
# Provider scraping (e.g., itunes)
|
|
|
|
|
provider = get_metadata_provider(scrape_url, config)
|
|
|
|
|
if provider is None:
|
|
|
|
|
log(f"Unknown metadata provider: {scrape_url}", file=sys.stderr)
|
2025-11-25 20:09:33 -08:00
|
|
|
return 1
|
|
|
|
|
|
2025-12-06 00:10:19 -08:00
|
|
|
# Prefer identifier tags (ISBN/OLID/etc.) when available; fallback to title/filename
|
|
|
|
|
identifier_tags: List[str] = []
|
|
|
|
|
result_tags = get_field(result, "tags", None)
|
|
|
|
|
if isinstance(result_tags, list):
|
|
|
|
|
identifier_tags = [str(t) for t in result_tags if isinstance(t, (str, bytes))]
|
|
|
|
|
|
|
|
|
|
# Try local sidecar if no tags present on result
|
|
|
|
|
if not identifier_tags:
|
2025-12-11 12:47:30 -08:00
|
|
|
file_path = get_field(result, "target", None) or get_field(result, "path", None) or get_field(result, "filename", None)
|
2025-12-06 00:10:19 -08:00
|
|
|
if isinstance(file_path, str) and file_path and not file_path.lower().startswith(("http://", "https://")):
|
|
|
|
|
try:
|
|
|
|
|
media_path = Path(str(file_path))
|
|
|
|
|
if media_path.exists():
|
|
|
|
|
tags_from_sidecar = read_sidecar(media_path)
|
|
|
|
|
if isinstance(tags_from_sidecar, list):
|
|
|
|
|
identifier_tags = [str(t) for t in tags_from_sidecar if isinstance(t, (str, bytes))]
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
2025-12-07 00:21:30 -08:00
|
|
|
|
|
|
|
|
title_from_tags = _extract_tag_value(identifier_tags, "title")
|
|
|
|
|
artist_from_tags = _extract_tag_value(identifier_tags, "artist")
|
2025-12-06 00:10:19 -08:00
|
|
|
|
|
|
|
|
identifiers = _extract_scrapable_identifiers(identifier_tags)
|
|
|
|
|
identifier_query: Optional[str] = None
|
|
|
|
|
if identifiers:
|
|
|
|
|
if provider.name in {"openlibrary", "googlebooks", "google"}:
|
|
|
|
|
identifier_query = identifiers.get("isbn_13") or identifiers.get("isbn_10") or identifiers.get("isbn") or identifiers.get("openlibrary")
|
|
|
|
|
elif provider.name == "itunes":
|
|
|
|
|
identifier_query = identifiers.get("musicbrainz") or identifiers.get("musicbrainzalbum")
|
|
|
|
|
|
|
|
|
|
# Determine query from identifier first, else title on the result or filename
|
2025-12-07 00:21:30 -08:00
|
|
|
title_hint = title_from_tags or get_field(result, "title", None) or get_field(result, "name", None)
|
2025-12-05 03:42:57 -08:00
|
|
|
if not title_hint:
|
|
|
|
|
file_path = get_field(result, "path", None) or get_field(result, "filename", None)
|
|
|
|
|
if file_path:
|
|
|
|
|
title_hint = Path(str(file_path)).stem
|
2025-12-07 00:21:30 -08:00
|
|
|
artist_hint = artist_from_tags or get_field(result, "artist", None) or get_field(result, "uploader", None)
|
|
|
|
|
if not artist_hint:
|
|
|
|
|
meta_field = get_field(result, "metadata", None)
|
|
|
|
|
if isinstance(meta_field, dict):
|
|
|
|
|
meta_artist = meta_field.get("artist") or meta_field.get("uploader")
|
|
|
|
|
if meta_artist:
|
|
|
|
|
artist_hint = str(meta_artist)
|
|
|
|
|
|
|
|
|
|
combined_query: Optional[str] = None
|
|
|
|
|
if not identifier_query and title_hint and artist_hint and provider.name in {"itunes", "musicbrainz"}:
|
|
|
|
|
if provider.name == "musicbrainz":
|
|
|
|
|
combined_query = f'recording:"{title_hint}" AND artist:"{artist_hint}"'
|
|
|
|
|
else:
|
|
|
|
|
combined_query = f"{title_hint} {artist_hint}"
|
2025-11-25 20:09:33 -08:00
|
|
|
|
2025-12-07 00:21:30 -08:00
|
|
|
query_hint = identifier_query or combined_query or title_hint
|
2025-12-06 00:10:19 -08:00
|
|
|
if not query_hint:
|
|
|
|
|
log("No title or identifier available to search for metadata", file=sys.stderr)
|
2025-12-05 03:42:57 -08:00
|
|
|
return 1
|
2025-11-25 20:09:33 -08:00
|
|
|
|
2025-12-06 00:10:19 -08:00
|
|
|
if identifier_query:
|
|
|
|
|
log(f"Using identifier for metadata search: {identifier_query}")
|
2025-12-07 00:21:30 -08:00
|
|
|
elif combined_query:
|
|
|
|
|
log(f"Using title+artist for metadata search: {title_hint} - {artist_hint}")
|
2025-12-06 00:10:19 -08:00
|
|
|
else:
|
|
|
|
|
log(f"Using title for metadata search: {query_hint}")
|
|
|
|
|
|
|
|
|
|
items = provider.search(query_hint, limit=10)
|
2025-12-05 03:42:57 -08:00
|
|
|
if not items:
|
|
|
|
|
log("No metadata results found", file=sys.stderr)
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
from result_table import ResultTable
|
|
|
|
|
table = ResultTable(f"Metadata: {provider.name}")
|
|
|
|
|
table.set_source_command("get-tag", [])
|
|
|
|
|
selection_payload = []
|
|
|
|
|
hash_for_payload = normalize_hash(hash_override) or normalize_hash(get_field(result, "hash_hex", None))
|
|
|
|
|
for idx, item in enumerate(items):
|
|
|
|
|
tags = provider.to_tags(item)
|
|
|
|
|
row = table.add_row()
|
|
|
|
|
row.add_column("Title", item.get("title", ""))
|
|
|
|
|
row.add_column("Artist", item.get("artist", ""))
|
|
|
|
|
row.add_column("Album", item.get("album", ""))
|
|
|
|
|
row.add_column("Year", item.get("year", ""))
|
|
|
|
|
payload = {
|
|
|
|
|
"tags": tags,
|
|
|
|
|
"provider": provider.name,
|
|
|
|
|
"title": item.get("title"),
|
|
|
|
|
"artist": item.get("artist"),
|
|
|
|
|
"album": item.get("album"),
|
|
|
|
|
"year": item.get("year"),
|
|
|
|
|
"extra": {
|
|
|
|
|
"tags": tags,
|
|
|
|
|
"provider": provider.name,
|
|
|
|
|
"hydrus_hash": hash_for_payload,
|
|
|
|
|
"storage_source": get_field(result, "source", None) or get_field(result, "origin", None),
|
|
|
|
|
},
|
|
|
|
|
"file_hash": hash_for_payload,
|
|
|
|
|
}
|
|
|
|
|
selection_payload.append(payload)
|
|
|
|
|
table.set_row_selection_args(idx, [str(idx + 1)])
|
|
|
|
|
|
|
|
|
|
ctx.set_last_result_table_overlay(table, selection_payload)
|
|
|
|
|
ctx.set_current_stage_table(table)
|
|
|
|
|
# Preserve items for @ selection and downstream pipes without emitting duplicates
|
|
|
|
|
ctx.set_last_result_items_only(selection_payload)
|
|
|
|
|
print(table)
|
2025-11-25 20:09:33 -08:00
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
# If -scrape was requested but no URL, that's an error
|
|
|
|
|
if scrape_requested and not scrape_url:
|
|
|
|
|
log("-scrape requires a URL argument", file=sys.stderr)
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
# Handle @N selection which creates a list - extract the first item
|
|
|
|
|
if isinstance(result, list) and len(result) > 0:
|
|
|
|
|
result = result[0]
|
|
|
|
|
|
|
|
|
|
hash_from_result = normalize_hash(get_field(result, "hash_hex", None))
|
|
|
|
|
hash_hex = hash_override or hash_from_result
|
|
|
|
|
# Only use emit mode if explicitly requested with --emit flag, not just because we're in a pipeline
|
|
|
|
|
# This allows interactive REPL to work even in pipelines
|
|
|
|
|
emit_mode = emit_requested or bool(store_key)
|
|
|
|
|
store_label = (store_key.strip() if store_key and store_key.strip() else None)
|
|
|
|
|
|
2025-12-11 12:47:30 -08:00
|
|
|
# Get hash and store from result
|
|
|
|
|
file_hash = hash_hex
|
|
|
|
|
storage_source = get_field(result, "store") or get_field(result, "storage") or get_field(result, "origin")
|
2025-11-25 20:09:33 -08:00
|
|
|
|
2025-12-11 12:47:30 -08:00
|
|
|
if not file_hash:
|
|
|
|
|
log("No hash available in result", file=sys.stderr)
|
|
|
|
|
return 1
|
2025-11-25 20:09:33 -08:00
|
|
|
|
2025-12-11 12:47:30 -08:00
|
|
|
if not storage_source:
|
|
|
|
|
log("No storage backend specified in result", file=sys.stderr)
|
|
|
|
|
return 1
|
2025-11-25 20:09:33 -08:00
|
|
|
|
2025-12-11 12:47:30 -08:00
|
|
|
# Get tags using storage backend
|
|
|
|
|
try:
|
|
|
|
|
from helper.store import FileStorage
|
|
|
|
|
storage = FileStorage(config)
|
|
|
|
|
backend = storage[storage_source]
|
|
|
|
|
current, source = backend.get_tag(file_hash, config=config)
|
|
|
|
|
|
|
|
|
|
if not current:
|
|
|
|
|
log("No tags found", file=sys.stderr)
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
service_name = ""
|
|
|
|
|
except KeyError:
|
|
|
|
|
log(f"Storage backend '{storage_source}' not found", file=sys.stderr)
|
|
|
|
|
return 1
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
log(f"Failed to get tags: {exc}", file=sys.stderr)
|
2025-11-25 20:09:33 -08:00
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
# Always output to ResultTable (pipeline mode only)
|
2025-11-27 10:59:01 -08:00
|
|
|
# Extract title for table header
|
|
|
|
|
item_title = get_field(result, "title", None) or get_field(result, "name", None) or get_field(result, "filename", None)
|
2025-12-06 00:10:19 -08:00
|
|
|
|
|
|
|
|
# Build a subject payload representing the file whose tags are being shown
|
|
|
|
|
subject_origin = get_field(result, "origin", None) or get_field(result, "source", None) or source
|
|
|
|
|
subject_payload: Dict[str, Any] = {
|
|
|
|
|
"tags": list(current),
|
|
|
|
|
"title": item_title,
|
|
|
|
|
"name": item_title,
|
|
|
|
|
"origin": subject_origin,
|
|
|
|
|
"source": subject_origin,
|
|
|
|
|
"storage_source": subject_origin,
|
|
|
|
|
"service_name": service_name,
|
|
|
|
|
"extra": {
|
|
|
|
|
"tags": list(current),
|
|
|
|
|
"storage_source": subject_origin,
|
|
|
|
|
"hydrus_hash": hash_hex,
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
if hash_hex:
|
|
|
|
|
subject_payload.update({
|
|
|
|
|
"hash": hash_hex,
|
|
|
|
|
"hash_hex": hash_hex,
|
|
|
|
|
"file_hash": hash_hex,
|
|
|
|
|
"hydrus_hash": hash_hex,
|
|
|
|
|
})
|
|
|
|
|
if local_path:
|
|
|
|
|
try:
|
|
|
|
|
path_text = str(local_path)
|
|
|
|
|
subject_payload.update({
|
|
|
|
|
"file_path": path_text,
|
|
|
|
|
"path": path_text,
|
|
|
|
|
"target": path_text,
|
|
|
|
|
})
|
|
|
|
|
subject_payload["extra"]["file_path"] = path_text
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
2025-11-27 10:59:01 -08:00
|
|
|
|
2025-11-25 20:09:33 -08:00
|
|
|
if source == "hydrus":
|
2025-12-06 00:10:19 -08:00
|
|
|
_emit_tags_as_table(current, hash_hex=hash_hex, source="hydrus", service_name=service_name, config=config, item_title=item_title, subject=subject_payload)
|
2025-11-25 20:09:33 -08:00
|
|
|
else:
|
2025-12-06 00:10:19 -08:00
|
|
|
_emit_tags_as_table(current, hash_hex=hash_hex, source="local", service_name=None, config=config, item_title=item_title, file_path=str(local_path) if local_path else None, subject=subject_payload)
|
2025-11-25 20:09:33 -08:00
|
|
|
|
|
|
|
|
# If emit requested or store key provided, emit payload
|
|
|
|
|
if emit_mode:
|
|
|
|
|
_emit_tag_payload(source, current, hash_value=hash_hex, store_label=store_label)
|
|
|
|
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
2025-12-07 00:21:30 -08:00
|
|
|
_SCRAPE_CHOICES = []
|
|
|
|
|
try:
|
|
|
|
|
_SCRAPE_CHOICES = sorted(list_metadata_providers().keys())
|
|
|
|
|
except Exception:
|
|
|
|
|
_SCRAPE_CHOICES = ["itunes", "openlibrary", "googlebooks", "google", "musicbrainz"]
|
|
|
|
|
|
|
|
|
|
|
2025-12-11 12:47:30 -08:00
|
|
|
class Get_Tag(Cmdlet):
|
|
|
|
|
"""Class-based get-tag cmdlet with self-registration."""
|
|
|
|
|
|
|
|
|
|
def __init__(self) -> None:
|
|
|
|
|
"""Initialize get-tag cmdlet."""
|
|
|
|
|
super().__init__(
|
|
|
|
|
name="get-tag",
|
|
|
|
|
summary="Get tags from Hydrus or local sidecar metadata",
|
|
|
|
|
usage="get-tag [-hash <sha256>] [--store <key>] [--emit] [-scrape <url|provider>]",
|
|
|
|
|
alias=["tags"],
|
|
|
|
|
arg=[
|
|
|
|
|
SharedArgs.HASH,
|
|
|
|
|
CmdletArg(
|
|
|
|
|
name="-store",
|
|
|
|
|
type="string",
|
|
|
|
|
description="Store result to this key for pipeline",
|
|
|
|
|
alias="store"
|
|
|
|
|
),
|
|
|
|
|
CmdletArg(
|
|
|
|
|
name="-emit",
|
|
|
|
|
type="flag",
|
|
|
|
|
description="Emit result without interactive prompt (quiet mode)",
|
|
|
|
|
alias="emit-only"
|
|
|
|
|
),
|
|
|
|
|
CmdletArg(
|
|
|
|
|
name="-scrape",
|
|
|
|
|
type="string",
|
|
|
|
|
description="Scrape metadata from URL or provider name (returns tags as JSON or table)",
|
|
|
|
|
required=False,
|
|
|
|
|
choices=_SCRAPE_CHOICES,
|
|
|
|
|
)
|
|
|
|
|
],
|
|
|
|
|
detail=[
|
|
|
|
|
"- Retrieves tags for a file from:",
|
|
|
|
|
" Hydrus: Using file hash if available",
|
|
|
|
|
" Local: From sidecar files or local library database",
|
|
|
|
|
"- Options:",
|
|
|
|
|
" -hash: Override hash to look up in Hydrus",
|
|
|
|
|
" -store: Store result to key for downstream pipeline",
|
|
|
|
|
" -emit: Quiet mode (no interactive selection)",
|
|
|
|
|
" -scrape: Scrape metadata from URL or metadata provider",
|
|
|
|
|
],
|
|
|
|
|
exec=self.run,
|
|
|
|
|
)
|
|
|
|
|
self.register()
|
|
|
|
|
|
|
|
|
|
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
|
|
|
|
"""Execute get-tag cmdlet."""
|
|
|
|
|
# Parse arguments
|
|
|
|
|
parsed = parse_cmdlet_args(args, self)
|
|
|
|
|
|
|
|
|
|
# Get hash and store from parsed args or result
|
|
|
|
|
hash_override = parsed.get("hash")
|
|
|
|
|
file_hash = hash_override or get_field(result, "hash") or get_field(result, "file_hash") or get_field(result, "hash_hex")
|
|
|
|
|
storage_source = parsed.get("store") or get_field(result, "store") or get_field(result, "storage") or get_field(result, "origin")
|
|
|
|
|
|
|
|
|
|
if not file_hash:
|
|
|
|
|
log("No hash available in result", file=sys.stderr)
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
if not storage_source:
|
|
|
|
|
log("No storage backend specified in result", file=sys.stderr)
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
# Get tags using storage backend
|
|
|
|
|
try:
|
|
|
|
|
from helper.store import FileStorage
|
|
|
|
|
storage_obj = FileStorage(config)
|
|
|
|
|
backend = storage_obj[storage_source]
|
|
|
|
|
current, source = backend.get_tag(file_hash, config=config)
|
|
|
|
|
|
|
|
|
|
if not current:
|
|
|
|
|
log("No tags found", file=sys.stderr)
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
# Build table and emit
|
|
|
|
|
item_title = get_field(result, "title") or file_hash[:16]
|
|
|
|
|
_emit_tags_as_table(
|
|
|
|
|
tags_list=current,
|
|
|
|
|
hash_hex=file_hash,
|
|
|
|
|
source=source,
|
|
|
|
|
service_name="",
|
|
|
|
|
config=config,
|
|
|
|
|
item_title=item_title,
|
|
|
|
|
file_path=None,
|
|
|
|
|
subject=result,
|
|
|
|
|
)
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
except KeyError:
|
|
|
|
|
log(f"Storage backend '{storage_source}' not found", file=sys.stderr)
|
|
|
|
|
return 1
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
log(f"Failed to get tags: {exc}", file=sys.stderr)
|
|
|
|
|
import traceback
|
|
|
|
|
traceback.print_exc(file=sys.stderr)
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Create and register the cmdlet
|
|
|
|
|
CMDLET = Get_Tag()
|
2025-11-25 20:09:33 -08:00
|
|
|
|
|
|
|
|
|