Medios-Macina/cmdlets/get_tag.py

"""Get tags from Hydrus or local sidecar metadata.

This cmdlet retrieves tags for a selected result, supporting both:
- Hydrus Network (for files with hash_hex)
- Local sidecar files (.tags)

In interactive mode: navigate with numbers, add/delete tags
In pipeline mode: display tags as read-only table, emit as structured JSON
"""

from __future__ import annotations

import sys

from helper.logger import log
from helper.metadata_search import get_metadata_provider
import subprocess
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple

import pipeline as ctx
from helper import hydrus
from helper.local_library import read_sidecar, write_sidecar, find_sidecar, LocalLibraryDB
from ._shared import normalize_hash, Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args
from config import get_local_storage_path


try:
	from metadata import extract_title
except ImportError:
	extract_title = None


# Tag item for ResultTable display and piping
from dataclasses import dataclass

@dataclass
class TagItem:
	"""Tag item for display in ResultTable and piping to other cmdlets.
	
	Allows tags to be selected and piped like:
	- delete-tag @{3,4,9}  (delete tags at indices 3, 4, 9)
	- add-tag @"namespace:value"  (add this tag)
	"""
	tag_name: str
	tag_index: int  # 1-based index for user reference
	hash_hex: Optional[str] = None
	source: str = "hydrus"
	service_name: Optional[str] = None
	file_path: Optional[str] = None
	
	def __post_init__(self):
		# Make ResultTable happy by adding standard fields
		# NOTE: Don't set 'title' - we want only the tag column in ResultTable
		self.origin = self.source
		self.detail = f"Tag #{self.tag_index}"
		self.target = self.tag_name
		self.media_kind = "tag"
	
	def to_dict(self) -> Dict[str, Any]:
		"""Convert to dict for JSON serialization."""
		return {
			"tag_name": self.tag_name,
			"tag_index": self.tag_index,
			"hash_hex": self.hash_hex,
			"source": self.source,
			"service_name": self.service_name,
		}


def _extract_my_tags_from_hydrus_meta(meta: Dict[str, Any], service_key: Optional[str], service_name: str) -> List[str]:
	"""Extract current tags from Hydrus metadata dict.
	
	Prefers display_tags (includes siblings/parents, excludes deleted).
	Falls back to storage_tags status '0' (current).
	"""
	tags_payload = meta.get("tags")
	if not isinstance(tags_payload, dict):
		return []
	svc_data = None
	if service_key:
		svc_data = tags_payload.get(service_key)
	if not isinstance(svc_data, dict):
		return []
	# Prefer display_tags (Hydrus computes siblings/parents)
	display = svc_data.get("display_tags")
	if isinstance(display, list) and display:
		return [str(t) for t in display if isinstance(t, (str, bytes)) and str(t).strip()]
	# Fallback to storage_tags status '0' (current)
	storage = svc_data.get("storage_tags")
	if isinstance(storage, dict):
		current_list = storage.get("0") or storage.get(0)
		if isinstance(current_list, list):
			return [str(t) for t in current_list if isinstance(t, (str, bytes)) and str(t).strip()]
	return []


def _emit_tags_as_table(
	tags_list: List[str],
	hash_hex: Optional[str],
	source: str = "hydrus",
	service_name: Optional[str] = None,
	config: Dict[str, Any] = None,
	item_title: Optional[str] = None,
	file_path: Optional[str] = None
) -> None:
	"""Emit tags as TagItem objects and display via ResultTable.
	
	This replaces _print_tag_list to make tags pipe-able.
	Stores the table in ctx._LAST_RESULT_TABLE for downstream @ selection.
	"""
	from result_table import ResultTable
	
	# Create ResultTable with just tag column (no title)
	table_title = "Tags"
	if item_title:
		table_title = f"Tags: {item_title}"
		if hash_hex:
			table_title += f" [{hash_hex[:8]}]"
			
	table = ResultTable(table_title, max_columns=1)
	table.set_source_command("get-tag", [])
	
	# Create TagItem for each tag
	tag_items = []
	for idx, tag_name in enumerate(tags_list, start=1):
		tag_item = TagItem(
			tag_name=tag_name,
			tag_index=idx,
			hash_hex=hash_hex,
			source=source,
			service_name=service_name,
			file_path=file_path,
		)
		tag_items.append(tag_item)
		table.add_result(tag_item)
		# Also emit to pipeline for downstream processing
		ctx.emit(tag_item)

	# Store the table and items in history so @.. works to go back
	# Use overlay mode so it doesn't push the previous search to history stack
	# This makes get-tag behave like a transient view
	try:
		ctx.set_last_result_table_overlay(table, tag_items)
	except AttributeError:
		ctx.set_last_result_table(table, tag_items)
	# Note: CLI will handle displaying the table via ResultTable formatting
def _summarize_tags(tags_list: List[str], limit: int = 8) -> str:
	"""Create a summary of tags for display."""
	shown = [t for t in tags_list[:limit] if t]
	summary = ", ".join(shown)
	remaining = max(0, len(tags_list) - len(shown))
	if remaining > 0:
		summary = f"{summary} (+{remaining} more)" if summary else f"(+{remaining} more)"
	if len(summary) > 200:
		summary = summary[:197] + "..."
	return summary


def _extract_title_from(tags_list: List[str]) -> Optional[str]:
	"""Extract title from tags list."""
	if extract_title:
		try:
			return extract_title(tags_list)
		except Exception:
			pass
	for t in tags_list:
		if isinstance(t, str) and t.lower().startswith("title:"):
			val = t.split(":", 1)[1].strip()
			if val:
				return val
	return None


def _rename_file_if_title_tag(media: Optional[Path], tags_added: List[str]) -> bool:
	"""Rename a local file if title: tag was added.
	
	Returns True if file was renamed, False otherwise.
	"""
	if not media or not tags_added:
		return False
	
	# Check if any of the added tags is a title: tag
	title_value = None
	for tag in tags_added:
		if isinstance(tag, str):
			lower_tag = tag.lower()
			if lower_tag.startswith("title:"):
				title_value = tag.split(":", 1)[1].strip()
				break
	
	if not title_value:
		return False
	
	try:
		# Get current file path
		file_path = media
		if not file_path.exists():
			return False
		
		# Parse file path
		dir_path = file_path.parent
		old_name = file_path.name
		
		# Get file extension
		suffix = file_path.suffix or ''
		
		# Sanitize title for use as filename
		import re
		safe_title = re.sub(r'[<>:"/\\|?*]', '', title_value).strip()
		if not safe_title:
			return False
		
		new_name = safe_title + suffix
		new_file_path = dir_path / new_name
		
		if new_file_path == file_path:
			return False
		
		# Build sidecar paths BEFORE renaming the file
		old_sidecar = Path(str(file_path) + '.tags')
		new_sidecar = Path(str(new_file_path) + '.tags')
		
		# Rename file
		try:
			file_path.rename(new_file_path)
			log(f"Renamed file: {old_name} → {new_name}")
			
			# Rename .tags sidecar if it exists
			if old_sidecar.exists():
				try:
					old_sidecar.rename(new_sidecar)
					log(f"Renamed sidecar: {old_name}.tags → {new_name}.tags")
				except Exception as e:
					log(f"Failed to rename sidecar: {e}", file=sys.stderr)
			
			return True
		except Exception as e:
			log(f"Failed to rename file: {e}", file=sys.stderr)
			return False
	except Exception as e:
		log(f"Error during file rename: {e}", file=sys.stderr)
		return False


def _apply_result_updates_from_tags(result: Any, tag_list: List[str]) -> None:
	"""Update result object with title and tag summary from tags."""
	try:
		new_title = _extract_title_from(tag_list)
		if new_title:
			setattr(result, "title", new_title)
		setattr(result, "tag_summary", _summarize_tags(tag_list))
	except Exception:
		pass


def _handle_title_rename(old_path: Path, tags_list: List[str]) -> Optional[Path]:
	"""If a title: tag is present, rename the file and its .tags sidecar to match.
	
	Returns the new path if renamed, otherwise returns None.
	"""
	# Extract title from tags
	new_title = None
	for tag in tags_list:
		if isinstance(tag, str) and tag.lower().startswith('title:'):
			new_title = tag.split(':', 1)[1].strip()
			break
	
	if not new_title or not old_path.exists():
		return None
	
	try:
		# Build new filename with same extension
		old_name = old_path.name
		old_suffix = old_path.suffix
		
		# Create new filename: title + extension
		new_name = f"{new_title}{old_suffix}"
		new_path = old_path.parent / new_name
		
		# Don't rename if already the same name
		if new_path == old_path:
			return None
		
		# Rename the main file
		if new_path.exists():
			log(f"Warning: Target filename already exists: {new_name}", file=sys.stderr)
			return None
		
		old_path.rename(new_path)
		log(f"Renamed file: {old_name} → {new_name}", file=sys.stderr)
		
		# Rename the .tags sidecar if it exists
		old_tags_path = old_path.parent / (old_name + '.tags')
		if old_tags_path.exists():
			new_tags_path = old_path.parent / (new_name + '.tags')
			if new_tags_path.exists():
				log(f"Warning: Target sidecar already exists: {new_tags_path.name}", file=sys.stderr)
			else:
				old_tags_path.rename(new_tags_path)
				log(f"Renamed sidecar: {old_tags_path.name} → {new_tags_path.name}", file=sys.stderr)
		
		return new_path
	except Exception as exc:
		log(f"Warning: Failed to rename file: {exc}", file=sys.stderr)
		return None


def _read_sidecar_fallback(p: Path) -> tuple[Optional[str], List[str], List[str]]:
	"""Fallback sidecar reader if metadata module unavailable.
	
	Format:
	- Lines with "hash:" prefix: file hash
	- Lines with "known_url:" or "url:" prefix: URLs
	- Lines with "relationship:" prefix: ignored (internal relationships)
	- Lines with "key:", "namespace:value" format: treated as namespace tags
	- Plain lines without colons: freeform tags
	
	Excluded namespaces (treated as metadata, not tags): hash, known_url, url, relationship
	"""
	try:
		raw = p.read_text(encoding="utf-8", errors="ignore")
	except OSError:
		return None, [], []
	t: List[str] = []
	u: List[str] = []
	h: Optional[str] = None
	
	# Namespaces to exclude from tags
	excluded_namespaces = {"hash", "known_url", "url", "relationship"}
	
	for line in raw.splitlines():
		s = line.strip()
		if not s:
			continue
		low = s.lower()
		
		# Check if this is a hash line
		if low.startswith("hash:"):
			h = s.split(":", 1)[1].strip() if ":" in s else h
		# Check if this is a URL line
		elif low.startswith("known_url:") or low.startswith("url:"):
			val = s.split(":", 1)[1].strip() if ":" in s else ""
			if val:
				u.append(val)
		# Check if this is an excluded namespace
		elif ":" in s:
			namespace = s.split(":", 1)[0].strip().lower()
			if namespace not in excluded_namespaces:
				# Include as namespace tag (e.g., "title: The Freemasons")
				t.append(s)
		else:
			# Plain text without colon = freeform tag
			t.append(s)
	
	return h, t, u


def _write_sidecar(p: Path, media: Path, tag_list: List[str], known_urls: List[str], hash_in_sidecar: Optional[str]) -> Path:
	"""Write tags to sidecar file and handle title-based renaming.
	
	Returns the new media path if renamed, otherwise returns the original media path.
	"""
	success = write_sidecar(media, tag_list, known_urls, hash_in_sidecar)
	if success:
		_apply_result_updates_from_tags(None, tag_list)
		# Check if we should rename the file based on title tag
		new_media = _handle_title_rename(media, tag_list)
		if new_media:
			return new_media
		return media
	
	# Fallback writer
	ordered = [s for s in tag_list if s and s.strip()]
	lines = []
	if hash_in_sidecar:
		lines.append(f"hash:{hash_in_sidecar}")
	lines.extend(ordered)
	for u in known_urls:
		lines.append(f"known_url:{u}")
	try:
		p.write_text("\n".join(lines) + "\n", encoding="utf-8")
		# Check if we should rename the file based on title tag
		new_media = _handle_title_rename(media, tag_list)
		if new_media:
			return new_media
		return media
	except OSError as exc:
		log(f"Failed to write sidecar: {exc}", file=sys.stderr)
		return media


def _emit_tag_payload(source: str, tags_list: List[str], *, hash_value: Optional[str], extra: Optional[Dict[str, Any]] = None, store_label: Optional[str] = None) -> int:
	"""Emit tags as structured payload to pipeline.
	
	Also emits individual tag objects to _PIPELINE_LAST_ITEMS so they can be selected by index.
	"""
	payload: Dict[str, Any] = {
		"source": source,
		"tags": list(tags_list),
		"count": len(tags_list),
	}
	if hash_value:
		payload["hash"] = hash_value
	if extra:
		for key, value in extra.items():
			if value is not None:
				payload[key] = value
	label = None
	if store_label:
		label = store_label
	elif ctx._PIPE_ACTIVE:
		label = "tags"
	if label:
		ctx.store_value(label, payload)
		if ctx._PIPE_ACTIVE and label.lower() != "tags":
			ctx.store_value("tags", payload)
	
	# Emit individual TagItem objects so they can be selected by bare index
	# When in pipeline, emit individual TagItem objects
	if ctx._PIPE_ACTIVE:
		for idx, tag_name in enumerate(tags_list, start=1):
			tag_item = TagItem(
				tag_name=tag_name,
				tag_index=idx,
				hash_hex=hash_value,
				source=source,
				service_name=None
			)
			ctx.emit(tag_item)
	else:
		# When not in pipeline, just emit the payload
		ctx.emit(payload)
	
	return 0


def _extract_scrapable_identifiers(tags_list: List[str]) -> Dict[str, str]:
	"""Extract scrapable identifiers from tags."""
	identifiers = {}
	scrapable_prefixes = {'openlibrary', 'isbn_10', 'isbn', 'musicbrainz', 'musicbrainzalbum', 'imdb', 'tmdb', 'tvdb'}
	
	for tag in tags_list:
		if not isinstance(tag, str) or ':' not in tag:
			continue
		
		parts = tag.split(':', 1)
		if len(parts) != 2:
			continue
		
		key = parts[0].strip().lower()
		value = parts[1].strip()
		
		if key in scrapable_prefixes and value:
			identifiers[key] = value
	
	return identifiers


def _scrape_url_metadata(url: str) -> Tuple[Optional[str], List[str], List[Tuple[str, str]], List[Dict[str, Any]]]:
	"""Scrape metadata from a URL using yt-dlp.
	
	Returns:
		(title, tags, formats, playlist_items) tuple where:
		- title: Video/content title
		- tags: List of extracted tags (both namespaced and freeform)
		- formats: List of (display_label, format_id) tuples
		- playlist_items: List of playlist entry dicts (empty if not a playlist)
	"""
	try:
		import json as json_module
		
		try:
			from metadata import extract_ytdlp_tags
		except ImportError:
			extract_ytdlp_tags = None
		
		# Build yt-dlp command with playlist support
		# IMPORTANT: Do NOT use --flat-playlist! It strips metadata like artist, album, uploader, genre
		# Without it, yt-dlp gives us full metadata in an 'entries' array within a single JSON object
		# This ensures we get album-level metadata from sources like BandCamp, YouTube Music, etc.
		cmd = [
			"yt-dlp",
			"-j",  # Output JSON
			"--no-warnings",
			"--playlist-items", "1-10",  # Get first 10 items if it's a playlist (provides entries)
			"-f", "best",
			url
		]
		
		result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
		
		if result.returncode != 0:
			log(f"yt-dlp error: {result.stderr}", file=sys.stderr)
			return None, [], [], []
		
		# Parse JSON output - WITHOUT --flat-playlist, we get ONE JSON object with 'entries' array
		# This gives us full metadata instead of flat format
		lines = result.stdout.strip().split('\n')
		if not lines or not lines[0]:
			log("yt-dlp returned empty output", file=sys.stderr)
			return None, [], [], []
		
		# Parse the single JSON object
		try:
			data = json_module.loads(lines[0])
		except json_module.JSONDecodeError as e:
			log(f"Failed to parse yt-dlp JSON: {e}", file=sys.stderr)
			return None, [], [], []
		
		# Extract title - use the main title
		title = data.get('title', 'Unknown')
		
		# Determine if this is a playlist/album (has entries array)
		# is_playlist = 'entries' in data and isinstance(data.get('entries'), list)
		
		# Extract tags and playlist items
		tags = []
		playlist_items = []
		
		# IMPORTANT: Extract album/playlist-level tags FIRST (before processing entries)
		# This ensures we get metadata about the collection, not just individual tracks
		if extract_ytdlp_tags:
			album_tags = extract_ytdlp_tags(data)
			tags.extend(album_tags)
		
		# Case 1: Entries are nested in the main object (standard playlist structure)
		if 'entries' in data and isinstance(data.get('entries'), list):
			entries = data['entries']
			# Build playlist items with title and duration
			for idx, entry in enumerate(entries, 1):
				if isinstance(entry, dict):
					item_title = entry.get('title', entry.get('id', f'Track {idx}'))
					item_duration = entry.get('duration', 0)
					playlist_items.append({
						'index': idx,
						'id': entry.get('id', f'track_{idx}'),
						'title': item_title,
						'duration': item_duration,
						'url': entry.get('url') or entry.get('webpage_url', ''),
					})
					
					# Extract tags from each entry and merge (but don't duplicate album-level tags)
					# Only merge entry tags that are multi-value prefixes (not single-value like title:, artist:, etc.)
					if extract_ytdlp_tags:
						entry_tags = extract_ytdlp_tags(entry)
						
						# Single-value namespaces that should not be duplicated from entries
						single_value_namespaces = {'title', 'artist', 'album', 'creator', 'channel', 'release_date', 'upload_date', 'license', 'location'}
						
						for tag in entry_tags:
							# Extract the namespace (part before the colon)
							tag_namespace = tag.split(':', 1)[0].lower() if ':' in tag else None
							
							# Skip if this namespace already exists in tags (from album level)
							if tag_namespace and tag_namespace in single_value_namespaces:
								# Check if any tag with this namespace already exists in tags
								already_has_namespace = any(
									t.split(':', 1)[0].lower() == tag_namespace 
									for t in tags if ':' in t
								)
								if already_has_namespace:
									continue  # Skip this tag, keep the album-level one
							
							if tag not in tags:  # Avoid exact duplicates
								tags.append(tag)
		
		# Case 2: Playlist detected by playlist_count field (BandCamp albums, etc.)
		# These need a separate call with --flat-playlist to get the actual entries
		elif (data.get('playlist_count') or 0) > 0 and 'entries' not in data:
			try:
				# Make a second call with --flat-playlist to get the actual tracks
				flat_cmd = [
					"yt-dlp",
					"-j",
					"--no-warnings",
					"--flat-playlist",
					"-f", "best",
					url
				]
				flat_result = subprocess.run(flat_cmd, capture_output=True, text=True, timeout=30)
				if flat_result.returncode == 0:
					flat_lines = flat_result.stdout.strip().split('\n')
					# With --flat-playlist, each line is a separate track JSON object
					# (not nested in a playlist container), so process ALL lines
					for idx, line in enumerate(flat_lines, 1):
						if line.strip().startswith('{'):
							try:
								entry = json_module.loads(line)
								item_title = entry.get('title', entry.get('id', f'Track {idx}'))
								item_duration = entry.get('duration', 0)
								playlist_items.append({
									'index': idx,
									'id': entry.get('id', f'track_{idx}'),
									'title': item_title,
									'duration': item_duration,
									'url': entry.get('url') or entry.get('webpage_url', ''),
								})
							except json_module.JSONDecodeError:
								pass
			except Exception as e:
				pass  # Silently ignore if we can't get playlist entries
		

		# Fallback: if still no tags detected, get from first item
		if not tags and extract_ytdlp_tags:
			tags = extract_ytdlp_tags(data)
		
		# Extract formats from the main data object
		formats = []
		if 'formats' in data:
			formats = _extract_url_formats(data.get('formats', []))
		
		# Deduplicate tags by namespace to prevent duplicate title:, artist:, etc.
		try:
			from metadata import dedup_tags_by_namespace as _dedup
			if _dedup:
				tags = _dedup(tags, keep_first=True)
		except Exception:
			pass  # If dedup fails, return tags as-is
		
		return title, tags, formats, playlist_items

	except subprocess.TimeoutExpired:
		log("yt-dlp timeout (>30s)", file=sys.stderr)
		return None, [], [], []
	except Exception as e:
		log(f"URL scraping error: {e}", file=sys.stderr)
		return None, [], [], []


def _extract_url_formats(formats: list) -> List[Tuple[str, str]]:
	"""Extract best formats from yt-dlp formats list.
	
	Returns list of (display_label, format_id) tuples.
	"""
	try:
		video_formats = {}  # {resolution: format_data}
		audio_formats = {}  # {quality_label: format_data}
		
		for fmt in formats:
			vcodec = fmt.get('vcodec', 'none')
			acodec = fmt.get('acodec', 'none')
			height = fmt.get('height')
			ext = fmt.get('ext', 'unknown')
			format_id = fmt.get('format_id', '')
			tbr = fmt.get('tbr', 0)
			abr = fmt.get('abr', 0)
			
			# Video format
			if vcodec and vcodec != 'none' and height:
				if height < 480:
					continue
				res_key = f"{height}p"
				if res_key not in video_formats or tbr > video_formats[res_key].get('tbr', 0):
					video_formats[res_key] = {
						'label': f"{height}p ({ext})",
						'format_id': format_id,
						'tbr': tbr,
					}
			
			# Audio-only format
			elif acodec and acodec != 'none' and (not vcodec or vcodec == 'none'):
				audio_key = f"audio_{abr}"
				if audio_key not in audio_formats or abr > audio_formats[audio_key].get('abr', 0):
					audio_formats[audio_key] = {
						'label': f"audio ({ext})",
						'format_id': format_id,
						'abr': abr,
					}
		
		result = []
		
		# Add video formats in descending resolution order
		for res in sorted(video_formats.keys(), key=lambda x: int(x.replace('p', '')), reverse=True):
			fmt = video_formats[res]
			result.append((fmt['label'], fmt['format_id']))
		
		# Add best audio format
		if audio_formats:
			best_audio = max(audio_formats.values(), key=lambda x: x.get('abr', 0))
			result.append((best_audio['label'], best_audio['format_id']))
		
		return result
	
	except Exception as e:
		log(f"Error extracting formats: {e}", file=sys.stderr)
		return []


def _scrape_isbn_metadata(isbn: str) -> List[str]:
	"""Scrape metadata for an ISBN using Open Library API."""
	new_tags = []
	try:
		from ..helper.http_client import HTTPClient
		import json as json_module
		
		isbn_clean = isbn.replace('-', '').strip()
		url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
		
		try:
			with HTTPClient() as client:
				response = client.get(url)
				response.raise_for_status()
				data = json_module.loads(response.content.decode('utf-8'))
		except Exception as e:
			log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr)
			return []
		
		if not data:
			log(f"No ISBN metadata found for: {isbn}")
			return []
		
		book_data = next(iter(data.values()), None)
		if not book_data:
			return []
		
		if 'title' in book_data:
			new_tags.append(f"title:{book_data['title']}")
		
		if 'authors' in book_data and isinstance(book_data['authors'], list):
			for author in book_data['authors'][:3]:
				if 'name' in author:
					new_tags.append(f"author:{author['name']}")
		
		if 'publish_date' in book_data:
			new_tags.append(f"publish_date:{book_data['publish_date']}")
		
		if 'publishers' in book_data and isinstance(book_data['publishers'], list):
			for pub in book_data['publishers'][:1]:
				if 'name' in pub:
					new_tags.append(f"publisher:{pub['name']}")
		
		if 'description' in book_data:
			desc = book_data['description']
			if isinstance(desc, dict) and 'value' in desc:
				desc = desc['value']
			if desc:
				desc_str = str(desc).strip()
				# Include description if available (limit to 200 chars to keep it manageable)
				if len(desc_str) > 0:
					new_tags.append(f"description:{desc_str[:200]}")
		
		if 'number_of_pages' in book_data:
			page_count = book_data['number_of_pages']
			if page_count and isinstance(page_count, int) and page_count > 0:
				new_tags.append(f"pages:{page_count}")
		
		if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict):
			identifiers = book_data['identifiers']
			
			if 'openlibrary' in identifiers:
				ol_ids = identifiers['openlibrary']
				if isinstance(ol_ids, list) and ol_ids:
					new_tags.append(f"openlibrary:{ol_ids[0]}")
				elif isinstance(ol_ids, str):
					new_tags.append(f"openlibrary:{ol_ids}")
			
			if 'lccn' in identifiers:
				lccn_list = identifiers['lccn']
				if isinstance(lccn_list, list) and lccn_list:
					new_tags.append(f"lccn:{lccn_list[0]}")
				elif isinstance(lccn_list, str):
					new_tags.append(f"lccn:{lccn_list}")
			
			if 'oclc' in identifiers:
				oclc_list = identifiers['oclc']
				if isinstance(oclc_list, list) and oclc_list:
					new_tags.append(f"oclc:{oclc_list[0]}")
				elif isinstance(oclc_list, str):
					new_tags.append(f"oclc:{oclc_list}")
			
			if 'goodreads' in identifiers:
				goodreads_list = identifiers['goodreads']
				if isinstance(goodreads_list, list) and goodreads_list:
					new_tags.append(f"goodreads:{goodreads_list[0]}")
				elif isinstance(goodreads_list, str):
					new_tags.append(f"goodreads:{goodreads_list}")
			
			if 'librarything' in identifiers:
				lt_list = identifiers['librarything']
				if isinstance(lt_list, list) and lt_list:
					new_tags.append(f"librarything:{lt_list[0]}")
				elif isinstance(lt_list, str):
					new_tags.append(f"librarything:{lt_list}")
			
			if 'doi' in identifiers:
				doi_list = identifiers['doi']
				if isinstance(doi_list, list) and doi_list:
					new_tags.append(f"doi:{doi_list[0]}")
				elif isinstance(doi_list, str):
					new_tags.append(f"doi:{doi_list}")
			
			if 'internet_archive' in identifiers:
				ia_list = identifiers['internet_archive']
				if isinstance(ia_list, list) and ia_list:
					new_tags.append(f"internet_archive:{ia_list[0]}")
				elif isinstance(ia_list, str):
					new_tags.append(f"internet_archive:{ia_list}")
		
		log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
		return new_tags
	except Exception as e:
		log(f"ISBN scraping error: {e}", file=sys.stderr)
		return []


def _scrape_openlibrary_metadata(olid: str) -> List[str]:
	"""Scrape metadata for an OpenLibrary ID using the .json API endpoint.
	
	Fetches from https://openlibrary.org/books/{OLID}.json and extracts:
	- Title, authors, publish date, publishers
	- Description
	- Subjects as freeform tags (without namespace prefix)
	- Identifiers (ISBN, LCCN, OCLC, etc.)
	"""
	new_tags = []
	try:
		from ..helper.http_client import HTTPClient
		import json as json_module
		
		# Format: OL9674499M or just 9674499M
		olid_clean = olid.replace('OL', '').replace('M', '')
		if not olid_clean.isdigit():
			olid_clean = olid
		
		# Ensure we have the full OLID format for the URL
		if not olid.startswith('OL'):
			url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
		else:
			url = f"https://openlibrary.org/books/{olid}.json"
		
		try:
			with HTTPClient() as client:
				response = client.get(url)
				response.raise_for_status()
				data = json_module.loads(response.content.decode('utf-8'))
		except Exception as e:
			log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr)
			return []
		
		if not data:
			log(f"No OpenLibrary metadata found for: {olid}")
			return []
		
		# Add title
		if 'title' in data:
			new_tags.append(f"title:{data['title']}")
		
		# Add authors
		if 'authors' in data and isinstance(data['authors'], list):
			for author in data['authors'][:3]:
				if isinstance(author, dict) and 'name' in author:
					new_tags.append(f"author:{author['name']}")
				elif isinstance(author, str):
					new_tags.append(f"author:{author}")
		
		# Add publish date
		if 'publish_date' in data:
			new_tags.append(f"publish_date:{data['publish_date']}")
		
		# Add publishers
		if 'publishers' in data and isinstance(data['publishers'], list):
			for pub in data['publishers'][:1]:
				if isinstance(pub, dict) and 'name' in pub:
					new_tags.append(f"publisher:{pub['name']}")
				elif isinstance(pub, str):
					new_tags.append(f"publisher:{pub}")
		
		# Add description
		if 'description' in data:
			desc = data['description']
			if isinstance(desc, dict) and 'value' in desc:
				desc = desc['value']
			if desc:
				desc_str = str(desc).strip()
				if len(desc_str) > 0:
					new_tags.append(f"description:{desc_str[:200]}")
		
		# Add number of pages
		if 'number_of_pages' in data:
			page_count = data['number_of_pages']
			if page_count and isinstance(page_count, int) and page_count > 0:
				new_tags.append(f"pages:{page_count}")
		
		# Add subjects as FREEFORM tags (no namespace prefix)
		if 'subjects' in data and isinstance(data['subjects'], list):
			for subject in data['subjects'][:10]:
				if subject and isinstance(subject, str):
					subject_clean = str(subject).strip()
					if subject_clean and subject_clean not in new_tags:
						new_tags.append(subject_clean)
		
		# Add identifiers
		if 'identifiers' in data and isinstance(data['identifiers'], dict):
			identifiers = data['identifiers']
			
			if 'isbn_10' in identifiers:
				isbn_10_list = identifiers['isbn_10']
				if isinstance(isbn_10_list, list) and isbn_10_list:
					new_tags.append(f"isbn_10:{isbn_10_list[0]}")
				elif isinstance(isbn_10_list, str):
					new_tags.append(f"isbn_10:{isbn_10_list}")
			
			if 'isbn_13' in identifiers:
				isbn_13_list = identifiers['isbn_13']
				if isinstance(isbn_13_list, list) and isbn_13_list:
					new_tags.append(f"isbn_13:{isbn_13_list[0]}")
				elif isinstance(isbn_13_list, str):
					new_tags.append(f"isbn_13:{isbn_13_list}")
			
			if 'lccn' in identifiers:
				lccn_list = identifiers['lccn']
				if isinstance(lccn_list, list) and lccn_list:
					new_tags.append(f"lccn:{lccn_list[0]}")
				elif isinstance(lccn_list, str):
					new_tags.append(f"lccn:{lccn_list}")
			
			if 'oclc_numbers' in identifiers:
				oclc_list = identifiers['oclc_numbers']
				if isinstance(oclc_list, list) and oclc_list:
					new_tags.append(f"oclc:{oclc_list[0]}")
				elif isinstance(oclc_list, str):
					new_tags.append(f"oclc:{oclc_list}")
			
			if 'goodreads' in identifiers:
				goodreads_list = identifiers['goodreads']
				if isinstance(goodreads_list, list) and goodreads_list:
					new_tags.append(f"goodreads:{goodreads_list[0]}")
				elif isinstance(goodreads_list, str):
					new_tags.append(f"goodreads:{goodreads_list}")
		
		log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
		return new_tags
	except Exception as e:
		log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
		return []


def _perform_scraping(tags_list: List[str]) -> List[str]:
	"""Perform scraping based on identifiers in tags.
	
	Priority order:
	1. openlibrary: (preferred - more complete metadata)
	2. isbn_10 or isbn (fallback)
	"""
	identifiers = _extract_scrapable_identifiers(tags_list)
	
	if not identifiers:
		log("No scrapable identifiers found (openlibrary, ISBN, musicbrainz, imdb)")
		return []
	
	log(f"Found scrapable identifiers: {', '.join(identifiers.keys())}")
	
	new_tags = []
	
	# Prefer OpenLibrary over ISBN (more complete metadata)
	if 'openlibrary' in identifiers:
		olid = identifiers['openlibrary']
		if olid:
			log(f"Scraping OpenLibrary: {olid}")
			new_tags.extend(_scrape_openlibrary_metadata(olid))
	elif 'isbn_10' in identifiers or 'isbn' in identifiers:
		isbn = identifiers.get('isbn_10') or identifiers.get('isbn')
		if isbn:
			log(f"Scraping ISBN: {isbn}")
			new_tags.extend(_scrape_isbn_metadata(isbn))
	
	existing_tags_lower = {tag.lower() for tag in tags_list}
	scraped_unique = []
	seen = set()
	for tag in new_tags:
		tag_lower = tag.lower()
		if tag_lower not in existing_tags_lower and tag_lower not in seen:
			scraped_unique.append(tag)
			seen.add(tag_lower)
	
	if scraped_unique:
		log(f"Added {len(scraped_unique)} new tag(s) from scraping")
	
	return scraped_unique


def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
	"""Get tags from Hydrus, local sidecar, or URL metadata.
	
	Usage: 
		get-tag [-hash <sha256>] [--store <key>] [--emit]
		get-tag -scrape <url>
	
	Options:
		-hash <sha256>: Override hash to use instead of result's hash_hex
		--store <key>: Store result to this key for pipeline
		--emit: Emit result without interactive prompt (quiet mode)
		-scrape <url>: Scrape metadata from URL (returns tags as JSON)
	"""
	# Helper to get field from both dict and object
	def get_field(obj: Any, field: str, default: Any = None) -> Any:
		if isinstance(obj, dict):
			return obj.get(field, default)
		else:
			return getattr(obj, field, default)
	
	# Parse arguments using shared parser
	parsed_args = parse_cmdlet_args(args, CMDLET)
	
	# Extract values
	hash_override = normalize_hash(parsed_args.get("hash"))
	store_key = parsed_args.get("store")
	emit_requested = parsed_args.get("emit", False)
	scrape_url = parsed_args.get("scrape")
	scrape_requested = scrape_url is not None
	
	# Handle URL or provider scraping mode
	if scrape_requested and scrape_url:
		import json as json_module

		if scrape_url.startswith("http://") or scrape_url.startswith("https://"):
			# URL scraping (existing behavior)
			title, tags, formats, playlist_items = _scrape_url_metadata(scrape_url)
			if not tags:
				log("No tags extracted from URL", file=sys.stderr)
				return 1
			output = {
				"title": title,
				"tags": tags,
				"formats": [(label, fmt_id) for label, fmt_id in formats],
				"playlist_items": playlist_items,
			}
			print(json_module.dumps(output, ensure_ascii=False))
			return 0
		
		# Provider scraping (e.g., itunes)
		provider = get_metadata_provider(scrape_url, config)
		if provider is None:
			log(f"Unknown metadata provider: {scrape_url}", file=sys.stderr)
			return 1
		
		# Determine query from title on the result or filename
		title_hint = get_field(result, "title", None) or get_field(result, "name", None)
		if not title_hint:
			file_path = get_field(result, "path", None) or get_field(result, "filename", None)
			if file_path:
				title_hint = Path(str(file_path)).stem
		
		if not title_hint:
			log("No title available to search for metadata", file=sys.stderr)
			return 1
		
		items = provider.search(title_hint, limit=10)
		if not items:
			log("No metadata results found", file=sys.stderr)
			return 1
		
		from result_table import ResultTable
		table = ResultTable(f"Metadata: {provider.name}")
		table.set_source_command("get-tag", [])
		selection_payload = []
		hash_for_payload = normalize_hash(hash_override) or normalize_hash(get_field(result, "hash_hex", None))
		for idx, item in enumerate(items):
			tags = provider.to_tags(item)
			row = table.add_row()
			row.add_column("Title", item.get("title", ""))
			row.add_column("Artist", item.get("artist", ""))
			row.add_column("Album", item.get("album", ""))
			row.add_column("Year", item.get("year", ""))
			payload = {
				"tags": tags,
				"provider": provider.name,
				"title": item.get("title"),
				"artist": item.get("artist"),
				"album": item.get("album"),
				"year": item.get("year"),
				"extra": {
					"tags": tags,
					"provider": provider.name,
					"hydrus_hash": hash_for_payload,
					"storage_source": get_field(result, "source", None) or get_field(result, "origin", None),
				},
				"file_hash": hash_for_payload,
			}
			selection_payload.append(payload)
			table.set_row_selection_args(idx, [str(idx + 1)])

		ctx.set_last_result_table_overlay(table, selection_payload)
		ctx.set_current_stage_table(table)
		# Preserve items for @ selection and downstream pipes without emitting duplicates
		ctx.set_last_result_items_only(selection_payload)
		print(table)
		return 0
	
	# If -scrape was requested but no URL, that's an error
	if scrape_requested and not scrape_url:
		log("-scrape requires a URL argument", file=sys.stderr)
		return 1
	
	# Handle @N selection which creates a list - extract the first item
	if isinstance(result, list) and len(result) > 0:
		result = result[0]
	
	hash_from_result = normalize_hash(get_field(result, "hash_hex", None))
	hash_hex = hash_override or hash_from_result
	# Only use emit mode if explicitly requested with --emit flag, not just because we're in a pipeline
	# This allows interactive REPL to work even in pipelines
	emit_mode = emit_requested or bool(store_key)
	store_label = (store_key.strip() if store_key and store_key.strip() else None)
	
	# Check Hydrus availability
	hydrus_available, _ = hydrus.is_available(config)
	
	# Try to find path in result object
	local_path = get_field(result, "target", None) or get_field(result, "path", None) or get_field(result, "file_path", None)
	
	# Determine if local file
	is_local_file = False
	media: Optional[Path] = None
	if local_path and isinstance(local_path, str) and not local_path.startswith(("http://", "https://")):
		is_local_file = True
		try:
			media = Path(str(local_path))
		except Exception:
			media = None
	
	# Try Hydrus first (always prioritize if available and has hash)
	use_hydrus = False
	hydrus_meta = None  # Cache the metadata from first fetch
	client = None
	if hash_hex and hydrus_available:
		try:
			client = hydrus.get_client(config)
			payload = client.fetch_file_metadata(hashes=[str(hash_hex)], include_service_keys_to_tags=True, include_file_urls=False)
			items = payload.get("metadata") if isinstance(payload, dict) else None
			if isinstance(items, list) and items:
				meta = items[0] if isinstance(items[0], dict) else None
				# Only accept file if it has a valid file_id (not None)
				if isinstance(meta, dict) and meta.get("file_id") is not None:
					use_hydrus = True
					hydrus_meta = meta  # Cache for tag extraction
		except Exception:
			pass
	
	# Get tags - try Hydrus first, fallback to sidecar
	current = []
	service_name = ""
	service_key = None
	source = "unknown"
	
	if use_hydrus and hash_hex and hydrus_meta:
		try:
			# Use cached metadata from above, don't fetch again
			service_name = hydrus.get_tag_service_name(config)
			if client is None:
				client = hydrus.get_client(config)
			service_key = hydrus.get_tag_service_key(client, service_name)
			current = _extract_my_tags_from_hydrus_meta(hydrus_meta, service_key, service_name)
			source = "hydrus"
		except Exception as exc:
			log(f"Warning: Failed to extract tags from Hydrus: {exc}", file=sys.stderr)
	
	# Fallback to local sidecar or local DB if no tags
	if not current and is_local_file and media and media.exists():
		try:
			# First try local library DB
			library_root = get_local_storage_path(config)
			if library_root:
				try:
					with LocalLibraryDB(library_root) as db:
						db_tags = db.get_tags(media)
						if db_tags:
							current = db_tags
							source = "local_db"
				except Exception as exc:
					log(f"[get_tag] DB lookup failed, trying sidecar: {exc}", file=sys.stderr)
			
			# Fall back to sidecar if DB didn't have tags
			if not current:
				sidecar_path = find_sidecar(media)
				if sidecar_path and sidecar_path.exists():
					try:
						_, current, _ = read_sidecar(sidecar_path)
					except Exception:
						_, current, _ = _read_sidecar_fallback(sidecar_path)
					if current:
						source = "sidecar"
		except Exception as exc:
			log(f"Warning: Failed to load tags from local storage: {exc}", file=sys.stderr)
	
	# Fallback to tags in the result object if Hydrus/local lookup returned nothing
	if not current:
		# Check if result has 'tags' attribute (PipeObject)
		if hasattr(result, 'tags') and getattr(result, 'tags', None):
			current = getattr(result, 'tags')
			source = "pipeline_result"
		# Check if result is a dict with 'tags' key
		elif isinstance(result, dict) and 'tags' in result:
			tags_val = result['tags']
			if isinstance(tags_val, list):
				current = tags_val
				source = "pipeline_result"
				source = "pipeline_result"
	
	# Error if no tags found
	if not current:
		log("No tags found", file=sys.stderr)
		return 1
	
	# Always output to ResultTable (pipeline mode only)
	# Extract title for table header
	item_title = get_field(result, "title", None) or get_field(result, "name", None) or get_field(result, "filename", None)
	
	if source == "hydrus":
		_emit_tags_as_table(current, hash_hex=hash_hex, source="hydrus", service_name=service_name, config=config, item_title=item_title)
	else:
		_emit_tags_as_table(current, hash_hex=hash_hex, source="local", service_name=None, config=config, item_title=item_title, file_path=str(local_path) if local_path else None)
	
	# If emit requested or store key provided, emit payload
	if emit_mode:
		_emit_tag_payload(source, current, hash_value=hash_hex, store_label=store_label)

	return 0


CMDLET = Cmdlet(
    name="get-tag",
    summary="Get tags from Hydrus or local sidecar metadata",
	usage="get-tag [-hash <sha256>] [--store <key>] [--emit] [-scrape <url|provider>]",
    aliases=["tags"],
    args=[
        SharedArgs.HASH,
        CmdletArg(
            name="-store",
            type="string",
            description="Store result to this key for pipeline",
            alias="store"
        ),
        CmdletArg(
            name="-emit",
            type="flag",
            description="Emit result without interactive prompt (quiet mode)",
            alias="emit-only"
        ),
        CmdletArg(
            name="-scrape",
            type="string",
			description="Scrape metadata from URL or provider name (returns tags as JSON or table)",
            required=False
        )
    ]
)