Medios-Macina/cmdlets/get_tag.py

"""Get tags from Hydrus or local sidecar metadata.

This cmdlet retrieves tags for a selected result, supporting both:
- Hydrus Network (for files with hash_hex)
- Local sidecar files (.tags)

In interactive mode: navigate with numbers, add/delete tags
In pipeline mode: display tags as read-only table, emit as structured JSON
"""

from __future__ import annotations

import sys

from helper.logger import log, debug
from helper.metadata_search import get_metadata_provider, list_metadata_providers
import subprocess
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple

import pipeline as ctx
from helper import hydrus
from helper.folder_store import read_sidecar, write_sidecar, find_sidecar, FolderDB
from ._shared import normalize_hash, looks_like_hash, Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field
from config import get_local_storage_path


try:
	from metadata import extract_title
except ImportError:
	extract_title = None


# Tag item for ResultTable display and piping
from dataclasses import dataclass

@dataclass
class TagItem:
	"""Tag item for display in ResultTable and piping to other cmdlets.
	
	Allows tags to be selected and piped like:
	- delete-tag @{3,4,9}  (delete tags at indices 3, 4, 9)
	- add-tag @"namespace:value"  (add this tag)
	"""
	tag_name: str
	tag_index: int  # 1-based index for user reference
	hash_hex: Optional[str] = None
	source: str = "hydrus"
	service_name: Optional[str] = None
	file_path: Optional[str] = None
	
	def __post_init__(self):
		# Make ResultTable happy by adding standard fields
		# NOTE: Don't set 'title' - we want only the tag column in ResultTable
		self.origin = self.source
		self.detail = f"Tag #{self.tag_index}"
		self.target = self.tag_name
		self.media_kind = "tag"
	
	def to_dict(self) -> Dict[str, Any]:
		"""Convert to dict for JSON serialization."""
		return {
			"tag_name": self.tag_name,
			"tag_index": self.tag_index,
			"hash_hex": self.hash_hex,
			"source": self.source,
			"service_name": self.service_name,
		}


def _emit_tags_as_table(
	tags_list: List[str],
	hash_hex: Optional[str],
	source: str = "hydrus",
	service_name: Optional[str] = None,
	config: Dict[str, Any] = None,
	item_title: Optional[str] = None,
	file_path: Optional[str] = None,
	subject: Optional[Any] = None,
) -> None:
	"""Emit tags as TagItem objects and display via ResultTable.
	
	This replaces _print_tag_list to make tags pipe-able.
	Stores the table in ctx._LAST_RESULT_TABLE for downstream @ selection.
	"""
	from result_table import ResultTable
	
	# Create ResultTable with just tag column (no title)
	table_title = "Tags"
	if item_title:
		table_title = f"Tags: {item_title}"
		if hash_hex:
			table_title += f" [{hash_hex[:8]}]"
			
	table = ResultTable(table_title, max_columns=1)
	table.set_source_command("get-tag", [])
	
	# Create TagItem for each tag
	tag_items = []
	for idx, tag_name in enumerate(tags_list, start=1):
		tag_item = TagItem(
			tag_name=tag_name,
			tag_index=idx,
			hash_hex=hash_hex,
			source=source,
			service_name=service_name,
			file_path=file_path,
		)
		tag_items.append(tag_item)
		table.add_result(tag_item)
		# Also emit to pipeline for downstream processing
		ctx.emit(tag_item)

	# Store the table and items in history so @.. works to go back
	# Use overlay mode so it doesn't push the previous search to history stack
	# This makes get-tag behave like a transient view
	try:
		ctx.set_last_result_table_overlay(table, tag_items, subject)
	except AttributeError:
		ctx.set_last_result_table(table, tag_items, subject)
	# Note: CLI will handle displaying the table via ResultTable formatting
def _summarize_tags(tags_list: List[str], limit: int = 8) -> str:
	"""Create a summary of tags for display."""
	shown = [t for t in tags_list[:limit] if t]
	summary = ", ".join(shown)
	remaining = max(0, len(tags_list) - len(shown))
	if remaining > 0:
		summary = f"{summary} (+{remaining} more)" if summary else f"(+{remaining} more)"
	if len(summary) > 200:
		summary = summary[:197] + "..."
	return summary


def _extract_title_from(tags_list: List[str]) -> Optional[str]:
	"""Extract title from tags list."""
	if extract_title:
		try:
			return extract_title(tags_list)
		except Exception:
			pass
	for t in tags_list:
		if isinstance(t, str) and t.lower().startswith("title:"):
			val = t.split(":", 1)[1].strip()
			if val:
				return val
	return None


def _rename_file_if_title_tag(media: Optional[Path], tags_added: List[str]) -> bool:
	"""Rename a local file if title: tag was added.
	
	Returns True if file was renamed, False otherwise.
	"""
	if not media or not tags_added:
		return False
	
	# Check if any of the added tags is a title: tag
	title_value = None
	for tag in tags_added:
		if isinstance(tag, str):
			lower_tag = tag.lower()
			if lower_tag.startswith("title:"):
				title_value = tag.split(":", 1)[1].strip()
				break
	
	if not title_value:
		return False
	
	try:
		# Get current file path
		file_path = media
		if not file_path.exists():
			return False
		
		# Parse file path
		dir_path = file_path.parent
		old_name = file_path.name
		
		# Get file extension
		suffix = file_path.suffix or ''
		
		# Sanitize title for use as filename
		import re
		safe_title = re.sub(r'[<>:"/\\|?*]', '', title_value).strip()
		if not safe_title:
			return False
		
		new_name = safe_title + suffix
		new_file_path = dir_path / new_name
		
		if new_file_path == file_path:
			return False
		
		# Build sidecar paths BEFORE renaming the file
		old_sidecar = Path(str(file_path) + '.tags')
		new_sidecar = Path(str(new_file_path) + '.tags')
		
		# Rename file
		try:
			file_path.rename(new_file_path)
			log(f"Renamed file: {old_name} → {new_name}")
			
			# Rename .tags sidecar if it exists
			if old_sidecar.exists():
				try:
					old_sidecar.rename(new_sidecar)
					log(f"Renamed sidecar: {old_name}.tags → {new_name}.tags")
				except Exception as e:
					log(f"Failed to rename sidecar: {e}", file=sys.stderr)
			
			return True
		except Exception as e:
			log(f"Failed to rename file: {e}", file=sys.stderr)
			return False
	except Exception as e:
		log(f"Error during file rename: {e}", file=sys.stderr)
		return False


def _apply_result_updates_from_tags(result: Any, tag_list: List[str]) -> None:
	"""Update result object with title and tag summary from tags."""
	try:
		new_title = _extract_title_from(tag_list)
		if new_title:
			setattr(result, "title", new_title)
		setattr(result, "tag_summary", _summarize_tags(tag_list))
	except Exception:
		pass


def _handle_title_rename(old_path: Path, tags_list: List[str]) -> Optional[Path]:
	"""If a title: tag is present, rename the file and its .tags sidecar to match.
	
	Returns the new path if renamed, otherwise returns None.
	"""
	# Extract title from tags
	new_title = None
	for tag in tags_list:
		if isinstance(tag, str) and tag.lower().startswith('title:'):
			new_title = tag.split(':', 1)[1].strip()
			break
	
	if not new_title or not old_path.exists():
		return None
	
	try:
		# Build new filename with same extension
		old_name = old_path.name
		old_suffix = old_path.suffix
		
		# Create new filename: title + extension
		new_name = f"{new_title}{old_suffix}"
		new_path = old_path.parent / new_name
		
		# Don't rename if already the same name
		if new_path == old_path:
			return None
		
		# Rename the main file
		if new_path.exists():
			log(f"Warning: Target filename already exists: {new_name}", file=sys.stderr)
			return None
		
		old_path.rename(new_path)
		log(f"Renamed file: {old_name} → {new_name}", file=sys.stderr)
		
		# Rename the .tags sidecar if it exists
		old_tags_path = old_path.parent / (old_name + '.tags')
		if old_tags_path.exists():
			new_tags_path = old_path.parent / (new_name + '.tags')
			if new_tags_path.exists():
				log(f"Warning: Target sidecar already exists: {new_tags_path.name}", file=sys.stderr)
			else:
				old_tags_path.rename(new_tags_path)
				log(f"Renamed sidecar: {old_tags_path.name} → {new_tags_path.name}", file=sys.stderr)
		
		return new_path
	except Exception as exc:
		log(f"Warning: Failed to rename file: {exc}", file=sys.stderr)
		return None


def _read_sidecar_fallback(p: Path) -> tuple[Optional[str], List[str], List[str]]:
	"""Fallback sidecar reader if metadata module unavailable.
	
	Format:
	- Lines with "hash:" prefix: file hash
	- Lines with "url:" or "url:" prefix: url
	- Lines with "relationship:" prefix: ignored (internal relationships)
	- Lines with "key:", "namespace:value" format: treated as namespace tags
	- Plain lines without colons: freeform tags
	
	Excluded namespaces (treated as metadata, not tags): hash, url, url, relationship
	"""
	try:
		raw = p.read_text(encoding="utf-8", errors="ignore")
	except OSError:
		return None, [], []
	t: List[str] = []
	u: List[str] = []
	h: Optional[str] = None
	
	# Namespaces to exclude from tags
	excluded_namespaces = {"hash", "url", "url", "relationship"}
	
	for line in raw.splitlines():
		s = line.strip()
		if not s:
			continue
		low = s.lower()
		
		# Check if this is a hash line
		if low.startswith("hash:"):
			h = s.split(":", 1)[1].strip() if ":" in s else h
		# Check if this is a URL line
		elif low.startswith("url:") or low.startswith("url:"):
			val = s.split(":", 1)[1].strip() if ":" in s else ""
			if val:
				u.append(val)
		# Check if this is an excluded namespace
		elif ":" in s:
			namespace = s.split(":", 1)[0].strip().lower()
			if namespace not in excluded_namespaces:
				# Include as namespace tag (e.g., "title: The Freemasons")
				t.append(s)
		else:
			# Plain text without colon = freeform tag
			t.append(s)
	
	return h, t, u


def _write_sidecar(p: Path, media: Path, tag_list: List[str], url: List[str], hash_in_sidecar: Optional[str]) -> Path:
	"""Write tags to sidecar file and handle title-based renaming.
	
	Returns the new media path if renamed, otherwise returns the original media path.
	"""
	success = write_sidecar(media, tag_list, url, hash_in_sidecar)
	if success:
		_apply_result_updates_from_tags(None, tag_list)
		# Check if we should rename the file based on title tag
		new_media = _handle_title_rename(media, tag_list)
		if new_media:
			return new_media
		return media
	
	# Fallback writer
	ordered = [s for s in tag_list if s and s.strip()]
	lines = []
	if hash_in_sidecar:
		lines.append(f"hash:{hash_in_sidecar}")
	lines.extend(ordered)
	for u in url:
		lines.append(f"url:{u}")
	try:
		p.write_text("\n".join(lines) + "\n", encoding="utf-8")
		# Check if we should rename the file based on title tag
		new_media = _handle_title_rename(media, tag_list)
		if new_media:
			return new_media
		return media
	except OSError as exc:
		log(f"Failed to write sidecar: {exc}", file=sys.stderr)
		return media


def _emit_tag_payload(source: str, tags_list: List[str], *, hash_value: Optional[str], extra: Optional[Dict[str, Any]] = None, store_label: Optional[str] = None) -> int:
	"""Emit tags as structured payload to pipeline.
	
	Also emits individual tag objects to _PIPELINE_LAST_ITEMS so they can be selected by index.
	"""
	payload: Dict[str, Any] = {
		"source": source,
		"tags": list(tags_list),
		"count": len(tags_list),
	}
	if hash_value:
		payload["hash"] = hash_value
	if extra:
		for key, value in extra.items():
			if value is not None:
				payload[key] = value
	label = None
	if store_label:
		label = store_label
	elif ctx.get_stage_context() is not None:
		label = "tags"
	if label:
		ctx.store_value(label, payload)
		if ctx.get_stage_context() is not None and label.lower() != "tags":
			ctx.store_value("tags", payload)
	
	# Emit individual TagItem objects so they can be selected by bare index
	# When in pipeline, emit individual TagItem objects
	if ctx.get_stage_context() is not None:
		for idx, tag_name in enumerate(tags_list, start=1):
			tag_item = TagItem(
				tag_name=tag_name,
				tag_index=idx,
				hash_hex=hash_value,
				source=source,
				service_name=None
			)
			ctx.emit(tag_item)
	else:
		# When not in pipeline, just emit the payload
		ctx.emit(payload)
	
	return 0


def _extract_scrapable_identifiers(tags_list: List[str]) -> Dict[str, str]:
	"""Extract scrapable identifiers from tags."""
	identifiers = {}
	scrapable_prefixes = {
		'openlibrary', 'isbn', 'isbn_10', 'isbn_13',
		'musicbrainz', 'musicbrainzalbum', 'imdb', 'tmdb', 'tvdb'
	}
	
	for tag in tags_list:
		if not isinstance(tag, str) or ':' not in tag:
			continue
		
		parts = tag.split(':', 1)
		if len(parts) != 2:
			continue
		
		key_raw = parts[0].strip().lower()
		key = key_raw.replace('-', '_')
		if key == 'isbn10':
			key = 'isbn_10'
		elif key == 'isbn13':
			key = 'isbn_13'
		value = parts[1].strip()
		
		# Normalize ISBN values by removing hyphens for API friendliness
		if key.startswith('isbn'):
			value = value.replace('-', '')
		
		if key in scrapable_prefixes and value:
			identifiers[key] = value
	
	return identifiers


def _extract_tag_value(tags_list: List[str], namespace: str) -> Optional[str]:
	"""Get first tag value for a namespace (e.g., artist:, title:)."""
	ns = namespace.lower()
	for tag in tags_list:
		if not isinstance(tag, str) or ':' not in tag:
			continue
		prefix, _, value = tag.partition(':')
		if prefix.strip().lower() != ns:
			continue
		candidate = value.strip()
		if candidate:
			return candidate
	return None


def _scrape_url_metadata(url: str) -> Tuple[Optional[str], List[str], List[Tuple[str, str]], List[Dict[str, Any]]]:
	"""Scrape metadata from a URL using yt-dlp.
	
	Returns:
		(title, tags, formats, playlist_items) tuple where:
		- title: Video/content title
		- tags: List of extracted tags (both namespaced and freeform)
		- formats: List of (display_label, format_id) tuples
		- playlist_items: List of playlist entry dicts (empty if not a playlist)
	"""
	try:
		import json as json_module
		
		try:
			from metadata import extract_ytdlp_tags
		except ImportError:
			extract_ytdlp_tags = None
		
		# Build yt-dlp command with playlist support
		# IMPORTANT: Do NOT use --flat-playlist! It strips metadata like artist, album, uploader, genre
		# Without it, yt-dlp gives us full metadata in an 'entries' array within a single JSON object
		# This ensures we get album-level metadata from sources like BandCamp, YouTube Music, etc.
		cmd = [
			"yt-dlp",
			"-j",  # Output JSON
			"--no-warnings",
			"--playlist-items", "1-10",  # Get first 10 items if it's a playlist (provides entries)
			"-f", "best",
			url
		]
		
		result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
		
		if result.returncode != 0:
			log(f"yt-dlp error: {result.stderr}", file=sys.stderr)
			return None, [], [], []
		
		# Parse JSON output - WITHOUT --flat-playlist, we get ONE JSON object with 'entries' array
		# This gives us full metadata instead of flat format
		lines = result.stdout.strip().split('\n')
		if not lines or not lines[0]:
			log("yt-dlp returned empty output", file=sys.stderr)
			return None, [], [], []
		
		# Parse the single JSON object
		try:
			data = json_module.loads(lines[0])
		except json_module.JSONDecodeError as e:
			log(f"Failed to parse yt-dlp JSON: {e}", file=sys.stderr)
			return None, [], [], []
		
		# Extract title - use the main title
		title = data.get('title', 'Unknown')
		
		# Determine if this is a playlist/album (has entries array)
		# is_playlist = 'entries' in data and isinstance(data.get('entries'), list)
		
		# Extract tags and playlist items
		tags = []
		playlist_items = []
		
		# IMPORTANT: Extract album/playlist-level tags FIRST (before processing entries)
		# This ensures we get metadata about the collection, not just individual tracks
		if extract_ytdlp_tags:
			album_tags = extract_ytdlp_tags(data)
			tags.extend(album_tags)
		
		# Case 1: Entries are nested in the main object (standard playlist structure)
		if 'entries' in data and isinstance(data.get('entries'), list):
			entries = data['entries']
			# Build playlist items with title and duration
			for idx, entry in enumerate(entries, 1):
				if isinstance(entry, dict):
					item_title = entry.get('title', entry.get('id', f'Track {idx}'))
					item_duration = entry.get('duration', 0)
					playlist_items.append({
						'index': idx,
						'id': entry.get('id', f'track_{idx}'),
						'title': item_title,
						'duration': item_duration,
						'url': entry.get('url') or entry.get('webpage_url', ''),
					})
					
					# Extract tags from each entry and merge (but don't duplicate album-level tags)
					# Only merge entry tags that are multi-value prefixes (not single-value like title:, artist:, etc.)
					if extract_ytdlp_tags:
						entry_tags = extract_ytdlp_tags(entry)
						
						# Single-value namespaces that should not be duplicated from entries
						single_value_namespaces = {'title', 'artist', 'album', 'creator', 'channel', 'release_date', 'upload_date', 'license', 'location'}
						
						for tag in entry_tags:
							# Extract the namespace (part before the colon)
							tag_namespace = tag.split(':', 1)[0].lower() if ':' in tag else None
							
							# Skip if this namespace already exists in tags (from album level)
							if tag_namespace and tag_namespace in single_value_namespaces:
								# Check if any tag with this namespace already exists in tags
								already_has_namespace = any(
									t.split(':', 1)[0].lower() == tag_namespace 
									for t in tags if ':' in t
								)
								if already_has_namespace:
									continue  # Skip this tag, keep the album-level one
							
							if tag not in tags:  # Avoid exact duplicates
								tags.append(tag)
		
		# Case 2: Playlist detected by playlist_count field (BandCamp albums, etc.)
		# These need a separate call with --flat-playlist to get the actual entries
		elif (data.get('playlist_count') or 0) > 0 and 'entries' not in data:
			try:
				# Make a second call with --flat-playlist to get the actual tracks
				flat_cmd = [
					"yt-dlp",
					"-j",
					"--no-warnings",
					"--flat-playlist",
					"-f", "best",
					url
				]
				flat_result = subprocess.run(flat_cmd, capture_output=True, text=True, timeout=30)
				if flat_result.returncode == 0:
					flat_lines = flat_result.stdout.strip().split('\n')
					# With --flat-playlist, each line is a separate track JSON object
					# (not nested in a playlist container), so process ALL lines
					for idx, line in enumerate(flat_lines, 1):
						if line.strip().startswith('{'):
							try:
								entry = json_module.loads(line)
								item_title = entry.get('title', entry.get('id', f'Track {idx}'))
								item_duration = entry.get('duration', 0)
								playlist_items.append({
									'index': idx,
									'id': entry.get('id', f'track_{idx}'),
									'title': item_title,
									'duration': item_duration,
									'url': entry.get('url') or entry.get('webpage_url', ''),
								})
							except json_module.JSONDecodeError:
								pass
			except Exception as e:
				pass  # Silently ignore if we can't get playlist entries
		

		# Fallback: if still no tags detected, get from first item
		if not tags and extract_ytdlp_tags:
			tags = extract_ytdlp_tags(data)
		
		# Extract formats from the main data object
		formats = []
		if 'formats' in data:
			formats = _extract_url_formats(data.get('formats', []))
		
		# Deduplicate tags by namespace to prevent duplicate title:, artist:, etc.
		try:
			from metadata import dedup_tags_by_namespace as _dedup
			if _dedup:
				tags = _dedup(tags, keep_first=True)
		except Exception:
			pass  # If dedup fails, return tags as-is
		
		return title, tags, formats, playlist_items

	except subprocess.TimeoutExpired:
		log("yt-dlp timeout (>30s)", file=sys.stderr)
		return None, [], [], []
	except Exception as e:
		log(f"URL scraping error: {e}", file=sys.stderr)
		return None, [], [], []


def _extract_url_formats(formats: list) -> List[Tuple[str, str]]:
	"""Extract best formats from yt-dlp formats list.
	
	Returns list of (display_label, format_id) tuples.
	"""
	try:
		video_formats = {}  # {resolution: format_data}
		audio_formats = {}  # {quality_label: format_data}
		
		for fmt in formats:
			vcodec = fmt.get('vcodec', 'none')
			acodec = fmt.get('acodec', 'none')
			height = fmt.get('height')
			ext = fmt.get('ext', 'unknown')
			format_id = fmt.get('format_id', '')
			tbr = fmt.get('tbr', 0)
			abr = fmt.get('abr', 0)
			
			# Video format
			if vcodec and vcodec != 'none' and height:
				if height < 480:
					continue
				res_key = f"{height}p"
				if res_key not in video_formats or tbr > video_formats[res_key].get('tbr', 0):
					video_formats[res_key] = {
						'label': f"{height}p ({ext})",
						'format_id': format_id,
						'tbr': tbr,
					}
			
			# Audio-only format
			elif acodec and acodec != 'none' and (not vcodec or vcodec == 'none'):
				audio_key = f"audio_{abr}"
				if audio_key not in audio_formats or abr > audio_formats[audio_key].get('abr', 0):
					audio_formats[audio_key] = {
						'label': f"audio ({ext})",
						'format_id': format_id,
						'abr': abr,
					}
		
		result = []
		
		# Add video formats in descending resolution order
		for res in sorted(video_formats.keys(), key=lambda x: int(x.replace('p', '')), reverse=True):
			fmt = video_formats[res]
			result.append((fmt['label'], fmt['format_id']))
		
		# Add best audio format
		if audio_formats:
			best_audio = max(audio_formats.values(), key=lambda x: x.get('abr', 0))
			result.append((best_audio['label'], best_audio['format_id']))
		
		return result
	
	except Exception as e:
		log(f"Error extracting formats: {e}", file=sys.stderr)
		return []


def _scrape_isbn_metadata(isbn: str) -> List[str]:
	"""Scrape metadata for an ISBN using Open Library API."""
	new_tags = []
	try:
		from ..helper.http_client import HTTPClient
		import json as json_module
		
		isbn_clean = isbn.replace('-', '').strip()
		url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
		
		try:
			with HTTPClient() as client:
				response = client.get(url)
				response.raise_for_status()
				data = json_module.loads(response.content.decode('utf-8'))
		except Exception as e:
			log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr)
			return []
		
		if not data:
			log(f"No ISBN metadata found for: {isbn}")
			return []
		
		book_data = next(iter(data.values()), None)
		if not book_data:
			return []
		
		if 'title' in book_data:
			new_tags.append(f"title:{book_data['title']}")
		
		if 'authors' in book_data and isinstance(book_data['authors'], list):
			for author in book_data['authors'][:3]:
				if 'name' in author:
					new_tags.append(f"author:{author['name']}")
		
		if 'publish_date' in book_data:
			new_tags.append(f"publish_date:{book_data['publish_date']}")
		
		if 'publishers' in book_data and isinstance(book_data['publishers'], list):
			for pub in book_data['publishers'][:1]:
				if 'name' in pub:
					new_tags.append(f"publisher:{pub['name']}")
		
		if 'description' in book_data:
			desc = book_data['description']
			if isinstance(desc, dict) and 'value' in desc:
				desc = desc['value']
			if desc:
				desc_str = str(desc).strip()
				# Include description if available (limit to 200 chars to keep it manageable)
				if len(desc_str) > 0:
					new_tags.append(f"description:{desc_str[:200]}")
		
		if 'number_of_pages' in book_data:
			page_count = book_data['number_of_pages']
			if page_count and isinstance(page_count, int) and page_count > 0:
				new_tags.append(f"pages:{page_count}")
		
		if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict):
			identifiers = book_data['identifiers']
			
			if 'openlibrary' in identifiers:
				ol_ids = identifiers['openlibrary']
				if isinstance(ol_ids, list) and ol_ids:
					new_tags.append(f"openlibrary:{ol_ids[0]}")
				elif isinstance(ol_ids, str):
					new_tags.append(f"openlibrary:{ol_ids}")
			
			if 'lccn' in identifiers:
				lccn_list = identifiers['lccn']
				if isinstance(lccn_list, list) and lccn_list:
					new_tags.append(f"lccn:{lccn_list[0]}")
				elif isinstance(lccn_list, str):
					new_tags.append(f"lccn:{lccn_list}")
			
			if 'oclc' in identifiers:
				oclc_list = identifiers['oclc']
				if isinstance(oclc_list, list) and oclc_list:
					new_tags.append(f"oclc:{oclc_list[0]}")
				elif isinstance(oclc_list, str):
					new_tags.append(f"oclc:{oclc_list}")
			
			if 'goodreads' in identifiers:
				goodreads_list = identifiers['goodreads']
				if isinstance(goodreads_list, list) and goodreads_list:
					new_tags.append(f"goodreads:{goodreads_list[0]}")
				elif isinstance(goodreads_list, str):
					new_tags.append(f"goodreads:{goodreads_list}")
			
			if 'librarything' in identifiers:
				lt_list = identifiers['librarything']
				if isinstance(lt_list, list) and lt_list:
					new_tags.append(f"librarything:{lt_list[0]}")
				elif isinstance(lt_list, str):
					new_tags.append(f"librarything:{lt_list}")
			
			if 'doi' in identifiers:
				doi_list = identifiers['doi']
				if isinstance(doi_list, list) and doi_list:
					new_tags.append(f"doi:{doi_list[0]}")
				elif isinstance(doi_list, str):
					new_tags.append(f"doi:{doi_list}")
			
			if 'internet_archive' in identifiers:
				ia_list = identifiers['internet_archive']
				if isinstance(ia_list, list) and ia_list:
					new_tags.append(f"internet_archive:{ia_list[0]}")
				elif isinstance(ia_list, str):
					new_tags.append(f"internet_archive:{ia_list}")
		
		log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
		return new_tags
	except Exception as e:
		log(f"ISBN scraping error: {e}", file=sys.stderr)
		return []


def _scrape_openlibrary_metadata(olid: str) -> List[str]:
	"""Scrape metadata for an OpenLibrary ID using the .json API endpoint.
	
	Fetches from https://openlibrary.org/books/{OLID}.json and extracts:
	- Title, authors, publish date, publishers
	- Description
	- Subjects as freeform tags (without namespace prefix)
	- Identifiers (ISBN, LCCN, OCLC, etc.)
	"""
	new_tags = []
	try:
		from ..helper.http_client import HTTPClient
		import json as json_module
		
		# Format: OL9674499M or just 9674499M
		olid_clean = olid.replace('OL', '').replace('M', '')
		if not olid_clean.isdigit():
			olid_clean = olid
		
		# Ensure we have the full OLID format for the URL
		if not olid.startswith('OL'):
			url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
		else:
			url = f"https://openlibrary.org/books/{olid}.json"
		
		try:
			with HTTPClient() as client:
				response = client.get(url)
				response.raise_for_status()
				data = json_module.loads(response.content.decode('utf-8'))
		except Exception as e:
			log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr)
			return []
		
		if not data:
			log(f"No OpenLibrary metadata found for: {olid}")
			return []
		
		# Add title
		if 'title' in data:
			new_tags.append(f"title:{data['title']}")
		
		# Add authors
		if 'authors' in data and isinstance(data['authors'], list):
			for author in data['authors'][:3]:
				if isinstance(author, dict) and 'name' in author:
					new_tags.append(f"author:{author['name']}")
				elif isinstance(author, str):
					new_tags.append(f"author:{author}")
		
		# Add publish date
		if 'publish_date' in data:
			new_tags.append(f"publish_date:{data['publish_date']}")
		
		# Add publishers
		if 'publishers' in data and isinstance(data['publishers'], list):
			for pub in data['publishers'][:1]:
				if isinstance(pub, dict) and 'name' in pub:
					new_tags.append(f"publisher:{pub['name']}")
				elif isinstance(pub, str):
					new_tags.append(f"publisher:{pub}")
		
		# Add description
		if 'description' in data:
			desc = data['description']
			if isinstance(desc, dict) and 'value' in desc:
				desc = desc['value']
			if desc:
				desc_str = str(desc).strip()
				if len(desc_str) > 0:
					new_tags.append(f"description:{desc_str[:200]}")
		
		# Add number of pages
		if 'number_of_pages' in data:
			page_count = data['number_of_pages']
			if page_count and isinstance(page_count, int) and page_count > 0:
				new_tags.append(f"pages:{page_count}")
		
		# Add subjects as FREEFORM tags (no namespace prefix)
		if 'subjects' in data and isinstance(data['subjects'], list):
			for subject in data['subjects'][:10]:
				if subject and isinstance(subject, str):
					subject_clean = str(subject).strip()
					if subject_clean and subject_clean not in new_tags:
						new_tags.append(subject_clean)
		
		# Add identifiers
		if 'identifiers' in data and isinstance(data['identifiers'], dict):
			identifiers = data['identifiers']
			
			if 'isbn_10' in identifiers:
				isbn_10_list = identifiers['isbn_10']
				if isinstance(isbn_10_list, list) and isbn_10_list:
					new_tags.append(f"isbn_10:{isbn_10_list[0]}")
				elif isinstance(isbn_10_list, str):
					new_tags.append(f"isbn_10:{isbn_10_list}")
			
			if 'isbn_13' in identifiers:
				isbn_13_list = identifiers['isbn_13']
				if isinstance(isbn_13_list, list) and isbn_13_list:
					new_tags.append(f"isbn_13:{isbn_13_list[0]}")
				elif isinstance(isbn_13_list, str):
					new_tags.append(f"isbn_13:{isbn_13_list}")
			
			if 'lccn' in identifiers:
				lccn_list = identifiers['lccn']
				if isinstance(lccn_list, list) and lccn_list:
					new_tags.append(f"lccn:{lccn_list[0]}")
				elif isinstance(lccn_list, str):
					new_tags.append(f"lccn:{lccn_list}")
			
			if 'oclc_numbers' in identifiers:
				oclc_list = identifiers['oclc_numbers']
				if isinstance(oclc_list, list) and oclc_list:
					new_tags.append(f"oclc:{oclc_list[0]}")
				elif isinstance(oclc_list, str):
					new_tags.append(f"oclc:{oclc_list}")
			
			if 'goodreads' in identifiers:
				goodreads_list = identifiers['goodreads']
				if isinstance(goodreads_list, list) and goodreads_list:
					new_tags.append(f"goodreads:{goodreads_list[0]}")
				elif isinstance(goodreads_list, str):
					new_tags.append(f"goodreads:{goodreads_list}")
		
		log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
		return new_tags
	except Exception as e:
		log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
		return []


def _perform_scraping(tags_list: List[str]) -> List[str]:
	"""Perform scraping based on identifiers in tags.
	
	Priority order:
	1. openlibrary: (preferred - more complete metadata)
	2. isbn_10 or isbn (fallback)
	"""
	identifiers = _extract_scrapable_identifiers(tags_list)
	
	if not identifiers:
		log("No scrapable identifiers found (openlibrary, ISBN, musicbrainz, imdb)")
		return []
	
	log(f"Found scrapable identifiers: {', '.join(identifiers.keys())}")
	
	new_tags = []
	
	# Prefer OpenLibrary over ISBN (more complete metadata)
	if 'openlibrary' in identifiers:
		olid = identifiers['openlibrary']
		if olid:
			log(f"Scraping OpenLibrary: {olid}")
			new_tags.extend(_scrape_openlibrary_metadata(olid))
	elif 'isbn_13' in identifiers or 'isbn_10' in identifiers or 'isbn' in identifiers:
		isbn = identifiers.get('isbn_13') or identifiers.get('isbn_10') or identifiers.get('isbn')
		if isbn:
			log(f"Scraping ISBN: {isbn}")
			new_tags.extend(_scrape_isbn_metadata(isbn))
	
	existing_tags_lower = {tag.lower() for tag in tags_list}
	scraped_unique = []
	seen = set()
	for tag in new_tags:
		tag_lower = tag.lower()
		if tag_lower not in existing_tags_lower and tag_lower not in seen:
			scraped_unique.append(tag)
			seen.add(tag_lower)
	
	if scraped_unique:
		log(f"Added {len(scraped_unique)} new tag(s) from scraping")
	
	return scraped_unique


def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
	"""Get tags from Hydrus, local sidecar, or URL metadata.
	
	Usage: 
		get-tag [-hash <sha256>] [--store <key>] [--emit]
		get-tag -scrape <url|provider>
	
	Options:
		-hash <sha256>: Override hash to use instead of result's hash_hex
		--store <key>: Store result to this key for pipeline
		--emit: Emit result without interactive prompt (quiet mode)
		-scrape <url|provider>: Scrape metadata from URL or provider name (itunes, openlibrary, googlebooks)
	"""
	args_list = [str(arg) for arg in (args or [])]
	raw_args = list(args_list)

	# Support numeric selection tokens (e.g., "@1" leading to argument "1") without treating
	# them as hash overrides. This lets users pick from the most recent table overlay/results.
	if len(args_list) == 1:
		token = args_list[0]
		if not token.startswith("-") and token.isdigit():
			try:
				idx = int(token) - 1
				items_pool = ctx.get_last_result_items()
				if 0 <= idx < len(items_pool):
					result = items_pool[idx]
					args_list = []
					debug(f"[get_tag] Resolved numeric selection arg {token} -> last_result_items[{idx}]")
				else:
					debug(f"[get_tag] Numeric selection arg {token} out of range (items={len(items_pool)})")
			except Exception as exc:
				debug(f"[get_tag] Failed to resolve numeric selection arg {token}: {exc}")
	# Helper to get field from both dict and object
	def get_field(obj: Any, field: str, default: Any = None) -> Any:
		if isinstance(obj, dict):
			return obj.get(field, default)
		else:
			return getattr(obj, field, default)
	
	# Parse arguments using shared parser
	parsed_args = parse_cmdlet_args(args_list, CMDLET)

	# Detect if -scrape flag was provided without a value (parse_cmdlet_args skips missing values)
	scrape_flag_present = any(str(arg).lower() in {"-scrape", "--scrape"} for arg in args_list)

	# Extract values
	hash_override_raw = parsed_args.get("hash")
	hash_override = normalize_hash(hash_override_raw)
	store_key = parsed_args.get("store")
	emit_requested = parsed_args.get("emit", False)
	scrape_url = parsed_args.get("scrape")
	scrape_requested = scrape_flag_present or scrape_url is not None

	explicit_hash_flag = any(str(arg).lower() in {"-hash", "--hash"} for arg in raw_args)
	if hash_override_raw is not None:
		if not hash_override or not looks_like_hash(hash_override):
			debug(f"[get_tag] Ignoring invalid hash override '{hash_override_raw}' (explicit_flag={explicit_hash_flag})")
			if explicit_hash_flag:
				log("Invalid hash format: expected 64 hex characters", file=sys.stderr)
				return 1
			hash_override = None

	if scrape_requested and (not scrape_url or str(scrape_url).strip() == ""):
		log("-scrape requires a URL or provider name", file=sys.stderr)
		return 1
	
	# Handle URL or provider scraping mode
	if scrape_requested and scrape_url:
		import json as json_module

		if scrape_url.startswith("http://") or scrape_url.startswith("https://"):
			# URL scraping (existing behavior)
			title, tags, formats, playlist_items = _scrape_url_metadata(scrape_url)
			if not tags:
				log("No tags extracted from URL", file=sys.stderr)
				return 1
			output = {
				"title": title,
				"tags": tags,
				"formats": [(label, fmt_id) for label, fmt_id in formats],
				"playlist_items": playlist_items,
			}
			print(json_module.dumps(output, ensure_ascii=False))
			return 0
		
		# Provider scraping (e.g., itunes)
		provider = get_metadata_provider(scrape_url, config)
		if provider is None:
			log(f"Unknown metadata provider: {scrape_url}", file=sys.stderr)
			return 1
		
		# Prefer identifier tags (ISBN/OLID/etc.) when available; fallback to title/filename
		identifier_tags: List[str] = []
		result_tags = get_field(result, "tags", None)
		if isinstance(result_tags, list):
			identifier_tags = [str(t) for t in result_tags if isinstance(t, (str, bytes))]
		
		# Try local sidecar if no tags present on result
		if not identifier_tags:
			file_path = get_field(result, "target", None) or get_field(result, "path", None) or get_field(result, "filename", None)
			if isinstance(file_path, str) and file_path and not file_path.lower().startswith(("http://", "https://")):
				try:
					media_path = Path(str(file_path))
					if media_path.exists():
						tags_from_sidecar = read_sidecar(media_path)
						if isinstance(tags_from_sidecar, list):
							identifier_tags = [str(t) for t in tags_from_sidecar if isinstance(t, (str, bytes))]
				except Exception:
					pass

		title_from_tags = _extract_tag_value(identifier_tags, "title")
		artist_from_tags = _extract_tag_value(identifier_tags, "artist")
		
		identifiers = _extract_scrapable_identifiers(identifier_tags)
		identifier_query: Optional[str] = None
		if identifiers:
			if provider.name in {"openlibrary", "googlebooks", "google"}:
				identifier_query = identifiers.get("isbn_13") or identifiers.get("isbn_10") or identifiers.get("isbn") or identifiers.get("openlibrary")
			elif provider.name == "itunes":
				identifier_query = identifiers.get("musicbrainz") or identifiers.get("musicbrainzalbum")
		
		# Determine query from identifier first, else title on the result or filename
		title_hint = title_from_tags or get_field(result, "title", None) or get_field(result, "name", None)
		if not title_hint:
			file_path = get_field(result, "path", None) or get_field(result, "filename", None)
			if file_path:
				title_hint = Path(str(file_path)).stem
		artist_hint = artist_from_tags or get_field(result, "artist", None) or get_field(result, "uploader", None)
		if not artist_hint:
			meta_field = get_field(result, "metadata", None)
			if isinstance(meta_field, dict):
				meta_artist = meta_field.get("artist") or meta_field.get("uploader")
				if meta_artist:
					artist_hint = str(meta_artist)

		combined_query: Optional[str] = None
		if not identifier_query and title_hint and artist_hint and provider.name in {"itunes", "musicbrainz"}:
			if provider.name == "musicbrainz":
				combined_query = f'recording:"{title_hint}" AND artist:"{artist_hint}"'
			else:
				combined_query = f"{title_hint} {artist_hint}"
		
		query_hint = identifier_query or combined_query or title_hint
		if not query_hint:
			log("No title or identifier available to search for metadata", file=sys.stderr)
			return 1
		
		if identifier_query:
			log(f"Using identifier for metadata search: {identifier_query}")
		elif combined_query:
			log(f"Using title+artist for metadata search: {title_hint} - {artist_hint}")
		else:
			log(f"Using title for metadata search: {query_hint}")

		items = provider.search(query_hint, limit=10)
		if not items:
			log("No metadata results found", file=sys.stderr)
			return 1
		
		from result_table import ResultTable
		table = ResultTable(f"Metadata: {provider.name}")
		table.set_source_command("get-tag", [])
		selection_payload = []
		hash_for_payload = normalize_hash(hash_override) or normalize_hash(get_field(result, "hash_hex", None))
		for idx, item in enumerate(items):
			tags = provider.to_tags(item)
			row = table.add_row()
			row.add_column("Title", item.get("title", ""))
			row.add_column("Artist", item.get("artist", ""))
			row.add_column("Album", item.get("album", ""))
			row.add_column("Year", item.get("year", ""))
			payload = {
				"tags": tags,
				"provider": provider.name,
				"title": item.get("title"),
				"artist": item.get("artist"),
				"album": item.get("album"),
				"year": item.get("year"),
				"extra": {
					"tags": tags,
					"provider": provider.name,
					"hydrus_hash": hash_for_payload,
					"storage_source": get_field(result, "source", None) or get_field(result, "origin", None),
				},
				"file_hash": hash_for_payload,
			}
			selection_payload.append(payload)
			table.set_row_selection_args(idx, [str(idx + 1)])

		ctx.set_last_result_table_overlay(table, selection_payload)
		ctx.set_current_stage_table(table)
		# Preserve items for @ selection and downstream pipes without emitting duplicates
		ctx.set_last_result_items_only(selection_payload)
		print(table)
		return 0
	
	# If -scrape was requested but no URL, that's an error
	if scrape_requested and not scrape_url:
		log("-scrape requires a URL argument", file=sys.stderr)
		return 1
	
	# Handle @N selection which creates a list - extract the first item
	if isinstance(result, list) and len(result) > 0:
		result = result[0]
	
	hash_from_result = normalize_hash(get_field(result, "hash_hex", None))
	hash_hex = hash_override or hash_from_result
	# Only use emit mode if explicitly requested with --emit flag, not just because we're in a pipeline
	# This allows interactive REPL to work even in pipelines
	emit_mode = emit_requested or bool(store_key)
	store_label = (store_key.strip() if store_key and store_key.strip() else None)
	
	# Get hash and store from result
	file_hash = hash_hex
	storage_source = get_field(result, "store") or get_field(result, "storage") or get_field(result, "origin")
	
	if not file_hash:
		log("No hash available in result", file=sys.stderr)
		return 1
	
	if not storage_source:
		log("No storage backend specified in result", file=sys.stderr)
		return 1
	
	# Get tags using storage backend
	try:
		from helper.store import FileStorage
		storage = FileStorage(config)
		backend = storage[storage_source]
		current, source = backend.get_tag(file_hash, config=config)
		
		if not current:
			log("No tags found", file=sys.stderr)
			return 1
		
		service_name = ""
	except KeyError:
		log(f"Storage backend '{storage_source}' not found", file=sys.stderr)
		return 1
	except Exception as exc:
		log(f"Failed to get tags: {exc}", file=sys.stderr)
		return 1
	
	# Always output to ResultTable (pipeline mode only)
	# Extract title for table header
	item_title = get_field(result, "title", None) or get_field(result, "name", None) or get_field(result, "filename", None)

	# Build a subject payload representing the file whose tags are being shown
	subject_origin = get_field(result, "origin", None) or get_field(result, "source", None) or source
	subject_payload: Dict[str, Any] = {
		"tags": list(current),
		"title": item_title,
		"name": item_title,
		"origin": subject_origin,
		"source": subject_origin,
		"storage_source": subject_origin,
		"service_name": service_name,
		"extra": {
			"tags": list(current),
			"storage_source": subject_origin,
			"hydrus_hash": hash_hex,
		},
	}
	if hash_hex:
		subject_payload.update({
			"hash": hash_hex,
			"hash_hex": hash_hex,
			"file_hash": hash_hex,
			"hydrus_hash": hash_hex,
		})
	if local_path:
		try:
			path_text = str(local_path)
			subject_payload.update({
				"file_path": path_text,
				"path": path_text,
				"target": path_text,
			})
			subject_payload["extra"]["file_path"] = path_text
		except Exception:
			pass
	
	if source == "hydrus":
		_emit_tags_as_table(current, hash_hex=hash_hex, source="hydrus", service_name=service_name, config=config, item_title=item_title, subject=subject_payload)
	else:
		_emit_tags_as_table(current, hash_hex=hash_hex, source="local", service_name=None, config=config, item_title=item_title, file_path=str(local_path) if local_path else None, subject=subject_payload)
	
	# If emit requested or store key provided, emit payload
	if emit_mode:
		_emit_tag_payload(source, current, hash_value=hash_hex, store_label=store_label)

	return 0


_SCRAPE_CHOICES = []
try:
	_SCRAPE_CHOICES = sorted(list_metadata_providers().keys())
except Exception:
	_SCRAPE_CHOICES = ["itunes", "openlibrary", "googlebooks", "google", "musicbrainz"]


class Get_Tag(Cmdlet):
	"""Class-based get-tag cmdlet with self-registration."""

	def __init__(self) -> None:
		"""Initialize get-tag cmdlet."""
		super().__init__(
			name="get-tag",
			summary="Get tags from Hydrus or local sidecar metadata",
			usage="get-tag [-hash <sha256>] [--store <key>] [--emit] [-scrape <url|provider>]",
			alias=["tags"],
			arg=[
				SharedArgs.HASH,
				CmdletArg(
					name="-store",
					type="string",
					description="Store result to this key for pipeline",
					alias="store"
				),
				CmdletArg(
					name="-emit",
					type="flag",
					description="Emit result without interactive prompt (quiet mode)",
					alias="emit-only"
				),
				CmdletArg(
					name="-scrape",
					type="string",
					description="Scrape metadata from URL or provider name (returns tags as JSON or table)",
					required=False,
					choices=_SCRAPE_CHOICES,
				)
			],
			detail=[
				"- Retrieves tags for a file from:",
				"    Hydrus: Using file hash if available",
				"    Local: From sidecar files or local library database",
				"- Options:",
				"    -hash: Override hash to look up in Hydrus",
				"    -store: Store result to key for downstream pipeline",
				"    -emit: Quiet mode (no interactive selection)",
				"    -scrape: Scrape metadata from URL or metadata provider",
			],
			exec=self.run,
		)
		self.register()

	def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
		"""Execute get-tag cmdlet."""
		# Parse arguments
		parsed = parse_cmdlet_args(args, self)
		
		# Get hash and store from parsed args or result
		hash_override = parsed.get("hash")
		file_hash = hash_override or get_field(result, "hash") or get_field(result, "file_hash") or get_field(result, "hash_hex")
		storage_source = parsed.get("store") or get_field(result, "store") or get_field(result, "storage") or get_field(result, "origin")
		
		if not file_hash:
			log("No hash available in result", file=sys.stderr)
			return 1
		
		if not storage_source:
			log("No storage backend specified in result", file=sys.stderr)
			return 1
		
		# Get tags using storage backend
		try:
			from helper.store import FileStorage
			storage_obj = FileStorage(config)
			backend = storage_obj[storage_source]
			current, source = backend.get_tag(file_hash, config=config)
			
			if not current:
				log("No tags found", file=sys.stderr)
				return 1
			
			# Build table and emit
			item_title = get_field(result, "title") or file_hash[:16]
			_emit_tags_as_table(
				tags_list=current,
				hash_hex=file_hash,
				source=source,
				service_name="",
				config=config,
				item_title=item_title,
				file_path=None,
				subject=result,
			)
			return 0
			
		except KeyError:
			log(f"Storage backend '{storage_source}' not found", file=sys.stderr)
			return 1
		except Exception as exc:
			log(f"Failed to get tags: {exc}", file=sys.stderr)
			import traceback
			traceback.print_exc(file=sys.stderr)
			return 1


# Create and register the cmdlet
CMDLET = Get_Tag()