dfdfsdd

2025-12-14 00:53:52 -08:00
parent 52a79b0086
commit a03eb0d1be
24 changed files with 2785 additions and 1868 deletions
--- a/metadata.py
+++ b/metadata.py
@@ -3,14 +3,12 @@ import re
 import subprocess
 import sys
 import shutil
-import sqlite3
-import requests
 from SYS.logger import log, debug
 from urllib.parse import urlsplit, urlunsplit, unquote
 from collections import deque
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
-from models import PipeObject, FileRelationshipTracker, _get_file_hash
+from models import FileRelationshipTracker
 try:
    import musicbrainzngs  # type: ignore
 except ImportError:  # pragma: no cover
@@ -332,6 +330,112 @@ def _generate_hydrus_url_variants(url: str) -> List[str]:
    return variants


+def normalize_urls(value: Any) -> List[str]:
+    """Normalize a URL field into a stable, deduplicated list.
+
+    Accepts:
+    - None
+    - a single URL string (optionally containing multiple URLs)
+    - a list/tuple/set of URL strings
+
+    This helper is used by cmdlets/stores/pipeline to keep `url` consistent.
+    """
+
+    def _iter_raw_urls(raw: Any) -> Iterable[str]:
+        if raw is None:
+            return
+
+        if isinstance(raw, str):
+            text = raw.strip()
+            if not text:
+                return
+            # Support legacy prefixes like "url:https://...".
+            if text.lower().startswith("url:"):
+                text = text.split(":", 1)[1].strip()
+
+            # Prefer extracting obvious URLs to avoid splitting inside query strings.
+            matches = re.findall(r"https?://[^\s,]+", text, flags=re.IGNORECASE)
+            if matches:
+                for m in matches:
+                    yield m
+                return
+
+            # Fallback: split on commas/whitespace.
+            for token in text.replace("\n", " ").replace("\r", " ").replace(",", " ").split():
+                if token:
+                    yield token
+            return
+
+        if isinstance(raw, (list, tuple, set)):
+            for item in raw:
+                if item is None:
+                    continue
+                if isinstance(item, str):
+                    if item.strip():
+                        yield item
+                else:
+                    text = str(item).strip()
+                    if text:
+                        yield text
+            return
+
+        # Last resort: string-coerce.
+        text = str(raw).strip()
+        if text:
+            yield text
+
+    def _canonicalize(url_text: str) -> Optional[str]:
+        u = str(url_text or "").strip()
+        if not u:
+            return None
+
+        # Trim common wrappers and trailing punctuation.
+        u = u.strip("<>\"' ")
+        u = u.rstrip(")].,;\"")
+        if not u:
+            return None
+
+        lower = u.lower()
+        if not (lower.startswith("http://") or lower.startswith("https://")):
+            return u
+
+        try:
+            parsed = urlsplit(u)
+        except Exception:
+            return u
+
+        scheme = (parsed.scheme or "").lower()
+        netloc = (parsed.netloc or "").lower()
+        path = unquote(parsed.path or "")
+        query = parsed.query or ""
+
+        # Normalize default ports.
+        if scheme == "http" and netloc.endswith(":80"):
+            netloc = netloc[:-3]
+        elif scheme == "https" and netloc.endswith(":443"):
+            netloc = netloc[:-4]
+
+        # Prefer no trailing slash except root.
+        if path and path != "/":
+            path = path.rstrip("/")
+
+        # Fragments are not part of the resource.
+        return urlunsplit((scheme, netloc, path, query, ""))
+
+    seen: Set[str] = set()
+    out: List[str] = []
+    for raw_url in _iter_raw_urls(value):
+        canonical = _canonicalize(raw_url)
+        if not canonical:
+            continue
+        if canonical in seen:
+            continue
+        seen.add(canonical)
+        out.append(canonical)
+
+    return out
+
+
 def value_normalize(value: str) -> str:
    """Normalize whitespace: collapse internal spaces, strip, remove newlines."""
    value = value.replace("\n", " ").replace("\r", " ")
@@ -358,6 +462,7 @@ def import_pending_sidecars(db_root: Path, db: Any) -> None:
                    continue

                # Ensure file entry exists
+                file_id: Optional[int] = None
                try:
                    cursor = db.connection.cursor() if db.connection else None
                    if cursor:
@@ -394,10 +499,16 @@ def import_pending_sidecars(db_root: Path, db: Any) -> None:
                        try:
                            cursor = db.connection.cursor() if db.connection else None
                            if cursor:
+                                file_hash_value: Optional[str] = None
+                                if hasattr(db, 'get_file_hash'):
+                                    try:
+                                        file_hash_value = db.get_file_hash(file_id)
+                                    except Exception:
+                                        file_hash_value = None
                                for tag in tags:
                                    cursor.execute(
                                        'INSERT OR IGNORE INTO tags (hash, tag) VALUES (?, ?)',
-                                        (file_hash_value, tag) if hasattr(db, 'get_file_hash') else (None, tag)
+                                        (file_hash_value, tag)
                                    )
                                db.connection.commit()
                        except Exception:
@@ -663,128 +774,6 @@ def fetch_musicbrainz_tags(mbid: str, entity: str) -> Dict[str, object]:
    return {"source": "musicbrainz", "id": mbid, "tag": tags, "entity": entity}


-def fetch_openlibrary_tags(ol_id: str) -> Dict[str, object]:
-    """Fetch metadata tags from OpenLibrary.
-    
-    Args:
-        ol_id: OpenLibrary ID (e.g., 'OL123456M' for a book)
-    
-    Returns:
-        Dictionary with 'tag' key containing list of extracted tags
-    """
-    import urllib.request
-    
-    # Normalize OL ID
-    ol_id = ol_id.strip().upper()
-    if not ol_id.startswith('OL'):
-        ol_id = f'OL{ol_id}'
-    
-    # Fetch from OpenLibrary API
-    url = f"https://openlibrary.org/books/{ol_id}.json"
-    tags: List[str] = []
-    
-    try:
-        with urllib.request.urlopen(url, timeout=10) as response:
-            data = json.loads(response.read().decode('utf-8'))
-    except Exception as e:
-        raise ValueError(f"Failed to fetch OpenLibrary data for {ol_id}: {e}")
-    
-    # Add OpenLibrary ID tag
-    _add_tag(tags, "openlibrary", ol_id)
-    
-    # Extract title
-    _add_tag(tags, "title", data.get("title"))
-    
-    # Extract subtitle if present
-    if data.get("subtitle"):
-        _add_tag(tags, "subtitle", data["subtitle"])
-    
-    # Extract authors
-    authors = data.get("authors", [])
-    author_names: List[str] = []
-    for author in authors:
-        if isinstance(author, dict):
-            name = author.get("name")
-        else:
-            name = str(author)
-        if name:
-            author_names.append(name)
-    if author_names:
-        _extend_tags(tags, "author", author_names)
-    
-    # Extract publication details
-    if data.get("publish_date"):
-        _add_tag(tags, "publish_date", data["publish_date"])
-        # Extract year if present
-        year_match = re.search(r'\b(\d{4})\b', str(data.get("publish_date", "")))
-        if year_match:
-            _add_tag(tags, "year", year_match.group(1))
-    
-    # Extract publishers
-    publishers = data.get("publishers", [])
-    if publishers:
-        publisher_names = []
-        for pub in publishers:
-            if isinstance(pub, dict):
-                name = pub.get("name")
-            else:
-                name = str(pub)
-            if name:
-                publisher_names.append(name)
-        if publisher_names:
-            _extend_tags(tags, "publisher", publisher_names)
-    
-    # Extract languages
-    languages = data.get("languages", [])
-    if languages:
-        lang_codes = []
-        for lang in languages:
-            if isinstance(lang, dict):
-                code = lang.get("key", "").split("/")[-1]
-            else:
-                code = str(lang).split("/")[-1]
-            if code and code != "":
-                lang_codes.append(code)
-        if lang_codes:
-            _extend_tags(tags, "language", lang_codes)
-    
-    # Extract ISBN
-    isbns = data.get("isbn_10", []) + data.get("isbn_13", [])
-    if isbns:
-        for isbn in isbns[:1]:  # Just take first one
-            if len(str(isbn)) == 10:
-                _add_tag(tags, "isbn_10", isbn)
-            elif len(str(isbn)) == 13:
-                _add_tag(tags, "isbn_13", isbn)
-    
-    # Extract page count
-    _add_tag(tags, "pages", data.get("number_of_pages"))
-    
-    # Extract genres/subjects (OpenLibrary calls them subjects)
-    # Subjects are added as plain freeform tags (no namespace prefix)
-    subjects = data.get("subjects", [])
-    if subjects:
-        for subject in subjects[:10]:  # Limit to 10 subjects
-            if isinstance(subject, dict):
-                name = subject.get("name")
-            else:
-                name = str(subject)
-            if name:
-                # Add subject as plain tag without "subject:" prefix
-                normalized = value_normalize(str(name))
-                if normalized:
-                    tags.append(normalized)
-    
-    # Extract OpenLibrary description
-    description = data.get("description")
-    if description:
-        if isinstance(description, dict):
-            description = description.get("value")
-        _add_tag(tags, "summary", description)
-    
-    return {"source": "openlibrary", "id": ol_id, "tag": tags}
-
-
 def _append_unique(target: List[str], seen: Set[str], value: Optional[str]) -> None:
    """Append a single value if not already in seen set (deduplication)."""
    if value is None:
@@ -1545,7 +1534,7 @@ def _derive_sidecar_path(media_path: Path) -> Path:
    return preferred


-def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]:
+def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]:  # pyright: ignore[reportUnusedFunction]
    """Read hash, tags, and url from sidecar file.
    
    Consolidated with read_tags_from_file - this extracts extra metadata (hash, url).
@@ -1559,7 +1548,7 @@ def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str]
    
    hash_value: Optional[str] = None
    tags: List[str] = []
-    url: List[str] = []
+    urls: List[str] = []
    
    for raw_line in raw.splitlines():
        line = raw_line.strip()
@@ -1574,15 +1563,15 @@ def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str]
            url_part = line.split(':', 1)[1].strip() if ':' in line else ''
            if url_part:
                for url_segment in url_part.split(','):
-                    for url in url_segment.split():
-                        url_clean = url.strip()
-                        if url_clean and url_clean not in url:
-                            url.append(url_clean)
+                    for url_token in url_segment.split():
+                        url_clean = url_token.strip()
+                        if url_clean and url_clean not in urls:
+                            urls.append(url_clean)
        else:
            # Everything else is a tag (including relationship: lines)
            tags.append(line)
    
-    return hash_value, tags, url
+    return hash_value, tags, urls



@@ -1827,63 +1816,6 @@ def apply_title_to_path(media_path: Path, tags: Iterable[str]) -> Path:
    return destination


-def _collect_search_roots(payload: Dict[str, Any]) -> List[Path]:
-    roots: List[Path] = []
-    for key in ('paths', 'search_paths', 'roots', 'directories'):
-        raw = payload.get(key)
-        if not raw:
-            continue
-        entries = raw if isinstance(raw, (list, tuple, set)) else [raw]
-        for entry in entries:
-            if not entry:
-                continue
-            try:
-                candidate = Path(str(entry)).expanduser()
-            except Exception:
-                continue
-            roots.append(candidate)
-    if load_config is not None and resolve_output_dir is not None:
-        try:
-            config = load_config()
-        except Exception:
-            config = None
-        if isinstance(config, dict) and config:
-            try:
-                default_root = resolve_output_dir(config)
-            except Exception:
-                default_root = None
-            if default_root is not None:
-                roots.append(default_root)
-    return roots
-
-
-def _locate_sidecar_by_hash(hash_value: str, roots: Iterable[Path]) -> Optional[Path]:
-    target = f'hash:{hash_value.strip().lower()}'
-    for root in roots:
-        try:
-            root_path = root.expanduser()
-        except Exception:
-            continue
-        if not root_path.exists() or not root_path.is_dir():
-            continue
-        for pattern in ('*.tag',):
-            try:
-                iterator = root_path.rglob(pattern)
-            except OSError:
-                continue
-            for candidate in iterator:
-                if not candidate.is_file():
-                    continue
-                try:
-                    with candidate.open('r', encoding='utf-8', errors='ignore') as handle:
-                        for line in handle:
-                            if line.strip().lower() == target:
-                                return candidate
-                except OSError:
-                    continue
-    return None
-
-
 def sync_sidecar(payload: Dict[str, Any]) -> Dict[str, Any]:
    path_value = payload.get('path')
    if not path_value:
@@ -2506,8 +2438,8 @@ def write_tags_to_file(
        
        # Add known url if provided - each on separate line to prevent corruption
        if url:
-            for url in url:
-                content_lines.append(f"url:{url}")
+            for url_item in url:
+                content_lines.append(f"url:{url_item}")
        
        # Add tags
        if tags:
@@ -2642,10 +2574,10 @@ def detect_metadata_request(tag: str) -> Optional[Dict[str, str]]:
 def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
    tag = payload.get('tag')
    if not isinstance(tag, str):
-        return {'tags': []}
+        return {'tag': []}
    trimmed = value_normalize(tag)
    if not trimmed:
-        return {'tags': []}
+        return {'tag': []}
    request = detect_metadata_request(trimmed)
    tags: List[str] = []
    seen: Set[str] = set()
@@ -2653,7 +2585,7 @@ def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
        _append_unique(tags, seen, request['base'])
    else:
        _append_unique(tags, seen, trimmed)
-        return {'tags': tags}
+        return {'tag': tags}
    try:
        if request['source'] == 'imdb':
            data = imdb_tag(request['id'])
@@ -2662,8 +2594,15 @@ def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
    except Exception as exc:  # pragma: no cover - network/service errors
        return {'tag': tags, 'error': str(exc)}
    # Add tags from fetched data (no namespace, just unique append)
-    for tag in (data.get('tag') or []):
-        _append_unique(tags, seen, tag)
+    raw_tags = data.get('tag') if isinstance(data, dict) else None
+    if isinstance(raw_tags, str):
+        tag_iter: Iterable[str] = [raw_tags]
+    elif isinstance(raw_tags, (list, tuple, set)):
+        tag_iter = [t for t in raw_tags if isinstance(t, str)]
+    else:
+        tag_iter = []
+    for tag_value in tag_iter:
+        _append_unique(tags, seen, tag_value)
    result = {
        'tag': tags,
        'source': request['source'],
@@ -3082,14 +3021,14 @@ def expand_tag_lists(tags_set: Set[str]) -> Set[str]:
    # Load adjective.json from workspace root
    adjective_path = Path(__file__).parent / "adjective.json"
    if not adjective_path.exists():
-        log.debug(f"adjective.json not found at {adjective_path}")
+        debug(f"adjective.json not found at {adjective_path}")
        return tags_set
    
    try:
        with open(adjective_path, 'r') as f:
            adjective_lists = json.load(f)
    except Exception as e:
-        log.error(f"Error loading adjective.json: {e}")
+        debug(f"Error loading adjective.json: {e}")
        return tags_set
    
    expanded_tags = set()
@@ -3108,10 +3047,10 @@ def expand_tag_lists(tags_set: Set[str]) -> Set[str]:
            if matched_list:
                # Add all tags from the list
                expanded_tags.update(matched_list)
-                log.info(f"Expanded {tag} to {len(matched_list)} tags")
+                debug(f"Expanded {tag} to {len(matched_list)} tags")
            else:
                # List not found, log warning but don't add the reference
-                log.warning(f"Tag list '{list_name}' not found in adjective.json")
+                debug(f"Tag list '{list_name}' not found in adjective.json")
        else:
            # Regular tag, keep as is
            expanded_tags.add(tag)
@@ -3194,98 +3133,6 @@ def build_book_tags(
    return deduped


-def fetch_openlibrary_metadata_tags(isbn: Optional[str] = None, olid: Optional[str] = None) -> List[str]:
-    """Fetch book metadata from OpenLibrary and return as tags.
-    
-    Args:
-        isbn: ISBN number (with or without isbn: prefix)
-        olid: OpenLibrary ID
-        
-    Returns:
-        List of tags extracted from OpenLibrary metadata
-    """
-    metadata_tags = []
-    
-    # Try OLID first (preferred), then ISBN
-    url = None
-    
-    if olid:
-        # Clean up OLID format
-        olid_clean = str(olid).replace('OL', '').replace('M', '').replace('W', '')
-        if olid_clean.isdigit():
-            url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
-        else:
-            url = f"https://openlibrary.org/books/{olid}.json"
-    elif isbn:
-        # Clean up ISBN
-        isbn_clean = str(isbn).replace('isbn:', '').strip()
-        url = f"https://openlibrary.org/isbn/{isbn_clean}.json"
-    
-    if not url:
-        return metadata_tags
-    
-    try:
-        response = requests.get(url, timeout=10)
-        if response.status_code != 200:
-            return metadata_tags
-            
-        data = response.json()
-        if not data:
-            return metadata_tags
-        
-        # Extract title
-        if 'title' in data:
-            metadata_tags.append(f"title:{data['title']}")
-        
-        # Extract authors
-        if 'authors' in data and isinstance(data['authors'], list):
-            for author in data['authors'][:3]:
-                if isinstance(author, dict) and 'name' in author:
-                    metadata_tags.append(f"author:{author['name']}")
-                elif isinstance(author, str):
-                    metadata_tags.append(f"author:{author}")
-        
-        # Extract publish date
-        if 'publish_date' in data:
-            metadata_tags.append(f"publish_date:{data['publish_date']}")
-        
-        # Extract publishers
-        if 'publishers' in data and isinstance(data['publishers'], list):
-            for pub in data['publishers'][:1]:
-                if isinstance(pub, dict) and 'name' in pub:
-                    metadata_tags.append(f"publisher:{pub['name']}")
-                elif isinstance(pub, str):
-                    metadata_tags.append(f"publisher:{pub}")
-        
-        # Extract number of pages
-        if 'number_of_pages' in data:
-            page_count = data['number_of_pages']
-            if page_count and isinstance(page_count, int) and page_count > 0:
-                metadata_tags.append(f"pages:{page_count}")
-        
-        # Extract language
-        if 'languages' in data and isinstance(data['languages'], list) and data['languages']:
-            lang = data['languages'][0]
-            if isinstance(lang, dict) and 'key' in lang:
-                lang_code = lang['key'].split('/')[-1]
-                metadata_tags.append(f"language:{lang_code}")
-            elif isinstance(lang, str):
-                metadata_tags.append(f"language:{lang}")
-        
-        # Extract subjects as freeform tags (limit to 5)
-        if 'subjects' in data and isinstance(data['subjects'], list):
-            for subject in data['subjects'][:5]:
-                if subject and isinstance(subject, str):
-                    subject_clean = str(subject).strip()
-                    if subject_clean:
-                        metadata_tags.append(subject_clean)
-        
-    except Exception as e:
-        debug(f"⚠ Failed to fetch OpenLibrary metadata: {e}")
-    
-    return metadata_tags
-
-
 def enrich_playlist_entries(entries: list, extractor: str) -> list:
    """Enrich playlist entries with full metadata by fetching individual entry info.
    
@@ -3312,7 +3159,7 @@ def enrich_playlist_entries(entries: list, extractor: str) -> list:
        if entry_url and is_url_supported_by_ytdlp(entry_url):
            try:
                import yt_dlp
-                ydl_opts = {
+                ydl_opts: Any = {
                    "quiet": True,
                    "no_warnings": True,
                    "skip_download": True,
@@ -3690,294 +3537,3 @@ def extract_url_formats(formats: list) -> List[Tuple[str, str]]:
        return []


-def scrape_isbn_metadata(isbn: str) -> List[str]:
-    """Scrape metadata for an ISBN using Open Library API."""
-    new_tags = []
-    try:
-        from API.HTTP import HTTPClient
-        import json as json_module
-        
-        isbn_clean = isbn.replace('-', '').strip()
-        url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
-        
-        try:
-            with HTTPClient() as client:
-                response = client.get(url)
-                response.raise_for_status()
-                data = json_module.loads(response.content.decode('utf-8'))
-        except Exception as e:
-            log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr)
-            return []
-        
-        if not data:
-            log(f"No ISBN metadata found for: {isbn}")
-            return []
-        
-        book_data = next(iter(data.values()), None)
-        if not book_data:
-            return []
-        
-        if 'title' in book_data:
-            new_tags.append(f"title:{book_data['title']}")
-        
-        if 'authors' in book_data and isinstance(book_data['authors'], list):
-            for author in book_data['authors'][:3]:
-                if 'name' in author:
-                    new_tags.append(f"author:{author['name']}")
-        
-        if 'publish_date' in book_data:
-            new_tags.append(f"publish_date:{book_data['publish_date']}")
-        
-        if 'publishers' in book_data and isinstance(book_data['publishers'], list):
-            for pub in book_data['publishers'][:1]:
-                if 'name' in pub:
-                    new_tags.append(f"publisher:{pub['name']}")
-        
-        if 'description' in book_data:
-            desc = book_data['description']
-            if isinstance(desc, dict) and 'value' in desc:
-                desc = desc['value']
-            if desc:
-                desc_str = str(desc).strip()
-                # Include description if available (limit to 200 chars to keep it manageable)
-                if len(desc_str) > 0:
-                    new_tags.append(f"description:{desc_str[:200]}")
-        
-        if 'number_of_pages' in book_data:
-            page_count = book_data['number_of_pages']
-            if page_count and isinstance(page_count, int) and page_count > 0:
-                new_tags.append(f"pages:{page_count}")
-        
-        if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict):
-            identifiers = book_data['identifiers']
-            
-            if 'openlibrary' in identifiers:
-                ol_ids = identifiers['openlibrary']
-                if isinstance(ol_ids, list) and ol_ids:
-                    new_tags.append(f"openlibrary:{ol_ids[0]}")
-                elif isinstance(ol_ids, str):
-                    new_tags.append(f"openlibrary:{ol_ids}")
-            
-            if 'lccn' in identifiers:
-                lccn_list = identifiers['lccn']
-                if isinstance(lccn_list, list) and lccn_list:
-                    new_tags.append(f"lccn:{lccn_list[0]}")
-                elif isinstance(lccn_list, str):
-                    new_tags.append(f"lccn:{lccn_list}")
-            
-            if 'oclc' in identifiers:
-                oclc_list = identifiers['oclc']
-                if isinstance(oclc_list, list) and oclc_list:
-                    new_tags.append(f"oclc:{oclc_list[0]}")
-                elif isinstance(oclc_list, str):
-                    new_tags.append(f"oclc:{oclc_list}")
-            
-            if 'goodreads' in identifiers:
-                goodreads_list = identifiers['goodreads']
-                if isinstance(goodreads_list, list) and goodreads_list:
-                    new_tags.append(f"goodreads:{goodreads_list[0]}")
-                elif isinstance(goodreads_list, str):
-                    new_tags.append(f"goodreads:{goodreads_list}")
-            
-            if 'librarything' in identifiers:
-                lt_list = identifiers['librarything']
-                if isinstance(lt_list, list) and lt_list:
-                    new_tags.append(f"librarything:{lt_list[0]}")
-                elif isinstance(lt_list, str):
-                    new_tags.append(f"librarything:{lt_list}")
-            
-            if 'doi' in identifiers:
-                doi_list = identifiers['doi']
-                if isinstance(doi_list, list) and doi_list:
-                    new_tags.append(f"doi:{doi_list[0]}")
-                elif isinstance(doi_list, str):
-                    new_tags.append(f"doi:{doi_list}")
-            
-            if 'internet_archive' in identifiers:
-                ia_list = identifiers['internet_archive']
-                if isinstance(ia_list, list) and ia_list:
-                    new_tags.append(f"internet_archive:{ia_list[0]}")
-                elif isinstance(ia_list, str):
-                    new_tags.append(f"internet_archive:{ia_list}")
-        
-        log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
-        return new_tags
-    except Exception as e:
-        log(f"ISBN scraping error: {e}", file=sys.stderr)
-        return []
-
-
-def scrape_openlibrary_metadata(olid: str) -> List[str]:
-    """Scrape metadata for an OpenLibrary ID using the .json API endpoint.
-    
-    Fetches from https://openlibrary.org/books/{OLID}.json and extracts:
-    - Title, authors, publish date, publishers
-    - Description
-    - Subjects as freeform tags (without namespace prefix)
-    - Identifiers (ISBN, LCCN, OCLC, etc.)
-    """
-    new_tags = []
-    try:
-        from API.HTTP import HTTPClient
-        import json as json_module
-        
-        # Format: OL9674499M or just 9674499M
-        olid_clean = olid.replace('OL', '').replace('M', '')
-        if not olid_clean.isdigit():
-            olid_clean = olid
-        
-        # Ensure we have the full OLID format for the URL
-        if not olid.startswith('OL'):
-            url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
-        else:
-            url = f"https://openlibrary.org/books/{olid}.json"
-        
-        try:
-            with HTTPClient() as client:
-                response = client.get(url)
-                response.raise_for_status()
-                data = json_module.loads(response.content.decode('utf-8'))
-        except Exception as e:
-            log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr)
-            return []
-        
-        if not data:
-            log(f"No OpenLibrary metadata found for: {olid}")
-            return []
-        
-        # Add title
-        if 'title' in data:
-            new_tags.append(f"title:{data['title']}")
-        
-        # Add authors
-        if 'authors' in data and isinstance(data['authors'], list):
-            for author in data['authors'][:3]:
-                if isinstance(author, dict) and 'name' in author:
-                    new_tags.append(f"author:{author['name']}")
-                elif isinstance(author, str):
-                    new_tags.append(f"author:{author}")
-        
-        # Add publish date
-        if 'publish_date' in data:
-            new_tags.append(f"publish_date:{data['publish_date']}")
-        
-        # Add publishers
-        if 'publishers' in data and isinstance(data['publishers'], list):
-            for pub in data['publishers'][:1]:
-                if isinstance(pub, dict) and 'name' in pub:
-                    new_tags.append(f"publisher:{pub['name']}")
-                elif isinstance(pub, str):
-                    new_tags.append(f"publisher:{pub}")
-        
-        # Add description
-        if 'description' in data:
-            desc = data['description']
-            if isinstance(desc, dict) and 'value' in desc:
-                desc = desc['value']
-            if desc:
-                desc_str = str(desc).strip()
-                if len(desc_str) > 0:
-                    new_tags.append(f"description:{desc_str[:200]}")
-        
-        # Add number of pages
-        if 'number_of_pages' in data:
-            page_count = data['number_of_pages']
-            if page_count and isinstance(page_count, int) and page_count > 0:
-                new_tags.append(f"pages:{page_count}")
-        
-        # Add subjects as FREEFORM tags (no namespace prefix)
-        if 'subjects' in data and isinstance(data['subjects'], list):
-            for subject in data['subjects'][:10]:
-                if subject and isinstance(subject, str):
-                    subject_clean = str(subject).strip()
-                    if subject_clean and subject_clean not in new_tags:
-                        new_tags.append(subject_clean)
-        
-        # Add identifiers
-        if 'identifiers' in data and isinstance(data['identifiers'], dict):
-            identifiers = data['identifiers']
-            
-            if 'isbn_10' in identifiers:
-                isbn_10_list = identifiers['isbn_10']
-                if isinstance(isbn_10_list, list) and isbn_10_list:
-                    new_tags.append(f"isbn_10:{isbn_10_list[0]}")
-                elif isinstance(isbn_10_list, str):
-                    new_tags.append(f"isbn_10:{isbn_10_list}")
-            
-            if 'isbn_13' in identifiers:
-                isbn_13_list = identifiers['isbn_13']
-                if isinstance(isbn_13_list, list) and isbn_13_list:
-                    new_tags.append(f"isbn_13:{isbn_13_list[0]}")
-                elif isinstance(isbn_13_list, str):
-                    new_tags.append(f"isbn_13:{isbn_13_list}")
-            
-            if 'lccn' in identifiers:
-                lccn_list = identifiers['lccn']
-                if isinstance(lccn_list, list) and lccn_list:
-                    new_tags.append(f"lccn:{lccn_list[0]}")
-                elif isinstance(lccn_list, str):
-                    new_tags.append(f"lccn:{lccn_list}")
-            
-            if 'oclc_numbers' in identifiers:
-                oclc_list = identifiers['oclc_numbers']
-                if isinstance(oclc_list, list) and oclc_list:
-                    new_tags.append(f"oclc:{oclc_list[0]}")
-                elif isinstance(oclc_list, str):
-                    new_tags.append(f"oclc:{oclc_list}")
-            
-            if 'goodreads' in identifiers:
-                goodreads_list = identifiers['goodreads']
-                if isinstance(goodreads_list, list) and goodreads_list:
-                    new_tags.append(f"goodreads:{goodreads_list[0]}")
-                elif isinstance(goodreads_list, str):
-                    new_tags.append(f"goodreads:{goodreads_list}")
-        
-        log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
-        return new_tags
-    except Exception as e:
-        log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
-        return []
-
-
-def perform_metadata_scraping(tags_list: List[str]) -> List[str]:
-    """Perform scraping based on identifiers in tags.
-    
-    Priority order:
-    1. openlibrary: (preferred - more complete metadata)
-    2. isbn_10 or isbn (fallback)
-    """
-    identifiers = extract_scrapable_identifiers(tags_list)
-    
-    if not identifiers:
-        log("No scrapable identifiers found (openlibrary, ISBN, musicbrainz, imdb)")
-        return []
-    
-    log(f"Found scrapable identifiers: {', '.join(identifiers.keys())}")
-    
-    new_tags = []
-    
-    # Prefer OpenLibrary over ISBN (more complete metadata)
-    if 'openlibrary' in identifiers:
-        olid = identifiers['openlibrary']
-        if olid:
-            log(f"Scraping OpenLibrary: {olid}")
-            new_tags.extend(scrape_openlibrary_metadata(olid))
-    elif 'isbn_13' in identifiers or 'isbn_10' in identifiers or 'isbn' in identifiers:
-        isbn = identifiers.get('isbn_13') or identifiers.get('isbn_10') or identifiers.get('isbn')
-        if isbn:
-            log(f"Scraping ISBN: {isbn}")
-            new_tags.extend(scrape_isbn_metadata(isbn))
-    
-    existing_tags_lower = {tag.lower() for tag in tags_list}
-    scraped_unique = []
-    seen = set()
-    for tag in new_tags:
-        tag_lower = tag.lower()
-        if tag_lower not in existing_tags_lower and tag_lower not in seen:
-            scraped_unique.append(tag)
-            seen.add(tag_lower)
-    
-    if scraped_unique:
-        log(f"Added {len(scraped_unique)} new tag(s) from scraping")
-    
-    return scraped_unique