This commit is contained in:
nose
2025-12-14 00:53:52 -08:00
parent 52a79b0086
commit a03eb0d1be
24 changed files with 2785 additions and 1868 deletions

View File

@@ -3,14 +3,12 @@ import re
import subprocess
import sys
import shutil
import sqlite3
import requests
from SYS.logger import log, debug
from urllib.parse import urlsplit, urlunsplit, unquote
from collections import deque
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
from models import PipeObject, FileRelationshipTracker, _get_file_hash
from models import FileRelationshipTracker
try:
import musicbrainzngs # type: ignore
except ImportError: # pragma: no cover
@@ -332,6 +330,112 @@ def _generate_hydrus_url_variants(url: str) -> List[str]:
return variants
def normalize_urls(value: Any) -> List[str]:
"""Normalize a URL field into a stable, deduplicated list.
Accepts:
- None
- a single URL string (optionally containing multiple URLs)
- a list/tuple/set of URL strings
This helper is used by cmdlets/stores/pipeline to keep `url` consistent.
"""
def _iter_raw_urls(raw: Any) -> Iterable[str]:
if raw is None:
return
if isinstance(raw, str):
text = raw.strip()
if not text:
return
# Support legacy prefixes like "url:https://...".
if text.lower().startswith("url:"):
text = text.split(":", 1)[1].strip()
# Prefer extracting obvious URLs to avoid splitting inside query strings.
matches = re.findall(r"https?://[^\s,]+", text, flags=re.IGNORECASE)
if matches:
for m in matches:
yield m
return
# Fallback: split on commas/whitespace.
for token in text.replace("\n", " ").replace("\r", " ").replace(",", " ").split():
if token:
yield token
return
if isinstance(raw, (list, tuple, set)):
for item in raw:
if item is None:
continue
if isinstance(item, str):
if item.strip():
yield item
else:
text = str(item).strip()
if text:
yield text
return
# Last resort: string-coerce.
text = str(raw).strip()
if text:
yield text
def _canonicalize(url_text: str) -> Optional[str]:
u = str(url_text or "").strip()
if not u:
return None
# Trim common wrappers and trailing punctuation.
u = u.strip("<>\"' ")
u = u.rstrip(")].,;\"")
if not u:
return None
lower = u.lower()
if not (lower.startswith("http://") or lower.startswith("https://")):
return u
try:
parsed = urlsplit(u)
except Exception:
return u
scheme = (parsed.scheme or "").lower()
netloc = (parsed.netloc or "").lower()
path = unquote(parsed.path or "")
query = parsed.query or ""
# Normalize default ports.
if scheme == "http" and netloc.endswith(":80"):
netloc = netloc[:-3]
elif scheme == "https" and netloc.endswith(":443"):
netloc = netloc[:-4]
# Prefer no trailing slash except root.
if path and path != "/":
path = path.rstrip("/")
# Fragments are not part of the resource.
return urlunsplit((scheme, netloc, path, query, ""))
seen: Set[str] = set()
out: List[str] = []
for raw_url in _iter_raw_urls(value):
canonical = _canonicalize(raw_url)
if not canonical:
continue
if canonical in seen:
continue
seen.add(canonical)
out.append(canonical)
return out
def value_normalize(value: str) -> str:
"""Normalize whitespace: collapse internal spaces, strip, remove newlines."""
value = value.replace("\n", " ").replace("\r", " ")
@@ -358,6 +462,7 @@ def import_pending_sidecars(db_root: Path, db: Any) -> None:
continue
# Ensure file entry exists
file_id: Optional[int] = None
try:
cursor = db.connection.cursor() if db.connection else None
if cursor:
@@ -394,10 +499,16 @@ def import_pending_sidecars(db_root: Path, db: Any) -> None:
try:
cursor = db.connection.cursor() if db.connection else None
if cursor:
file_hash_value: Optional[str] = None
if hasattr(db, 'get_file_hash'):
try:
file_hash_value = db.get_file_hash(file_id)
except Exception:
file_hash_value = None
for tag in tags:
cursor.execute(
'INSERT OR IGNORE INTO tags (hash, tag) VALUES (?, ?)',
(file_hash_value, tag) if hasattr(db, 'get_file_hash') else (None, tag)
(file_hash_value, tag)
)
db.connection.commit()
except Exception:
@@ -663,128 +774,6 @@ def fetch_musicbrainz_tags(mbid: str, entity: str) -> Dict[str, object]:
return {"source": "musicbrainz", "id": mbid, "tag": tags, "entity": entity}
def fetch_openlibrary_tags(ol_id: str) -> Dict[str, object]:
"""Fetch metadata tags from OpenLibrary.
Args:
ol_id: OpenLibrary ID (e.g., 'OL123456M' for a book)
Returns:
Dictionary with 'tag' key containing list of extracted tags
"""
import urllib.request
# Normalize OL ID
ol_id = ol_id.strip().upper()
if not ol_id.startswith('OL'):
ol_id = f'OL{ol_id}'
# Fetch from OpenLibrary API
url = f"https://openlibrary.org/books/{ol_id}.json"
tags: List[str] = []
try:
with urllib.request.urlopen(url, timeout=10) as response:
data = json.loads(response.read().decode('utf-8'))
except Exception as e:
raise ValueError(f"Failed to fetch OpenLibrary data for {ol_id}: {e}")
# Add OpenLibrary ID tag
_add_tag(tags, "openlibrary", ol_id)
# Extract title
_add_tag(tags, "title", data.get("title"))
# Extract subtitle if present
if data.get("subtitle"):
_add_tag(tags, "subtitle", data["subtitle"])
# Extract authors
authors = data.get("authors", [])
author_names: List[str] = []
for author in authors:
if isinstance(author, dict):
name = author.get("name")
else:
name = str(author)
if name:
author_names.append(name)
if author_names:
_extend_tags(tags, "author", author_names)
# Extract publication details
if data.get("publish_date"):
_add_tag(tags, "publish_date", data["publish_date"])
# Extract year if present
year_match = re.search(r'\b(\d{4})\b', str(data.get("publish_date", "")))
if year_match:
_add_tag(tags, "year", year_match.group(1))
# Extract publishers
publishers = data.get("publishers", [])
if publishers:
publisher_names = []
for pub in publishers:
if isinstance(pub, dict):
name = pub.get("name")
else:
name = str(pub)
if name:
publisher_names.append(name)
if publisher_names:
_extend_tags(tags, "publisher", publisher_names)
# Extract languages
languages = data.get("languages", [])
if languages:
lang_codes = []
for lang in languages:
if isinstance(lang, dict):
code = lang.get("key", "").split("/")[-1]
else:
code = str(lang).split("/")[-1]
if code and code != "":
lang_codes.append(code)
if lang_codes:
_extend_tags(tags, "language", lang_codes)
# Extract ISBN
isbns = data.get("isbn_10", []) + data.get("isbn_13", [])
if isbns:
for isbn in isbns[:1]: # Just take first one
if len(str(isbn)) == 10:
_add_tag(tags, "isbn_10", isbn)
elif len(str(isbn)) == 13:
_add_tag(tags, "isbn_13", isbn)
# Extract page count
_add_tag(tags, "pages", data.get("number_of_pages"))
# Extract genres/subjects (OpenLibrary calls them subjects)
# Subjects are added as plain freeform tags (no namespace prefix)
subjects = data.get("subjects", [])
if subjects:
for subject in subjects[:10]: # Limit to 10 subjects
if isinstance(subject, dict):
name = subject.get("name")
else:
name = str(subject)
if name:
# Add subject as plain tag without "subject:" prefix
normalized = value_normalize(str(name))
if normalized:
tags.append(normalized)
# Extract OpenLibrary description
description = data.get("description")
if description:
if isinstance(description, dict):
description = description.get("value")
_add_tag(tags, "summary", description)
return {"source": "openlibrary", "id": ol_id, "tag": tags}
def _append_unique(target: List[str], seen: Set[str], value: Optional[str]) -> None:
"""Append a single value if not already in seen set (deduplication)."""
if value is None:
@@ -1545,7 +1534,7 @@ def _derive_sidecar_path(media_path: Path) -> Path:
return preferred
def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]:
def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]: # pyright: ignore[reportUnusedFunction]
"""Read hash, tags, and url from sidecar file.
Consolidated with read_tags_from_file - this extracts extra metadata (hash, url).
@@ -1559,7 +1548,7 @@ def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str]
hash_value: Optional[str] = None
tags: List[str] = []
url: List[str] = []
urls: List[str] = []
for raw_line in raw.splitlines():
line = raw_line.strip()
@@ -1574,15 +1563,15 @@ def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str]
url_part = line.split(':', 1)[1].strip() if ':' in line else ''
if url_part:
for url_segment in url_part.split(','):
for url in url_segment.split():
url_clean = url.strip()
if url_clean and url_clean not in url:
url.append(url_clean)
for url_token in url_segment.split():
url_clean = url_token.strip()
if url_clean and url_clean not in urls:
urls.append(url_clean)
else:
# Everything else is a tag (including relationship: lines)
tags.append(line)
return hash_value, tags, url
return hash_value, tags, urls
@@ -1827,63 +1816,6 @@ def apply_title_to_path(media_path: Path, tags: Iterable[str]) -> Path:
return destination
def _collect_search_roots(payload: Dict[str, Any]) -> List[Path]:
roots: List[Path] = []
for key in ('paths', 'search_paths', 'roots', 'directories'):
raw = payload.get(key)
if not raw:
continue
entries = raw if isinstance(raw, (list, tuple, set)) else [raw]
for entry in entries:
if not entry:
continue
try:
candidate = Path(str(entry)).expanduser()
except Exception:
continue
roots.append(candidate)
if load_config is not None and resolve_output_dir is not None:
try:
config = load_config()
except Exception:
config = None
if isinstance(config, dict) and config:
try:
default_root = resolve_output_dir(config)
except Exception:
default_root = None
if default_root is not None:
roots.append(default_root)
return roots
def _locate_sidecar_by_hash(hash_value: str, roots: Iterable[Path]) -> Optional[Path]:
target = f'hash:{hash_value.strip().lower()}'
for root in roots:
try:
root_path = root.expanduser()
except Exception:
continue
if not root_path.exists() or not root_path.is_dir():
continue
for pattern in ('*.tag',):
try:
iterator = root_path.rglob(pattern)
except OSError:
continue
for candidate in iterator:
if not candidate.is_file():
continue
try:
with candidate.open('r', encoding='utf-8', errors='ignore') as handle:
for line in handle:
if line.strip().lower() == target:
return candidate
except OSError:
continue
return None
def sync_sidecar(payload: Dict[str, Any]) -> Dict[str, Any]:
path_value = payload.get('path')
if not path_value:
@@ -2506,8 +2438,8 @@ def write_tags_to_file(
# Add known url if provided - each on separate line to prevent corruption
if url:
for url in url:
content_lines.append(f"url:{url}")
for url_item in url:
content_lines.append(f"url:{url_item}")
# Add tags
if tags:
@@ -2642,10 +2574,10 @@ def detect_metadata_request(tag: str) -> Optional[Dict[str, str]]:
def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
tag = payload.get('tag')
if not isinstance(tag, str):
return {'tags': []}
return {'tag': []}
trimmed = value_normalize(tag)
if not trimmed:
return {'tags': []}
return {'tag': []}
request = detect_metadata_request(trimmed)
tags: List[str] = []
seen: Set[str] = set()
@@ -2653,7 +2585,7 @@ def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
_append_unique(tags, seen, request['base'])
else:
_append_unique(tags, seen, trimmed)
return {'tags': tags}
return {'tag': tags}
try:
if request['source'] == 'imdb':
data = imdb_tag(request['id'])
@@ -2662,8 +2594,15 @@ def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
except Exception as exc: # pragma: no cover - network/service errors
return {'tag': tags, 'error': str(exc)}
# Add tags from fetched data (no namespace, just unique append)
for tag in (data.get('tag') or []):
_append_unique(tags, seen, tag)
raw_tags = data.get('tag') if isinstance(data, dict) else None
if isinstance(raw_tags, str):
tag_iter: Iterable[str] = [raw_tags]
elif isinstance(raw_tags, (list, tuple, set)):
tag_iter = [t for t in raw_tags if isinstance(t, str)]
else:
tag_iter = []
for tag_value in tag_iter:
_append_unique(tags, seen, tag_value)
result = {
'tag': tags,
'source': request['source'],
@@ -3082,14 +3021,14 @@ def expand_tag_lists(tags_set: Set[str]) -> Set[str]:
# Load adjective.json from workspace root
adjective_path = Path(__file__).parent / "adjective.json"
if not adjective_path.exists():
log.debug(f"adjective.json not found at {adjective_path}")
debug(f"adjective.json not found at {adjective_path}")
return tags_set
try:
with open(adjective_path, 'r') as f:
adjective_lists = json.load(f)
except Exception as e:
log.error(f"Error loading adjective.json: {e}")
debug(f"Error loading adjective.json: {e}")
return tags_set
expanded_tags = set()
@@ -3108,10 +3047,10 @@ def expand_tag_lists(tags_set: Set[str]) -> Set[str]:
if matched_list:
# Add all tags from the list
expanded_tags.update(matched_list)
log.info(f"Expanded {tag} to {len(matched_list)} tags")
debug(f"Expanded {tag} to {len(matched_list)} tags")
else:
# List not found, log warning but don't add the reference
log.warning(f"Tag list '{list_name}' not found in adjective.json")
debug(f"Tag list '{list_name}' not found in adjective.json")
else:
# Regular tag, keep as is
expanded_tags.add(tag)
@@ -3194,98 +3133,6 @@ def build_book_tags(
return deduped
def fetch_openlibrary_metadata_tags(isbn: Optional[str] = None, olid: Optional[str] = None) -> List[str]:
"""Fetch book metadata from OpenLibrary and return as tags.
Args:
isbn: ISBN number (with or without isbn: prefix)
olid: OpenLibrary ID
Returns:
List of tags extracted from OpenLibrary metadata
"""
metadata_tags = []
# Try OLID first (preferred), then ISBN
url = None
if olid:
# Clean up OLID format
olid_clean = str(olid).replace('OL', '').replace('M', '').replace('W', '')
if olid_clean.isdigit():
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
else:
url = f"https://openlibrary.org/books/{olid}.json"
elif isbn:
# Clean up ISBN
isbn_clean = str(isbn).replace('isbn:', '').strip()
url = f"https://openlibrary.org/isbn/{isbn_clean}.json"
if not url:
return metadata_tags
try:
response = requests.get(url, timeout=10)
if response.status_code != 200:
return metadata_tags
data = response.json()
if not data:
return metadata_tags
# Extract title
if 'title' in data:
metadata_tags.append(f"title:{data['title']}")
# Extract authors
if 'authors' in data and isinstance(data['authors'], list):
for author in data['authors'][:3]:
if isinstance(author, dict) and 'name' in author:
metadata_tags.append(f"author:{author['name']}")
elif isinstance(author, str):
metadata_tags.append(f"author:{author}")
# Extract publish date
if 'publish_date' in data:
metadata_tags.append(f"publish_date:{data['publish_date']}")
# Extract publishers
if 'publishers' in data and isinstance(data['publishers'], list):
for pub in data['publishers'][:1]:
if isinstance(pub, dict) and 'name' in pub:
metadata_tags.append(f"publisher:{pub['name']}")
elif isinstance(pub, str):
metadata_tags.append(f"publisher:{pub}")
# Extract number of pages
if 'number_of_pages' in data:
page_count = data['number_of_pages']
if page_count and isinstance(page_count, int) and page_count > 0:
metadata_tags.append(f"pages:{page_count}")
# Extract language
if 'languages' in data and isinstance(data['languages'], list) and data['languages']:
lang = data['languages'][0]
if isinstance(lang, dict) and 'key' in lang:
lang_code = lang['key'].split('/')[-1]
metadata_tags.append(f"language:{lang_code}")
elif isinstance(lang, str):
metadata_tags.append(f"language:{lang}")
# Extract subjects as freeform tags (limit to 5)
if 'subjects' in data and isinstance(data['subjects'], list):
for subject in data['subjects'][:5]:
if subject and isinstance(subject, str):
subject_clean = str(subject).strip()
if subject_clean:
metadata_tags.append(subject_clean)
except Exception as e:
debug(f"⚠ Failed to fetch OpenLibrary metadata: {e}")
return metadata_tags
def enrich_playlist_entries(entries: list, extractor: str) -> list:
"""Enrich playlist entries with full metadata by fetching individual entry info.
@@ -3312,7 +3159,7 @@ def enrich_playlist_entries(entries: list, extractor: str) -> list:
if entry_url and is_url_supported_by_ytdlp(entry_url):
try:
import yt_dlp
ydl_opts = {
ydl_opts: Any = {
"quiet": True,
"no_warnings": True,
"skip_download": True,
@@ -3690,294 +3537,3 @@ def extract_url_formats(formats: list) -> List[Tuple[str, str]]:
return []
def scrape_isbn_metadata(isbn: str) -> List[str]:
"""Scrape metadata for an ISBN using Open Library API."""
new_tags = []
try:
from API.HTTP import HTTPClient
import json as json_module
isbn_clean = isbn.replace('-', '').strip()
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode('utf-8'))
except Exception as e:
log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr)
return []
if not data:
log(f"No ISBN metadata found for: {isbn}")
return []
book_data = next(iter(data.values()), None)
if not book_data:
return []
if 'title' in book_data:
new_tags.append(f"title:{book_data['title']}")
if 'authors' in book_data and isinstance(book_data['authors'], list):
for author in book_data['authors'][:3]:
if 'name' in author:
new_tags.append(f"author:{author['name']}")
if 'publish_date' in book_data:
new_tags.append(f"publish_date:{book_data['publish_date']}")
if 'publishers' in book_data and isinstance(book_data['publishers'], list):
for pub in book_data['publishers'][:1]:
if 'name' in pub:
new_tags.append(f"publisher:{pub['name']}")
if 'description' in book_data:
desc = book_data['description']
if isinstance(desc, dict) and 'value' in desc:
desc = desc['value']
if desc:
desc_str = str(desc).strip()
# Include description if available (limit to 200 chars to keep it manageable)
if len(desc_str) > 0:
new_tags.append(f"description:{desc_str[:200]}")
if 'number_of_pages' in book_data:
page_count = book_data['number_of_pages']
if page_count and isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict):
identifiers = book_data['identifiers']
if 'openlibrary' in identifiers:
ol_ids = identifiers['openlibrary']
if isinstance(ol_ids, list) and ol_ids:
new_tags.append(f"openlibrary:{ol_ids[0]}")
elif isinstance(ol_ids, str):
new_tags.append(f"openlibrary:{ol_ids}")
if 'lccn' in identifiers:
lccn_list = identifiers['lccn']
if isinstance(lccn_list, list) and lccn_list:
new_tags.append(f"lccn:{lccn_list[0]}")
elif isinstance(lccn_list, str):
new_tags.append(f"lccn:{lccn_list}")
if 'oclc' in identifiers:
oclc_list = identifiers['oclc']
if isinstance(oclc_list, list) and oclc_list:
new_tags.append(f"oclc:{oclc_list[0]}")
elif isinstance(oclc_list, str):
new_tags.append(f"oclc:{oclc_list}")
if 'goodreads' in identifiers:
goodreads_list = identifiers['goodreads']
if isinstance(goodreads_list, list) and goodreads_list:
new_tags.append(f"goodreads:{goodreads_list[0]}")
elif isinstance(goodreads_list, str):
new_tags.append(f"goodreads:{goodreads_list}")
if 'librarything' in identifiers:
lt_list = identifiers['librarything']
if isinstance(lt_list, list) and lt_list:
new_tags.append(f"librarything:{lt_list[0]}")
elif isinstance(lt_list, str):
new_tags.append(f"librarything:{lt_list}")
if 'doi' in identifiers:
doi_list = identifiers['doi']
if isinstance(doi_list, list) and doi_list:
new_tags.append(f"doi:{doi_list[0]}")
elif isinstance(doi_list, str):
new_tags.append(f"doi:{doi_list}")
if 'internet_archive' in identifiers:
ia_list = identifiers['internet_archive']
if isinstance(ia_list, list) and ia_list:
new_tags.append(f"internet_archive:{ia_list[0]}")
elif isinstance(ia_list, str):
new_tags.append(f"internet_archive:{ia_list}")
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
return new_tags
except Exception as e:
log(f"ISBN scraping error: {e}", file=sys.stderr)
return []
def scrape_openlibrary_metadata(olid: str) -> List[str]:
"""Scrape metadata for an OpenLibrary ID using the .json API endpoint.
Fetches from https://openlibrary.org/books/{OLID}.json and extracts:
- Title, authors, publish date, publishers
- Description
- Subjects as freeform tags (without namespace prefix)
- Identifiers (ISBN, LCCN, OCLC, etc.)
"""
new_tags = []
try:
from API.HTTP import HTTPClient
import json as json_module
# Format: OL9674499M or just 9674499M
olid_clean = olid.replace('OL', '').replace('M', '')
if not olid_clean.isdigit():
olid_clean = olid
# Ensure we have the full OLID format for the URL
if not olid.startswith('OL'):
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
else:
url = f"https://openlibrary.org/books/{olid}.json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode('utf-8'))
except Exception as e:
log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr)
return []
if not data:
log(f"No OpenLibrary metadata found for: {olid}")
return []
# Add title
if 'title' in data:
new_tags.append(f"title:{data['title']}")
# Add authors
if 'authors' in data and isinstance(data['authors'], list):
for author in data['authors'][:3]:
if isinstance(author, dict) and 'name' in author:
new_tags.append(f"author:{author['name']}")
elif isinstance(author, str):
new_tags.append(f"author:{author}")
# Add publish date
if 'publish_date' in data:
new_tags.append(f"publish_date:{data['publish_date']}")
# Add publishers
if 'publishers' in data and isinstance(data['publishers'], list):
for pub in data['publishers'][:1]:
if isinstance(pub, dict) and 'name' in pub:
new_tags.append(f"publisher:{pub['name']}")
elif isinstance(pub, str):
new_tags.append(f"publisher:{pub}")
# Add description
if 'description' in data:
desc = data['description']
if isinstance(desc, dict) and 'value' in desc:
desc = desc['value']
if desc:
desc_str = str(desc).strip()
if len(desc_str) > 0:
new_tags.append(f"description:{desc_str[:200]}")
# Add number of pages
if 'number_of_pages' in data:
page_count = data['number_of_pages']
if page_count and isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
# Add subjects as FREEFORM tags (no namespace prefix)
if 'subjects' in data and isinstance(data['subjects'], list):
for subject in data['subjects'][:10]:
if subject and isinstance(subject, str):
subject_clean = str(subject).strip()
if subject_clean and subject_clean not in new_tags:
new_tags.append(subject_clean)
# Add identifiers
if 'identifiers' in data and isinstance(data['identifiers'], dict):
identifiers = data['identifiers']
if 'isbn_10' in identifiers:
isbn_10_list = identifiers['isbn_10']
if isinstance(isbn_10_list, list) and isbn_10_list:
new_tags.append(f"isbn_10:{isbn_10_list[0]}")
elif isinstance(isbn_10_list, str):
new_tags.append(f"isbn_10:{isbn_10_list}")
if 'isbn_13' in identifiers:
isbn_13_list = identifiers['isbn_13']
if isinstance(isbn_13_list, list) and isbn_13_list:
new_tags.append(f"isbn_13:{isbn_13_list[0]}")
elif isinstance(isbn_13_list, str):
new_tags.append(f"isbn_13:{isbn_13_list}")
if 'lccn' in identifiers:
lccn_list = identifiers['lccn']
if isinstance(lccn_list, list) and lccn_list:
new_tags.append(f"lccn:{lccn_list[0]}")
elif isinstance(lccn_list, str):
new_tags.append(f"lccn:{lccn_list}")
if 'oclc_numbers' in identifiers:
oclc_list = identifiers['oclc_numbers']
if isinstance(oclc_list, list) and oclc_list:
new_tags.append(f"oclc:{oclc_list[0]}")
elif isinstance(oclc_list, str):
new_tags.append(f"oclc:{oclc_list}")
if 'goodreads' in identifiers:
goodreads_list = identifiers['goodreads']
if isinstance(goodreads_list, list) and goodreads_list:
new_tags.append(f"goodreads:{goodreads_list[0]}")
elif isinstance(goodreads_list, str):
new_tags.append(f"goodreads:{goodreads_list}")
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
return new_tags
except Exception as e:
log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
return []
def perform_metadata_scraping(tags_list: List[str]) -> List[str]:
"""Perform scraping based on identifiers in tags.
Priority order:
1. openlibrary: (preferred - more complete metadata)
2. isbn_10 or isbn (fallback)
"""
identifiers = extract_scrapable_identifiers(tags_list)
if not identifiers:
log("No scrapable identifiers found (openlibrary, ISBN, musicbrainz, imdb)")
return []
log(f"Found scrapable identifiers: {', '.join(identifiers.keys())}")
new_tags = []
# Prefer OpenLibrary over ISBN (more complete metadata)
if 'openlibrary' in identifiers:
olid = identifiers['openlibrary']
if olid:
log(f"Scraping OpenLibrary: {olid}")
new_tags.extend(scrape_openlibrary_metadata(olid))
elif 'isbn_13' in identifiers or 'isbn_10' in identifiers or 'isbn' in identifiers:
isbn = identifiers.get('isbn_13') or identifiers.get('isbn_10') or identifiers.get('isbn')
if isbn:
log(f"Scraping ISBN: {isbn}")
new_tags.extend(scrape_isbn_metadata(isbn))
existing_tags_lower = {tag.lower() for tag in tags_list}
scraped_unique = []
seen = set()
for tag in new_tags:
tag_lower = tag.lower()
if tag_lower not in existing_tags_lower and tag_lower not in seen:
scraped_unique.append(tag)
seen.add(tag_lower)
if scraped_unique:
log(f"Added {len(scraped_unique)} new tag(s) from scraping")
return scraped_unique