dfdfsdd
This commit is contained in:
726
metadata.py
726
metadata.py
@@ -3,14 +3,12 @@ import re
|
||||
import subprocess
|
||||
import sys
|
||||
import shutil
|
||||
import sqlite3
|
||||
import requests
|
||||
from SYS.logger import log, debug
|
||||
from urllib.parse import urlsplit, urlunsplit, unquote
|
||||
from collections import deque
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple
|
||||
from models import PipeObject, FileRelationshipTracker, _get_file_hash
|
||||
from models import FileRelationshipTracker
|
||||
try:
|
||||
import musicbrainzngs # type: ignore
|
||||
except ImportError: # pragma: no cover
|
||||
@@ -332,6 +330,112 @@ def _generate_hydrus_url_variants(url: str) -> List[str]:
|
||||
return variants
|
||||
|
||||
|
||||
def normalize_urls(value: Any) -> List[str]:
|
||||
"""Normalize a URL field into a stable, deduplicated list.
|
||||
|
||||
Accepts:
|
||||
- None
|
||||
- a single URL string (optionally containing multiple URLs)
|
||||
- a list/tuple/set of URL strings
|
||||
|
||||
This helper is used by cmdlets/stores/pipeline to keep `url` consistent.
|
||||
"""
|
||||
|
||||
def _iter_raw_urls(raw: Any) -> Iterable[str]:
|
||||
if raw is None:
|
||||
return
|
||||
|
||||
if isinstance(raw, str):
|
||||
text = raw.strip()
|
||||
if not text:
|
||||
return
|
||||
# Support legacy prefixes like "url:https://...".
|
||||
if text.lower().startswith("url:"):
|
||||
text = text.split(":", 1)[1].strip()
|
||||
|
||||
# Prefer extracting obvious URLs to avoid splitting inside query strings.
|
||||
matches = re.findall(r"https?://[^\s,]+", text, flags=re.IGNORECASE)
|
||||
if matches:
|
||||
for m in matches:
|
||||
yield m
|
||||
return
|
||||
|
||||
# Fallback: split on commas/whitespace.
|
||||
for token in text.replace("\n", " ").replace("\r", " ").replace(",", " ").split():
|
||||
if token:
|
||||
yield token
|
||||
return
|
||||
|
||||
if isinstance(raw, (list, tuple, set)):
|
||||
for item in raw:
|
||||
if item is None:
|
||||
continue
|
||||
if isinstance(item, str):
|
||||
if item.strip():
|
||||
yield item
|
||||
else:
|
||||
text = str(item).strip()
|
||||
if text:
|
||||
yield text
|
||||
return
|
||||
|
||||
# Last resort: string-coerce.
|
||||
text = str(raw).strip()
|
||||
if text:
|
||||
yield text
|
||||
|
||||
def _canonicalize(url_text: str) -> Optional[str]:
|
||||
u = str(url_text or "").strip()
|
||||
if not u:
|
||||
return None
|
||||
|
||||
# Trim common wrappers and trailing punctuation.
|
||||
u = u.strip("<>\"' ")
|
||||
u = u.rstrip(")].,;\"")
|
||||
if not u:
|
||||
return None
|
||||
|
||||
lower = u.lower()
|
||||
if not (lower.startswith("http://") or lower.startswith("https://")):
|
||||
return u
|
||||
|
||||
try:
|
||||
parsed = urlsplit(u)
|
||||
except Exception:
|
||||
return u
|
||||
|
||||
scheme = (parsed.scheme or "").lower()
|
||||
netloc = (parsed.netloc or "").lower()
|
||||
path = unquote(parsed.path or "")
|
||||
query = parsed.query or ""
|
||||
|
||||
# Normalize default ports.
|
||||
if scheme == "http" and netloc.endswith(":80"):
|
||||
netloc = netloc[:-3]
|
||||
elif scheme == "https" and netloc.endswith(":443"):
|
||||
netloc = netloc[:-4]
|
||||
|
||||
# Prefer no trailing slash except root.
|
||||
if path and path != "/":
|
||||
path = path.rstrip("/")
|
||||
|
||||
# Fragments are not part of the resource.
|
||||
return urlunsplit((scheme, netloc, path, query, ""))
|
||||
|
||||
seen: Set[str] = set()
|
||||
out: List[str] = []
|
||||
for raw_url in _iter_raw_urls(value):
|
||||
canonical = _canonicalize(raw_url)
|
||||
if not canonical:
|
||||
continue
|
||||
if canonical in seen:
|
||||
continue
|
||||
seen.add(canonical)
|
||||
out.append(canonical)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def value_normalize(value: str) -> str:
|
||||
"""Normalize whitespace: collapse internal spaces, strip, remove newlines."""
|
||||
value = value.replace("\n", " ").replace("\r", " ")
|
||||
@@ -358,6 +462,7 @@ def import_pending_sidecars(db_root: Path, db: Any) -> None:
|
||||
continue
|
||||
|
||||
# Ensure file entry exists
|
||||
file_id: Optional[int] = None
|
||||
try:
|
||||
cursor = db.connection.cursor() if db.connection else None
|
||||
if cursor:
|
||||
@@ -394,10 +499,16 @@ def import_pending_sidecars(db_root: Path, db: Any) -> None:
|
||||
try:
|
||||
cursor = db.connection.cursor() if db.connection else None
|
||||
if cursor:
|
||||
file_hash_value: Optional[str] = None
|
||||
if hasattr(db, 'get_file_hash'):
|
||||
try:
|
||||
file_hash_value = db.get_file_hash(file_id)
|
||||
except Exception:
|
||||
file_hash_value = None
|
||||
for tag in tags:
|
||||
cursor.execute(
|
||||
'INSERT OR IGNORE INTO tags (hash, tag) VALUES (?, ?)',
|
||||
(file_hash_value, tag) if hasattr(db, 'get_file_hash') else (None, tag)
|
||||
(file_hash_value, tag)
|
||||
)
|
||||
db.connection.commit()
|
||||
except Exception:
|
||||
@@ -663,128 +774,6 @@ def fetch_musicbrainz_tags(mbid: str, entity: str) -> Dict[str, object]:
|
||||
return {"source": "musicbrainz", "id": mbid, "tag": tags, "entity": entity}
|
||||
|
||||
|
||||
def fetch_openlibrary_tags(ol_id: str) -> Dict[str, object]:
|
||||
"""Fetch metadata tags from OpenLibrary.
|
||||
|
||||
Args:
|
||||
ol_id: OpenLibrary ID (e.g., 'OL123456M' for a book)
|
||||
|
||||
Returns:
|
||||
Dictionary with 'tag' key containing list of extracted tags
|
||||
"""
|
||||
import urllib.request
|
||||
|
||||
# Normalize OL ID
|
||||
ol_id = ol_id.strip().upper()
|
||||
if not ol_id.startswith('OL'):
|
||||
ol_id = f'OL{ol_id}'
|
||||
|
||||
# Fetch from OpenLibrary API
|
||||
url = f"https://openlibrary.org/books/{ol_id}.json"
|
||||
tags: List[str] = []
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=10) as response:
|
||||
data = json.loads(response.read().decode('utf-8'))
|
||||
except Exception as e:
|
||||
raise ValueError(f"Failed to fetch OpenLibrary data for {ol_id}: {e}")
|
||||
|
||||
# Add OpenLibrary ID tag
|
||||
_add_tag(tags, "openlibrary", ol_id)
|
||||
|
||||
# Extract title
|
||||
_add_tag(tags, "title", data.get("title"))
|
||||
|
||||
# Extract subtitle if present
|
||||
if data.get("subtitle"):
|
||||
_add_tag(tags, "subtitle", data["subtitle"])
|
||||
|
||||
# Extract authors
|
||||
authors = data.get("authors", [])
|
||||
author_names: List[str] = []
|
||||
for author in authors:
|
||||
if isinstance(author, dict):
|
||||
name = author.get("name")
|
||||
else:
|
||||
name = str(author)
|
||||
if name:
|
||||
author_names.append(name)
|
||||
if author_names:
|
||||
_extend_tags(tags, "author", author_names)
|
||||
|
||||
# Extract publication details
|
||||
if data.get("publish_date"):
|
||||
_add_tag(tags, "publish_date", data["publish_date"])
|
||||
# Extract year if present
|
||||
year_match = re.search(r'\b(\d{4})\b', str(data.get("publish_date", "")))
|
||||
if year_match:
|
||||
_add_tag(tags, "year", year_match.group(1))
|
||||
|
||||
# Extract publishers
|
||||
publishers = data.get("publishers", [])
|
||||
if publishers:
|
||||
publisher_names = []
|
||||
for pub in publishers:
|
||||
if isinstance(pub, dict):
|
||||
name = pub.get("name")
|
||||
else:
|
||||
name = str(pub)
|
||||
if name:
|
||||
publisher_names.append(name)
|
||||
if publisher_names:
|
||||
_extend_tags(tags, "publisher", publisher_names)
|
||||
|
||||
# Extract languages
|
||||
languages = data.get("languages", [])
|
||||
if languages:
|
||||
lang_codes = []
|
||||
for lang in languages:
|
||||
if isinstance(lang, dict):
|
||||
code = lang.get("key", "").split("/")[-1]
|
||||
else:
|
||||
code = str(lang).split("/")[-1]
|
||||
if code and code != "":
|
||||
lang_codes.append(code)
|
||||
if lang_codes:
|
||||
_extend_tags(tags, "language", lang_codes)
|
||||
|
||||
# Extract ISBN
|
||||
isbns = data.get("isbn_10", []) + data.get("isbn_13", [])
|
||||
if isbns:
|
||||
for isbn in isbns[:1]: # Just take first one
|
||||
if len(str(isbn)) == 10:
|
||||
_add_tag(tags, "isbn_10", isbn)
|
||||
elif len(str(isbn)) == 13:
|
||||
_add_tag(tags, "isbn_13", isbn)
|
||||
|
||||
# Extract page count
|
||||
_add_tag(tags, "pages", data.get("number_of_pages"))
|
||||
|
||||
# Extract genres/subjects (OpenLibrary calls them subjects)
|
||||
# Subjects are added as plain freeform tags (no namespace prefix)
|
||||
subjects = data.get("subjects", [])
|
||||
if subjects:
|
||||
for subject in subjects[:10]: # Limit to 10 subjects
|
||||
if isinstance(subject, dict):
|
||||
name = subject.get("name")
|
||||
else:
|
||||
name = str(subject)
|
||||
if name:
|
||||
# Add subject as plain tag without "subject:" prefix
|
||||
normalized = value_normalize(str(name))
|
||||
if normalized:
|
||||
tags.append(normalized)
|
||||
|
||||
# Extract OpenLibrary description
|
||||
description = data.get("description")
|
||||
if description:
|
||||
if isinstance(description, dict):
|
||||
description = description.get("value")
|
||||
_add_tag(tags, "summary", description)
|
||||
|
||||
return {"source": "openlibrary", "id": ol_id, "tag": tags}
|
||||
|
||||
|
||||
def _append_unique(target: List[str], seen: Set[str], value: Optional[str]) -> None:
|
||||
"""Append a single value if not already in seen set (deduplication)."""
|
||||
if value is None:
|
||||
@@ -1545,7 +1534,7 @@ def _derive_sidecar_path(media_path: Path) -> Path:
|
||||
return preferred
|
||||
|
||||
|
||||
def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]:
|
||||
def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str], List[str]]: # pyright: ignore[reportUnusedFunction]
|
||||
"""Read hash, tags, and url from sidecar file.
|
||||
|
||||
Consolidated with read_tags_from_file - this extracts extra metadata (hash, url).
|
||||
@@ -1559,7 +1548,7 @@ def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str]
|
||||
|
||||
hash_value: Optional[str] = None
|
||||
tags: List[str] = []
|
||||
url: List[str] = []
|
||||
urls: List[str] = []
|
||||
|
||||
for raw_line in raw.splitlines():
|
||||
line = raw_line.strip()
|
||||
@@ -1574,15 +1563,15 @@ def _read_sidecar_metadata(sidecar_path: Path) -> tuple[Optional[str], List[str]
|
||||
url_part = line.split(':', 1)[1].strip() if ':' in line else ''
|
||||
if url_part:
|
||||
for url_segment in url_part.split(','):
|
||||
for url in url_segment.split():
|
||||
url_clean = url.strip()
|
||||
if url_clean and url_clean not in url:
|
||||
url.append(url_clean)
|
||||
for url_token in url_segment.split():
|
||||
url_clean = url_token.strip()
|
||||
if url_clean and url_clean not in urls:
|
||||
urls.append(url_clean)
|
||||
else:
|
||||
# Everything else is a tag (including relationship: lines)
|
||||
tags.append(line)
|
||||
|
||||
return hash_value, tags, url
|
||||
return hash_value, tags, urls
|
||||
|
||||
|
||||
|
||||
@@ -1827,63 +1816,6 @@ def apply_title_to_path(media_path: Path, tags: Iterable[str]) -> Path:
|
||||
return destination
|
||||
|
||||
|
||||
def _collect_search_roots(payload: Dict[str, Any]) -> List[Path]:
|
||||
roots: List[Path] = []
|
||||
for key in ('paths', 'search_paths', 'roots', 'directories'):
|
||||
raw = payload.get(key)
|
||||
if not raw:
|
||||
continue
|
||||
entries = raw if isinstance(raw, (list, tuple, set)) else [raw]
|
||||
for entry in entries:
|
||||
if not entry:
|
||||
continue
|
||||
try:
|
||||
candidate = Path(str(entry)).expanduser()
|
||||
except Exception:
|
||||
continue
|
||||
roots.append(candidate)
|
||||
if load_config is not None and resolve_output_dir is not None:
|
||||
try:
|
||||
config = load_config()
|
||||
except Exception:
|
||||
config = None
|
||||
if isinstance(config, dict) and config:
|
||||
try:
|
||||
default_root = resolve_output_dir(config)
|
||||
except Exception:
|
||||
default_root = None
|
||||
if default_root is not None:
|
||||
roots.append(default_root)
|
||||
return roots
|
||||
|
||||
|
||||
def _locate_sidecar_by_hash(hash_value: str, roots: Iterable[Path]) -> Optional[Path]:
|
||||
target = f'hash:{hash_value.strip().lower()}'
|
||||
for root in roots:
|
||||
try:
|
||||
root_path = root.expanduser()
|
||||
except Exception:
|
||||
continue
|
||||
if not root_path.exists() or not root_path.is_dir():
|
||||
continue
|
||||
for pattern in ('*.tag',):
|
||||
try:
|
||||
iterator = root_path.rglob(pattern)
|
||||
except OSError:
|
||||
continue
|
||||
for candidate in iterator:
|
||||
if not candidate.is_file():
|
||||
continue
|
||||
try:
|
||||
with candidate.open('r', encoding='utf-8', errors='ignore') as handle:
|
||||
for line in handle:
|
||||
if line.strip().lower() == target:
|
||||
return candidate
|
||||
except OSError:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def sync_sidecar(payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
path_value = payload.get('path')
|
||||
if not path_value:
|
||||
@@ -2506,8 +2438,8 @@ def write_tags_to_file(
|
||||
|
||||
# Add known url if provided - each on separate line to prevent corruption
|
||||
if url:
|
||||
for url in url:
|
||||
content_lines.append(f"url:{url}")
|
||||
for url_item in url:
|
||||
content_lines.append(f"url:{url_item}")
|
||||
|
||||
# Add tags
|
||||
if tags:
|
||||
@@ -2642,10 +2574,10 @@ def detect_metadata_request(tag: str) -> Optional[Dict[str, str]]:
|
||||
def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
tag = payload.get('tag')
|
||||
if not isinstance(tag, str):
|
||||
return {'tags': []}
|
||||
return {'tag': []}
|
||||
trimmed = value_normalize(tag)
|
||||
if not trimmed:
|
||||
return {'tags': []}
|
||||
return {'tag': []}
|
||||
request = detect_metadata_request(trimmed)
|
||||
tags: List[str] = []
|
||||
seen: Set[str] = set()
|
||||
@@ -2653,7 +2585,7 @@ def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
_append_unique(tags, seen, request['base'])
|
||||
else:
|
||||
_append_unique(tags, seen, trimmed)
|
||||
return {'tags': tags}
|
||||
return {'tag': tags}
|
||||
try:
|
||||
if request['source'] == 'imdb':
|
||||
data = imdb_tag(request['id'])
|
||||
@@ -2662,8 +2594,15 @@ def expand_metadata_tag(payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
except Exception as exc: # pragma: no cover - network/service errors
|
||||
return {'tag': tags, 'error': str(exc)}
|
||||
# Add tags from fetched data (no namespace, just unique append)
|
||||
for tag in (data.get('tag') or []):
|
||||
_append_unique(tags, seen, tag)
|
||||
raw_tags = data.get('tag') if isinstance(data, dict) else None
|
||||
if isinstance(raw_tags, str):
|
||||
tag_iter: Iterable[str] = [raw_tags]
|
||||
elif isinstance(raw_tags, (list, tuple, set)):
|
||||
tag_iter = [t for t in raw_tags if isinstance(t, str)]
|
||||
else:
|
||||
tag_iter = []
|
||||
for tag_value in tag_iter:
|
||||
_append_unique(tags, seen, tag_value)
|
||||
result = {
|
||||
'tag': tags,
|
||||
'source': request['source'],
|
||||
@@ -3082,14 +3021,14 @@ def expand_tag_lists(tags_set: Set[str]) -> Set[str]:
|
||||
# Load adjective.json from workspace root
|
||||
adjective_path = Path(__file__).parent / "adjective.json"
|
||||
if not adjective_path.exists():
|
||||
log.debug(f"adjective.json not found at {adjective_path}")
|
||||
debug(f"adjective.json not found at {adjective_path}")
|
||||
return tags_set
|
||||
|
||||
try:
|
||||
with open(adjective_path, 'r') as f:
|
||||
adjective_lists = json.load(f)
|
||||
except Exception as e:
|
||||
log.error(f"Error loading adjective.json: {e}")
|
||||
debug(f"Error loading adjective.json: {e}")
|
||||
return tags_set
|
||||
|
||||
expanded_tags = set()
|
||||
@@ -3108,10 +3047,10 @@ def expand_tag_lists(tags_set: Set[str]) -> Set[str]:
|
||||
if matched_list:
|
||||
# Add all tags from the list
|
||||
expanded_tags.update(matched_list)
|
||||
log.info(f"Expanded {tag} to {len(matched_list)} tags")
|
||||
debug(f"Expanded {tag} to {len(matched_list)} tags")
|
||||
else:
|
||||
# List not found, log warning but don't add the reference
|
||||
log.warning(f"Tag list '{list_name}' not found in adjective.json")
|
||||
debug(f"Tag list '{list_name}' not found in adjective.json")
|
||||
else:
|
||||
# Regular tag, keep as is
|
||||
expanded_tags.add(tag)
|
||||
@@ -3194,98 +3133,6 @@ def build_book_tags(
|
||||
return deduped
|
||||
|
||||
|
||||
def fetch_openlibrary_metadata_tags(isbn: Optional[str] = None, olid: Optional[str] = None) -> List[str]:
|
||||
"""Fetch book metadata from OpenLibrary and return as tags.
|
||||
|
||||
Args:
|
||||
isbn: ISBN number (with or without isbn: prefix)
|
||||
olid: OpenLibrary ID
|
||||
|
||||
Returns:
|
||||
List of tags extracted from OpenLibrary metadata
|
||||
"""
|
||||
metadata_tags = []
|
||||
|
||||
# Try OLID first (preferred), then ISBN
|
||||
url = None
|
||||
|
||||
if olid:
|
||||
# Clean up OLID format
|
||||
olid_clean = str(olid).replace('OL', '').replace('M', '').replace('W', '')
|
||||
if olid_clean.isdigit():
|
||||
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
|
||||
else:
|
||||
url = f"https://openlibrary.org/books/{olid}.json"
|
||||
elif isbn:
|
||||
# Clean up ISBN
|
||||
isbn_clean = str(isbn).replace('isbn:', '').strip()
|
||||
url = f"https://openlibrary.org/isbn/{isbn_clean}.json"
|
||||
|
||||
if not url:
|
||||
return metadata_tags
|
||||
|
||||
try:
|
||||
response = requests.get(url, timeout=10)
|
||||
if response.status_code != 200:
|
||||
return metadata_tags
|
||||
|
||||
data = response.json()
|
||||
if not data:
|
||||
return metadata_tags
|
||||
|
||||
# Extract title
|
||||
if 'title' in data:
|
||||
metadata_tags.append(f"title:{data['title']}")
|
||||
|
||||
# Extract authors
|
||||
if 'authors' in data and isinstance(data['authors'], list):
|
||||
for author in data['authors'][:3]:
|
||||
if isinstance(author, dict) and 'name' in author:
|
||||
metadata_tags.append(f"author:{author['name']}")
|
||||
elif isinstance(author, str):
|
||||
metadata_tags.append(f"author:{author}")
|
||||
|
||||
# Extract publish date
|
||||
if 'publish_date' in data:
|
||||
metadata_tags.append(f"publish_date:{data['publish_date']}")
|
||||
|
||||
# Extract publishers
|
||||
if 'publishers' in data and isinstance(data['publishers'], list):
|
||||
for pub in data['publishers'][:1]:
|
||||
if isinstance(pub, dict) and 'name' in pub:
|
||||
metadata_tags.append(f"publisher:{pub['name']}")
|
||||
elif isinstance(pub, str):
|
||||
metadata_tags.append(f"publisher:{pub}")
|
||||
|
||||
# Extract number of pages
|
||||
if 'number_of_pages' in data:
|
||||
page_count = data['number_of_pages']
|
||||
if page_count and isinstance(page_count, int) and page_count > 0:
|
||||
metadata_tags.append(f"pages:{page_count}")
|
||||
|
||||
# Extract language
|
||||
if 'languages' in data and isinstance(data['languages'], list) and data['languages']:
|
||||
lang = data['languages'][0]
|
||||
if isinstance(lang, dict) and 'key' in lang:
|
||||
lang_code = lang['key'].split('/')[-1]
|
||||
metadata_tags.append(f"language:{lang_code}")
|
||||
elif isinstance(lang, str):
|
||||
metadata_tags.append(f"language:{lang}")
|
||||
|
||||
# Extract subjects as freeform tags (limit to 5)
|
||||
if 'subjects' in data and isinstance(data['subjects'], list):
|
||||
for subject in data['subjects'][:5]:
|
||||
if subject and isinstance(subject, str):
|
||||
subject_clean = str(subject).strip()
|
||||
if subject_clean:
|
||||
metadata_tags.append(subject_clean)
|
||||
|
||||
except Exception as e:
|
||||
debug(f"⚠ Failed to fetch OpenLibrary metadata: {e}")
|
||||
|
||||
return metadata_tags
|
||||
|
||||
|
||||
def enrich_playlist_entries(entries: list, extractor: str) -> list:
|
||||
"""Enrich playlist entries with full metadata by fetching individual entry info.
|
||||
|
||||
@@ -3312,7 +3159,7 @@ def enrich_playlist_entries(entries: list, extractor: str) -> list:
|
||||
if entry_url and is_url_supported_by_ytdlp(entry_url):
|
||||
try:
|
||||
import yt_dlp
|
||||
ydl_opts = {
|
||||
ydl_opts: Any = {
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"skip_download": True,
|
||||
@@ -3690,294 +3537,3 @@ def extract_url_formats(formats: list) -> List[Tuple[str, str]]:
|
||||
return []
|
||||
|
||||
|
||||
def scrape_isbn_metadata(isbn: str) -> List[str]:
|
||||
"""Scrape metadata for an ISBN using Open Library API."""
|
||||
new_tags = []
|
||||
try:
|
||||
from API.HTTP import HTTPClient
|
||||
import json as json_module
|
||||
|
||||
isbn_clean = isbn.replace('-', '').strip()
|
||||
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
|
||||
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json_module.loads(response.content.decode('utf-8'))
|
||||
except Exception as e:
|
||||
log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not data:
|
||||
log(f"No ISBN metadata found for: {isbn}")
|
||||
return []
|
||||
|
||||
book_data = next(iter(data.values()), None)
|
||||
if not book_data:
|
||||
return []
|
||||
|
||||
if 'title' in book_data:
|
||||
new_tags.append(f"title:{book_data['title']}")
|
||||
|
||||
if 'authors' in book_data and isinstance(book_data['authors'], list):
|
||||
for author in book_data['authors'][:3]:
|
||||
if 'name' in author:
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
|
||||
if 'publish_date' in book_data:
|
||||
new_tags.append(f"publish_date:{book_data['publish_date']}")
|
||||
|
||||
if 'publishers' in book_data and isinstance(book_data['publishers'], list):
|
||||
for pub in book_data['publishers'][:1]:
|
||||
if 'name' in pub:
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
|
||||
if 'description' in book_data:
|
||||
desc = book_data['description']
|
||||
if isinstance(desc, dict) and 'value' in desc:
|
||||
desc = desc['value']
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
# Include description if available (limit to 200 chars to keep it manageable)
|
||||
if len(desc_str) > 0:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
if 'number_of_pages' in book_data:
|
||||
page_count = book_data['number_of_pages']
|
||||
if page_count and isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict):
|
||||
identifiers = book_data['identifiers']
|
||||
|
||||
if 'openlibrary' in identifiers:
|
||||
ol_ids = identifiers['openlibrary']
|
||||
if isinstance(ol_ids, list) and ol_ids:
|
||||
new_tags.append(f"openlibrary:{ol_ids[0]}")
|
||||
elif isinstance(ol_ids, str):
|
||||
new_tags.append(f"openlibrary:{ol_ids}")
|
||||
|
||||
if 'lccn' in identifiers:
|
||||
lccn_list = identifiers['lccn']
|
||||
if isinstance(lccn_list, list) and lccn_list:
|
||||
new_tags.append(f"lccn:{lccn_list[0]}")
|
||||
elif isinstance(lccn_list, str):
|
||||
new_tags.append(f"lccn:{lccn_list}")
|
||||
|
||||
if 'oclc' in identifiers:
|
||||
oclc_list = identifiers['oclc']
|
||||
if isinstance(oclc_list, list) and oclc_list:
|
||||
new_tags.append(f"oclc:{oclc_list[0]}")
|
||||
elif isinstance(oclc_list, str):
|
||||
new_tags.append(f"oclc:{oclc_list}")
|
||||
|
||||
if 'goodreads' in identifiers:
|
||||
goodreads_list = identifiers['goodreads']
|
||||
if isinstance(goodreads_list, list) and goodreads_list:
|
||||
new_tags.append(f"goodreads:{goodreads_list[0]}")
|
||||
elif isinstance(goodreads_list, str):
|
||||
new_tags.append(f"goodreads:{goodreads_list}")
|
||||
|
||||
if 'librarything' in identifiers:
|
||||
lt_list = identifiers['librarything']
|
||||
if isinstance(lt_list, list) and lt_list:
|
||||
new_tags.append(f"librarything:{lt_list[0]}")
|
||||
elif isinstance(lt_list, str):
|
||||
new_tags.append(f"librarything:{lt_list}")
|
||||
|
||||
if 'doi' in identifiers:
|
||||
doi_list = identifiers['doi']
|
||||
if isinstance(doi_list, list) and doi_list:
|
||||
new_tags.append(f"doi:{doi_list[0]}")
|
||||
elif isinstance(doi_list, str):
|
||||
new_tags.append(f"doi:{doi_list}")
|
||||
|
||||
if 'internet_archive' in identifiers:
|
||||
ia_list = identifiers['internet_archive']
|
||||
if isinstance(ia_list, list) and ia_list:
|
||||
new_tags.append(f"internet_archive:{ia_list[0]}")
|
||||
elif isinstance(ia_list, str):
|
||||
new_tags.append(f"internet_archive:{ia_list}")
|
||||
|
||||
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
|
||||
return new_tags
|
||||
except Exception as e:
|
||||
log(f"ISBN scraping error: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
|
||||
def scrape_openlibrary_metadata(olid: str) -> List[str]:
|
||||
"""Scrape metadata for an OpenLibrary ID using the .json API endpoint.
|
||||
|
||||
Fetches from https://openlibrary.org/books/{OLID}.json and extracts:
|
||||
- Title, authors, publish date, publishers
|
||||
- Description
|
||||
- Subjects as freeform tags (without namespace prefix)
|
||||
- Identifiers (ISBN, LCCN, OCLC, etc.)
|
||||
"""
|
||||
new_tags = []
|
||||
try:
|
||||
from API.HTTP import HTTPClient
|
||||
import json as json_module
|
||||
|
||||
# Format: OL9674499M or just 9674499M
|
||||
olid_clean = olid.replace('OL', '').replace('M', '')
|
||||
if not olid_clean.isdigit():
|
||||
olid_clean = olid
|
||||
|
||||
# Ensure we have the full OLID format for the URL
|
||||
if not olid.startswith('OL'):
|
||||
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
|
||||
else:
|
||||
url = f"https://openlibrary.org/books/{olid}.json"
|
||||
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json_module.loads(response.content.decode('utf-8'))
|
||||
except Exception as e:
|
||||
log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not data:
|
||||
log(f"No OpenLibrary metadata found for: {olid}")
|
||||
return []
|
||||
|
||||
# Add title
|
||||
if 'title' in data:
|
||||
new_tags.append(f"title:{data['title']}")
|
||||
|
||||
# Add authors
|
||||
if 'authors' in data and isinstance(data['authors'], list):
|
||||
for author in data['authors'][:3]:
|
||||
if isinstance(author, dict) and 'name' in author:
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
elif isinstance(author, str):
|
||||
new_tags.append(f"author:{author}")
|
||||
|
||||
# Add publish date
|
||||
if 'publish_date' in data:
|
||||
new_tags.append(f"publish_date:{data['publish_date']}")
|
||||
|
||||
# Add publishers
|
||||
if 'publishers' in data and isinstance(data['publishers'], list):
|
||||
for pub in data['publishers'][:1]:
|
||||
if isinstance(pub, dict) and 'name' in pub:
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
elif isinstance(pub, str):
|
||||
new_tags.append(f"publisher:{pub}")
|
||||
|
||||
# Add description
|
||||
if 'description' in data:
|
||||
desc = data['description']
|
||||
if isinstance(desc, dict) and 'value' in desc:
|
||||
desc = desc['value']
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
if len(desc_str) > 0:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
# Add number of pages
|
||||
if 'number_of_pages' in data:
|
||||
page_count = data['number_of_pages']
|
||||
if page_count and isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
# Add subjects as FREEFORM tags (no namespace prefix)
|
||||
if 'subjects' in data and isinstance(data['subjects'], list):
|
||||
for subject in data['subjects'][:10]:
|
||||
if subject and isinstance(subject, str):
|
||||
subject_clean = str(subject).strip()
|
||||
if subject_clean and subject_clean not in new_tags:
|
||||
new_tags.append(subject_clean)
|
||||
|
||||
# Add identifiers
|
||||
if 'identifiers' in data and isinstance(data['identifiers'], dict):
|
||||
identifiers = data['identifiers']
|
||||
|
||||
if 'isbn_10' in identifiers:
|
||||
isbn_10_list = identifiers['isbn_10']
|
||||
if isinstance(isbn_10_list, list) and isbn_10_list:
|
||||
new_tags.append(f"isbn_10:{isbn_10_list[0]}")
|
||||
elif isinstance(isbn_10_list, str):
|
||||
new_tags.append(f"isbn_10:{isbn_10_list}")
|
||||
|
||||
if 'isbn_13' in identifiers:
|
||||
isbn_13_list = identifiers['isbn_13']
|
||||
if isinstance(isbn_13_list, list) and isbn_13_list:
|
||||
new_tags.append(f"isbn_13:{isbn_13_list[0]}")
|
||||
elif isinstance(isbn_13_list, str):
|
||||
new_tags.append(f"isbn_13:{isbn_13_list}")
|
||||
|
||||
if 'lccn' in identifiers:
|
||||
lccn_list = identifiers['lccn']
|
||||
if isinstance(lccn_list, list) and lccn_list:
|
||||
new_tags.append(f"lccn:{lccn_list[0]}")
|
||||
elif isinstance(lccn_list, str):
|
||||
new_tags.append(f"lccn:{lccn_list}")
|
||||
|
||||
if 'oclc_numbers' in identifiers:
|
||||
oclc_list = identifiers['oclc_numbers']
|
||||
if isinstance(oclc_list, list) and oclc_list:
|
||||
new_tags.append(f"oclc:{oclc_list[0]}")
|
||||
elif isinstance(oclc_list, str):
|
||||
new_tags.append(f"oclc:{oclc_list}")
|
||||
|
||||
if 'goodreads' in identifiers:
|
||||
goodreads_list = identifiers['goodreads']
|
||||
if isinstance(goodreads_list, list) and goodreads_list:
|
||||
new_tags.append(f"goodreads:{goodreads_list[0]}")
|
||||
elif isinstance(goodreads_list, str):
|
||||
new_tags.append(f"goodreads:{goodreads_list}")
|
||||
|
||||
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
|
||||
return new_tags
|
||||
except Exception as e:
|
||||
log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
|
||||
def perform_metadata_scraping(tags_list: List[str]) -> List[str]:
|
||||
"""Perform scraping based on identifiers in tags.
|
||||
|
||||
Priority order:
|
||||
1. openlibrary: (preferred - more complete metadata)
|
||||
2. isbn_10 or isbn (fallback)
|
||||
"""
|
||||
identifiers = extract_scrapable_identifiers(tags_list)
|
||||
|
||||
if not identifiers:
|
||||
log("No scrapable identifiers found (openlibrary, ISBN, musicbrainz, imdb)")
|
||||
return []
|
||||
|
||||
log(f"Found scrapable identifiers: {', '.join(identifiers.keys())}")
|
||||
|
||||
new_tags = []
|
||||
|
||||
# Prefer OpenLibrary over ISBN (more complete metadata)
|
||||
if 'openlibrary' in identifiers:
|
||||
olid = identifiers['openlibrary']
|
||||
if olid:
|
||||
log(f"Scraping OpenLibrary: {olid}")
|
||||
new_tags.extend(scrape_openlibrary_metadata(olid))
|
||||
elif 'isbn_13' in identifiers or 'isbn_10' in identifiers or 'isbn' in identifiers:
|
||||
isbn = identifiers.get('isbn_13') or identifiers.get('isbn_10') or identifiers.get('isbn')
|
||||
if isbn:
|
||||
log(f"Scraping ISBN: {isbn}")
|
||||
new_tags.extend(scrape_isbn_metadata(isbn))
|
||||
|
||||
existing_tags_lower = {tag.lower() for tag in tags_list}
|
||||
scraped_unique = []
|
||||
seen = set()
|
||||
for tag in new_tags:
|
||||
tag_lower = tag.lower()
|
||||
if tag_lower not in existing_tags_lower and tag_lower not in seen:
|
||||
scraped_unique.append(tag)
|
||||
seen.add(tag_lower)
|
||||
|
||||
if scraped_unique:
|
||||
log(f"Added {len(scraped_unique)} new tag(s) from scraping")
|
||||
|
||||
return scraped_unique
|
||||
|
||||
Reference in New Issue
Block a user