from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Type, cast
import html as html_std
import re
import requests
import sys
import json
import subprocess
try: # Optional dependency for IMDb scraping
from imdbinfo.services import search_title # type: ignore
except ImportError: # pragma: no cover - optional
search_title = None # type: ignore[assignment]
from SYS.logger import log, debug
from SYS.metadata import imdb_tag
try: # Optional dependency
import musicbrainzngs # type: ignore
except ImportError: # pragma: no cover - optional
musicbrainzngs = None
try: # Optional dependency
import yt_dlp # type: ignore
except ImportError: # pragma: no cover - optional
yt_dlp = None
class MetadataProvider(ABC):
"""Base class for metadata providers (music, movies, books, etc.)."""
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
self.config = config or {}
@property
def name(self) -> str:
return self.__class__.__name__.replace("Provider", "").lower()
@abstractmethod
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
"""Return a list of candidate metadata records."""
def to_tags(self, item: Dict[str, Any]) -> List[str]:
"""Convert a result item into a list of tags."""
tags: List[str] = []
title = item.get("title")
artist = item.get("artist")
album = item.get("album")
year = item.get("year")
if title:
tags.append(f"title:{title}")
if artist:
tags.append(f"artist:{artist}")
if album:
tags.append(f"album:{album}")
if year:
tags.append(f"year:{year}")
tags.append(f"source:{self.name}")
return tags
class ITunesProvider(MetadataProvider):
"""Metadata provider using the iTunes Search API."""
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
params = {
"term": query,
"media": "music",
"entity": "song",
"limit": limit
}
try:
resp = requests.get(
"https://itunes.apple.com/search",
params=params,
timeout=10
)
resp.raise_for_status()
results = resp.json().get("results", [])
except Exception as exc:
log(f"iTunes search failed: {exc}", file=sys.stderr)
return []
items: List[Dict[str, Any]] = []
for r in results:
item = {
"title": r.get("trackName"),
"artist": r.get("artistName"),
"album": r.get("collectionName"),
"year": str(r.get("releaseDate",
""))[:4],
"provider": self.name,
"raw": r,
}
items.append(item)
debug(f"iTunes returned {len(items)} items for '{query}'")
return items
class OpenLibraryMetadataProvider(MetadataProvider):
"""Metadata provider for OpenLibrary book metadata."""
@property
def name(self) -> str: # type: ignore[override]
return "openlibrary"
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
query_clean = (query or "").strip()
if not query_clean:
return []
try:
# Prefer ISBN-specific search when the query looks like one
if query_clean.replace("-",
"").isdigit() and len(query_clean.replace("-",
"")) in (
10,
13,
):
q = f"isbn:{query_clean.replace('-', '')}"
else:
q = query_clean
resp = requests.get(
"https://openlibrary.org/search.json",
params={
"q": q,
"limit": limit
},
timeout=10,
)
resp.raise_for_status()
data = resp.json()
except Exception as exc:
log(f"OpenLibrary search failed: {exc}", file=sys.stderr)
return []
items: List[Dict[str, Any]] = []
for doc in data.get("docs", [])[:limit]:
authors = doc.get("author_name") or []
publisher = ""
publishers = doc.get("publisher") or []
if isinstance(publishers, list) and publishers:
publisher = publishers[0]
# Prefer 13-digit ISBN when available, otherwise 10-digit
isbn_list = doc.get("isbn") or []
isbn_13 = next((i for i in isbn_list if len(str(i)) == 13), None)
isbn_10 = next((i for i in isbn_list if len(str(i)) == 10), None)
# Derive OLID from key
olid = ""
key = doc.get("key", "")
if isinstance(key, str) and key:
olid = key.split("/")[-1]
items.append(
{
"title": doc.get("title") or "",
"artist": ", ".join(authors) if authors else "",
"album": publisher,
"year": str(doc.get("first_publish_year") or ""),
"provider": self.name,
"authors": authors,
"publisher": publisher,
"identifiers": {
"isbn_13": isbn_13,
"isbn_10": isbn_10,
"openlibrary": olid,
"oclc": (doc.get("oclc_numbers") or [None])[0],
"lccn": (doc.get("lccn") or [None])[0],
},
"description": None,
}
)
return items
def to_tags(self, item: Dict[str, Any]) -> List[str]:
tags: List[str] = []
title = item.get("title")
authors = item.get("authors") or []
publisher = item.get("publisher")
year = item.get("year")
description = item.get("description") or ""
if title:
tags.append(f"title:{title}")
for author in authors:
if author:
tags.append(f"author:{author}")
if publisher:
tags.append(f"publisher:{publisher}")
if year:
tags.append(f"year:{year}")
if description:
tags.append(f"description:{description[:200]}")
identifiers = item.get("identifiers") or {}
for key, value in identifiers.items():
if value:
tags.append(f"{key}:{value}")
tags.append(f"source:{self.name}")
return tags
class GoogleBooksMetadataProvider(MetadataProvider):
"""Metadata provider for Google Books volumes API."""
@property
def name(self) -> str: # type: ignore[override]
return "googlebooks"
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
query_clean = (query or "").strip()
if not query_clean:
return []
# Prefer ISBN queries when possible
if query_clean.replace("-",
"").isdigit() and len(query_clean.replace("-",
"")) in (10,
13):
q = f"isbn:{query_clean.replace('-', '')}"
else:
q = query_clean
try:
resp = requests.get(
"https://www.googleapis.com/books/v1/volumes",
params={
"q": q,
"maxResults": limit
},
timeout=10,
)
resp.raise_for_status()
payload = resp.json()
except Exception as exc:
log(f"Google Books search failed: {exc}", file=sys.stderr)
return []
items: List[Dict[str, Any]] = []
for volume in payload.get("items", [])[:limit]:
info = volume.get("volumeInfo") or {}
authors = info.get("authors") or []
publisher = info.get("publisher", "")
published_date = info.get("publishedDate", "")
year = str(published_date)[:4] if published_date else ""
identifiers_raw = info.get("industryIdentifiers") or []
identifiers: Dict[str,
Optional[str]] = {
"googlebooks": volume.get("id")
}
for ident in identifiers_raw:
if not isinstance(ident, dict):
continue
ident_type = ident.get("type", "").lower()
ident_value = ident.get("identifier")
if not ident_value:
continue
if ident_type == "isbn_13":
identifiers.setdefault("isbn_13", ident_value)
elif ident_type == "isbn_10":
identifiers.setdefault("isbn_10", ident_value)
else:
identifiers.setdefault(ident_type, ident_value)
items.append(
{
"title": info.get("title") or "",
"artist": ", ".join(authors) if authors else "",
"album": publisher,
"year": year,
"provider": self.name,
"authors": authors,
"publisher": publisher,
"identifiers": identifiers,
"description": info.get("description",
""),
}
)
return items
def to_tags(self, item: Dict[str, Any]) -> List[str]:
tags: List[str] = []
title = item.get("title")
authors = item.get("authors") or []
publisher = item.get("publisher")
year = item.get("year")
description = item.get("description") or ""
if title:
tags.append(f"title:{title}")
for author in authors:
if author:
tags.append(f"author:{author}")
if publisher:
tags.append(f"publisher:{publisher}")
if year:
tags.append(f"year:{year}")
if description:
tags.append(f"description:{description[:200]}")
identifiers = item.get("identifiers") or {}
for key, value in identifiers.items():
if value:
tags.append(f"{key}:{value}")
tags.append(f"source:{self.name}")
return tags
class ISBNsearchMetadataProvider(MetadataProvider):
"""Metadata provider that scrapes isbnsearch.org by ISBN.
This is a best-effort HTML scrape. It expects the query to be an ISBN.
"""
@property
def name(self) -> str: # type: ignore[override]
return "isbnsearch"
@staticmethod
def _strip_html_to_text(raw: str) -> str:
s = html_std.unescape(str(raw or ""))
s = re.sub(r"(?i)
", "\n", s)
s = re.sub(r"<[^>]+>", " ", s)
s = re.sub(r"\s+", " ", s)
return s.strip()
@staticmethod
def _clean_isbn(query: str) -> str:
s = str(query or "").strip()
if not s:
return ""
s = s.replace("isbn:", "").replace("ISBN:", "")
s = re.sub(r"[^0-9Xx]", "", s).upper()
if len(s) in (10, 13):
return s
# Try to locate an ISBN-like token inside the query.
m = re.search(r"\b(?:97[89])?\d{9}[\dXx]\b", s)
return str(m.group(0)).upper() if m else ""
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
_ = limit
isbn = self._clean_isbn(query)
if not isbn:
return []
url = f"https://isbnsearch.org/isbn/{isbn}"
try:
resp = requests.get(url, timeout=10)
resp.raise_for_status()
html = str(resp.text or "")
if not html:
return []
except Exception as exc:
log(f"ISBNsearch scrape failed: {exc}", file=sys.stderr)
return []
title = ""
m_title = re.search(r"(?is)