re

2025-12-25 04:49:22 -08:00
parent 2542a68479
commit 43afa4e3fa
19 changed files with 2766 additions and 234 deletions
--- a/Provider/metadata_provider.py
+++ b/Provider/metadata_provider.py
@@ -2,6 +2,8 @@ from __future__ import annotations

 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Type, cast
+import html as html_std
+import re
 import requests
 import sys
 import json
@@ -279,6 +281,199 @@ class GoogleBooksMetadataProvider(MetadataProvider):
        return tags


+class ISBNsearchMetadataProvider(MetadataProvider):
+    """Metadata provider that scrapes isbnsearch.org by ISBN.
+
+    This is a best-effort HTML scrape. It expects the query to be an ISBN.
+    """
+
+    @property
+    def name(self) -> str:  # type: ignore[override]
+        return "isbnsearch"
+
+    @staticmethod
+    def _strip_html_to_text(raw: str) -> str:
+        s = html_std.unescape(str(raw or ""))
+        s = re.sub(r"(?i)<br\s*/?>", "\n", s)
+        s = re.sub(r"<[^>]+>", " ", s)
+        s = re.sub(r"\s+", " ", s)
+        return s.strip()
+
+    @staticmethod
+    def _clean_isbn(query: str) -> str:
+        s = str(query or "").strip()
+        if not s:
+            return ""
+        s = s.replace("isbn:", "").replace("ISBN:", "")
+        s = re.sub(r"[^0-9Xx]", "", s).upper()
+        if len(s) in (10, 13):
+            return s
+        # Try to locate an ISBN-like token inside the query.
+        m = re.search(r"\b(?:97[89])?\d{9}[\dXx]\b", s)
+        return str(m.group(0)).upper() if m else ""
+
+    def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
+        _ = limit
+        isbn = self._clean_isbn(query)
+        if not isbn:
+            return []
+
+        url = f"https://isbnsearch.org/isbn/{isbn}"
+        try:
+            resp = requests.get(url, timeout=10)
+            resp.raise_for_status()
+            html = str(resp.text or "")
+            if not html:
+                return []
+        except Exception as exc:
+            log(f"ISBNsearch scrape failed: {exc}", file=sys.stderr)
+            return []
+
+        title = ""
+        m_title = re.search(r"(?is)<h1\b[^>]*>(.*?)</h1>", html)
+        if m_title:
+            title = self._strip_html_to_text(m_title.group(1))
+
+        raw_fields: Dict[str, str] = {}
+        strong_matches = list(re.finditer(r"(?is)<strong\b[^>]*>(.*?)</strong>", html))
+        for idx, m in enumerate(strong_matches):
+            label_raw = self._strip_html_to_text(m.group(1))
+            label = str(label_raw or "").strip()
+            if not label:
+                continue
+            if label.endswith(":"):
+                label = label[:-1].strip()
+
+            chunk_start = m.end()
+            # Stop at next <strong> or end of document.
+            chunk_end = strong_matches[idx + 1].start() if (idx + 1) < len(strong_matches) else len(html)
+            chunk = html[chunk_start:chunk_end]
+            # Prefer stopping within the same paragraph when possible.
+            m_end = re.search(r"(?is)(</p>|<br\s*/?>)", chunk)
+            if m_end:
+                chunk = chunk[: m_end.start()]
+
+            val_text = self._strip_html_to_text(chunk)
+            if not val_text:
+                continue
+            raw_fields[label] = val_text
+
+        def _get(*labels: str) -> str:
+            for lab in labels:
+                for k, v in raw_fields.items():
+                    if str(k).strip().lower() == str(lab).strip().lower():
+                        return str(v or "").strip()
+            return ""
+
+        # Map common ISBNsearch labels.
+        author_text = _get("Author", "Authors", "Author(s)")
+        publisher = _get("Publisher")
+        published = _get("Published", "Publication Date", "Publish Date")
+        language = _get("Language")
+        pages = _get("Pages")
+        isbn_13 = _get("ISBN-13", "ISBN13")
+        isbn_10 = _get("ISBN-10", "ISBN10")
+
+        year = ""
+        if published:
+            m_year = re.search(r"\b(\d{4})\b", published)
+            year = str(m_year.group(1)) if m_year else ""
+
+        authors: List[str] = []
+        if author_text:
+            # Split on common separators; keep multi-part names intact.
+            for part in re.split(r"\s*(?:,|;|\band\b|\&|\|)\s*", author_text, flags=re.IGNORECASE):
+                p = str(part or "").strip()
+                if p:
+                    authors.append(p)
+
+        # Prefer parsed title, but fall back to og:title if needed.
+        if not title:
+            m_og = re.search(r"(?is)<meta\b[^>]*property=['\"]og:title['\"][^>]*content=['\"](.*?)['\"][^>]*>", html)
+            if m_og:
+                title = self._strip_html_to_text(m_og.group(1))
+
+        # Ensure ISBN tokens are normalized.
+        isbn_tokens: List[str] = []
+        for token in [isbn_13, isbn_10, isbn]:
+            t = self._clean_isbn(token)
+            if t and t not in isbn_tokens:
+                isbn_tokens.append(t)
+
+        item: Dict[str, Any] = {
+            "title": title or "",
+            # Keep UI columns compatible with the generic metadata table.
+            "artist": ", ".join(authors) if authors else "",
+            "album": publisher or "",
+            "year": year or "",
+            "provider": self.name,
+            "authors": authors,
+            "publisher": publisher or "",
+            "language": language or "",
+            "pages": pages or "",
+            "identifiers": {
+                "isbn_13": next((t for t in isbn_tokens if len(t) == 13), None),
+                "isbn_10": next((t for t in isbn_tokens if len(t) == 10), None),
+            },
+            "raw_fields": raw_fields,
+        }
+
+        # Only return usable items.
+        if not item.get("title") and not any(item["identifiers"].values()):
+            return []
+
+        return [item]
+
+    def to_tags(self, item: Dict[str, Any]) -> List[str]:
+        tags: List[str] = []
+
+        title = str(item.get("title") or "").strip()
+        if title:
+            tags.append(f"title:{title}")
+
+        authors = item.get("authors") or []
+        if isinstance(authors, list):
+            for a in authors:
+                a = str(a or "").strip()
+                if a:
+                    tags.append(f"author:{a}")
+
+        publisher = str(item.get("publisher") or "").strip()
+        if publisher:
+            tags.append(f"publisher:{publisher}")
+
+        year = str(item.get("year") or "").strip()
+        if year:
+            tags.append(f"year:{year}")
+
+        language = str(item.get("language") or "").strip()
+        if language:
+            tags.append(f"language:{language}")
+
+        identifiers = item.get("identifiers") or {}
+        if isinstance(identifiers, dict):
+            for key in ("isbn_13", "isbn_10"):
+                val = identifiers.get(key)
+                if val:
+                    tags.append(f"isbn:{val}")
+
+        tags.append(f"source:{self.name}")
+
+        # Dedup case-insensitively, preserve order.
+        seen: set[str] = set()
+        out: List[str] = []
+        for t in tags:
+            s = str(t or "").strip()
+            if not s:
+                continue
+            k = s.lower()
+            if k in seen:
+                continue
+            seen.add(k)
+            out.append(s)
+        return out
+
+
 class MusicBrainzMetadataProvider(MetadataProvider):
    """Metadata provider for MusicBrainz recordings."""

@@ -517,6 +712,7 @@ _METADATA_PROVIDERS: Dict[str, Type[MetadataProvider]] = {
    "openlibrary": OpenLibraryMetadataProvider,
    "googlebooks": GoogleBooksMetadataProvider,
    "google": GoogleBooksMetadataProvider,
+    "isbnsearch": ISBNsearchMetadataProvider,
    "musicbrainz": MusicBrainzMetadataProvider,
    "ytdlp": YtdlpMetadataProvider,
 }