re
Some checks failed
smoke-mm / Install & smoke test mm --help (push) Has been cancelled
Some checks failed
smoke-mm / Install & smoke test mm --help (push) Has been cancelled
This commit is contained in:
@@ -2,6 +2,8 @@ from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List, Optional, Type, cast
|
||||
import html as html_std
|
||||
import re
|
||||
import requests
|
||||
import sys
|
||||
import json
|
||||
@@ -279,6 +281,199 @@ class GoogleBooksMetadataProvider(MetadataProvider):
|
||||
return tags
|
||||
|
||||
|
||||
class ISBNsearchMetadataProvider(MetadataProvider):
|
||||
"""Metadata provider that scrapes isbnsearch.org by ISBN.
|
||||
|
||||
This is a best-effort HTML scrape. It expects the query to be an ISBN.
|
||||
"""
|
||||
|
||||
@property
|
||||
def name(self) -> str: # type: ignore[override]
|
||||
return "isbnsearch"
|
||||
|
||||
@staticmethod
|
||||
def _strip_html_to_text(raw: str) -> str:
|
||||
s = html_std.unescape(str(raw or ""))
|
||||
s = re.sub(r"(?i)<br\s*/?>", "\n", s)
|
||||
s = re.sub(r"<[^>]+>", " ", s)
|
||||
s = re.sub(r"\s+", " ", s)
|
||||
return s.strip()
|
||||
|
||||
@staticmethod
|
||||
def _clean_isbn(query: str) -> str:
|
||||
s = str(query or "").strip()
|
||||
if not s:
|
||||
return ""
|
||||
s = s.replace("isbn:", "").replace("ISBN:", "")
|
||||
s = re.sub(r"[^0-9Xx]", "", s).upper()
|
||||
if len(s) in (10, 13):
|
||||
return s
|
||||
# Try to locate an ISBN-like token inside the query.
|
||||
m = re.search(r"\b(?:97[89])?\d{9}[\dXx]\b", s)
|
||||
return str(m.group(0)).upper() if m else ""
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
|
||||
_ = limit
|
||||
isbn = self._clean_isbn(query)
|
||||
if not isbn:
|
||||
return []
|
||||
|
||||
url = f"https://isbnsearch.org/isbn/{isbn}"
|
||||
try:
|
||||
resp = requests.get(url, timeout=10)
|
||||
resp.raise_for_status()
|
||||
html = str(resp.text or "")
|
||||
if not html:
|
||||
return []
|
||||
except Exception as exc:
|
||||
log(f"ISBNsearch scrape failed: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
title = ""
|
||||
m_title = re.search(r"(?is)<h1\b[^>]*>(.*?)</h1>", html)
|
||||
if m_title:
|
||||
title = self._strip_html_to_text(m_title.group(1))
|
||||
|
||||
raw_fields: Dict[str, str] = {}
|
||||
strong_matches = list(re.finditer(r"(?is)<strong\b[^>]*>(.*?)</strong>", html))
|
||||
for idx, m in enumerate(strong_matches):
|
||||
label_raw = self._strip_html_to_text(m.group(1))
|
||||
label = str(label_raw or "").strip()
|
||||
if not label:
|
||||
continue
|
||||
if label.endswith(":"):
|
||||
label = label[:-1].strip()
|
||||
|
||||
chunk_start = m.end()
|
||||
# Stop at next <strong> or end of document.
|
||||
chunk_end = strong_matches[idx + 1].start() if (idx + 1) < len(strong_matches) else len(html)
|
||||
chunk = html[chunk_start:chunk_end]
|
||||
# Prefer stopping within the same paragraph when possible.
|
||||
m_end = re.search(r"(?is)(</p>|<br\s*/?>)", chunk)
|
||||
if m_end:
|
||||
chunk = chunk[: m_end.start()]
|
||||
|
||||
val_text = self._strip_html_to_text(chunk)
|
||||
if not val_text:
|
||||
continue
|
||||
raw_fields[label] = val_text
|
||||
|
||||
def _get(*labels: str) -> str:
|
||||
for lab in labels:
|
||||
for k, v in raw_fields.items():
|
||||
if str(k).strip().lower() == str(lab).strip().lower():
|
||||
return str(v or "").strip()
|
||||
return ""
|
||||
|
||||
# Map common ISBNsearch labels.
|
||||
author_text = _get("Author", "Authors", "Author(s)")
|
||||
publisher = _get("Publisher")
|
||||
published = _get("Published", "Publication Date", "Publish Date")
|
||||
language = _get("Language")
|
||||
pages = _get("Pages")
|
||||
isbn_13 = _get("ISBN-13", "ISBN13")
|
||||
isbn_10 = _get("ISBN-10", "ISBN10")
|
||||
|
||||
year = ""
|
||||
if published:
|
||||
m_year = re.search(r"\b(\d{4})\b", published)
|
||||
year = str(m_year.group(1)) if m_year else ""
|
||||
|
||||
authors: List[str] = []
|
||||
if author_text:
|
||||
# Split on common separators; keep multi-part names intact.
|
||||
for part in re.split(r"\s*(?:,|;|\band\b|\&|\|)\s*", author_text, flags=re.IGNORECASE):
|
||||
p = str(part or "").strip()
|
||||
if p:
|
||||
authors.append(p)
|
||||
|
||||
# Prefer parsed title, but fall back to og:title if needed.
|
||||
if not title:
|
||||
m_og = re.search(r"(?is)<meta\b[^>]*property=['\"]og:title['\"][^>]*content=['\"](.*?)['\"][^>]*>", html)
|
||||
if m_og:
|
||||
title = self._strip_html_to_text(m_og.group(1))
|
||||
|
||||
# Ensure ISBN tokens are normalized.
|
||||
isbn_tokens: List[str] = []
|
||||
for token in [isbn_13, isbn_10, isbn]:
|
||||
t = self._clean_isbn(token)
|
||||
if t and t not in isbn_tokens:
|
||||
isbn_tokens.append(t)
|
||||
|
||||
item: Dict[str, Any] = {
|
||||
"title": title or "",
|
||||
# Keep UI columns compatible with the generic metadata table.
|
||||
"artist": ", ".join(authors) if authors else "",
|
||||
"album": publisher or "",
|
||||
"year": year or "",
|
||||
"provider": self.name,
|
||||
"authors": authors,
|
||||
"publisher": publisher or "",
|
||||
"language": language or "",
|
||||
"pages": pages or "",
|
||||
"identifiers": {
|
||||
"isbn_13": next((t for t in isbn_tokens if len(t) == 13), None),
|
||||
"isbn_10": next((t for t in isbn_tokens if len(t) == 10), None),
|
||||
},
|
||||
"raw_fields": raw_fields,
|
||||
}
|
||||
|
||||
# Only return usable items.
|
||||
if not item.get("title") and not any(item["identifiers"].values()):
|
||||
return []
|
||||
|
||||
return [item]
|
||||
|
||||
def to_tags(self, item: Dict[str, Any]) -> List[str]:
|
||||
tags: List[str] = []
|
||||
|
||||
title = str(item.get("title") or "").strip()
|
||||
if title:
|
||||
tags.append(f"title:{title}")
|
||||
|
||||
authors = item.get("authors") or []
|
||||
if isinstance(authors, list):
|
||||
for a in authors:
|
||||
a = str(a or "").strip()
|
||||
if a:
|
||||
tags.append(f"author:{a}")
|
||||
|
||||
publisher = str(item.get("publisher") or "").strip()
|
||||
if publisher:
|
||||
tags.append(f"publisher:{publisher}")
|
||||
|
||||
year = str(item.get("year") or "").strip()
|
||||
if year:
|
||||
tags.append(f"year:{year}")
|
||||
|
||||
language = str(item.get("language") or "").strip()
|
||||
if language:
|
||||
tags.append(f"language:{language}")
|
||||
|
||||
identifiers = item.get("identifiers") or {}
|
||||
if isinstance(identifiers, dict):
|
||||
for key in ("isbn_13", "isbn_10"):
|
||||
val = identifiers.get(key)
|
||||
if val:
|
||||
tags.append(f"isbn:{val}")
|
||||
|
||||
tags.append(f"source:{self.name}")
|
||||
|
||||
# Dedup case-insensitively, preserve order.
|
||||
seen: set[str] = set()
|
||||
out: List[str] = []
|
||||
for t in tags:
|
||||
s = str(t or "").strip()
|
||||
if not s:
|
||||
continue
|
||||
k = s.lower()
|
||||
if k in seen:
|
||||
continue
|
||||
seen.add(k)
|
||||
out.append(s)
|
||||
return out
|
||||
|
||||
|
||||
class MusicBrainzMetadataProvider(MetadataProvider):
|
||||
"""Metadata provider for MusicBrainz recordings."""
|
||||
|
||||
@@ -517,6 +712,7 @@ _METADATA_PROVIDERS: Dict[str, Type[MetadataProvider]] = {
|
||||
"openlibrary": OpenLibraryMetadataProvider,
|
||||
"googlebooks": GoogleBooksMetadataProvider,
|
||||
"google": GoogleBooksMetadataProvider,
|
||||
"isbnsearch": ISBNsearchMetadataProvider,
|
||||
"musicbrainz": MusicBrainzMetadataProvider,
|
||||
"ytdlp": YtdlpMetadataProvider,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user