re
Some checks failed
smoke-mm / Install & smoke test mm --help (push) Has been cancelled

This commit is contained in:
nose
2025-12-25 04:49:22 -08:00
parent 2542a68479
commit 43afa4e3fa
19 changed files with 2766 additions and 234 deletions

View File

@@ -2,6 +2,8 @@ from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Type, cast
import html as html_std
import re
import requests
import sys
import json
@@ -279,6 +281,199 @@ class GoogleBooksMetadataProvider(MetadataProvider):
return tags
class ISBNsearchMetadataProvider(MetadataProvider):
"""Metadata provider that scrapes isbnsearch.org by ISBN.
This is a best-effort HTML scrape. It expects the query to be an ISBN.
"""
@property
def name(self) -> str: # type: ignore[override]
return "isbnsearch"
@staticmethod
def _strip_html_to_text(raw: str) -> str:
s = html_std.unescape(str(raw or ""))
s = re.sub(r"(?i)<br\s*/?>", "\n", s)
s = re.sub(r"<[^>]+>", " ", s)
s = re.sub(r"\s+", " ", s)
return s.strip()
@staticmethod
def _clean_isbn(query: str) -> str:
s = str(query or "").strip()
if not s:
return ""
s = s.replace("isbn:", "").replace("ISBN:", "")
s = re.sub(r"[^0-9Xx]", "", s).upper()
if len(s) in (10, 13):
return s
# Try to locate an ISBN-like token inside the query.
m = re.search(r"\b(?:97[89])?\d{9}[\dXx]\b", s)
return str(m.group(0)).upper() if m else ""
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
_ = limit
isbn = self._clean_isbn(query)
if not isbn:
return []
url = f"https://isbnsearch.org/isbn/{isbn}"
try:
resp = requests.get(url, timeout=10)
resp.raise_for_status()
html = str(resp.text or "")
if not html:
return []
except Exception as exc:
log(f"ISBNsearch scrape failed: {exc}", file=sys.stderr)
return []
title = ""
m_title = re.search(r"(?is)<h1\b[^>]*>(.*?)</h1>", html)
if m_title:
title = self._strip_html_to_text(m_title.group(1))
raw_fields: Dict[str, str] = {}
strong_matches = list(re.finditer(r"(?is)<strong\b[^>]*>(.*?)</strong>", html))
for idx, m in enumerate(strong_matches):
label_raw = self._strip_html_to_text(m.group(1))
label = str(label_raw or "").strip()
if not label:
continue
if label.endswith(":"):
label = label[:-1].strip()
chunk_start = m.end()
# Stop at next <strong> or end of document.
chunk_end = strong_matches[idx + 1].start() if (idx + 1) < len(strong_matches) else len(html)
chunk = html[chunk_start:chunk_end]
# Prefer stopping within the same paragraph when possible.
m_end = re.search(r"(?is)(</p>|<br\s*/?>)", chunk)
if m_end:
chunk = chunk[: m_end.start()]
val_text = self._strip_html_to_text(chunk)
if not val_text:
continue
raw_fields[label] = val_text
def _get(*labels: str) -> str:
for lab in labels:
for k, v in raw_fields.items():
if str(k).strip().lower() == str(lab).strip().lower():
return str(v or "").strip()
return ""
# Map common ISBNsearch labels.
author_text = _get("Author", "Authors", "Author(s)")
publisher = _get("Publisher")
published = _get("Published", "Publication Date", "Publish Date")
language = _get("Language")
pages = _get("Pages")
isbn_13 = _get("ISBN-13", "ISBN13")
isbn_10 = _get("ISBN-10", "ISBN10")
year = ""
if published:
m_year = re.search(r"\b(\d{4})\b", published)
year = str(m_year.group(1)) if m_year else ""
authors: List[str] = []
if author_text:
# Split on common separators; keep multi-part names intact.
for part in re.split(r"\s*(?:,|;|\band\b|\&|\|)\s*", author_text, flags=re.IGNORECASE):
p = str(part or "").strip()
if p:
authors.append(p)
# Prefer parsed title, but fall back to og:title if needed.
if not title:
m_og = re.search(r"(?is)<meta\b[^>]*property=['\"]og:title['\"][^>]*content=['\"](.*?)['\"][^>]*>", html)
if m_og:
title = self._strip_html_to_text(m_og.group(1))
# Ensure ISBN tokens are normalized.
isbn_tokens: List[str] = []
for token in [isbn_13, isbn_10, isbn]:
t = self._clean_isbn(token)
if t and t not in isbn_tokens:
isbn_tokens.append(t)
item: Dict[str, Any] = {
"title": title or "",
# Keep UI columns compatible with the generic metadata table.
"artist": ", ".join(authors) if authors else "",
"album": publisher or "",
"year": year or "",
"provider": self.name,
"authors": authors,
"publisher": publisher or "",
"language": language or "",
"pages": pages or "",
"identifiers": {
"isbn_13": next((t for t in isbn_tokens if len(t) == 13), None),
"isbn_10": next((t for t in isbn_tokens if len(t) == 10), None),
},
"raw_fields": raw_fields,
}
# Only return usable items.
if not item.get("title") and not any(item["identifiers"].values()):
return []
return [item]
def to_tags(self, item: Dict[str, Any]) -> List[str]:
tags: List[str] = []
title = str(item.get("title") or "").strip()
if title:
tags.append(f"title:{title}")
authors = item.get("authors") or []
if isinstance(authors, list):
for a in authors:
a = str(a or "").strip()
if a:
tags.append(f"author:{a}")
publisher = str(item.get("publisher") or "").strip()
if publisher:
tags.append(f"publisher:{publisher}")
year = str(item.get("year") or "").strip()
if year:
tags.append(f"year:{year}")
language = str(item.get("language") or "").strip()
if language:
tags.append(f"language:{language}")
identifiers = item.get("identifiers") or {}
if isinstance(identifiers, dict):
for key in ("isbn_13", "isbn_10"):
val = identifiers.get(key)
if val:
tags.append(f"isbn:{val}")
tags.append(f"source:{self.name}")
# Dedup case-insensitively, preserve order.
seen: set[str] = set()
out: List[str] = []
for t in tags:
s = str(t or "").strip()
if not s:
continue
k = s.lower()
if k in seen:
continue
seen.add(k)
out.append(s)
return out
class MusicBrainzMetadataProvider(MetadataProvider):
"""Metadata provider for MusicBrainz recordings."""
@@ -517,6 +712,7 @@ _METADATA_PROVIDERS: Dict[str, Type[MetadataProvider]] = {
"openlibrary": OpenLibraryMetadataProvider,
"googlebooks": GoogleBooksMetadataProvider,
"google": GoogleBooksMetadataProvider,
"isbnsearch": ISBNsearchMetadataProvider,
"musicbrainz": MusicBrainzMetadataProvider,
"ytdlp": YtdlpMetadataProvider,
}