This commit is contained in:
nose
2025-12-14 00:53:52 -08:00
parent 52a79b0086
commit a03eb0d1be
24 changed files with 2785 additions and 1868 deletions

View File

@@ -12,7 +12,13 @@ from __future__ import annotations
import sys
from SYS.logger import log, debug
try:
from Provider.openlibrary import OpenLibrary
_ol_scrape_isbn_metadata = OpenLibrary.scrape_isbn_metadata
_ol_scrape_openlibrary_metadata = OpenLibrary.scrape_openlibrary_metadata
except Exception:
_ol_scrape_isbn_metadata = None # type: ignore[assignment]
_ol_scrape_openlibrary_metadata = None # type: ignore[assignment]
from Provider.metadata_provider import get_metadata_provider, list_metadata_providers
import subprocess
from pathlib import Path
@@ -31,6 +37,10 @@ except ImportError:
extract_title = None
_scrape_isbn_metadata = _ol_scrape_isbn_metadata # type: ignore[assignment]
_scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata # type: ignore[assignment]
@@ -691,249 +701,22 @@ def _extract_url_formats(formats: list) -> List[Tuple[str, str]]:
def _scrape_isbn_metadata(isbn: str) -> List[str]:
"""Scrape metadata for an ISBN using Open Library API."""
new_tags = []
if _ol_scrape_isbn_metadata is None:
log("OpenLibrary scraper unavailable", file=sys.stderr)
return []
try:
from ..API.HTTP import HTTPClient
import json as json_module
isbn_clean = isbn.replace('-', '').strip()
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode('utf-8'))
except Exception as e:
log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr)
return []
if not data:
log(f"No ISBN metadata found for: {isbn}")
return []
book_data = next(iter(data.values()), None)
if not book_data:
return []
if 'title' in book_data:
new_tags.append(f"title:{book_data['title']}")
if 'authors' in book_data and isinstance(book_data['authors'], list):
for author in book_data['authors'][:3]:
if 'name' in author:
new_tags.append(f"author:{author['name']}")
if 'publish_date' in book_data:
new_tags.append(f"publish_date:{book_data['publish_date']}")
if 'publishers' in book_data and isinstance(book_data['publishers'], list):
for pub in book_data['publishers'][:1]:
if 'name' in pub:
new_tags.append(f"publisher:{pub['name']}")
if 'description' in book_data:
desc = book_data['description']
if isinstance(desc, dict) and 'value' in desc:
desc = desc['value']
if desc:
desc_str = str(desc).strip()
# Include description if available (limit to 200 chars to keep it manageable)
if len(desc_str) > 0:
new_tags.append(f"description:{desc_str[:200]}")
if 'number_of_pages' in book_data:
page_count = book_data['number_of_pages']
if page_count and isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict):
identifiers = book_data['identifiers']
if 'openlibrary' in identifiers:
ol_ids = identifiers['openlibrary']
if isinstance(ol_ids, list) and ol_ids:
new_tags.append(f"openlibrary:{ol_ids[0]}")
elif isinstance(ol_ids, str):
new_tags.append(f"openlibrary:{ol_ids}")
if 'lccn' in identifiers:
lccn_list = identifiers['lccn']
if isinstance(lccn_list, list) and lccn_list:
new_tags.append(f"lccn:{lccn_list[0]}")
elif isinstance(lccn_list, str):
new_tags.append(f"lccn:{lccn_list}")
if 'oclc' in identifiers:
oclc_list = identifiers['oclc']
if isinstance(oclc_list, list) and oclc_list:
new_tags.append(f"oclc:{oclc_list[0]}")
elif isinstance(oclc_list, str):
new_tags.append(f"oclc:{oclc_list}")
if 'goodreads' in identifiers:
goodreads_list = identifiers['goodreads']
if isinstance(goodreads_list, list) and goodreads_list:
new_tags.append(f"goodreads:{goodreads_list[0]}")
elif isinstance(goodreads_list, str):
new_tags.append(f"goodreads:{goodreads_list}")
if 'librarything' in identifiers:
lt_list = identifiers['librarything']
if isinstance(lt_list, list) and lt_list:
new_tags.append(f"librarything:{lt_list[0]}")
elif isinstance(lt_list, str):
new_tags.append(f"librarything:{lt_list}")
if 'doi' in identifiers:
doi_list = identifiers['doi']
if isinstance(doi_list, list) and doi_list:
new_tags.append(f"doi:{doi_list[0]}")
elif isinstance(doi_list, str):
new_tags.append(f"doi:{doi_list}")
if 'internet_archive' in identifiers:
ia_list = identifiers['internet_archive']
if isinstance(ia_list, list) and ia_list:
new_tags.append(f"internet_archive:{ia_list[0]}")
elif isinstance(ia_list, str):
new_tags.append(f"internet_archive:{ia_list}")
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
return new_tags
return list(_ol_scrape_isbn_metadata(isbn))
except Exception as e:
log(f"ISBN scraping error: {e}", file=sys.stderr)
return []
def _scrape_openlibrary_metadata(olid: str) -> List[str]:
"""Scrape metadata for an OpenLibrary ID using the .json API endpoint.
Fetches from https://openlibrary.org/books/{OLID}.json and extracts:
- Title, authors, publish date, publishers
- Description
- Subjects as freeform tags (without namespace prefix)
- Identifiers (ISBN, LCCN, OCLC, etc.)
"""
new_tags = []
if _ol_scrape_openlibrary_metadata is None:
log("OpenLibrary scraper unavailable", file=sys.stderr)
return []
try:
from ..API.HTTP import HTTPClient
import json as json_module
# Format: OL9674499M or just 9674499M
olid_clean = olid.replace('OL', '').replace('M', '')
if not olid_clean.isdigit():
olid_clean = olid
# Ensure we have the full OLID format for the URL
if not olid.startswith('OL'):
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
else:
url = f"https://openlibrary.org/books/{olid}.json"
try:
with HTTPClient() as client:
response = client.get(url)
response.raise_for_status()
data = json_module.loads(response.content.decode('utf-8'))
except Exception as e:
log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr)
return []
if not data:
log(f"No OpenLibrary metadata found for: {olid}")
return []
# Add title
if 'title' in data:
new_tags.append(f"title:{data['title']}")
# Add authors
if 'authors' in data and isinstance(data['authors'], list):
for author in data['authors'][:3]:
if isinstance(author, dict) and 'name' in author:
new_tags.append(f"author:{author['name']}")
elif isinstance(author, str):
new_tags.append(f"author:{author}")
# Add publish date
if 'publish_date' in data:
new_tags.append(f"publish_date:{data['publish_date']}")
# Add publishers
if 'publishers' in data and isinstance(data['publishers'], list):
for pub in data['publishers'][:1]:
if isinstance(pub, dict) and 'name' in pub:
new_tags.append(f"publisher:{pub['name']}")
elif isinstance(pub, str):
new_tags.append(f"publisher:{pub}")
# Add description
if 'description' in data:
desc = data['description']
if isinstance(desc, dict) and 'value' in desc:
desc = desc['value']
if desc:
desc_str = str(desc).strip()
if len(desc_str) > 0:
new_tags.append(f"description:{desc_str[:200]}")
# Add number of pages
if 'number_of_pages' in data:
page_count = data['number_of_pages']
if page_count and isinstance(page_count, int) and page_count > 0:
new_tags.append(f"pages:{page_count}")
# Add subjects as FREEFORM tags (no namespace prefix)
if 'subjects' in data and isinstance(data['subjects'], list):
for subject in data['subjects'][:10]:
if subject and isinstance(subject, str):
subject_clean = str(subject).strip()
if subject_clean and subject_clean not in new_tags:
new_tags.append(subject_clean)
# Add identifiers
if 'identifiers' in data and isinstance(data['identifiers'], dict):
identifiers = data['identifiers']
if 'isbn_10' in identifiers:
isbn_10_list = identifiers['isbn_10']
if isinstance(isbn_10_list, list) and isbn_10_list:
new_tags.append(f"isbn_10:{isbn_10_list[0]}")
elif isinstance(isbn_10_list, str):
new_tags.append(f"isbn_10:{isbn_10_list}")
if 'isbn_13' in identifiers:
isbn_13_list = identifiers['isbn_13']
if isinstance(isbn_13_list, list) and isbn_13_list:
new_tags.append(f"isbn_13:{isbn_13_list[0]}")
elif isinstance(isbn_13_list, str):
new_tags.append(f"isbn_13:{isbn_13_list}")
if 'lccn' in identifiers:
lccn_list = identifiers['lccn']
if isinstance(lccn_list, list) and lccn_list:
new_tags.append(f"lccn:{lccn_list[0]}")
elif isinstance(lccn_list, str):
new_tags.append(f"lccn:{lccn_list}")
if 'oclc_numbers' in identifiers:
oclc_list = identifiers['oclc_numbers']
if isinstance(oclc_list, list) and oclc_list:
new_tags.append(f"oclc:{oclc_list[0]}")
elif isinstance(oclc_list, str):
new_tags.append(f"oclc:{oclc_list}")
if 'goodreads' in identifiers:
goodreads_list = identifiers['goodreads']
if isinstance(goodreads_list, list) and goodreads_list:
new_tags.append(f"goodreads:{goodreads_list[0]}")
elif isinstance(goodreads_list, str):
new_tags.append(f"goodreads:{goodreads_list}")
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
return new_tags
return list(_ol_scrape_openlibrary_metadata(olid))
except Exception as e:
log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
return []