dfdfsdd
This commit is contained in:
@@ -12,7 +12,13 @@ from __future__ import annotations
|
||||
|
||||
import sys
|
||||
|
||||
from SYS.logger import log, debug
|
||||
try:
|
||||
from Provider.openlibrary import OpenLibrary
|
||||
_ol_scrape_isbn_metadata = OpenLibrary.scrape_isbn_metadata
|
||||
_ol_scrape_openlibrary_metadata = OpenLibrary.scrape_openlibrary_metadata
|
||||
except Exception:
|
||||
_ol_scrape_isbn_metadata = None # type: ignore[assignment]
|
||||
_ol_scrape_openlibrary_metadata = None # type: ignore[assignment]
|
||||
from Provider.metadata_provider import get_metadata_provider, list_metadata_providers
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
@@ -31,6 +37,10 @@ except ImportError:
|
||||
extract_title = None
|
||||
|
||||
|
||||
_scrape_isbn_metadata = _ol_scrape_isbn_metadata # type: ignore[assignment]
|
||||
_scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata # type: ignore[assignment]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -691,249 +701,22 @@ def _extract_url_formats(formats: list) -> List[Tuple[str, str]]:
|
||||
|
||||
|
||||
def _scrape_isbn_metadata(isbn: str) -> List[str]:
|
||||
"""Scrape metadata for an ISBN using Open Library API."""
|
||||
new_tags = []
|
||||
if _ol_scrape_isbn_metadata is None:
|
||||
log("OpenLibrary scraper unavailable", file=sys.stderr)
|
||||
return []
|
||||
try:
|
||||
from ..API.HTTP import HTTPClient
|
||||
import json as json_module
|
||||
|
||||
isbn_clean = isbn.replace('-', '').strip()
|
||||
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
|
||||
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json_module.loads(response.content.decode('utf-8'))
|
||||
except Exception as e:
|
||||
log(f"Failed to fetch ISBN metadata: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not data:
|
||||
log(f"No ISBN metadata found for: {isbn}")
|
||||
return []
|
||||
|
||||
book_data = next(iter(data.values()), None)
|
||||
if not book_data:
|
||||
return []
|
||||
|
||||
if 'title' in book_data:
|
||||
new_tags.append(f"title:{book_data['title']}")
|
||||
|
||||
if 'authors' in book_data and isinstance(book_data['authors'], list):
|
||||
for author in book_data['authors'][:3]:
|
||||
if 'name' in author:
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
|
||||
if 'publish_date' in book_data:
|
||||
new_tags.append(f"publish_date:{book_data['publish_date']}")
|
||||
|
||||
if 'publishers' in book_data and isinstance(book_data['publishers'], list):
|
||||
for pub in book_data['publishers'][:1]:
|
||||
if 'name' in pub:
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
|
||||
if 'description' in book_data:
|
||||
desc = book_data['description']
|
||||
if isinstance(desc, dict) and 'value' in desc:
|
||||
desc = desc['value']
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
# Include description if available (limit to 200 chars to keep it manageable)
|
||||
if len(desc_str) > 0:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
if 'number_of_pages' in book_data:
|
||||
page_count = book_data['number_of_pages']
|
||||
if page_count and isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
if 'identifiers' in book_data and isinstance(book_data['identifiers'], dict):
|
||||
identifiers = book_data['identifiers']
|
||||
|
||||
if 'openlibrary' in identifiers:
|
||||
ol_ids = identifiers['openlibrary']
|
||||
if isinstance(ol_ids, list) and ol_ids:
|
||||
new_tags.append(f"openlibrary:{ol_ids[0]}")
|
||||
elif isinstance(ol_ids, str):
|
||||
new_tags.append(f"openlibrary:{ol_ids}")
|
||||
|
||||
if 'lccn' in identifiers:
|
||||
lccn_list = identifiers['lccn']
|
||||
if isinstance(lccn_list, list) and lccn_list:
|
||||
new_tags.append(f"lccn:{lccn_list[0]}")
|
||||
elif isinstance(lccn_list, str):
|
||||
new_tags.append(f"lccn:{lccn_list}")
|
||||
|
||||
if 'oclc' in identifiers:
|
||||
oclc_list = identifiers['oclc']
|
||||
if isinstance(oclc_list, list) and oclc_list:
|
||||
new_tags.append(f"oclc:{oclc_list[0]}")
|
||||
elif isinstance(oclc_list, str):
|
||||
new_tags.append(f"oclc:{oclc_list}")
|
||||
|
||||
if 'goodreads' in identifiers:
|
||||
goodreads_list = identifiers['goodreads']
|
||||
if isinstance(goodreads_list, list) and goodreads_list:
|
||||
new_tags.append(f"goodreads:{goodreads_list[0]}")
|
||||
elif isinstance(goodreads_list, str):
|
||||
new_tags.append(f"goodreads:{goodreads_list}")
|
||||
|
||||
if 'librarything' in identifiers:
|
||||
lt_list = identifiers['librarything']
|
||||
if isinstance(lt_list, list) and lt_list:
|
||||
new_tags.append(f"librarything:{lt_list[0]}")
|
||||
elif isinstance(lt_list, str):
|
||||
new_tags.append(f"librarything:{lt_list}")
|
||||
|
||||
if 'doi' in identifiers:
|
||||
doi_list = identifiers['doi']
|
||||
if isinstance(doi_list, list) and doi_list:
|
||||
new_tags.append(f"doi:{doi_list[0]}")
|
||||
elif isinstance(doi_list, str):
|
||||
new_tags.append(f"doi:{doi_list}")
|
||||
|
||||
if 'internet_archive' in identifiers:
|
||||
ia_list = identifiers['internet_archive']
|
||||
if isinstance(ia_list, list) and ia_list:
|
||||
new_tags.append(f"internet_archive:{ia_list[0]}")
|
||||
elif isinstance(ia_list, str):
|
||||
new_tags.append(f"internet_archive:{ia_list}")
|
||||
|
||||
log(f"Found {len(new_tags)} tag(s) from ISBN lookup")
|
||||
return new_tags
|
||||
return list(_ol_scrape_isbn_metadata(isbn))
|
||||
except Exception as e:
|
||||
log(f"ISBN scraping error: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
|
||||
def _scrape_openlibrary_metadata(olid: str) -> List[str]:
|
||||
"""Scrape metadata for an OpenLibrary ID using the .json API endpoint.
|
||||
|
||||
Fetches from https://openlibrary.org/books/{OLID}.json and extracts:
|
||||
- Title, authors, publish date, publishers
|
||||
- Description
|
||||
- Subjects as freeform tags (without namespace prefix)
|
||||
- Identifiers (ISBN, LCCN, OCLC, etc.)
|
||||
"""
|
||||
new_tags = []
|
||||
if _ol_scrape_openlibrary_metadata is None:
|
||||
log("OpenLibrary scraper unavailable", file=sys.stderr)
|
||||
return []
|
||||
try:
|
||||
from ..API.HTTP import HTTPClient
|
||||
import json as json_module
|
||||
|
||||
# Format: OL9674499M or just 9674499M
|
||||
olid_clean = olid.replace('OL', '').replace('M', '')
|
||||
if not olid_clean.isdigit():
|
||||
olid_clean = olid
|
||||
|
||||
# Ensure we have the full OLID format for the URL
|
||||
if not olid.startswith('OL'):
|
||||
url = f"https://openlibrary.org/books/OL{olid_clean}M.json"
|
||||
else:
|
||||
url = f"https://openlibrary.org/books/{olid}.json"
|
||||
|
||||
try:
|
||||
with HTTPClient() as client:
|
||||
response = client.get(url)
|
||||
response.raise_for_status()
|
||||
data = json_module.loads(response.content.decode('utf-8'))
|
||||
except Exception as e:
|
||||
log(f"Failed to fetch OpenLibrary metadata: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
if not data:
|
||||
log(f"No OpenLibrary metadata found for: {olid}")
|
||||
return []
|
||||
|
||||
# Add title
|
||||
if 'title' in data:
|
||||
new_tags.append(f"title:{data['title']}")
|
||||
|
||||
# Add authors
|
||||
if 'authors' in data and isinstance(data['authors'], list):
|
||||
for author in data['authors'][:3]:
|
||||
if isinstance(author, dict) and 'name' in author:
|
||||
new_tags.append(f"author:{author['name']}")
|
||||
elif isinstance(author, str):
|
||||
new_tags.append(f"author:{author}")
|
||||
|
||||
# Add publish date
|
||||
if 'publish_date' in data:
|
||||
new_tags.append(f"publish_date:{data['publish_date']}")
|
||||
|
||||
# Add publishers
|
||||
if 'publishers' in data and isinstance(data['publishers'], list):
|
||||
for pub in data['publishers'][:1]:
|
||||
if isinstance(pub, dict) and 'name' in pub:
|
||||
new_tags.append(f"publisher:{pub['name']}")
|
||||
elif isinstance(pub, str):
|
||||
new_tags.append(f"publisher:{pub}")
|
||||
|
||||
# Add description
|
||||
if 'description' in data:
|
||||
desc = data['description']
|
||||
if isinstance(desc, dict) and 'value' in desc:
|
||||
desc = desc['value']
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
if len(desc_str) > 0:
|
||||
new_tags.append(f"description:{desc_str[:200]}")
|
||||
|
||||
# Add number of pages
|
||||
if 'number_of_pages' in data:
|
||||
page_count = data['number_of_pages']
|
||||
if page_count and isinstance(page_count, int) and page_count > 0:
|
||||
new_tags.append(f"pages:{page_count}")
|
||||
|
||||
# Add subjects as FREEFORM tags (no namespace prefix)
|
||||
if 'subjects' in data and isinstance(data['subjects'], list):
|
||||
for subject in data['subjects'][:10]:
|
||||
if subject and isinstance(subject, str):
|
||||
subject_clean = str(subject).strip()
|
||||
if subject_clean and subject_clean not in new_tags:
|
||||
new_tags.append(subject_clean)
|
||||
|
||||
# Add identifiers
|
||||
if 'identifiers' in data and isinstance(data['identifiers'], dict):
|
||||
identifiers = data['identifiers']
|
||||
|
||||
if 'isbn_10' in identifiers:
|
||||
isbn_10_list = identifiers['isbn_10']
|
||||
if isinstance(isbn_10_list, list) and isbn_10_list:
|
||||
new_tags.append(f"isbn_10:{isbn_10_list[0]}")
|
||||
elif isinstance(isbn_10_list, str):
|
||||
new_tags.append(f"isbn_10:{isbn_10_list}")
|
||||
|
||||
if 'isbn_13' in identifiers:
|
||||
isbn_13_list = identifiers['isbn_13']
|
||||
if isinstance(isbn_13_list, list) and isbn_13_list:
|
||||
new_tags.append(f"isbn_13:{isbn_13_list[0]}")
|
||||
elif isinstance(isbn_13_list, str):
|
||||
new_tags.append(f"isbn_13:{isbn_13_list}")
|
||||
|
||||
if 'lccn' in identifiers:
|
||||
lccn_list = identifiers['lccn']
|
||||
if isinstance(lccn_list, list) and lccn_list:
|
||||
new_tags.append(f"lccn:{lccn_list[0]}")
|
||||
elif isinstance(lccn_list, str):
|
||||
new_tags.append(f"lccn:{lccn_list}")
|
||||
|
||||
if 'oclc_numbers' in identifiers:
|
||||
oclc_list = identifiers['oclc_numbers']
|
||||
if isinstance(oclc_list, list) and oclc_list:
|
||||
new_tags.append(f"oclc:{oclc_list[0]}")
|
||||
elif isinstance(oclc_list, str):
|
||||
new_tags.append(f"oclc:{oclc_list}")
|
||||
|
||||
if 'goodreads' in identifiers:
|
||||
goodreads_list = identifiers['goodreads']
|
||||
if isinstance(goodreads_list, list) and goodreads_list:
|
||||
new_tags.append(f"goodreads:{goodreads_list[0]}")
|
||||
elif isinstance(goodreads_list, str):
|
||||
new_tags.append(f"goodreads:{goodreads_list}")
|
||||
|
||||
log(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup")
|
||||
return new_tags
|
||||
return list(_ol_scrape_openlibrary_metadata(olid))
|
||||
except Exception as e:
|
||||
log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
Reference in New Issue
Block a user