This commit is contained in:
2026-03-22 22:41:56 -07:00
parent 67ba6cb3d1
commit 23a73a94e6
6 changed files with 956 additions and 179 deletions

View File

@@ -361,6 +361,44 @@ def is_download_file_url(url: str) -> bool:
)
def _archive_item_access(identifier: str) -> Dict[str, Any]:
ident = str(identifier or "").strip()
if not ident:
return {"mediatype": "", "lendable": False, "collection": []}
session = requests.Session()
try:
response = session.get(f"https://archive.org/metadata/{ident}", timeout=8)
response.raise_for_status()
data = response.json() if response is not None else {}
except Exception:
return {"mediatype": "", "lendable": False, "collection": []}
finally:
try:
session.close()
except Exception:
pass
meta = data.get("metadata", {}) if isinstance(data, dict) else {}
if not isinstance(meta, dict):
meta = {}
mediatype = str(meta.get("mediatype") or "").strip().lower()
collection = meta.get("collection")
values: List[str] = []
if isinstance(collection, list):
values = [str(x).strip().lower() for x in collection if str(x).strip()]
elif isinstance(collection, str) and collection.strip():
values = [collection.strip().lower()]
lendable = any(v in {"inlibrary", "lendinglibrary"} for v in values)
return {
"mediatype": mediatype,
"lendable": lendable,
"collection": values,
}
def list_download_files(identifier: str) -> List[Dict[str, Any]]:
"""Return a sorted list of downloadable files for an IA identifier.
@@ -620,6 +658,11 @@ class InternetArchive(Provider):
quiet_mode: bool,
) -> Optional[int]:
"""Generic hook for download-file to show a selection table for IA items."""
try:
if self._should_delegate_borrow(str(url or "")):
return None
except Exception:
pass
from SYS.field_access import get_field as sh_get_field
return maybe_show_formats_table(
raw_urls=[url] if url else [],
@@ -638,6 +681,72 @@ class InternetArchive(Provider):
self._collection = conf.get("collection") or conf.get("default_collection")
self._mediatype = conf.get("mediatype") or conf.get("default_mediatype")
@staticmethod
def _should_delegate_borrow(url: str) -> bool:
raw = str(url or "").strip()
if not is_details_url(raw):
return False
identifier = extract_identifier(raw)
if not identifier:
return False
access = _archive_item_access(identifier)
return bool(access.get("lendable")) and str(access.get("mediatype") or "") == "texts"
def _download_via_openlibrary(self, url: str, output_dir: Path) -> Optional[Dict[str, Any]]:
try:
from Provider.openlibrary import OpenLibrary
except Exception as exc:
log(f"[internetarchive] OpenLibrary borrow helper unavailable: {exc}", file=sys.stderr)
return None
provider = OpenLibrary(self.config)
try:
result = provider.download_url(url, output_dir)
finally:
try:
session = getattr(provider, "_session", None)
if session is not None:
session.close()
except Exception:
pass
if not isinstance(result, dict):
return result
search_result = result.get("search_result")
metadata: Dict[str, Any] = {}
title = None
tags: List[str] = []
if search_result is not None:
try:
title = str(getattr(search_result, "title", "") or "").strip() or None
except Exception:
title = None
try:
metadata = dict(getattr(search_result, "full_metadata", {}) or {})
except Exception:
metadata = {}
try:
tags_val = getattr(search_result, "tag", None)
if isinstance(tags_val, set):
tags = [str(t) for t in sorted(tags_val) if t]
elif isinstance(tags_val, list):
tags = [str(t) for t in tags_val if t]
except Exception:
tags = []
normalized: Dict[str, Any] = {"path": result.get("path")}
if metadata:
normalized["metadata"] = metadata
normalized["full_metadata"] = metadata
if title:
normalized["title"] = title
if tags:
normalized["tags"] = tags
normalized["media_kind"] = "book"
normalized["provider_action"] = "borrow"
return normalized
def validate(self) -> bool:
try:
_ia()
@@ -824,13 +933,18 @@ class InternetArchive(Provider):
return out
def download_url(self, url: str, output_dir: Path) -> Optional[Path]:
def download_url(self, url: str, output_dir: Path) -> Optional[Any]:
"""Download an Internet Archive URL.
Supports:
- https://archive.org/details/<identifier>
- https://archive.org/download/<identifier>/<filename>
"""
if self._should_delegate_borrow(url):
delegated = self._download_via_openlibrary(url, output_dir)
if delegated is not None:
return delegated
sr = SearchResult(
table="internetarchive",
title=str(url),
@@ -842,6 +956,15 @@ class InternetArchive(Provider):
def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
raw_path = str(getattr(result, "path", "") or "").strip()
if self._should_delegate_borrow(raw_path):
delegated = self._download_via_openlibrary(raw_path, output_dir)
if isinstance(delegated, dict):
delegated_path = delegated.get("path")
if delegated_path:
return Path(str(delegated_path))
if isinstance(delegated, (str, Path)):
return Path(str(delegated))
# Fast path for explicit IA file URLs.
# This uses the shared direct downloader, which already integrates with
# pipeline transfer progress bars.