This commit is contained in:
2026-01-01 20:37:27 -08:00
parent f3c79609d8
commit deb05c0d44
35 changed files with 5030 additions and 4879 deletions

View File

@@ -68,6 +68,13 @@ def _supports_upload(provider: Provider) -> bool:
return provider.__class__.upload is not Provider.upload
def _provider_url_patterns(provider_class: Type[Provider]) -> Sequence[str]:
try:
return list(provider_class.url_patterns())
except Exception:
return []
def get_provider(name: str,
config: Optional[Dict[str,
Any]] = None) -> Optional[Provider]:
@@ -166,47 +173,53 @@ def list_file_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bo
def match_provider_name_for_url(url: str) -> Optional[str]:
"""Return a registered provider name that claims the URL's domain.
Providers can declare domains via a class attribute `URL_DOMAINS` (sequence of strings).
Providers can declare domains via class attribute `URL` (preferred) or `URL_DOMAINS`.
This matcher is intentionally cheap (no provider instantiation, no network).
"""
raw_url = str(url or "").strip()
raw_url_lower = raw_url.lower()
try:
parsed = urlparse(str(url))
parsed = urlparse(raw_url)
host = (parsed.hostname or "").strip().lower()
path = (parsed.path or "").strip()
except Exception:
host = ""
path = ""
if not host:
return None
# Prefer Internet Archive for archive.org links unless the URL clearly refers
# to a borrow/loan flow (handled by OpenLibrary provider).
#
# This keeps direct downloads and item pages routed to `internetarchive`, while
# preserving OpenLibrary's scripted borrow pipeline for loan/reader URLs.
if host == "openlibrary.org" or host.endswith(".openlibrary.org"):
return "openlibrary" if "openlibrary" in _PROVIDERS else None
if host == "archive.org" or host.endswith(".archive.org"):
low_path = str(path or "").lower()
is_borrowish = (
low_path.startswith("/borrow/") or low_path.startswith("/stream/")
or low_path.startswith("/services/loans/") or "/services/loans/" in low_path
)
if is_borrowish:
if host:
if host == "openlibrary.org" or host.endswith(".openlibrary.org"):
return "openlibrary" if "openlibrary" in _PROVIDERS else None
return "internetarchive" if "internetarchive" in _PROVIDERS else None
if host == "archive.org" or host.endswith(".archive.org"):
low_path = str(path or "").lower()
is_borrowish = (
low_path.startswith("/borrow/") or low_path.startswith("/stream/")
or low_path.startswith("/services/loans/") or "/services/loans/" in low_path
)
if is_borrowish:
return "openlibrary" if "openlibrary" in _PROVIDERS else None
return "internetarchive" if "internetarchive" in _PROVIDERS else None
for name, provider_class in _PROVIDERS.items():
domains = getattr(provider_class, "URL_DOMAINS", None)
if not isinstance(domains, (list, tuple)):
domains = _provider_url_patterns(provider_class)
if not domains:
continue
for d in domains:
dom = str(d or "").strip().lower()
if not dom:
continue
if raw_url_lower.startswith(dom):
return name
for d in domains:
dom = str(d or "").strip().lower()
if not dom or not host:
continue
if host == dom or host.endswith("." + dom):
return name