This commit is contained in:
2026-01-01 20:37:27 -08:00
parent f3c79609d8
commit deb05c0d44
35 changed files with 5030 additions and 4879 deletions

View File

@@ -3,7 +3,7 @@ from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional, Sequence, Tuple
@dataclass
@@ -53,6 +53,8 @@ class Provider(ABC):
- validate()
"""
URL: Sequence[str] = ()
def __init__(self, config: Optional[Dict[str, Any]] = None):
self.config = config or {}
self.name = self.__class__.__name__.lower()
@@ -107,6 +109,30 @@ class Provider(ABC):
_ = stage_is_last
return False
@classmethod
def url_patterns(cls) -> Tuple[str, ...]:
"""Return normalized URL patterns that this provider handles."""
patterns: List[str] = []
maybe_urls = getattr(cls, "URL", None)
if isinstance(maybe_urls, (list, tuple)):
for entry in maybe_urls:
try:
candidate = str(entry or "").strip().lower()
except Exception:
continue
if candidate:
patterns.append(candidate)
maybe_domains = getattr(cls, "URL_DOMAINS", None)
if isinstance(maybe_domains, (list, tuple)):
for entry in maybe_domains:
try:
candidate = str(entry or "").strip().lower()
except Exception:
continue
if candidate and candidate not in patterns:
patterns.append(candidate)
return tuple(patterns)
class SearchProvider(Provider):
"""Compatibility alias for older code.

View File

@@ -68,6 +68,13 @@ def _supports_upload(provider: Provider) -> bool:
return provider.__class__.upload is not Provider.upload
def _provider_url_patterns(provider_class: Type[Provider]) -> Sequence[str]:
try:
return list(provider_class.url_patterns())
except Exception:
return []
def get_provider(name: str,
config: Optional[Dict[str,
Any]] = None) -> Optional[Provider]:
@@ -166,47 +173,53 @@ def list_file_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bo
def match_provider_name_for_url(url: str) -> Optional[str]:
"""Return a registered provider name that claims the URL's domain.
Providers can declare domains via a class attribute `URL_DOMAINS` (sequence of strings).
Providers can declare domains via class attribute `URL` (preferred) or `URL_DOMAINS`.
This matcher is intentionally cheap (no provider instantiation, no network).
"""
raw_url = str(url or "").strip()
raw_url_lower = raw_url.lower()
try:
parsed = urlparse(str(url))
parsed = urlparse(raw_url)
host = (parsed.hostname or "").strip().lower()
path = (parsed.path or "").strip()
except Exception:
host = ""
path = ""
if not host:
return None
# Prefer Internet Archive for archive.org links unless the URL clearly refers
# to a borrow/loan flow (handled by OpenLibrary provider).
#
# This keeps direct downloads and item pages routed to `internetarchive`, while
# preserving OpenLibrary's scripted borrow pipeline for loan/reader URLs.
if host == "openlibrary.org" or host.endswith(".openlibrary.org"):
return "openlibrary" if "openlibrary" in _PROVIDERS else None
if host == "archive.org" or host.endswith(".archive.org"):
low_path = str(path or "").lower()
is_borrowish = (
low_path.startswith("/borrow/") or low_path.startswith("/stream/")
or low_path.startswith("/services/loans/") or "/services/loans/" in low_path
)
if is_borrowish:
if host:
if host == "openlibrary.org" or host.endswith(".openlibrary.org"):
return "openlibrary" if "openlibrary" in _PROVIDERS else None
return "internetarchive" if "internetarchive" in _PROVIDERS else None
if host == "archive.org" or host.endswith(".archive.org"):
low_path = str(path or "").lower()
is_borrowish = (
low_path.startswith("/borrow/") or low_path.startswith("/stream/")
or low_path.startswith("/services/loans/") or "/services/loans/" in low_path
)
if is_borrowish:
return "openlibrary" if "openlibrary" in _PROVIDERS else None
return "internetarchive" if "internetarchive" in _PROVIDERS else None
for name, provider_class in _PROVIDERS.items():
domains = getattr(provider_class, "URL_DOMAINS", None)
if not isinstance(domains, (list, tuple)):
domains = _provider_url_patterns(provider_class)
if not domains:
continue
for d in domains:
dom = str(d or "").strip().lower()
if not dom:
continue
if raw_url_lower.startswith(dom):
return name
for d in domains:
dom = str(d or "").strip().lower()
if not dom or not host:
continue
if host == dom or host.endswith("." + dom):
return name