j
This commit is contained in:
@@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -53,6 +53,8 @@ class Provider(ABC):
|
||||
- validate()
|
||||
"""
|
||||
|
||||
URL: Sequence[str] = ()
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
self.config = config or {}
|
||||
self.name = self.__class__.__name__.lower()
|
||||
@@ -107,6 +109,30 @@ class Provider(ABC):
|
||||
_ = stage_is_last
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def url_patterns(cls) -> Tuple[str, ...]:
|
||||
"""Return normalized URL patterns that this provider handles."""
|
||||
patterns: List[str] = []
|
||||
maybe_urls = getattr(cls, "URL", None)
|
||||
if isinstance(maybe_urls, (list, tuple)):
|
||||
for entry in maybe_urls:
|
||||
try:
|
||||
candidate = str(entry or "").strip().lower()
|
||||
except Exception:
|
||||
continue
|
||||
if candidate:
|
||||
patterns.append(candidate)
|
||||
maybe_domains = getattr(cls, "URL_DOMAINS", None)
|
||||
if isinstance(maybe_domains, (list, tuple)):
|
||||
for entry in maybe_domains:
|
||||
try:
|
||||
candidate = str(entry or "").strip().lower()
|
||||
except Exception:
|
||||
continue
|
||||
if candidate and candidate not in patterns:
|
||||
patterns.append(candidate)
|
||||
return tuple(patterns)
|
||||
|
||||
|
||||
class SearchProvider(Provider):
|
||||
"""Compatibility alias for older code.
|
||||
|
||||
@@ -68,6 +68,13 @@ def _supports_upload(provider: Provider) -> bool:
|
||||
return provider.__class__.upload is not Provider.upload
|
||||
|
||||
|
||||
def _provider_url_patterns(provider_class: Type[Provider]) -> Sequence[str]:
|
||||
try:
|
||||
return list(provider_class.url_patterns())
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def get_provider(name: str,
|
||||
config: Optional[Dict[str,
|
||||
Any]] = None) -> Optional[Provider]:
|
||||
@@ -166,47 +173,53 @@ def list_file_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bo
|
||||
def match_provider_name_for_url(url: str) -> Optional[str]:
|
||||
"""Return a registered provider name that claims the URL's domain.
|
||||
|
||||
Providers can declare domains via a class attribute `URL_DOMAINS` (sequence of strings).
|
||||
Providers can declare domains via class attribute `URL` (preferred) or `URL_DOMAINS`.
|
||||
This matcher is intentionally cheap (no provider instantiation, no network).
|
||||
"""
|
||||
|
||||
raw_url = str(url or "").strip()
|
||||
raw_url_lower = raw_url.lower()
|
||||
try:
|
||||
parsed = urlparse(str(url))
|
||||
parsed = urlparse(raw_url)
|
||||
host = (parsed.hostname or "").strip().lower()
|
||||
path = (parsed.path or "").strip()
|
||||
except Exception:
|
||||
host = ""
|
||||
path = ""
|
||||
|
||||
if not host:
|
||||
return None
|
||||
|
||||
# Prefer Internet Archive for archive.org links unless the URL clearly refers
|
||||
# to a borrow/loan flow (handled by OpenLibrary provider).
|
||||
#
|
||||
# This keeps direct downloads and item pages routed to `internetarchive`, while
|
||||
# preserving OpenLibrary's scripted borrow pipeline for loan/reader URLs.
|
||||
if host == "openlibrary.org" or host.endswith(".openlibrary.org"):
|
||||
return "openlibrary" if "openlibrary" in _PROVIDERS else None
|
||||
|
||||
if host == "archive.org" or host.endswith(".archive.org"):
|
||||
low_path = str(path or "").lower()
|
||||
is_borrowish = (
|
||||
low_path.startswith("/borrow/") or low_path.startswith("/stream/")
|
||||
or low_path.startswith("/services/loans/") or "/services/loans/" in low_path
|
||||
)
|
||||
if is_borrowish:
|
||||
if host:
|
||||
if host == "openlibrary.org" or host.endswith(".openlibrary.org"):
|
||||
return "openlibrary" if "openlibrary" in _PROVIDERS else None
|
||||
return "internetarchive" if "internetarchive" in _PROVIDERS else None
|
||||
|
||||
if host == "archive.org" or host.endswith(".archive.org"):
|
||||
low_path = str(path or "").lower()
|
||||
is_borrowish = (
|
||||
low_path.startswith("/borrow/") or low_path.startswith("/stream/")
|
||||
or low_path.startswith("/services/loans/") or "/services/loans/" in low_path
|
||||
)
|
||||
if is_borrowish:
|
||||
return "openlibrary" if "openlibrary" in _PROVIDERS else None
|
||||
return "internetarchive" if "internetarchive" in _PROVIDERS else None
|
||||
|
||||
for name, provider_class in _PROVIDERS.items():
|
||||
domains = getattr(provider_class, "URL_DOMAINS", None)
|
||||
if not isinstance(domains, (list, tuple)):
|
||||
domains = _provider_url_patterns(provider_class)
|
||||
if not domains:
|
||||
continue
|
||||
for d in domains:
|
||||
dom = str(d or "").strip().lower()
|
||||
if not dom:
|
||||
continue
|
||||
if raw_url_lower.startswith(dom):
|
||||
return name
|
||||
for d in domains:
|
||||
dom = str(d or "").strip().lower()
|
||||
if not dom or not host:
|
||||
continue
|
||||
if host == dom or host.endswith("." + dom):
|
||||
return name
|
||||
|
||||
|
||||
Reference in New Issue
Block a user