j

2026-01-01 20:37:27 -08:00
parent f3c79609d8
commit deb05c0d44
35 changed files with 5030 additions and 4879 deletions
--- a/ProviderCore/base.py
+++ b/ProviderCore/base.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Sequence, Tuple


@dataclass
@@ -53,6 +53,8 @@ class Provider(ABC):
    - validate()
    """

+    URL: Sequence[str] = ()
+
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        self.config = config or {}
        self.name = self.__class__.__name__.lower()
@@ -107,6 +109,30 @@ class Provider(ABC):
        _ = stage_is_last
        return False

+    @classmethod
+    def url_patterns(cls) -> Tuple[str, ...]:
+        """Return normalized URL patterns that this provider handles."""
+        patterns: List[str] = []
+        maybe_urls = getattr(cls, "URL", None)
+        if isinstance(maybe_urls, (list, tuple)):
+            for entry in maybe_urls:
+                try:
+                    candidate = str(entry or "").strip().lower()
+                except Exception:
+                    continue
+                if candidate:
+                    patterns.append(candidate)
+        maybe_domains = getattr(cls, "URL_DOMAINS", None)
+        if isinstance(maybe_domains, (list, tuple)):
+            for entry in maybe_domains:
+                try:
+                    candidate = str(entry or "").strip().lower()
+                except Exception:
+                    continue
+                if candidate and candidate not in patterns:
+                    patterns.append(candidate)
+        return tuple(patterns)
+

 class SearchProvider(Provider):
    """Compatibility alias for older code.
--- a/ProviderCore/registry.py
+++ b/ProviderCore/registry.py
@@ -68,6 +68,13 @@ def _supports_upload(provider: Provider) -> bool:
    return provider.__class__.upload is not Provider.upload


+def _provider_url_patterns(provider_class: Type[Provider]) -> Sequence[str]:
+    try:
+        return list(provider_class.url_patterns())
+    except Exception:
+        return []
+
+
 def get_provider(name: str,
                 config: Optional[Dict[str,
                                       Any]] = None) -> Optional[Provider]:
@@ -166,47 +173,53 @@ def list_file_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bo
 def match_provider_name_for_url(url: str) -> Optional[str]:
    """Return a registered provider name that claims the URL's domain.

-    Providers can declare domains via a class attribute `URL_DOMAINS` (sequence of strings).
+    Providers can declare domains via class attribute `URL` (preferred) or `URL_DOMAINS`.
    This matcher is intentionally cheap (no provider instantiation, no network).
    """

+    raw_url = str(url or "").strip()
+    raw_url_lower = raw_url.lower()
    try:
-        parsed = urlparse(str(url))
+        parsed = urlparse(raw_url)
        host = (parsed.hostname or "").strip().lower()
        path = (parsed.path or "").strip()
    except Exception:
        host = ""
        path = ""

-    if not host:
-        return None
-
    # Prefer Internet Archive for archive.org links unless the URL clearly refers
    # to a borrow/loan flow (handled by OpenLibrary provider).
    #
    # This keeps direct downloads and item pages routed to `internetarchive`, while
    # preserving OpenLibrary's scripted borrow pipeline for loan/reader URLs.
-    if host == "openlibrary.org" or host.endswith(".openlibrary.org"):
-        return "openlibrary" if "openlibrary" in _PROVIDERS else None
-
-    if host == "archive.org" or host.endswith(".archive.org"):
-        low_path = str(path or "").lower()
-        is_borrowish = (
-            low_path.startswith("/borrow/") or low_path.startswith("/stream/")
-            or low_path.startswith("/services/loans/") or "/services/loans/" in low_path
-        )
-        if is_borrowish:
+    if host:
+        if host == "openlibrary.org" or host.endswith(".openlibrary.org"):
            return "openlibrary" if "openlibrary" in _PROVIDERS else None
-        return "internetarchive" if "internetarchive" in _PROVIDERS else None
+
+        if host == "archive.org" or host.endswith(".archive.org"):
+            low_path = str(path or "").lower()
+            is_borrowish = (
+                low_path.startswith("/borrow/") or low_path.startswith("/stream/")
+                or low_path.startswith("/services/loans/") or "/services/loans/" in low_path
+            )
+            if is_borrowish:
+                return "openlibrary" if "openlibrary" in _PROVIDERS else None
+            return "internetarchive" if "internetarchive" in _PROVIDERS else None

    for name, provider_class in _PROVIDERS.items():
-        domains = getattr(provider_class, "URL_DOMAINS", None)
-        if not isinstance(domains, (list, tuple)):
+        domains = _provider_url_patterns(provider_class)
+        if not domains:
            continue
        for d in domains:
            dom = str(d or "").strip().lower()
            if not dom:
                continue
+            if raw_url_lower.startswith(dom):
+                return name
+        for d in domains:
+            dom = str(d or "").strip().lower()
+            if not dom or not host:
+                continue
            if host == dom or host.endswith("." + dom):
                return name