This commit is contained in:
2026-01-05 07:51:19 -08:00
parent 8545367e28
commit 1f765cffda
32 changed files with 3447 additions and 3250 deletions

View File

@@ -26,8 +26,7 @@ class SearchResult:
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for pipeline processing."""
return {
out = {
"table": self.table,
"title": self.title,
"path": self.path,
@@ -40,6 +39,15 @@ class SearchResult:
"full_metadata": self.full_metadata,
}
try:
url_value = getattr(self, "url", None)
if url_value is not None:
out["url"] = url_value
except Exception:
pass
return out
class Provider(ABC):
"""Unified provider base class.

View File

@@ -1,75 +1,238 @@
"""Provider registry.
Concrete provider implementations live in the `Provider/` package.
This module is the single source of truth for provider discovery.
Concrete provider implementations live in the ``Provider`` package. This module
is the single source of truth for discovery, metadata, and lifecycle helpers
for those plugins.
"""
from __future__ import annotations
from typing import Any, Dict, Optional, Sequence, Type
import importlib
import pkgutil
import sys
from dataclasses import dataclass, field
from types import ModuleType
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Type
from urllib.parse import urlparse
from SYS.logger import log
from ProviderCore.base import Provider, SearchProvider, FileProvider, SearchResult
from Provider.alldebrid import AllDebrid
from Provider.bandcamp import Bandcamp
from Provider.libgen import Libgen
from Provider.matrix import Matrix
from Provider.openlibrary import OpenLibrary
from Provider.soulseek import Soulseek, download_soulseek_file
from Provider.telegram import Telegram
from Provider.youtube import YouTube
from Provider.fileio import FileIO
from Provider.zeroxzero import ZeroXZero
from Provider.loc import LOC
from Provider.internetarchive import InternetArchive
from Provider.podcastindex import PodcastIndex
from Provider.HIFI import HIFI
from ProviderCore.base import FileProvider, Provider, SearchProvider, SearchResult
from Provider.soulseek import download_soulseek_file
_PROVIDERS: Dict[str,
Type[Provider]] = {
# Search-capable providers
"alldebrid": AllDebrid,
"libgen": Libgen,
"openlibrary": OpenLibrary,
"internetarchive": InternetArchive,
"hifi": HIFI,
"soulseek": Soulseek,
"bandcamp": Bandcamp,
"youtube": YouTube,
"telegram": Telegram,
"loc": LOC,
"podcastindex": PodcastIndex,
# Upload-capable providers
"0x0": ZeroXZero,
"file.io": FileIO,
"matrix": Matrix,
}
@dataclass(frozen=True)
class ProviderInfo:
"""Metadata about a single provider entry."""
canonical_name: str
provider_class: Type[Provider]
module: str
alias_names: Tuple[str, ...] = field(default_factory=tuple)
@property
def supports_search(self) -> bool:
return self.provider_class.search is not Provider.search
@property
def supports_upload(self) -> bool:
return self.provider_class.upload is not Provider.upload
class ProviderRegistry:
"""Handles discovery, registration, and lookup of provider classes."""
def __init__(self, package_name: str) -> None:
self.package_name = (package_name or "").strip()
self._infos: Dict[str, ProviderInfo] = {}
self._lookup: Dict[str, ProviderInfo] = {}
self._modules: set[str] = set()
self._discovered = False
def _normalize(self, value: Any) -> str:
return str(value or "").strip().lower()
def _candidate_names(self,
provider_class: Type[Provider],
override_name: Optional[str]) -> List[str]:
names: List[str] = []
seen: set[str] = set()
def _add(value: Any) -> None:
text = str(value or "").strip()
normalized = text.lower()
if not text or normalized in seen:
return
seen.add(normalized)
names.append(text)
if override_name:
_add(override_name)
else:
_add(getattr(provider_class, "PROVIDER_NAME", None))
_add(getattr(provider_class, "NAME", None))
_add(getattr(provider_class, "__name__", None))
for alias in getattr(provider_class, "PROVIDER_ALIASES", ()) or ():
_add(alias)
return names
def register(
self,
provider_class: Type[Provider],
*,
override_name: Optional[str] = None,
extra_aliases: Optional[Sequence[str]] = None,
module_name: Optional[str] = None,
replace: bool = False,
) -> ProviderInfo:
"""Register a provider class with canonical and alias names."""
candidates = self._candidate_names(provider_class, override_name)
if not candidates:
raise ValueError("provider name candidates are required")
canonical = self._normalize(candidates[0])
if not canonical:
raise ValueError("provider name must not be empty")
alias_names: List[str] = []
alias_seen: set[str] = set()
for candidate in candidates[1:]:
normalized = self._normalize(candidate)
if not normalized or normalized == canonical or normalized in alias_seen:
continue
alias_seen.add(normalized)
alias_names.append(normalized)
for alias in extra_aliases or ():
normalized = self._normalize(alias)
if not normalized or normalized == canonical or normalized in alias_seen:
continue
alias_seen.add(normalized)
alias_names.append(normalized)
info = ProviderInfo(
canonical_name=canonical,
provider_class=provider_class,
module=module_name or getattr(provider_class, "__module__", "") or "",
alias_names=tuple(alias_names),
)
existing = self._infos.get(canonical)
if existing is not None and not replace:
return existing
self._infos[canonical] = info
for lookup in (canonical,) + tuple(alias_names):
self._lookup[lookup] = info
return info
def _register_module(self, module: ModuleType) -> None:
module_name = getattr(module, "__name__", "")
if not module_name or module_name in self._modules:
return
self._modules.add(module_name)
for attr in dir(module):
candidate = getattr(module, attr)
if not isinstance(candidate, type):
continue
if not issubclass(candidate, Provider):
continue
if candidate in {Provider, SearchProvider, FileProvider}:
continue
if getattr(candidate, "__module__", "") != module_name:
continue
try:
self.register(candidate, module_name=module_name)
except Exception as exc:
log(f"[provider] Failed to register {module_name}.{candidate.__name__}: {exc}", file=sys.stderr)
def discover(self) -> None:
"""Import and register providers from the package."""
if self._discovered or not self.package_name:
return
self._discovered = True
try:
package = importlib.import_module(self.package_name)
except Exception as exc:
log(f"[provider] Failed to import package {self.package_name}: {exc}", file=sys.stderr)
return
self._register_module(package)
package_path = getattr(package, "__path__", None)
if not package_path:
return
for finder, module_name, _ in pkgutil.iter_modules(package_path):
if module_name.startswith("_"):
continue
module_path = f"{self.package_name}.{module_name}"
try:
module = importlib.import_module(module_path)
except Exception as exc:
log(f"[provider] Failed to load {module_path}: {exc}", file=sys.stderr)
continue
self._register_module(module)
def get(self, name: str) -> Optional[ProviderInfo]:
self.discover()
if not name:
return None
return self._lookup.get(self._normalize(name))
def iter_providers(self) -> Iterable[ProviderInfo]:
self.discover()
return tuple(self._infos.values())
def has_name(self, name: str) -> bool:
return self.get(name) is not None
REGISTRY = ProviderRegistry("Provider")
REGISTRY.discover()
def register_provider(
provider_class: Type[Provider],
*,
name: Optional[str] = None,
aliases: Optional[Sequence[str]] = None,
module_name: Optional[str] = None,
replace: bool = False,
) -> ProviderInfo:
"""Register a provider class from tests or third-party packages."""
return REGISTRY.register(
provider_class,
override_name=name,
extra_aliases=aliases,
module_name=module_name,
replace=replace,
)
def get_provider_class(name: str) -> Optional[Type[Provider]]:
"""Return the provider class for a registered provider name, if any."""
key = str(name or "").strip().lower()
return _PROVIDERS.get(key)
info = REGISTRY.get(name)
if info is None:
return None
return info.provider_class
def selection_auto_stage_for_table(
table_type: str,
stage_args: Optional[Sequence[str]] = None,
) -> Optional[list[str]]:
"""Return the provider-suggested stage to auto-run for a selected table.
This is used by the CLI to avoid hardcoding table names and behaviors.
"""
t = str(table_type or "").strip().lower()
if not t:
return None
# Provider tables are usually either:
# - "youtube" (no dot)
# - "hifi.tracks" (prefix = provider name)
provider_key = t.split(".", 1)[0] if "." in t else t
provider_class = get_provider_class(provider_key) or get_provider_class(t)
if provider_class is None:
@@ -82,14 +245,7 @@ def selection_auto_stage_for_table(
def is_known_provider_name(name: str) -> bool:
"""Return True if `name` matches a registered provider key.
This is intentionally cheap (no imports/instantiation) so callers can
probe UI strings (table names, store names, etc.) without triggering
noisy 'Unknown provider' logs.
"""
return (name or "").strip().lower() in _PROVIDERS
return REGISTRY.has_name(name)
def _supports_search(provider: Provider) -> bool:
@@ -107,18 +263,14 @@ def _provider_url_patterns(provider_class: Type[Provider]) -> Sequence[str]:
return []
def get_provider(name: str,
config: Optional[Dict[str,
Any]] = None) -> Optional[Provider]:
"""Get a provider by name (unified registry)."""
provider_class = _PROVIDERS.get((name or "").lower())
if provider_class is None:
def get_provider(name: str, config: Optional[Dict[str, Any]] = None) -> Optional[Provider]:
info = REGISTRY.get(name)
if info is None:
log(f"[provider] Unknown provider: {name}", file=sys.stderr)
return None
try:
provider = provider_class(config)
provider = info.provider_class(config)
if not provider.validate():
log(f"[provider] Provider '{name}' is not available", file=sys.stderr)
return None
@@ -129,24 +281,18 @@ def get_provider(name: str,
def list_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]:
"""List all providers and their availability."""
availability: Dict[str,
bool] = {}
for name, provider_class in _PROVIDERS.items():
availability: Dict[str, bool] = {}
for info in REGISTRY.iter_providers():
try:
provider = provider_class(config)
availability[name] = provider.validate()
provider = info.provider_class(config)
availability[info.canonical_name] = provider.validate()
except Exception:
availability[name] = False
availability[info.canonical_name] = False
return availability
def get_search_provider(name: str,
config: Optional[Dict[str,
Any]] = None) -> Optional[SearchProvider]:
"""Get a search-capable provider by name (compat API)."""
config: Optional[Dict[str, Any]] = None) -> Optional[SearchProvider]:
provider = get_provider(name, config)
if provider is None:
return None
@@ -157,26 +303,20 @@ def get_search_provider(name: str,
def list_search_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]:
"""List all search providers and their availability."""
availability: Dict[str,
bool] = {}
for name, provider_class in _PROVIDERS.items():
availability: Dict[str, bool] = {}
for info in REGISTRY.iter_providers():
try:
provider = provider_class(config)
availability[name] = bool(
provider.validate() and _supports_search(provider)
provider = info.provider_class(config)
availability[info.canonical_name] = bool(
provider.validate() and info.supports_search
)
except Exception:
availability[name] = False
availability[info.canonical_name] = False
return availability
def get_file_provider(name: str,
config: Optional[Dict[str,
Any]] = None) -> Optional[FileProvider]:
"""Get an upload-capable provider by name (compat API)."""
config: Optional[Dict[str, Any]] = None) -> Optional[FileProvider]:
provider = get_provider(name, config)
if provider is None:
return None
@@ -187,28 +327,19 @@ def get_file_provider(name: str,
def list_file_providers(config: Optional[Dict[str, Any]] = None) -> Dict[str, bool]:
"""List all file providers and their availability."""
availability: Dict[str,
bool] = {}
for name, provider_class in _PROVIDERS.items():
availability: Dict[str, bool] = {}
for info in REGISTRY.iter_providers():
try:
provider = provider_class(config)
availability[name] = bool(
provider.validate() and _supports_upload(provider)
provider = info.provider_class(config)
availability[info.canonical_name] = bool(
provider.validate() and info.supports_upload
)
except Exception:
availability[name] = False
availability[info.canonical_name] = False
return availability
def match_provider_name_for_url(url: str) -> Optional[str]:
"""Return a registered provider name that claims the URL's domain.
Providers can declare domains via class attribute `URL` (preferred) or `URL_DOMAINS`.
This matcher is intentionally cheap (no provider instantiation, no network).
"""
raw_url = str(url or "").strip()
raw_url_lower = raw_url.lower()
try:
@@ -219,11 +350,6 @@ def match_provider_name_for_url(url: str) -> Optional[str]:
host = ""
path = ""
# Prefer Internet Archive for archive.org links unless the URL clearly refers
# to a borrow/loan flow (handled by OpenLibrary provider).
#
# This keeps direct downloads and item pages routed to `internetarchive`, while
# preserving OpenLibrary's scripted borrow pipeline for loan/reader URLs.
def _norm_host(h: str) -> str:
h_norm = str(h or "").strip().lower()
if h_norm.startswith("www."):
@@ -234,47 +360,45 @@ def match_provider_name_for_url(url: str) -> Optional[str]:
if host_norm:
if host_norm == "openlibrary.org" or host_norm.endswith(".openlibrary.org"):
return "openlibrary" if "openlibrary" in _PROVIDERS else None
return "openlibrary" if REGISTRY.has_name("openlibrary") else None
if host_norm == "archive.org" or host_norm.endswith(".archive.org"):
low_path = str(path or "").lower()
is_borrowish = (
low_path.startswith("/borrow/") or low_path.startswith("/stream/")
or low_path.startswith("/services/loans/") or "/services/loans/" in low_path
low_path.startswith("/borrow/")
or low_path.startswith("/stream/")
or low_path.startswith("/services/loans/")
or "/services/loans/" in low_path
)
if is_borrowish:
return "openlibrary" if "openlibrary" in _PROVIDERS else None
return "internetarchive" if "internetarchive" in _PROVIDERS else None
return "openlibrary" if REGISTRY.has_name("openlibrary") else None
return "internetarchive" if REGISTRY.has_name("internetarchive") else None
for name, provider_class in _PROVIDERS.items():
domains = _provider_url_patterns(provider_class)
for info in REGISTRY.iter_providers():
domains = _provider_url_patterns(info.provider_class)
if not domains:
continue
for d in domains:
dom_raw = str(d or "").strip()
for domain in domains:
dom_raw = str(domain or "").strip()
dom = dom_raw.lower()
if not dom:
continue
# Scheme-like patterns (magnet:, http://example) still use prefix match.
if dom.startswith("magnet:") or dom.startswith("http://") or dom.startswith("https://"):
if raw_url_lower.startswith(dom):
return name
return info.canonical_name
continue
dom_norm = _norm_host(dom)
if not dom_norm or not host_norm:
continue
if host_norm == dom_norm or host_norm.endswith("." + dom_norm):
return name
return info.canonical_name
return None
def get_provider_for_url(url: str,
config: Optional[Dict[str,
Any]] = None) -> Optional[Provider]:
"""Instantiate and return the matching provider for a URL, if any."""
config: Optional[Dict[str, Any]] = None) -> Optional[Provider]:
name = match_provider_name_for_url(url)
if not name:
return None
@@ -282,10 +406,12 @@ def get_provider_for_url(url: str,
__all__ = [
"SearchResult",
"ProviderInfo",
"Provider",
"SearchProvider",
"FileProvider",
"SearchResult",
"register_provider",
"get_provider",
"list_providers",
"get_search_provider",
@@ -294,7 +420,7 @@ __all__ = [
"list_file_providers",
"match_provider_name_for_url",
"get_provider_for_url",
"download_soulseek_file",
"get_provider_class",
"selection_auto_stage_for_table",
"download_soulseek_file",
]