"""Convenience mixins and helpers for table-based providers. Provides a small `TableProviderMixin` that handles HTTP fetch + table extraction (using `SYS.html_table.extract_records`) and converts records into `ProviderCore.base.SearchResult` rows with sane default column ordering. Providers can subclass this mixin to implement search quickly: class MyProvider(TableProviderMixin, Provider): URL = ("https://example.org/search",) def search(self, query, limit=50, **kwargs): url = f"{self.URL[0]}?q={quote_plus(query)}" return self.search_table_from_url(url, limit=limit, xpaths=self.DEFAULT_XPATHS) The mixin deliberately avoids adding heavy dependencies (uses our lxml helper) so authors don't have to install pandas/bs4 unless they want to. """ from __future__ import annotations from typing import Any, Dict, List, Optional from urllib.parse import quote_plus from API.HTTP import HTTPClient from ProviderCore.base import SearchResult from SYS.html_table import extract_records import lxml.html as lxml_html class TableProviderMixin: """Mixin to simplify providers that scrape table/list results from HTML. Methods: - search_table_from_url(url, limit, xpaths): fetches HTML, extracts records, returns SearchResults - DEFAULT_XPATHS: default xpath list used when none is provided """ # Reuse the same defaults as the html_table helper DEFAULT_XPATHS: List[str] = [ "//table//tbody/tr", "//table//tr[td]", "//div[contains(@class,'list-item')]", "//div[contains(@class,'result')]", "//li[contains(@class,'item')]", ] def search_table_from_url(self, url: str, limit: int = 50, xpaths: Optional[List[str]] = None, timeout: float = 15.0) -> List[SearchResult]: """Fetch `url`, extract table/list records, and return SearchResult list. `xpaths` is passed to `extract_records` (falls back to DEFAULT_XPATHS). """ if not url: return [] try: with HTTPClient(timeout=timeout) as client: resp = client.get(url) content = resp.content except Exception: return [] # Ensure we pass an lxml document or string (httpx returns bytes) try: doc = lxml_html.fromstring(content) except Exception: try: doc = content.decode("utf-8") except Exception: doc = str(content) records, chosen = extract_records(doc, base_url=url, xpaths=xpaths or self.DEFAULT_XPATHS) results: List[SearchResult] = [] for rec in (records or [])[: int(limit)]: title = rec.get("title") or "" path = rec.get("path") or "" platform = rec.get("system") or rec.get("platform") or "" size = rec.get("size") or "" region = rec.get("region") or "" version = rec.get("version") or "" languages = rec.get("languages") or "" cols = [("Title", title)] if platform: cols.append(("Platform", platform)) if size: cols.append(("Size", size)) if region: cols.append(("Region", region)) if version: cols.append(("Version", version)) if languages: cols.append(("Languages", languages)) results.append( SearchResult( table=(getattr(self, "name", "provider") or "provider"), title=title, path=path, detail="", annotations=[], media_kind="file", size_bytes=None, tag={getattr(self, "name", "provider")}, columns=cols, full_metadata={"raw_record": rec}, ) ) return results