Medios-Macina/SYS/provider_helpers.py

"""Convenience mixins and helpers for table-based providers.

Provides a small `TableProviderMixin` that handles HTTP fetch + table extraction
(using `SYS.html_table.extract_records`) and converts records into
`ProviderCore.base.SearchResult` rows with sane default column ordering.

Providers can subclass this mixin to implement search quickly:

class MyProvider(TableProviderMixin, Provider):
    URL = ("https://example.org/search",)

    def search(self, query, limit=50, **kwargs):
        url = f"{self.URL[0]}?q={quote_plus(query)}"
        return self.search_table_from_url(url, limit=limit, xpaths=self.DEFAULT_XPATHS)

The mixin deliberately avoids adding heavy dependencies (uses our lxml helper)
so authors don't have to install pandas/bs4 unless they want to.
"""
from __future__ import annotations

from typing import Any, Dict, List, Optional
from urllib.parse import quote_plus

from API.HTTP import HTTPClient
from ProviderCore.base import SearchResult
from SYS.html_table import extract_records
import lxml.html as lxml_html


class TableProviderMixin:
    """Mixin to simplify providers that scrape table/list results from HTML.

    Methods:
      - search_table_from_url(url, limit, xpaths): fetches HTML, extracts records, returns SearchResults
      - DEFAULT_XPATHS: default xpath list used when none is provided
    """

    # Reuse the same defaults as the html_table helper
    DEFAULT_XPATHS: List[str] = [
        "//table//tbody/tr",
        "//table//tr[td]",
        "//div[contains(@class,'list-item')]",
        "//div[contains(@class,'result')]",
        "//li[contains(@class,'item')]",
    ]

    def search_table_from_url(self, url: str, limit: int = 50, xpaths: Optional[List[str]] = None, timeout: float = 15.0) -> List[SearchResult]:
        """Fetch `url`, extract table/list records, and return SearchResult list.

        `xpaths` is passed to `extract_records` (falls back to DEFAULT_XPATHS).
        """
        if not url:
            return []

        try:
            with HTTPClient(timeout=timeout) as client:
                resp = client.get(url)
                content = resp.content
        except Exception:
            return []

        # Ensure we pass an lxml document or string (httpx returns bytes)
        try:
            doc = lxml_html.fromstring(content)
        except Exception:
            try:
                doc = content.decode("utf-8")
            except Exception:
                doc = str(content)

        records, chosen = extract_records(doc, base_url=url, xpaths=xpaths or self.DEFAULT_XPATHS)

        results: List[SearchResult] = []
        for rec in (records or [])[: int(limit)]:
            title = rec.get("title") or ""
            path = rec.get("path") or ""
            platform = rec.get("system") or rec.get("platform") or ""
            size = rec.get("size") or ""
            region = rec.get("region") or ""
            version = rec.get("version") or ""
            languages = rec.get("languages") or ""

            cols = [("Title", title)]
            if platform:
                cols.append(("Platform", platform))
            if size:
                cols.append(("Size", size))
            if region:
                cols.append(("Region", region))
            if version:
                cols.append(("Version", version))
            if languages:
                cols.append(("Languages", languages))

            results.append(
                SearchResult(
                    table=(getattr(self, "name", "provider") or "provider"),
                    title=title,
                    path=path,
                    detail="",
                    annotations=[],
                    media_kind="file",
                    size_bytes=None,
                    tag={getattr(self, "name", "provider")},
                    columns=cols,
                    full_metadata={"raw_record": rec},
                )
            )

        return results