111 lines
3.9 KiB
Python
111 lines
3.9 KiB
Python
"""Convenience mixins and helpers for table-based providers.
|
|
|
|
Provides a small `TableProviderMixin` that handles HTTP fetch + table extraction
|
|
(using `SYS.html_table.extract_records`) and converts records into
|
|
`ProviderCore.base.SearchResult` rows with sane default column ordering.
|
|
|
|
Providers can subclass this mixin to implement search quickly:
|
|
|
|
class MyProvider(TableProviderMixin, Provider):
|
|
URL = ("https://example.org/search",)
|
|
|
|
def search(self, query, limit=50, **kwargs):
|
|
url = f"{self.URL[0]}?q={quote_plus(query)}"
|
|
return self.search_table_from_url(url, limit=limit, xpaths=self.DEFAULT_XPATHS)
|
|
|
|
The mixin deliberately avoids adding heavy dependencies (uses our lxml helper)
|
|
so authors don't have to install pandas/bs4 unless they want to.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from typing import Any, Dict, List, Optional
|
|
from urllib.parse import quote_plus
|
|
|
|
from API.HTTP import HTTPClient
|
|
from ProviderCore.base import SearchResult
|
|
from SYS.html_table import extract_records
|
|
import lxml.html as lxml_html
|
|
|
|
|
|
class TableProviderMixin:
|
|
"""Mixin to simplify providers that scrape table/list results from HTML.
|
|
|
|
Methods:
|
|
- search_table_from_url(url, limit, xpaths): fetches HTML, extracts records, returns SearchResults
|
|
- DEFAULT_XPATHS: default xpath list used when none is provided
|
|
"""
|
|
|
|
# Reuse the same defaults as the html_table helper
|
|
DEFAULT_XPATHS: List[str] = [
|
|
"//table//tbody/tr",
|
|
"//table//tr[td]",
|
|
"//div[contains(@class,'list-item')]",
|
|
"//div[contains(@class,'result')]",
|
|
"//li[contains(@class,'item')]",
|
|
]
|
|
|
|
def search_table_from_url(self, url: str, limit: int = 50, xpaths: Optional[List[str]] = None, timeout: float = 15.0) -> List[SearchResult]:
|
|
"""Fetch `url`, extract table/list records, and return SearchResult list.
|
|
|
|
`xpaths` is passed to `extract_records` (falls back to DEFAULT_XPATHS).
|
|
"""
|
|
if not url:
|
|
return []
|
|
|
|
try:
|
|
with HTTPClient(timeout=timeout) as client:
|
|
resp = client.get(url)
|
|
content = resp.content
|
|
except Exception:
|
|
return []
|
|
|
|
# Ensure we pass an lxml document or string (httpx returns bytes)
|
|
try:
|
|
doc = lxml_html.fromstring(content)
|
|
except Exception:
|
|
try:
|
|
doc = content.decode("utf-8")
|
|
except Exception:
|
|
doc = str(content)
|
|
|
|
records, chosen = extract_records(doc, base_url=url, xpaths=xpaths or self.DEFAULT_XPATHS)
|
|
|
|
results: List[SearchResult] = []
|
|
for rec in (records or [])[: int(limit)]:
|
|
title = rec.get("title") or ""
|
|
path = rec.get("path") or ""
|
|
platform = rec.get("system") or rec.get("platform") or ""
|
|
size = rec.get("size") or ""
|
|
region = rec.get("region") or ""
|
|
version = rec.get("version") or ""
|
|
languages = rec.get("languages") or ""
|
|
|
|
cols = [("Title", title)]
|
|
if platform:
|
|
cols.append(("Platform", platform))
|
|
if size:
|
|
cols.append(("Size", size))
|
|
if region:
|
|
cols.append(("Region", region))
|
|
if version:
|
|
cols.append(("Version", version))
|
|
if languages:
|
|
cols.append(("Languages", languages))
|
|
|
|
results.append(
|
|
SearchResult(
|
|
table=(getattr(self, "name", "provider") or "provider"),
|
|
title=title,
|
|
path=path,
|
|
detail="",
|
|
annotations=[],
|
|
media_kind="file",
|
|
size_bytes=None,
|
|
tag={getattr(self, "name", "provider")},
|
|
columns=cols,
|
|
full_metadata={"raw_record": rec},
|
|
)
|
|
)
|
|
|
|
return results
|