refactor(download): remove ProviderCore/download.py, move sanitize_filename to SYS.utils, replace callers to use API.HTTP.HTTPClient
This commit is contained in:
110
SYS/provider_helpers.py
Normal file
110
SYS/provider_helpers.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""Convenience mixins and helpers for table-based providers.
|
||||
|
||||
Provides a small `TableProviderMixin` that handles HTTP fetch + table extraction
|
||||
(using `SYS.html_table.extract_records`) and converts records into
|
||||
`ProviderCore.base.SearchResult` rows with sane default column ordering.
|
||||
|
||||
Providers can subclass this mixin to implement search quickly:
|
||||
|
||||
class MyProvider(TableProviderMixin, Provider):
|
||||
URL = ("https://example.org/search",)
|
||||
|
||||
def search(self, query, limit=50, **kwargs):
|
||||
url = f"{self.URL[0]}?q={quote_plus(query)}"
|
||||
return self.search_table_from_url(url, limit=limit, xpaths=self.DEFAULT_XPATHS)
|
||||
|
||||
The mixin deliberately avoids adding heavy dependencies (uses our lxml helper)
|
||||
so authors don't have to install pandas/bs4 unless they want to.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
from urllib.parse import quote_plus
|
||||
|
||||
from API.HTTP import HTTPClient
|
||||
from ProviderCore.base import SearchResult
|
||||
from SYS.html_table import extract_records
|
||||
import lxml.html as lxml_html
|
||||
|
||||
|
||||
class TableProviderMixin:
|
||||
"""Mixin to simplify providers that scrape table/list results from HTML.
|
||||
|
||||
Methods:
|
||||
- search_table_from_url(url, limit, xpaths): fetches HTML, extracts records, returns SearchResults
|
||||
- DEFAULT_XPATHS: default xpath list used when none is provided
|
||||
"""
|
||||
|
||||
# Reuse the same defaults as the html_table helper
|
||||
DEFAULT_XPATHS: List[str] = [
|
||||
"//table//tbody/tr",
|
||||
"//table//tr[td]",
|
||||
"//div[contains(@class,'list-item')]",
|
||||
"//div[contains(@class,'result')]",
|
||||
"//li[contains(@class,'item')]",
|
||||
]
|
||||
|
||||
def search_table_from_url(self, url: str, limit: int = 50, xpaths: Optional[List[str]] = None, timeout: float = 15.0) -> List[SearchResult]:
|
||||
"""Fetch `url`, extract table/list records, and return SearchResult list.
|
||||
|
||||
`xpaths` is passed to `extract_records` (falls back to DEFAULT_XPATHS).
|
||||
"""
|
||||
if not url:
|
||||
return []
|
||||
|
||||
try:
|
||||
with HTTPClient(timeout=timeout) as client:
|
||||
resp = client.get(url)
|
||||
content = resp.content
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
# Ensure we pass an lxml document or string (httpx returns bytes)
|
||||
try:
|
||||
doc = lxml_html.fromstring(content)
|
||||
except Exception:
|
||||
try:
|
||||
doc = content.decode("utf-8")
|
||||
except Exception:
|
||||
doc = str(content)
|
||||
|
||||
records, chosen = extract_records(doc, base_url=url, xpaths=xpaths or self.DEFAULT_XPATHS)
|
||||
|
||||
results: List[SearchResult] = []
|
||||
for rec in (records or [])[: int(limit)]:
|
||||
title = rec.get("title") or ""
|
||||
path = rec.get("path") or ""
|
||||
platform = rec.get("system") or rec.get("platform") or ""
|
||||
size = rec.get("size") or ""
|
||||
region = rec.get("region") or ""
|
||||
version = rec.get("version") or ""
|
||||
languages = rec.get("languages") or ""
|
||||
|
||||
cols = [("Title", title)]
|
||||
if platform:
|
||||
cols.append(("Platform", platform))
|
||||
if size:
|
||||
cols.append(("Size", size))
|
||||
if region:
|
||||
cols.append(("Region", region))
|
||||
if version:
|
||||
cols.append(("Version", version))
|
||||
if languages:
|
||||
cols.append(("Languages", languages))
|
||||
|
||||
results.append(
|
||||
SearchResult(
|
||||
table=(getattr(self, "name", "provider") or "provider"),
|
||||
title=title,
|
||||
path=path,
|
||||
detail="",
|
||||
annotations=[],
|
||||
media_kind="file",
|
||||
size_bytes=None,
|
||||
tag={getattr(self, "name", "provider")},
|
||||
columns=cols,
|
||||
full_metadata={"raw_record": rec},
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
Reference in New Issue
Block a user