refactor(download): remove ProviderCore/download.py, move sanitize_filename to SYS.utils, replace callers to use API.HTTP.HTTPClient
This commit is contained in:
302
SYS/html_table.py
Normal file
302
SYS/html_table.py
Normal file
@@ -0,0 +1,302 @@
|
||||
"""Small helper utilities for extracting structured records from HTML tables
|
||||
using lxml.
|
||||
|
||||
Goal: make it trivial for provider authors to extract table rows and common
|
||||
fields (title, link, standardized column keys) without re-implementing the
|
||||
same heuristics in every provider.
|
||||
|
||||
Key functions:
|
||||
- find_candidate_nodes(doc_or_html, xpaths=...)
|
||||
- extract_records(doc_or_html, base_url=None, xpaths=...)
|
||||
- normalize_header(name, synonyms=...)
|
||||
|
||||
This module intentionally avoids heavyweight deps (no pandas) and works with
|
||||
`lxml.html` elements (the project already uses lxml).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from lxml import html as lxml_html
|
||||
from urllib.parse import urljoin
|
||||
import re
|
||||
|
||||
# Default xpaths for candidate result containers
|
||||
_DEFAULT_XPATHS = [
|
||||
"//table//tbody/tr",
|
||||
"//table//tr[td]",
|
||||
"//div[contains(@class,'list-item')]",
|
||||
"//div[contains(@class,'result')]",
|
||||
"//li[contains(@class,'item')]",
|
||||
]
|
||||
|
||||
# Simple header synonyms (you can extend as needed)
|
||||
_DEFAULT_SYNONYMS = {
|
||||
"platform": "system",
|
||||
"system": "system",
|
||||
"name": "title",
|
||||
"title": "title",
|
||||
}
|
||||
|
||||
|
||||
def _ensure_doc(doc_or_html: Any) -> lxml_html.HtmlElement:
|
||||
if isinstance(doc_or_html, str):
|
||||
return lxml_html.fromstring(doc_or_html)
|
||||
return doc_or_html
|
||||
|
||||
|
||||
def _text_or_img_title(el) -> str:
|
||||
# Prefer img/@title if present (useful for flag icons)
|
||||
try:
|
||||
imgs = el.xpath('.//img/@title')
|
||||
if imgs:
|
||||
return str(imgs[0]).strip()
|
||||
except Exception:
|
||||
pass
|
||||
return (el.text_content() or "").strip()
|
||||
|
||||
|
||||
def find_candidate_nodes(doc_or_html: Any, xpaths: Optional[List[str]] = None) -> Tuple[List[Any], Optional[str]]:
|
||||
"""Find candidate nodes for results using a prioritized xpath list.
|
||||
|
||||
Returns (nodes, chosen_xpath).
|
||||
"""
|
||||
doc = _ensure_doc(doc_or_html)
|
||||
for xp in (xpaths or _DEFAULT_XPATHS):
|
||||
try:
|
||||
found = doc.xpath(xp)
|
||||
if found:
|
||||
return list(found), xp
|
||||
except Exception:
|
||||
continue
|
||||
return [], None
|
||||
|
||||
|
||||
def _parse_tr_nodes(tr_nodes: List[Any], base: Optional[str] = None) -> List[Dict[str, str]]:
|
||||
out: List[Dict[str, str]] = []
|
||||
|
||||
for tr in tr_nodes:
|
||||
try:
|
||||
tds = tr.xpath("./td")
|
||||
if not tds or len(tds) < 1:
|
||||
continue
|
||||
|
||||
# canonical fields
|
||||
rec: Dict[str, str] = {}
|
||||
|
||||
# Heuristic: if the first cell contains an anchor, treat it as the title/path
|
||||
# (detail pages often put the file link in the first column and size in the second).
|
||||
a0 = tds[0].xpath('.//a[contains(@href,"/vault/")]') or tds[0].xpath('.//a')
|
||||
if a0:
|
||||
rec["title"] = (a0[0].text_content() or "").strip()
|
||||
href = a0[0].get("href")
|
||||
rec["path"] = urljoin(base, href) if href and base else (href or "")
|
||||
|
||||
# Try to find a size cell in the remaining tds (class 'size' is common)
|
||||
size_val = None
|
||||
for td in tds[1:]:
|
||||
s = td.xpath('.//span[contains(@class,"size")]/text()')
|
||||
if s:
|
||||
size_val = str(s[0]).strip()
|
||||
break
|
||||
if not size_val and len(tds) > 1:
|
||||
txt = (tds[1].text_content() or "").strip()
|
||||
# crude size heuristic: contains digits and a unit letter
|
||||
if txt and re.search(r"\d", txt):
|
||||
size_val = txt
|
||||
|
||||
if size_val:
|
||||
rec["size"] = size_val
|
||||
|
||||
else:
|
||||
# First cell often "system"/"platform"
|
||||
rec["platform"] = _text_or_img_title(tds[0])
|
||||
|
||||
# Title + optional link from second column
|
||||
if len(tds) > 1:
|
||||
a = tds[1].xpath('.//a[contains(@href,"/vault/")]') or tds[1].xpath('.//a')
|
||||
if a:
|
||||
rec["title"] = (a[0].text_content() or "").strip()
|
||||
href = a[0].get("href")
|
||||
rec["path"] = urljoin(base, href) if href and base else (href or "")
|
||||
else:
|
||||
rec["title"] = (tds[1].text_content() or "").strip()
|
||||
|
||||
# Additional columns in common Vimm layout
|
||||
if len(tds) > 2:
|
||||
rec["region"] = _text_or_img_title(tds[2]).strip()
|
||||
if len(tds) > 3:
|
||||
rec["version"] = (tds[3].text_content() or "").strip()
|
||||
if len(tds) > 4:
|
||||
rec["languages"] = (tds[4].text_content() or "").strip()
|
||||
|
||||
out.append(rec)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def _parse_list_item_nodes(nodes: List[Any], base: Optional[str] = None) -> List[Dict[str, str]]:
|
||||
out: List[Dict[str, str]] = []
|
||||
for node in nodes:
|
||||
try:
|
||||
rec: Dict[str, str] = {}
|
||||
# title heuristics
|
||||
a = node.xpath('.//h2/a') or node.xpath('.//a')
|
||||
if a:
|
||||
rec["title"] = (a[0].text_content() or "").strip()
|
||||
href = a[0].get("href")
|
||||
rec["path"] = urljoin(base, href) if href and base else (href or "")
|
||||
else:
|
||||
rec["title"] = (node.text_content() or "").strip()
|
||||
|
||||
# platform, size
|
||||
p = node.xpath('.//span[contains(@class,"platform")]/text()')
|
||||
if p:
|
||||
rec["platform"] = str(p[0]).strip()
|
||||
|
||||
s = node.xpath('.//span[contains(@class,"size")]/text()')
|
||||
if s:
|
||||
rec["size"] = str(s[0]).strip()
|
||||
|
||||
out.append(rec)
|
||||
except Exception:
|
||||
continue
|
||||
return out
|
||||
|
||||
|
||||
def normalize_header(name: str, synonyms: Optional[Dict[str, str]] = None) -> str:
|
||||
"""Normalize header names to a canonical form.
|
||||
|
||||
Defaults map 'platform' -> 'system' and 'name' -> 'title', but callers
|
||||
can pass a custom synonyms dict.
|
||||
"""
|
||||
if not name:
|
||||
return ""
|
||||
s = str(name or "").strip().lower()
|
||||
s = re.sub(r"\s+", "_", s)
|
||||
syn = (synonyms or _DEFAULT_SYNONYMS).get(s)
|
||||
return syn or s
|
||||
|
||||
|
||||
def extract_records(doc_or_html: Any, base_url: Optional[str] = None, xpaths: Optional[List[str]] = None, use_pandas_if_available: bool = True) -> Tuple[List[Dict[str, str]], Optional[str]]:
|
||||
"""Find result candidate nodes and return a list of normalized records plus chosen xpath.
|
||||
|
||||
If pandas is available and `use_pandas_if_available` is True, attempt to parse
|
||||
HTML tables using `pandas.read_html` and return those records. Falls back to
|
||||
node-based parsing when pandas is not available or fails. Returns (records, chosen)
|
||||
where `chosen` is the xpath that matched or the string 'pandas' when the
|
||||
pandas path was used.
|
||||
"""
|
||||
# Prepare an HTML string for pandas if needed
|
||||
html_text: Optional[str] = None
|
||||
if isinstance(doc_or_html, (bytes, bytearray)):
|
||||
try:
|
||||
html_text = doc_or_html.decode("utf-8")
|
||||
except Exception:
|
||||
html_text = doc_or_html.decode("latin-1", errors="ignore")
|
||||
elif isinstance(doc_or_html, str):
|
||||
html_text = doc_or_html
|
||||
else:
|
||||
try:
|
||||
html_text = lxml_html.tostring(doc_or_html, encoding="unicode")
|
||||
except Exception:
|
||||
html_text = str(doc_or_html)
|
||||
|
||||
# Try pandas first when available and requested
|
||||
if use_pandas_if_available and html_text is not None:
|
||||
try:
|
||||
import pandas as _pd # type: ignore
|
||||
|
||||
dfs = _pd.read_html(html_text)
|
||||
if dfs:
|
||||
# pick the largest dataframe by row count for heuristics
|
||||
df = max(dfs, key=lambda d: getattr(d, "shape", (len(getattr(d, 'index', [])), 0))[0])
|
||||
try:
|
||||
rows = df.to_dict("records")
|
||||
except Exception:
|
||||
# Some DataFrame-like objects may have slightly different APIs
|
||||
rows = [dict(r) for r in df]
|
||||
|
||||
records: List[Dict[str, str]] = []
|
||||
for row in rows:
|
||||
nr: Dict[str, str] = {}
|
||||
for k, v in (row or {}).items():
|
||||
nk = normalize_header(str(k or ""))
|
||||
nr[nk] = (str(v).strip() if v is not None else "")
|
||||
records.append(nr)
|
||||
|
||||
# Attempt to recover hrefs by matching anchor text -> href
|
||||
try:
|
||||
doc = lxml_html.fromstring(html_text)
|
||||
anchors = {}
|
||||
for a in doc.xpath('//a'):
|
||||
txt = (a.text_content() or "").strip()
|
||||
href = a.get("href")
|
||||
if txt and href and txt not in anchors:
|
||||
anchors[txt] = href
|
||||
for rec in records:
|
||||
if not rec.get("path") and rec.get("title"):
|
||||
href = anchors.get(rec["title"])
|
||||
if href:
|
||||
rec["path"] = urljoin(base_url, href) if base_url else href
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return records, "pandas"
|
||||
except Exception:
|
||||
# Pandas not present or parsing failed; fall back to node parsing
|
||||
pass
|
||||
|
||||
# Fallback to node-based parsing
|
||||
nodes, chosen = find_candidate_nodes(doc_or_html, xpaths=xpaths)
|
||||
if not nodes:
|
||||
return [], chosen
|
||||
|
||||
# Determine node type and parse accordingly
|
||||
first = nodes[0]
|
||||
tag = getattr(first, "tag", "").lower()
|
||||
if tag == "tr":
|
||||
records = _parse_tr_nodes(nodes, base=base_url)
|
||||
else:
|
||||
# list-item style
|
||||
records = _parse_list_item_nodes(nodes, base=base_url)
|
||||
|
||||
# Normalize keys (map platform->system etc)
|
||||
normed: List[Dict[str, str]] = []
|
||||
for r in records:
|
||||
nr: Dict[str, str] = {}
|
||||
for k, v in (r or {}).items():
|
||||
nk = normalize_header(k)
|
||||
nr[nk] = v
|
||||
normed.append(nr)
|
||||
|
||||
return normed, chosen
|
||||
|
||||
|
||||
# Small convenience: convert records to SearchResult. Providers can call this or
|
||||
# use their own mapping when they need full SearchResult objects.
|
||||
from ProviderCore.base import SearchResult # local import to avoid circular issues
|
||||
|
||||
|
||||
def records_to_search_results(records: List[Dict[str, str]], table: str = "provider") -> List[SearchResult]:
|
||||
out: List[SearchResult] = []
|
||||
for rec in records:
|
||||
title = rec.get("title") or rec.get("name") or ""
|
||||
path = rec.get("path") or ""
|
||||
meta = dict(rec)
|
||||
out.append(
|
||||
SearchResult(
|
||||
table=table,
|
||||
title=str(title),
|
||||
path=str(path),
|
||||
detail="",
|
||||
annotations=[],
|
||||
media_kind="file",
|
||||
size_bytes=None,
|
||||
tag={table},
|
||||
columns=[(k.title(), v) for k, v in rec.items() if k and v],
|
||||
full_metadata={"raw_record": rec, "raw": rec},
|
||||
)
|
||||
)
|
||||
return out
|
||||
@@ -972,6 +972,16 @@ def get_last_result_table_row_selection_args(row_index: int) -> Optional[List[st
|
||||
return None
|
||||
|
||||
|
||||
def get_last_result_table_row_selection_action(row_index: int) -> Optional[List[str]]:
|
||||
"""Get the expanded stage tokens for a row in the last result table."""
|
||||
state = _get_pipeline_state()
|
||||
if _is_selectable_table(state.last_result_table) and hasattr(state.last_result_table, "rows"):
|
||||
if 0 <= row_index < len(state.last_result_table.rows):
|
||||
row = state.last_result_table.rows[row_index]
|
||||
if hasattr(row, "selection_action"):
|
||||
return row.selection_action
|
||||
return None
|
||||
|
||||
def set_current_stage_table(result_table: Optional[Any]) -> None:
|
||||
"""Store the current pipeline stage table for @N expansion.
|
||||
|
||||
@@ -1035,6 +1045,17 @@ def get_current_stage_table_row_selection_args(row_index: int) -> Optional[List[
|
||||
return None
|
||||
|
||||
|
||||
def get_current_stage_table_row_selection_action(row_index: int) -> Optional[List[str]]:
|
||||
"""Get the expanded stage tokens for a row in the current stage table."""
|
||||
state = _get_pipeline_state()
|
||||
if _is_selectable_table(state.current_stage_table) and hasattr(state.current_stage_table, "rows"):
|
||||
if 0 <= row_index < len(state.current_stage_table.rows):
|
||||
row = state.current_stage_table.rows[row_index]
|
||||
if hasattr(row, "selection_action"):
|
||||
return row.selection_action
|
||||
return None
|
||||
|
||||
|
||||
def get_current_stage_table_row_source_index(row_index: int) -> Optional[int]:
|
||||
"""Get the original source index for a row in the current stage table.
|
||||
|
||||
|
||||
110
SYS/provider_helpers.py
Normal file
110
SYS/provider_helpers.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""Convenience mixins and helpers for table-based providers.
|
||||
|
||||
Provides a small `TableProviderMixin` that handles HTTP fetch + table extraction
|
||||
(using `SYS.html_table.extract_records`) and converts records into
|
||||
`ProviderCore.base.SearchResult` rows with sane default column ordering.
|
||||
|
||||
Providers can subclass this mixin to implement search quickly:
|
||||
|
||||
class MyProvider(TableProviderMixin, Provider):
|
||||
URL = ("https://example.org/search",)
|
||||
|
||||
def search(self, query, limit=50, **kwargs):
|
||||
url = f"{self.URL[0]}?q={quote_plus(query)}"
|
||||
return self.search_table_from_url(url, limit=limit, xpaths=self.DEFAULT_XPATHS)
|
||||
|
||||
The mixin deliberately avoids adding heavy dependencies (uses our lxml helper)
|
||||
so authors don't have to install pandas/bs4 unless they want to.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
from urllib.parse import quote_plus
|
||||
|
||||
from API.HTTP import HTTPClient
|
||||
from ProviderCore.base import SearchResult
|
||||
from SYS.html_table import extract_records
|
||||
import lxml.html as lxml_html
|
||||
|
||||
|
||||
class TableProviderMixin:
|
||||
"""Mixin to simplify providers that scrape table/list results from HTML.
|
||||
|
||||
Methods:
|
||||
- search_table_from_url(url, limit, xpaths): fetches HTML, extracts records, returns SearchResults
|
||||
- DEFAULT_XPATHS: default xpath list used when none is provided
|
||||
"""
|
||||
|
||||
# Reuse the same defaults as the html_table helper
|
||||
DEFAULT_XPATHS: List[str] = [
|
||||
"//table//tbody/tr",
|
||||
"//table//tr[td]",
|
||||
"//div[contains(@class,'list-item')]",
|
||||
"//div[contains(@class,'result')]",
|
||||
"//li[contains(@class,'item')]",
|
||||
]
|
||||
|
||||
def search_table_from_url(self, url: str, limit: int = 50, xpaths: Optional[List[str]] = None, timeout: float = 15.0) -> List[SearchResult]:
|
||||
"""Fetch `url`, extract table/list records, and return SearchResult list.
|
||||
|
||||
`xpaths` is passed to `extract_records` (falls back to DEFAULT_XPATHS).
|
||||
"""
|
||||
if not url:
|
||||
return []
|
||||
|
||||
try:
|
||||
with HTTPClient(timeout=timeout) as client:
|
||||
resp = client.get(url)
|
||||
content = resp.content
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
# Ensure we pass an lxml document or string (httpx returns bytes)
|
||||
try:
|
||||
doc = lxml_html.fromstring(content)
|
||||
except Exception:
|
||||
try:
|
||||
doc = content.decode("utf-8")
|
||||
except Exception:
|
||||
doc = str(content)
|
||||
|
||||
records, chosen = extract_records(doc, base_url=url, xpaths=xpaths or self.DEFAULT_XPATHS)
|
||||
|
||||
results: List[SearchResult] = []
|
||||
for rec in (records or [])[: int(limit)]:
|
||||
title = rec.get("title") or ""
|
||||
path = rec.get("path") or ""
|
||||
platform = rec.get("system") or rec.get("platform") or ""
|
||||
size = rec.get("size") or ""
|
||||
region = rec.get("region") or ""
|
||||
version = rec.get("version") or ""
|
||||
languages = rec.get("languages") or ""
|
||||
|
||||
cols = [("Title", title)]
|
||||
if platform:
|
||||
cols.append(("Platform", platform))
|
||||
if size:
|
||||
cols.append(("Size", size))
|
||||
if region:
|
||||
cols.append(("Region", region))
|
||||
if version:
|
||||
cols.append(("Version", version))
|
||||
if languages:
|
||||
cols.append(("Languages", languages))
|
||||
|
||||
results.append(
|
||||
SearchResult(
|
||||
table=(getattr(self, "name", "provider") or "provider"),
|
||||
title=title,
|
||||
path=path,
|
||||
detail="",
|
||||
annotations=[],
|
||||
media_kind="file",
|
||||
size_bytes=None,
|
||||
tag={getattr(self, "name", "provider")},
|
||||
columns=cols,
|
||||
full_metadata={"raw_record": rec},
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
@@ -359,6 +359,8 @@ class ResultRow:
|
||||
columns: List[ResultColumn] = field(default_factory=list)
|
||||
selection_args: Optional[List[str]] = None
|
||||
"""Arguments to use for this row when selected via @N syntax (e.g., ['-item', '3'])"""
|
||||
selection_action: Optional[List[str]] = None
|
||||
"""Full expanded stage tokens that should run when this row is selected."""
|
||||
source_index: Optional[int] = None
|
||||
"""Original insertion order index (used to map sorted views back to source items)."""
|
||||
payload: Optional[Any] = None
|
||||
@@ -648,6 +650,11 @@ class ResultTable:
|
||||
if 0 <= row_index < len(self.rows):
|
||||
self.rows[row_index].selection_args = selection_args
|
||||
|
||||
def set_row_selection_action(self, row_index: int, selection_action: List[str]) -> None:
|
||||
"""Specify the entire stage tokens to run for this row on @N."""
|
||||
if 0 <= row_index < len(self.rows):
|
||||
self.rows[row_index].selection_action = selection_action
|
||||
|
||||
def set_header_lines(self, lines: List[str]) -> "ResultTable":
|
||||
"""Attach metadata lines that render beneath the title."""
|
||||
self.header_lines = [line for line in lines if line]
|
||||
@@ -827,6 +834,30 @@ class ResultTable:
|
||||
if hasattr(result, "annotations") and result.annotations:
|
||||
row.add_column("Annotations", ", ".join(str(a) for a in result.annotations))
|
||||
|
||||
try:
|
||||
md = getattr(result, "full_metadata", None)
|
||||
md_dict = dict(md) if isinstance(md, dict) else {}
|
||||
except Exception:
|
||||
md_dict = {}
|
||||
|
||||
try:
|
||||
selection_args = getattr(result, "selection_args", None)
|
||||
except Exception:
|
||||
selection_args = None
|
||||
if selection_args is None:
|
||||
selection_args = md_dict.get("_selection_args") or md_dict.get("selection_args")
|
||||
if selection_args:
|
||||
row.selection_args = [str(a) for a in selection_args if a is not None]
|
||||
|
||||
try:
|
||||
selection_action = getattr(result, "selection_action", None)
|
||||
except Exception:
|
||||
selection_action = None
|
||||
if selection_action is None:
|
||||
selection_action = md_dict.get("_selection_action") or md_dict.get("selection_action")
|
||||
if selection_action:
|
||||
row.selection_action = [str(a) for a in selection_action if a is not None]
|
||||
|
||||
def _add_result_item(self, row: ResultRow, item: Any) -> None:
|
||||
"""Extract and add ResultItem fields to row (compact display for search results).
|
||||
|
||||
|
||||
@@ -10,10 +10,10 @@ from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
||||
|
||||
from SYS.result_table_api import ColumnSpec, ProviderAdapter, ResultModel
|
||||
from SYS.result_table_api import ColumnSpec, ProviderAdapter, ResultModel, ResultTable, ensure_result_model
|
||||
|
||||
|
||||
ColumnFactory = Callable[[Iterable[ResultModel]], List[ColumnSpec]]
|
||||
ColumnFactory = Callable[[List[ResultModel]], List[ColumnSpec]]
|
||||
SelectionFn = Callable[[ResultModel], List[str]]
|
||||
|
||||
|
||||
@@ -22,33 +22,57 @@ class Provider:
|
||||
name: str
|
||||
adapter: ProviderAdapter
|
||||
# columns can be a static list or a factory that derives columns from sample rows
|
||||
columns: Optional[Union[List[ColumnSpec], ColumnFactory]] = None
|
||||
selection_fn: Optional[SelectionFn] = None
|
||||
columns: Union[List[ColumnSpec], ColumnFactory]
|
||||
selection_fn: SelectionFn
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
|
||||
def get_columns(self, rows: Optional[Iterable[ResultModel]] = None) -> List[ColumnSpec]:
|
||||
if self.columns is None:
|
||||
raise ValueError(f"provider '{self.name}' must define columns")
|
||||
|
||||
if callable(self.columns):
|
||||
try:
|
||||
rows_list = list(rows) if rows is not None else []
|
||||
return list(self.columns(rows_list))
|
||||
except Exception:
|
||||
# Fall back to a minimal Title column on errors
|
||||
return [ColumnSpec("title", "Title", lambda r: r.title)]
|
||||
if self.columns is not None:
|
||||
return list(self.columns)
|
||||
# Default minimal column set
|
||||
return [ColumnSpec("title", "Title", lambda r: r.title)]
|
||||
rows_list = list(rows) if rows is not None else []
|
||||
cols = list(self.columns(rows_list))
|
||||
else:
|
||||
cols = list(self.columns)
|
||||
|
||||
if not cols:
|
||||
raise ValueError(f"provider '{self.name}' produced no columns")
|
||||
|
||||
return cols
|
||||
|
||||
def selection_args(self, row: ResultModel) -> List[str]:
|
||||
if callable(self.selection_fn):
|
||||
try:
|
||||
return list(self.selection_fn(row))
|
||||
except Exception:
|
||||
return []
|
||||
# Default selector: prefer path flag, then title
|
||||
if getattr(row, "path", None):
|
||||
return ["-path", str(row.path)]
|
||||
return ["-title", str(row.title)]
|
||||
if not callable(self.selection_fn):
|
||||
raise ValueError(f"provider '{self.name}' must define a selection function")
|
||||
|
||||
sel = list(self.selection_fn(ensure_result_model(row)))
|
||||
return sel
|
||||
|
||||
def build_table(self, items: Iterable[Any]) -> ResultTable:
|
||||
"""Materialize adapter output into a ResultTable (strict, no legacy types)."""
|
||||
|
||||
try:
|
||||
rows = [ensure_result_model(r) for r in self.adapter(items)]
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"provider '{self.name}' adapter failed") from exc
|
||||
|
||||
cols = self.get_columns(rows)
|
||||
return ResultTable(provider=self.name, rows=rows, columns=cols, meta=self.metadata or {})
|
||||
|
||||
def serialize_row(self, row: ResultModel) -> Dict[str, Any]:
|
||||
r = ensure_result_model(row)
|
||||
return {
|
||||
"title": r.title,
|
||||
"path": r.path,
|
||||
"ext": r.ext,
|
||||
"size_bytes": r.size_bytes,
|
||||
"metadata": r.metadata or {},
|
||||
"source": r.source or self.name,
|
||||
"_selection_args": self.selection_args(r),
|
||||
}
|
||||
|
||||
def serialize_rows(self, rows: Iterable[ResultModel]) -> List[Dict[str, Any]]:
|
||||
return [self.serialize_row(r) for r in rows]
|
||||
|
||||
|
||||
_PROVIDERS: Dict[str, Provider] = {}
|
||||
@@ -58,8 +82,8 @@ def register_provider(
|
||||
name: str,
|
||||
adapter: ProviderAdapter,
|
||||
*,
|
||||
columns: Optional[Union[List[ColumnSpec], ColumnFactory]] = None,
|
||||
selection_fn: Optional[SelectionFn] = None,
|
||||
columns: Union[List[ColumnSpec], ColumnFactory],
|
||||
selection_fn: SelectionFn,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> Provider:
|
||||
name = str(name or "").strip().lower()
|
||||
@@ -67,13 +91,20 @@ def register_provider(
|
||||
raise ValueError("provider name required")
|
||||
if name in _PROVIDERS:
|
||||
raise ValueError(f"provider already registered: {name}")
|
||||
if columns is None:
|
||||
raise ValueError("provider registration requires columns")
|
||||
if selection_fn is None:
|
||||
raise ValueError("provider registration requires selection_fn")
|
||||
p = Provider(name=name, adapter=adapter, columns=columns, selection_fn=selection_fn, metadata=metadata)
|
||||
_PROVIDERS[name] = p
|
||||
return p
|
||||
|
||||
|
||||
def get_provider(name: str) -> Provider:
|
||||
return _PROVIDERS[name.lower()]
|
||||
normalized = str(name or "").lower()
|
||||
if normalized not in _PROVIDERS:
|
||||
raise KeyError(f"provider not registered: {name}")
|
||||
return _PROVIDERS[normalized]
|
||||
|
||||
|
||||
def list_providers() -> List[str]:
|
||||
|
||||
@@ -7,7 +7,7 @@ renderers must use. It intentionally refuses to accept legacy dicts/strings/objs
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Callable, Dict, Iterable, Optional, Protocol
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Protocol
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -33,6 +33,48 @@ class ResultModel:
|
||||
source: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ResultTable:
|
||||
"""Concrete, provider-owned table of rows/columns.
|
||||
|
||||
This is intentionally minimal: it only stores rows, column specs, and
|
||||
optional metadata used by renderers. It does not auto-normalize legacy
|
||||
objects or infer columns.
|
||||
"""
|
||||
|
||||
provider: str
|
||||
rows: List[ResultModel]
|
||||
columns: List[ColumnSpec]
|
||||
meta: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if not str(self.provider or "").strip():
|
||||
raise ValueError("provider required for ResultTable")
|
||||
object.__setattr__(self, "rows", [ensure_result_model(r) for r in self.rows])
|
||||
if not self.columns:
|
||||
raise ValueError("columns are required for ResultTable")
|
||||
object.__setattr__(self, "columns", list(self.columns))
|
||||
object.__setattr__(self, "meta", dict(self.meta or {}))
|
||||
|
||||
def serialize_row(self, row: ResultModel, selection: Optional[List[str]] = None) -> Dict[str, Any]:
|
||||
"""Convert a row into pipeline-friendly dict (with selection args).
|
||||
|
||||
Selection args must be precomputed by the provider; this method only
|
||||
copies them into the serialized dict.
|
||||
"""
|
||||
|
||||
r = ensure_result_model(row)
|
||||
return {
|
||||
"title": r.title,
|
||||
"path": r.path,
|
||||
"ext": r.ext,
|
||||
"size_bytes": r.size_bytes,
|
||||
"metadata": r.metadata or {},
|
||||
"source": r.source or self.provider,
|
||||
"_selection_args": list(selection or []),
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ColumnSpec:
|
||||
"""Specification for a column that renderers will use.
|
||||
@@ -100,6 +142,7 @@ def metadata_column(key: str, header: Optional[str] = None, format_fn: Optional[
|
||||
|
||||
__all__ = [
|
||||
"ResultModel",
|
||||
"ResultTable",
|
||||
"ColumnSpec",
|
||||
"ProviderAdapter",
|
||||
"Renderer",
|
||||
|
||||
@@ -9,7 +9,7 @@ from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, Iterable, Optional
|
||||
|
||||
from SYS.result_table_api import ColumnSpec, ResultModel, Renderer
|
||||
from SYS.result_table_api import ColumnSpec, ResultModel, ResultTable, Renderer
|
||||
|
||||
|
||||
class RichRenderer(Renderer):
|
||||
@@ -65,3 +65,22 @@ def render_to_console(rows: Iterable[ResultModel], columns: Iterable[ColumnSpec]
|
||||
|
||||
table = RichRenderer().render(rows, columns, meta)
|
||||
Console().print(table)
|
||||
|
||||
|
||||
def render_result_table(table: ResultTable, renderer: Optional[Renderer] = None) -> Any:
|
||||
"""Render a ResultTable with the provided renderer (RichRenderer by default)."""
|
||||
|
||||
rend = renderer or RichRenderer()
|
||||
return rend.render(table.rows, table.columns, table.meta)
|
||||
|
||||
|
||||
def render_result_table_to_console(table: ResultTable, renderer: Optional[Renderer] = None) -> None:
|
||||
try:
|
||||
from rich.console import Console
|
||||
except Exception:
|
||||
for r in table.rows:
|
||||
print(" ".join(str((col.extractor(r) or "")) for col in table.columns))
|
||||
return
|
||||
|
||||
console = Console()
|
||||
console.print(render_result_table(table, renderer))
|
||||
|
||||
18
SYS/utils.py
18
SYS/utils.py
@@ -66,6 +66,24 @@ def sanitize_metadata_value(value: Any) -> str | None:
|
||||
return value
|
||||
|
||||
|
||||
def sanitize_filename(name: str, *, max_len: int = 150) -> str:
|
||||
"""Return a filesystem-safe filename derived from *name*.
|
||||
|
||||
Replaces characters that are invalid on Windows with underscores and
|
||||
collapses whitespace. Trims trailing periods and enforces a max length.
|
||||
"""
|
||||
text = str(name or "").strip()
|
||||
if not text:
|
||||
return "download"
|
||||
|
||||
forbidden = set('<>:"/\\|?*')
|
||||
cleaned = "".join("_" if c in forbidden else c for c in text)
|
||||
cleaned = " ".join(cleaned.split()).strip().strip(".")
|
||||
if not cleaned:
|
||||
cleaned = "download"
|
||||
return cleaned[:max_len]
|
||||
|
||||
|
||||
def unique_preserve_order(values: Iterable[str]) -> list[str]:
|
||||
seen: set[str] = set()
|
||||
ordered: list[str] = []
|
||||
|
||||
Reference in New Issue
Block a user