This commit is contained in:
2026-01-31 19:00:04 -08:00
parent dcf16e0cc4
commit 6513a3ad04
25 changed files with 617 additions and 397 deletions

View File

@@ -25,6 +25,9 @@ from ProviderCore.base import SearchResult
from SYS.html_table import extract_records
import lxml.html as lxml_html
import logging
logger = logging.getLogger(__name__)
class TableProviderMixin:
"""Mixin to simplify providers that scrape table/list results from HTML.
@@ -56,15 +59,18 @@ class TableProviderMixin:
resp = client.get(url)
content = resp.content
except Exception:
logger.exception("Failed to fetch URL %s for provider %s", url, getattr(self, 'name', '<provider>'))
return []
# Ensure we pass an lxml document or string (httpx returns bytes)
try:
doc = lxml_html.fromstring(content)
except Exception:
logger.debug("Failed to parse content with lxml; attempting to decode as utf-8", exc_info=True)
try:
doc = content.decode("utf-8")
except Exception:
logger.debug("Failed to decode content as utf-8; falling back to str()", exc_info=True)
doc = str(content)
records, chosen = extract_records(doc, base_url=url, xpaths=xpaths or self.DEFAULT_XPATHS)