h
This commit is contained in:
@@ -25,6 +25,9 @@ from ProviderCore.base import SearchResult
|
||||
from SYS.html_table import extract_records
|
||||
import lxml.html as lxml_html
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TableProviderMixin:
|
||||
"""Mixin to simplify providers that scrape table/list results from HTML.
|
||||
@@ -56,15 +59,18 @@ class TableProviderMixin:
|
||||
resp = client.get(url)
|
||||
content = resp.content
|
||||
except Exception:
|
||||
logger.exception("Failed to fetch URL %s for provider %s", url, getattr(self, 'name', '<provider>'))
|
||||
return []
|
||||
|
||||
# Ensure we pass an lxml document or string (httpx returns bytes)
|
||||
try:
|
||||
doc = lxml_html.fromstring(content)
|
||||
except Exception:
|
||||
logger.debug("Failed to parse content with lxml; attempting to decode as utf-8", exc_info=True)
|
||||
try:
|
||||
doc = content.decode("utf-8")
|
||||
except Exception:
|
||||
logger.debug("Failed to decode content as utf-8; falling back to str()", exc_info=True)
|
||||
doc = str(content)
|
||||
|
||||
records, chosen = extract_records(doc, base_url=url, xpaths=xpaths or self.DEFAULT_XPATHS)
|
||||
|
||||
Reference in New Issue
Block a user