This commit is contained in:
2026-01-31 19:57:09 -08:00
parent 6513a3ad04
commit 1dbaabac73
7 changed files with 125 additions and 88 deletions

View File

@@ -19,6 +19,7 @@ from typing import Any, Dict, List, Optional, Tuple
from lxml import html as lxml_html
from urllib.parse import urljoin
import re
from SYS.logger import debug
# Default xpaths for candidate result containers
_DEFAULT_XPATHS = [
@@ -50,8 +51,8 @@ def _text_or_img_title(el) -> str:
imgs = el.xpath('.//img/@title')
if imgs:
return str(imgs[0]).strip()
except Exception:
pass
except Exception as exc:
debug("Failed to retrieve img title from element: %s", exc, exc_info=True)
return (el.text_content() or "").strip()
@@ -66,7 +67,8 @@ def find_candidate_nodes(doc_or_html: Any, xpaths: Optional[List[str]] = None) -
found = doc.xpath(xp)
if found:
return list(found), xp
except Exception:
except Exception as exc:
debug("Failed to execute xpath %s: %s", xp, exc, exc_info=True)
continue
return [], None
@@ -214,7 +216,8 @@ def extract_records(doc_or_html: Any, base_url: Optional[str] = None, xpaths: Op
df = max(dfs, key=lambda d: getattr(d, "shape", (len(getattr(d, 'index', [])), 0))[0])
try:
rows = df.to_dict("records")
except Exception:
except Exception as exc:
debug("pandas HTML table parse: df.to_dict('records') failed: %s", exc, exc_info=True)
# Some DataFrame-like objects may have slightly different APIs
rows = [dict(r) for r in df]
@@ -240,13 +243,13 @@ def extract_records(doc_or_html: Any, base_url: Optional[str] = None, xpaths: Op
href = anchors.get(rec["title"])
if href:
rec["path"] = urljoin(base_url, href) if base_url else href
except Exception:
pass
except Exception as exc:
debug("pandas: failed to recover anchor hrefs for table rows: %s", exc, exc_info=True)
return records, "pandas"
except Exception:
except Exception as exc:
# Pandas not present or parsing failed; fall back to node parsing
pass
debug("pandas: not available or parsing failed: %s", exc, exc_info=True)
# Fallback to node-based parsing
nodes, chosen = find_candidate_nodes(doc_or_html, xpaths=xpaths)