h
This commit is contained in:
@@ -19,6 +19,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
||||
from lxml import html as lxml_html
|
||||
from urllib.parse import urljoin
|
||||
import re
|
||||
from SYS.logger import debug
|
||||
|
||||
# Default xpaths for candidate result containers
|
||||
_DEFAULT_XPATHS = [
|
||||
@@ -50,8 +51,8 @@ def _text_or_img_title(el) -> str:
|
||||
imgs = el.xpath('.//img/@title')
|
||||
if imgs:
|
||||
return str(imgs[0]).strip()
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as exc:
|
||||
debug("Failed to retrieve img title from element: %s", exc, exc_info=True)
|
||||
return (el.text_content() or "").strip()
|
||||
|
||||
|
||||
@@ -66,7 +67,8 @@ def find_candidate_nodes(doc_or_html: Any, xpaths: Optional[List[str]] = None) -
|
||||
found = doc.xpath(xp)
|
||||
if found:
|
||||
return list(found), xp
|
||||
except Exception:
|
||||
except Exception as exc:
|
||||
debug("Failed to execute xpath %s: %s", xp, exc, exc_info=True)
|
||||
continue
|
||||
return [], None
|
||||
|
||||
@@ -214,7 +216,8 @@ def extract_records(doc_or_html: Any, base_url: Optional[str] = None, xpaths: Op
|
||||
df = max(dfs, key=lambda d: getattr(d, "shape", (len(getattr(d, 'index', [])), 0))[0])
|
||||
try:
|
||||
rows = df.to_dict("records")
|
||||
except Exception:
|
||||
except Exception as exc:
|
||||
debug("pandas HTML table parse: df.to_dict('records') failed: %s", exc, exc_info=True)
|
||||
# Some DataFrame-like objects may have slightly different APIs
|
||||
rows = [dict(r) for r in df]
|
||||
|
||||
@@ -240,13 +243,13 @@ def extract_records(doc_or_html: Any, base_url: Optional[str] = None, xpaths: Op
|
||||
href = anchors.get(rec["title"])
|
||||
if href:
|
||||
rec["path"] = urljoin(base_url, href) if base_url else href
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as exc:
|
||||
debug("pandas: failed to recover anchor hrefs for table rows: %s", exc, exc_info=True)
|
||||
|
||||
return records, "pandas"
|
||||
except Exception:
|
||||
except Exception as exc:
|
||||
# Pandas not present or parsing failed; fall back to node parsing
|
||||
pass
|
||||
debug("pandas: not available or parsing failed: %s", exc, exc_info=True)
|
||||
|
||||
# Fallback to node-based parsing
|
||||
nodes, chosen = find_candidate_nodes(doc_or_html, xpaths=xpaths)
|
||||
|
||||
Reference in New Issue
Block a user