144 lines
4.7 KiB
Python
144 lines
4.7 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from API.loc import LOCClient
|
|
from ProviderCore.base import Provider, SearchResult
|
|
from SYS.cli_syntax import get_free_text, parse_query
|
|
from SYS.logger import log
|
|
|
|
|
|
class LOC(Provider):
|
|
"""LoC search provider.
|
|
|
|
Currently implements Chronicling America collection search via the LoC JSON API.
|
|
"""
|
|
|
|
URL_DOMAINS = ["www.loc.gov"]
|
|
URL = URL_DOMAINS
|
|
|
|
def validate(self) -> bool:
|
|
return True
|
|
|
|
def search(
|
|
self,
|
|
query: str,
|
|
limit: int = 50,
|
|
filters: Optional[Dict[str,
|
|
Any]] = None,
|
|
**kwargs: Any,
|
|
) -> List[SearchResult]:
|
|
_ = kwargs
|
|
parsed = parse_query(query or "")
|
|
text = get_free_text(parsed).strip()
|
|
fields = parsed.get("fields",
|
|
{}) if isinstance(parsed,
|
|
dict) else {}
|
|
|
|
# Allow explicit q: override.
|
|
q = str(fields.get("q") or text or "").strip()
|
|
if not q:
|
|
return []
|
|
|
|
# Pass through any extra filters supported by the LoC API.
|
|
extra: Dict[str,
|
|
Any] = {}
|
|
if isinstance(filters, dict):
|
|
extra.update(filters)
|
|
if isinstance(fields, dict):
|
|
for k, v in fields.items():
|
|
if k == "q":
|
|
continue
|
|
extra[str(k)] = v
|
|
|
|
client = LOCClient()
|
|
|
|
results: List[SearchResult] = []
|
|
start = 1
|
|
page_size = 25
|
|
try:
|
|
if limit and limit > 0:
|
|
page_size = max(1, min(int(limit), 50))
|
|
|
|
while len(results) < max(0, int(limit)):
|
|
payload = client.search_chronicling_america(
|
|
q,
|
|
start=start,
|
|
count=page_size,
|
|
extra_params=extra
|
|
)
|
|
items = payload.get("results")
|
|
if not isinstance(items, list) or not items:
|
|
break
|
|
|
|
for it in items:
|
|
if not isinstance(it, dict):
|
|
continue
|
|
|
|
title = str(it.get("title") or "").strip() or "(untitled)"
|
|
date = str(it.get("date") or "").strip()
|
|
url = str(it.get("url") or "").strip()
|
|
aka = it.get("aka")
|
|
if (not url) and isinstance(aka, list) and aka:
|
|
url = str(aka[0] or "").strip()
|
|
|
|
formats = it.get("online_format")
|
|
if isinstance(formats, list):
|
|
fmt_text = ", ".join([str(x) for x in formats if x])
|
|
else:
|
|
fmt_text = str(formats or "").strip()
|
|
|
|
partof = it.get("partof")
|
|
if isinstance(partof, list) and partof:
|
|
source = str(partof[-1] or "").strip()
|
|
else:
|
|
source = "Chronicling America"
|
|
|
|
detail_parts = []
|
|
if date:
|
|
detail_parts.append(date)
|
|
if source:
|
|
detail_parts.append(source)
|
|
detail = " — ".join(detail_parts)
|
|
|
|
annotations: List[str] = []
|
|
if date:
|
|
annotations.append(date)
|
|
if fmt_text:
|
|
annotations.append(fmt_text)
|
|
|
|
results.append(
|
|
SearchResult(
|
|
table="loc",
|
|
title=title,
|
|
path=url or title,
|
|
detail=detail,
|
|
annotations=annotations,
|
|
media_kind="document",
|
|
columns=[
|
|
("Title",
|
|
title),
|
|
("Date",
|
|
date),
|
|
("Format",
|
|
fmt_text),
|
|
("URL",
|
|
url),
|
|
],
|
|
full_metadata=it,
|
|
)
|
|
)
|
|
if len(results) >= int(limit):
|
|
break
|
|
|
|
# LoC API pagination uses sp (1-based start index).
|
|
if len(items) < page_size:
|
|
break
|
|
start += len(items)
|
|
|
|
except Exception as exc:
|
|
log(f"[loc] search failed: {exc}")
|
|
return []
|
|
|
|
return results
|