Files
Medios-Macina/Provider/loc.py

143 lines
4.6 KiB
Python
Raw Normal View History

2025-12-25 16:02:46 -08:00
from __future__ import annotations
from typing import Any, Dict, List, Optional
from API.loc import LOCClient
from ProviderCore.base import Provider, SearchResult
from SYS.cli_syntax import get_free_text, parse_query
2025-12-25 16:02:46 -08:00
from SYS.logger import log
class LOC(Provider):
"""LoC search provider.
Currently implements Chronicling America collection search via the LoC JSON API.
"""
URL_DOMAINS = ["www.loc.gov"]
def validate(self) -> bool:
return True
def search(
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str,
Any]] = None,
2025-12-25 16:02:46 -08:00
**kwargs: Any,
) -> List[SearchResult]:
_ = kwargs
parsed = parse_query(query or "")
text = get_free_text(parsed).strip()
fields = parsed.get("fields",
{}) if isinstance(parsed,
dict) else {}
2025-12-25 16:02:46 -08:00
# Allow explicit q: override.
q = str(fields.get("q") or text or "").strip()
if not q:
return []
# Pass through any extra filters supported by the LoC API.
extra: Dict[str,
Any] = {}
2025-12-25 16:02:46 -08:00
if isinstance(filters, dict):
extra.update(filters)
if isinstance(fields, dict):
for k, v in fields.items():
if k == "q":
continue
extra[str(k)] = v
client = LOCClient()
results: List[SearchResult] = []
start = 1
page_size = 25
try:
if limit and limit > 0:
page_size = max(1, min(int(limit), 50))
while len(results) < max(0, int(limit)):
2025-12-29 17:05:03 -08:00
payload = client.search_chronicling_america(
q,
start=start,
count=page_size,
extra_params=extra
2025-12-29 17:05:03 -08:00
)
2025-12-25 16:02:46 -08:00
items = payload.get("results")
if not isinstance(items, list) or not items:
break
for it in items:
if not isinstance(it, dict):
continue
title = str(it.get("title") or "").strip() or "(untitled)"
date = str(it.get("date") or "").strip()
url = str(it.get("url") or "").strip()
aka = it.get("aka")
if (not url) and isinstance(aka, list) and aka:
url = str(aka[0] or "").strip()
formats = it.get("online_format")
if isinstance(formats, list):
fmt_text = ", ".join([str(x) for x in formats if x])
else:
fmt_text = str(formats or "").strip()
partof = it.get("partof")
if isinstance(partof, list) and partof:
source = str(partof[-1] or "").strip()
else:
source = "Chronicling America"
detail_parts = []
if date:
detail_parts.append(date)
if source:
detail_parts.append(source)
detail = "".join(detail_parts)
annotations: List[str] = []
if date:
annotations.append(date)
if fmt_text:
annotations.append(fmt_text)
results.append(
SearchResult(
table="loc",
title=title,
path=url or title,
detail=detail,
annotations=annotations,
media_kind="document",
columns=[
("Title",
title),
("Date",
date),
("Format",
fmt_text),
("URL",
url),
2025-12-25 16:02:46 -08:00
],
full_metadata=it,
)
)
if len(results) >= int(limit):
break
# LoC API pagination uses sp (1-based start index).
if len(items) < page_size:
break
start += len(items)
except Exception as exc:
log(f"[loc] search failed: {exc}")
return []
return results