rdf

2025-12-25 16:02:46 -08:00
parent 6a592f873c
commit 1ec2b313df
5 changed files with 224 additions and 5 deletions
--- a/API/loc.py
+++ b/API/loc.py
@@ -0,0 +1,80 @@
+"""Library of Congress (LoC) API helpers.
+
+This module currently focuses on the LoC JSON API endpoint for the
+Chronicling America collection.
+
+Docs:
+- https://www.loc.gov/apis/
+- https://www.loc.gov/apis/json-and-yaml/
+
+The LoC JSON API does not require an API key.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any, Dict, Optional
+
+from API.HTTP import HTTPClient
+
+
+class LOCError(Exception):
+    pass
+
+
+class LOCClient:
+    """Minimal client for the public LoC JSON API."""
+
+    BASE_URL = "https://www.loc.gov"
+
+    def __init__(self, *, timeout: float = 20.0):
+        self.timeout = float(timeout)
+
+    def _get_json(self, path: str, params: Dict[str, Any]) -> Dict[str, Any]:
+        url = self.BASE_URL.rstrip("/") + "/" + str(path or "").lstrip("/")
+        try:
+            with HTTPClient(timeout=self.timeout) as client:
+                resp = client.get(url, params=params)
+                resp.raise_for_status()
+                # httpx.Response.json() exists but keep decoding consistent
+                return json.loads(resp.content.decode("utf-8"))
+        except Exception as exc:
+            raise LOCError(str(exc)) from exc
+
+    def search_chronicling_america(
+        self,
+        query: str,
+        *,
+        start: int = 1,
+        count: int = 25,
+        extra_params: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """Search the Chronicling America collection via LoC JSON API.
+
+        Args:
+            query: Free-text query.
+            start: 1-based start index (LoC uses `sp`).
+            count: Results per page (LoC uses `c`).
+            extra_params: Additional LoC API params (facets, filters, etc.).
+
+        Returns:
+            Parsed JSON response.
+        """
+
+        q = str(query or "").strip()
+        if not q:
+            return {"results": []}
+
+        params: Dict[str, Any] = {
+            "q": q,
+            "fo": "json",
+            "c": int(count) if int(count) > 0 else 25,
+            "sp": int(start) if int(start) > 0 else 1,
+        }
+        if extra_params:
+            for k, v in extra_params.items():
+                if v is None:
+                    continue
+                params[str(k)] = v
+
+        return self._get_json("/collections/chronicling-america/", params)
--- a/Provider/loc.py
+++ b/Provider/loc.py
@@ -0,0 +1,129 @@
+from __future__ import annotations
+
+from typing import Any, Dict, List, Optional
+
+from API.loc import LOCClient
+from ProviderCore.base import Provider, SearchResult
+from cli_syntax import get_free_text, parse_query
+from SYS.logger import log
+
+
+class LOC(Provider):
+    """LoC search provider.
+
+    Currently implements Chronicling America collection search via the LoC JSON API.
+    """
+
+    URL_DOMAINS = ["www.loc.gov"]
+
+    def validate(self) -> bool:
+        return True
+
+    def search(
+        self,
+        query: str,
+        limit: int = 50,
+        filters: Optional[Dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> List[SearchResult]:
+        _ = kwargs
+        parsed = parse_query(query or "")
+        text = get_free_text(parsed).strip()
+        fields = parsed.get("fields", {}) if isinstance(parsed, dict) else {}
+
+        # Allow explicit q: override.
+        q = str(fields.get("q") or text or "").strip()
+        if not q:
+            return []
+
+        # Pass through any extra filters supported by the LoC API.
+        extra: Dict[str, Any] = {}
+        if isinstance(filters, dict):
+            extra.update(filters)
+        if isinstance(fields, dict):
+            for k, v in fields.items():
+                if k == "q":
+                    continue
+                extra[str(k)] = v
+
+        client = LOCClient()
+
+        results: List[SearchResult] = []
+        start = 1
+        page_size = 25
+        try:
+            if limit and limit > 0:
+                page_size = max(1, min(int(limit), 50))
+
+            while len(results) < max(0, int(limit)):
+                payload = client.search_chronicling_america(q, start=start, count=page_size, extra_params=extra)
+                items = payload.get("results")
+                if not isinstance(items, list) or not items:
+                    break
+
+                for it in items:
+                    if not isinstance(it, dict):
+                        continue
+
+                    title = str(it.get("title") or "").strip() or "(untitled)"
+                    date = str(it.get("date") or "").strip()
+                    url = str(it.get("url") or "").strip()
+                    aka = it.get("aka")
+                    if (not url) and isinstance(aka, list) and aka:
+                        url = str(aka[0] or "").strip()
+
+                    formats = it.get("online_format")
+                    if isinstance(formats, list):
+                        fmt_text = ", ".join([str(x) for x in formats if x])
+                    else:
+                        fmt_text = str(formats or "").strip()
+
+                    partof = it.get("partof")
+                    if isinstance(partof, list) and partof:
+                        source = str(partof[-1] or "").strip()
+                    else:
+                        source = "Chronicling America"
+
+                    detail_parts = []
+                    if date:
+                        detail_parts.append(date)
+                    if source:
+                        detail_parts.append(source)
+                    detail = " — ".join(detail_parts)
+
+                    annotations: List[str] = []
+                    if date:
+                        annotations.append(date)
+                    if fmt_text:
+                        annotations.append(fmt_text)
+
+                    results.append(
+                        SearchResult(
+                            table="loc",
+                            title=title,
+                            path=url or title,
+                            detail=detail,
+                            annotations=annotations,
+                            media_kind="document",
+                            columns=[
+                                ("Title", title),
+                                ("Date", date),
+                                ("Format", fmt_text),
+                                ("URL", url),
+                            ],
+                            full_metadata=it,
+                        )
+                    )
+                    if len(results) >= int(limit):
+                        break
+
+                # LoC API pagination uses sp (1-based start index).
+                if len(items) < page_size:
+                    break
+                start += len(items)
+
+        except Exception as exc:
+            log(f"[loc] search failed: {exc}")
+            return []
+
+        return results
--- a/ProviderCore/registry.py
+++ b/ProviderCore/registry.py
@@ -23,6 +23,7 @@ from Provider.telegram import Telegram
 from Provider.youtube import YouTube
 from Provider.fileio import FileIO
 from Provider.zeroxzero import ZeroXZero
+from Provider.loc import LOC


 _PROVIDERS: Dict[str, Type[Provider]] = {
@@ -34,6 +35,7 @@ _PROVIDERS: Dict[str, Type[Provider]] = {
    "bandcamp": Bandcamp,
    "youtube": YouTube,
    "telegram": Telegram,
+    "loc": LOC,
    # Upload-capable providers
    "0x0": ZeroXZero,
    "file.io": FileIO,
--- a/cmdlet/search_provider.py
+++ b/cmdlet/search_provider.py
@@ -31,10 +31,10 @@ class Search_Provider(Cmdlet):
    def __init__(self):
        super().__init__(
            name="search-provider",
-            summary="Search external providers (bandcamp, libgen, soulseek, youtube, alldebrid)",
+            summary="Search external providers (bandcamp, libgen, soulseek, youtube, alldebrid, loc)",
            usage="search-provider -provider <provider> <query> [-limit N] [-open ID]",
            arg=[
-                CmdletArg("provider", type="string", required=True, description="Provider name: bandcamp, libgen, soulseek, youtube, alldebrid"),
+                CmdletArg("provider", type="string", required=True, description="Provider name: bandcamp, libgen, soulseek, youtube, alldebrid, loc"),
                CmdletArg("query", type="string", required=True, description="Search query (supports provider-specific syntax)"),
                CmdletArg("limit", type="int", description="Maximum results to return (default: 50)"),
                CmdletArg("open", type="int", description="(alldebrid) Open folder/magnet by ID and list its files"),
@@ -48,6 +48,8 @@ class Search_Provider(Cmdlet):
                "  Example: search-provider -provider bandcamp \"artist:altrusian grace\"",
                "- libgen: Search Library Genesis for books",
                "  Example: search-provider -provider libgen \"python programming\"",
+                "- loc: Search Library of Congress (Chronicling America)",
+                "  Example: search-provider -provider loc \"lincoln\"",
                "- soulseek: Search P2P network for music",
                "  Example: search-provider -provider soulseek \"pink floyd\"",
                "- youtube: Search YouTube for videos",
@@ -182,6 +184,8 @@ class Search_Provider(Cmdlet):
                provider_label = "Youtube"
            elif provider_lower == "openlibrary":
                provider_label = "OpenLibrary"
+            elif provider_lower == "loc":
+                provider_label = "LoC"
            else:
                provider_label = provider_text[:1].upper() + provider_text[1:] if provider_text else "Provider"

@@ -189,7 +193,7 @@ class Search_Provider(Cmdlet):
                table_title = f"{provider_label} Files: {open_id}".strip().rstrip(":")
            else:
                table_title = f"{provider_label}: {query}".strip().rstrip(":")
-            preserve_order = provider_name.lower() in ("youtube", "openlibrary")
+            preserve_order = provider_name.lower() in ("youtube", "openlibrary", "loc")
            table = ResultTable(table_title).set_preserve_order(preserve_order)
            table.set_table(provider_name)
            table.set_source_command("search-provider", list(args))
--- a/docs/hydrusnetwork.md
+++ b/docs/hydrusnetwork.md
@@ -31,7 +31,11 @@ click apply
 --------------------------------------------
 edit the below and place in your config.conf

-[store=hydrusnetwork]
+<figure>
+  <figcaption>config.conf</figcaption>
+  <pre><code class="language-powershell">[store=hydrusnetwork]
 NAME="shortnamenospacesorsymbols"
 API="apiaccesskeygoeshere"
 URL="apibaseurlgoeshere"
+</code></pre>
+</figure>