rdf

2025-12-25 16:02:46 -08:00
parent 6a592f873c
commit 1ec2b313df
5 changed files with 224 additions and 5 deletions
--- a/API/loc.py
+++ b/API/loc.py
@@ -0,0 +1,80 @@
 """Library of Congress (LoC) API helpers.
 This module currently focuses on the LoC JSON API endpoint for the
 Chronicling America collection.
 Docs:
 - https://www.loc.gov/apis/
 - https://www.loc.gov/apis/json-and-yaml/
 The LoC JSON API does not require an API key.
 """
 from __future__ import annotations
 import json
 from typing import Any, Dict, Optional
 from API.HTTP import HTTPClient
 class LOCError(Exception):
    pass
 class LOCClient:
    """Minimal client for the public LoC JSON API."""
    BASE_URL = "https://www.loc.gov"
    def __init__(self, *, timeout: float = 20.0):
        self.timeout = float(timeout)
    def _get_json(self, path: str, params: Dict[str, Any]) -> Dict[str, Any]:
        url = self.BASE_URL.rstrip("/") + "/" + str(path or "").lstrip("/")
        try:
            with HTTPClient(timeout=self.timeout) as client:
                resp = client.get(url, params=params)
                resp.raise_for_status()
                # httpx.Response.json() exists but keep decoding consistent
                return json.loads(resp.content.decode("utf-8"))
        except Exception as exc:
            raise LOCError(str(exc)) from exc
    def search_chronicling_america(
        self,
        query: str,
        *,
        start: int = 1,
        count: int = 25,
        extra_params: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """Search the Chronicling America collection via LoC JSON API.
        Args:
            query: Free-text query.
            start: 1-based start index (LoC uses `sp`).
            count: Results per page (LoC uses `c`).
            extra_params: Additional LoC API params (facets, filters, etc.).
        Returns:
            Parsed JSON response.
        """
        q = str(query or "").strip()
        if not q:
            return {"results": []}
        params: Dict[str, Any] = {
            "q": q,
            "fo": "json",
            "c": int(count) if int(count) > 0 else 25,
            "sp": int(start) if int(start) > 0 else 1,
        }
        if extra_params:
            for k, v in extra_params.items():
                if v is None:
                    continue
                params[str(k)] = v
        return self._get_json("/collections/chronicling-america/", params)
--- a/Provider/loc.py
+++ b/Provider/loc.py
@@ -0,0 +1,129 @@
 from __future__ import annotations
 from typing import Any, Dict, List, Optional
 from API.loc import LOCClient
 from ProviderCore.base import Provider, SearchResult
 from cli_syntax import get_free_text, parse_query
 from SYS.logger import log
 class LOC(Provider):
    """LoC search provider.
    Currently implements Chronicling America collection search via the LoC JSON API.
    """
    URL_DOMAINS = ["www.loc.gov"]
    def validate(self) -> bool:
        return True
    def search(
        self,
        query: str,
        limit: int = 50,
        filters: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> List[SearchResult]:
        _ = kwargs
        parsed = parse_query(query or "")
        text = get_free_text(parsed).strip()
        fields = parsed.get("fields", {}) if isinstance(parsed, dict) else {}
        # Allow explicit q: override.
        q = str(fields.get("q") or text or "").strip()
        if not q:
            return []
        # Pass through any extra filters supported by the LoC API.
        extra: Dict[str, Any] = {}
        if isinstance(filters, dict):
            extra.update(filters)
        if isinstance(fields, dict):
            for k, v in fields.items():
                if k == "q":
                    continue
                extra[str(k)] = v
        client = LOCClient()
        results: List[SearchResult] = []
        start = 1
        page_size = 25
        try:
            if limit and limit > 0:
                page_size = max(1, min(int(limit), 50))
            while len(results) < max(0, int(limit)):
                payload = client.search_chronicling_america(q, start=start, count=page_size, extra_params=extra)
                items = payload.get("results")
                if not isinstance(items, list) or not items:
                    break
                for it in items:
                    if not isinstance(it, dict):
                        continue
                    title = str(it.get("title") or "").strip() or "(untitled)"
                    date = str(it.get("date") or "").strip()
                    url = str(it.get("url") or "").strip()
                    aka = it.get("aka")
                    if (not url) and isinstance(aka, list) and aka:
                        url = str(aka[0] or "").strip()
                    formats = it.get("online_format")
                    if isinstance(formats, list):
                        fmt_text = ", ".join([str(x) for x in formats if x])
                    else:
                        fmt_text = str(formats or "").strip()
                    partof = it.get("partof")
                    if isinstance(partof, list) and partof:
                        source = str(partof[-1] or "").strip()
                    else:
                        source = "Chronicling America"
                    detail_parts = []
                    if date:
                        detail_parts.append(date)
                    if source:
                        detail_parts.append(source)
                    detail = " — ".join(detail_parts)
                    annotations: List[str] = []
                    if date:
                        annotations.append(date)
                    if fmt_text:
                        annotations.append(fmt_text)
                    results.append(
                        SearchResult(
                            table="loc",
                            title=title,
                            path=url or title,
                            detail=detail,
                            annotations=annotations,
                            media_kind="document",
                            columns=[
                                ("Title", title),
                                ("Date", date),
                                ("Format", fmt_text),
                                ("URL", url),
                            ],
                            full_metadata=it,
                        )
                    )
                    if len(results) >= int(limit):
                        break
                # LoC API pagination uses sp (1-based start index).
                if len(items) < page_size:
                    break
                start += len(items)
        except Exception as exc:
            log(f"[loc] search failed: {exc}")
            return []
        return results
--- a/ProviderCore/registry.py
+++ b/ProviderCore/registry.py
@@ -23,6 +23,7 @@ from Provider.telegram import Telegram
 from Provider.youtube import YouTube
 from Provider.fileio import FileIO
 from Provider.zeroxzero import ZeroXZero
 from Provider.loc import LOC
 _PROVIDERS: Dict[str, Type[Provider]] = {
@@ -34,6 +35,7 @@ _PROVIDERS: Dict[str, Type[Provider]] = {
    "bandcamp": Bandcamp,
    "youtube": YouTube,
    "telegram": Telegram,
    "loc": LOC,
    # Upload-capable providers
    "0x0": ZeroXZero,
    "file.io": FileIO,
--- a/cmdlet/search_provider.py
+++ b/cmdlet/search_provider.py
@@ -31,10 +31,10 @@ class Search_Provider(Cmdlet):
    def __init__(self):
        super().__init__(
            name="search-provider",
-            summary="Search external providers (bandcamp, libgen, soulseek, youtube, alldebrid)",
+            summary="Search external providers (bandcamp, libgen, soulseek, youtube, alldebrid, loc)",
            usage="search-provider -provider <provider> <query> [-limit N] [-open ID]",
            arg=[
-                CmdletArg("provider", type="string", required=True, description="Provider name: bandcamp, libgen, soulseek, youtube, alldebrid"),
+                CmdletArg("provider", type="string", required=True, description="Provider name: bandcamp, libgen, soulseek, youtube, alldebrid, loc"),
                CmdletArg("query", type="string", required=True, description="Search query (supports provider-specific syntax)"),
                CmdletArg("limit", type="int", description="Maximum results to return (default: 50)"),
                CmdletArg("open", type="int", description="(alldebrid) Open folder/magnet by ID and list its files"),
@@ -48,6 +48,8 @@ class Search_Provider(Cmdlet):
                "  Example: search-provider -provider bandcamp \"artist:altrusian grace\"",
                "- libgen: Search Library Genesis for books",
                "  Example: search-provider -provider libgen \"python programming\"",
                "- loc: Search Library of Congress (Chronicling America)",
                "  Example: search-provider -provider loc \"lincoln\"",
                "- soulseek: Search P2P network for music",
                "  Example: search-provider -provider soulseek \"pink floyd\"",
                "- youtube: Search YouTube for videos",
@@ -182,6 +184,8 @@ class Search_Provider(Cmdlet):
                provider_label = "Youtube"
            elif provider_lower == "openlibrary":
                provider_label = "OpenLibrary"
            elif provider_lower == "loc":
                provider_label = "LoC"
            else:
                provider_label = provider_text[:1].upper() + provider_text[1:] if provider_text else "Provider"
@@ -189,7 +193,7 @@ class Search_Provider(Cmdlet):
                table_title = f"{provider_label} Files: {open_id}".strip().rstrip(":")
            else:
                table_title = f"{provider_label}: {query}".strip().rstrip(":")
-            preserve_order = provider_name.lower() in ("youtube", "openlibrary")
+            preserve_order = provider_name.lower() in ("youtube", "openlibrary", "loc")
            table = ResultTable(table_title).set_preserve_order(preserve_order)
            table.set_table(provider_name)
            table.set_source_command("search-provider", list(args))
--- a/docs/hydrusnetwork.md
+++ b/docs/hydrusnetwork.md
@@ -31,7 +31,11 @@ click apply
 --------------------------------------------
 edit the below and place in your config.conf
-[store=hydrusnetwork]
+<figure>
  <figcaption>config.conf</figcaption>
  <pre><code class="language-powershell">[store=hydrusnetwork]
 NAME="shortnamenospacesorsymbols"
 API="apiaccesskeygoeshere"
-URL="apibaseurlgoeshere"
+URL="apibaseurlgoeshere"
 </code></pre>
 </figure>