From 1ec2b313dff155fece6d7ff6bc388a24e52df11e Mon Sep 17 00:00:00 2001 From: goyim nose Date: Thu, 25 Dec 2025 16:02:46 -0800 Subject: [PATCH] rdf --- API/loc.py | 80 +++++++++++++++++++++++ Provider/loc.py | 129 ++++++++++++++++++++++++++++++++++++++ ProviderCore/registry.py | 2 + cmdlet/search_provider.py | 10 ++- docs/hydrusnetwork.md | 8 ++- 5 files changed, 224 insertions(+), 5 deletions(-) create mode 100644 API/loc.py create mode 100644 Provider/loc.py diff --git a/API/loc.py b/API/loc.py new file mode 100644 index 0000000..381658c --- /dev/null +++ b/API/loc.py @@ -0,0 +1,80 @@ +"""Library of Congress (LoC) API helpers. + +This module currently focuses on the LoC JSON API endpoint for the +Chronicling America collection. + +Docs: +- https://www.loc.gov/apis/ +- https://www.loc.gov/apis/json-and-yaml/ + +The LoC JSON API does not require an API key. +""" + +from __future__ import annotations + +import json +from typing import Any, Dict, Optional + +from API.HTTP import HTTPClient + + +class LOCError(Exception): + pass + + +class LOCClient: + """Minimal client for the public LoC JSON API.""" + + BASE_URL = "https://www.loc.gov" + + def __init__(self, *, timeout: float = 20.0): + self.timeout = float(timeout) + + def _get_json(self, path: str, params: Dict[str, Any]) -> Dict[str, Any]: + url = self.BASE_URL.rstrip("/") + "/" + str(path or "").lstrip("/") + try: + with HTTPClient(timeout=self.timeout) as client: + resp = client.get(url, params=params) + resp.raise_for_status() + # httpx.Response.json() exists but keep decoding consistent + return json.loads(resp.content.decode("utf-8")) + except Exception as exc: + raise LOCError(str(exc)) from exc + + def search_chronicling_america( + self, + query: str, + *, + start: int = 1, + count: int = 25, + extra_params: Optional[Dict[str, Any]] = None, + ) -> Dict[str, Any]: + """Search the Chronicling America collection via LoC JSON API. + + Args: + query: Free-text query. + start: 1-based start index (LoC uses `sp`). + count: Results per page (LoC uses `c`). + extra_params: Additional LoC API params (facets, filters, etc.). + + Returns: + Parsed JSON response. + """ + + q = str(query or "").strip() + if not q: + return {"results": []} + + params: Dict[str, Any] = { + "q": q, + "fo": "json", + "c": int(count) if int(count) > 0 else 25, + "sp": int(start) if int(start) > 0 else 1, + } + if extra_params: + for k, v in extra_params.items(): + if v is None: + continue + params[str(k)] = v + + return self._get_json("/collections/chronicling-america/", params) diff --git a/Provider/loc.py b/Provider/loc.py new file mode 100644 index 0000000..a1b9e5f --- /dev/null +++ b/Provider/loc.py @@ -0,0 +1,129 @@ +from __future__ import annotations + +from typing import Any, Dict, List, Optional + +from API.loc import LOCClient +from ProviderCore.base import Provider, SearchResult +from cli_syntax import get_free_text, parse_query +from SYS.logger import log + + +class LOC(Provider): + """LoC search provider. + + Currently implements Chronicling America collection search via the LoC JSON API. + """ + + URL_DOMAINS = ["www.loc.gov"] + + def validate(self) -> bool: + return True + + def search( + self, + query: str, + limit: int = 50, + filters: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> List[SearchResult]: + _ = kwargs + parsed = parse_query(query or "") + text = get_free_text(parsed).strip() + fields = parsed.get("fields", {}) if isinstance(parsed, dict) else {} + + # Allow explicit q: override. + q = str(fields.get("q") or text or "").strip() + if not q: + return [] + + # Pass through any extra filters supported by the LoC API. + extra: Dict[str, Any] = {} + if isinstance(filters, dict): + extra.update(filters) + if isinstance(fields, dict): + for k, v in fields.items(): + if k == "q": + continue + extra[str(k)] = v + + client = LOCClient() + + results: List[SearchResult] = [] + start = 1 + page_size = 25 + try: + if limit and limit > 0: + page_size = max(1, min(int(limit), 50)) + + while len(results) < max(0, int(limit)): + payload = client.search_chronicling_america(q, start=start, count=page_size, extra_params=extra) + items = payload.get("results") + if not isinstance(items, list) or not items: + break + + for it in items: + if not isinstance(it, dict): + continue + + title = str(it.get("title") or "").strip() or "(untitled)" + date = str(it.get("date") or "").strip() + url = str(it.get("url") or "").strip() + aka = it.get("aka") + if (not url) and isinstance(aka, list) and aka: + url = str(aka[0] or "").strip() + + formats = it.get("online_format") + if isinstance(formats, list): + fmt_text = ", ".join([str(x) for x in formats if x]) + else: + fmt_text = str(formats or "").strip() + + partof = it.get("partof") + if isinstance(partof, list) and partof: + source = str(partof[-1] or "").strip() + else: + source = "Chronicling America" + + detail_parts = [] + if date: + detail_parts.append(date) + if source: + detail_parts.append(source) + detail = " — ".join(detail_parts) + + annotations: List[str] = [] + if date: + annotations.append(date) + if fmt_text: + annotations.append(fmt_text) + + results.append( + SearchResult( + table="loc", + title=title, + path=url or title, + detail=detail, + annotations=annotations, + media_kind="document", + columns=[ + ("Title", title), + ("Date", date), + ("Format", fmt_text), + ("URL", url), + ], + full_metadata=it, + ) + ) + if len(results) >= int(limit): + break + + # LoC API pagination uses sp (1-based start index). + if len(items) < page_size: + break + start += len(items) + + except Exception as exc: + log(f"[loc] search failed: {exc}") + return [] + + return results diff --git a/ProviderCore/registry.py b/ProviderCore/registry.py index 7f491dc..2640ecb 100644 --- a/ProviderCore/registry.py +++ b/ProviderCore/registry.py @@ -23,6 +23,7 @@ from Provider.telegram import Telegram from Provider.youtube import YouTube from Provider.fileio import FileIO from Provider.zeroxzero import ZeroXZero +from Provider.loc import LOC _PROVIDERS: Dict[str, Type[Provider]] = { @@ -34,6 +35,7 @@ _PROVIDERS: Dict[str, Type[Provider]] = { "bandcamp": Bandcamp, "youtube": YouTube, "telegram": Telegram, + "loc": LOC, # Upload-capable providers "0x0": ZeroXZero, "file.io": FileIO, diff --git a/cmdlet/search_provider.py b/cmdlet/search_provider.py index 80359fb..e8e91d4 100644 --- a/cmdlet/search_provider.py +++ b/cmdlet/search_provider.py @@ -31,10 +31,10 @@ class Search_Provider(Cmdlet): def __init__(self): super().__init__( name="search-provider", - summary="Search external providers (bandcamp, libgen, soulseek, youtube, alldebrid)", + summary="Search external providers (bandcamp, libgen, soulseek, youtube, alldebrid, loc)", usage="search-provider -provider [-limit N] [-open ID]", arg=[ - CmdletArg("provider", type="string", required=True, description="Provider name: bandcamp, libgen, soulseek, youtube, alldebrid"), + CmdletArg("provider", type="string", required=True, description="Provider name: bandcamp, libgen, soulseek, youtube, alldebrid, loc"), CmdletArg("query", type="string", required=True, description="Search query (supports provider-specific syntax)"), CmdletArg("limit", type="int", description="Maximum results to return (default: 50)"), CmdletArg("open", type="int", description="(alldebrid) Open folder/magnet by ID and list its files"), @@ -48,6 +48,8 @@ class Search_Provider(Cmdlet): " Example: search-provider -provider bandcamp \"artist:altrusian grace\"", "- libgen: Search Library Genesis for books", " Example: search-provider -provider libgen \"python programming\"", + "- loc: Search Library of Congress (Chronicling America)", + " Example: search-provider -provider loc \"lincoln\"", "- soulseek: Search P2P network for music", " Example: search-provider -provider soulseek \"pink floyd\"", "- youtube: Search YouTube for videos", @@ -182,6 +184,8 @@ class Search_Provider(Cmdlet): provider_label = "Youtube" elif provider_lower == "openlibrary": provider_label = "OpenLibrary" + elif provider_lower == "loc": + provider_label = "LoC" else: provider_label = provider_text[:1].upper() + provider_text[1:] if provider_text else "Provider" @@ -189,7 +193,7 @@ class Search_Provider(Cmdlet): table_title = f"{provider_label} Files: {open_id}".strip().rstrip(":") else: table_title = f"{provider_label}: {query}".strip().rstrip(":") - preserve_order = provider_name.lower() in ("youtube", "openlibrary") + preserve_order = provider_name.lower() in ("youtube", "openlibrary", "loc") table = ResultTable(table_title).set_preserve_order(preserve_order) table.set_table(provider_name) table.set_source_command("search-provider", list(args)) diff --git a/docs/hydrusnetwork.md b/docs/hydrusnetwork.md index d1e4e91..ca901ca 100644 --- a/docs/hydrusnetwork.md +++ b/docs/hydrusnetwork.md @@ -31,7 +31,11 @@ click apply -------------------------------------------- edit the below and place in your config.conf -[store=hydrusnetwork] +
+
config.conf
+
[store=hydrusnetwork]
 NAME="shortnamenospacesorsymbols"
 API="apiaccesskeygoeshere"
-URL="apibaseurlgoeshere"
\ No newline at end of file
+URL="apibaseurlgoeshere"
+
+