268 lines
9.1 KiB
Python
268 lines
9.1 KiB
Python
|
|
"""Smart downloader front-door.
|
||
|
|
|
||
|
|
Currently focused on Internet Archive item pages:
|
||
|
|
- Takes a piped InternetArchive search-provider row (table=internetarchive) or an archive.org details URL
|
||
|
|
- Displays a selectable table of available files/formats (PDF/ZIP/OCR/etc)
|
||
|
|
- Selecting a row via @N expands to download-file <direct-url>
|
||
|
|
|
||
|
|
This enables:
|
||
|
|
search-provider -provider internetarchive "..."
|
||
|
|
@3 # shows formats table
|
||
|
|
@2 | add-file ... # downloads selected file then pipes to add-file
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import re
|
||
|
|
import sys
|
||
|
|
from typing import Any, Dict, List, Sequence, cast
|
||
|
|
from urllib.parse import quote
|
||
|
|
|
||
|
|
from SYS.logger import log, debug
|
||
|
|
import pipeline as pipeline_context
|
||
|
|
from result_table import ResultTable
|
||
|
|
|
||
|
|
from . import _shared as sh
|
||
|
|
|
||
|
|
Cmdlet = sh.Cmdlet
|
||
|
|
SharedArgs = sh.SharedArgs
|
||
|
|
parse_cmdlet_args = sh.parse_cmdlet_args
|
||
|
|
get_field = sh.get_field
|
||
|
|
|
||
|
|
|
||
|
|
def _extract_ia_identifier(text: str) -> str:
|
||
|
|
s = str(text or "").strip()
|
||
|
|
if not s:
|
||
|
|
return ""
|
||
|
|
|
||
|
|
# https://archive.org/details/<identifier>
|
||
|
|
m = re.search(r"archive\.org/(?:details|download)/([^/?#\s]+)", s, flags=re.IGNORECASE)
|
||
|
|
if m:
|
||
|
|
return str(m.group(1) or "").strip()
|
||
|
|
|
||
|
|
# internetarchive:<identifier>
|
||
|
|
if s.lower().startswith("internetarchive:"):
|
||
|
|
return s.split(":", 1)[-1].strip()
|
||
|
|
|
||
|
|
return ""
|
||
|
|
|
||
|
|
|
||
|
|
class Download_Data(Cmdlet):
|
||
|
|
def __init__(self) -> None:
|
||
|
|
super().__init__(
|
||
|
|
name="download-data",
|
||
|
|
summary="List downloadable files/formats for provider items (e.g., Internet Archive)",
|
||
|
|
usage="download-data <url> OR @N | download-data (provider item), then select a file with @N",
|
||
|
|
alias=[],
|
||
|
|
arg=[SharedArgs.URL],
|
||
|
|
detail=[
|
||
|
|
"For Internet Archive item pages, shows a selectable list of available files (PDF/ZIP/OCR/etc).",
|
||
|
|
"Select a file row with @N to run download-file on that direct URL.",
|
||
|
|
],
|
||
|
|
exec=self.run,
|
||
|
|
)
|
||
|
|
self.register()
|
||
|
|
|
||
|
|
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
|
||
|
|
try:
|
||
|
|
# parse_cmdlet_args typing varies across cmdlets; keep runtime behavior.
|
||
|
|
parsed = parse_cmdlet_args(args, cast(Any, self))
|
||
|
|
except Exception:
|
||
|
|
parsed = {}
|
||
|
|
|
||
|
|
raw_urls = parsed.get("url", [])
|
||
|
|
if isinstance(raw_urls, str):
|
||
|
|
raw_urls = [raw_urls]
|
||
|
|
url_arg = str(raw_urls[0]).strip() if raw_urls else ""
|
||
|
|
|
||
|
|
piped_items: List[Any] = []
|
||
|
|
if isinstance(result, list):
|
||
|
|
piped_items = list(result)
|
||
|
|
elif result is not None:
|
||
|
|
piped_items = [result]
|
||
|
|
|
||
|
|
# Prefer piped item target if present.
|
||
|
|
target = ""
|
||
|
|
if piped_items:
|
||
|
|
target = str(get_field(piped_items[0], "path") or get_field(piped_items[0], "url") or "").strip()
|
||
|
|
if not target:
|
||
|
|
target = url_arg
|
||
|
|
|
||
|
|
table_name = ""
|
||
|
|
try:
|
||
|
|
table_name = str(get_field(piped_items[0], "table") or "").strip().lower() if piped_items else ""
|
||
|
|
except Exception:
|
||
|
|
table_name = ""
|
||
|
|
|
||
|
|
identifier = ""
|
||
|
|
if piped_items:
|
||
|
|
md = get_field(piped_items[0], "full_metadata")
|
||
|
|
if isinstance(md, dict):
|
||
|
|
identifier = str(md.get("identifier") or "").strip()
|
||
|
|
if not identifier:
|
||
|
|
identifier = _extract_ia_identifier(target)
|
||
|
|
|
||
|
|
if table_name == "internetarchive" or ("archive.org" in target.lower() and identifier):
|
||
|
|
return self._run_internetarchive(piped_items[0] if piped_items else None, identifier=identifier)
|
||
|
|
|
||
|
|
log("download-data: unsupported target (currently only Internet Archive item pages are supported)", file=sys.stderr)
|
||
|
|
return 1
|
||
|
|
|
||
|
|
@staticmethod
|
||
|
|
def _run_internetarchive(item: Any, *, identifier: str) -> int:
|
||
|
|
try:
|
||
|
|
from Provider.internetarchive import _ia as _ia_loader
|
||
|
|
except Exception as exc:
|
||
|
|
log(f"download-data: Internet Archive provider unavailable: {exc}", file=sys.stderr)
|
||
|
|
return 1
|
||
|
|
|
||
|
|
def _is_ia_metadata_file(f: Dict[str, Any]) -> bool:
|
||
|
|
try:
|
||
|
|
source = str(f.get("source") or "").strip().lower()
|
||
|
|
fmt = str(f.get("format") or "").strip().lower()
|
||
|
|
except Exception:
|
||
|
|
source = ""
|
||
|
|
fmt = ""
|
||
|
|
|
||
|
|
if source == "metadata":
|
||
|
|
return True
|
||
|
|
if fmt in {"metadata", "archive bittorrent"}:
|
||
|
|
return True
|
||
|
|
if fmt.startswith("thumbnail"):
|
||
|
|
return True
|
||
|
|
return False
|
||
|
|
|
||
|
|
ia = None
|
||
|
|
try:
|
||
|
|
ia = _ia_loader()
|
||
|
|
except Exception as exc:
|
||
|
|
log(f"download-data: Internet Archive module unavailable: {exc}", file=sys.stderr)
|
||
|
|
return 1
|
||
|
|
|
||
|
|
try:
|
||
|
|
get_item = getattr(ia, "get_item", None)
|
||
|
|
if not callable(get_item):
|
||
|
|
raise Exception("internetarchive.get_item is not available")
|
||
|
|
ia_item = cast(Any, get_item(str(identifier)))
|
||
|
|
except Exception as exc:
|
||
|
|
log(f"download-data: Internet Archive item lookup failed: {exc}", file=sys.stderr)
|
||
|
|
return 1
|
||
|
|
|
||
|
|
files: List[Dict[str, Any]] = []
|
||
|
|
try:
|
||
|
|
raw_files = getattr(ia_item, "files", None)
|
||
|
|
if isinstance(raw_files, list):
|
||
|
|
for f in raw_files:
|
||
|
|
if isinstance(f, dict):
|
||
|
|
files.append(f)
|
||
|
|
except Exception:
|
||
|
|
files = []
|
||
|
|
|
||
|
|
if not files:
|
||
|
|
try:
|
||
|
|
for f in ia_item.get_files():
|
||
|
|
name = getattr(f, "name", None)
|
||
|
|
if not name and isinstance(f, dict):
|
||
|
|
name = f.get("name")
|
||
|
|
if not name:
|
||
|
|
continue
|
||
|
|
files.append(
|
||
|
|
{
|
||
|
|
"name": str(name),
|
||
|
|
"size": getattr(f, "size", None),
|
||
|
|
"format": getattr(f, "format", None),
|
||
|
|
"source": getattr(f, "source", None),
|
||
|
|
}
|
||
|
|
)
|
||
|
|
except Exception:
|
||
|
|
files = []
|
||
|
|
|
||
|
|
if not files:
|
||
|
|
log("download-data: Internet Archive item has no files", file=sys.stderr)
|
||
|
|
return 1
|
||
|
|
|
||
|
|
# Prefer non-metadata files for the picker.
|
||
|
|
candidates = [f for f in files if not _is_ia_metadata_file(f)]
|
||
|
|
if not candidates:
|
||
|
|
candidates = list(files)
|
||
|
|
|
||
|
|
def _key(f: Dict[str, Any]) -> tuple[str, str]:
|
||
|
|
fmt = str(f.get("format") or "").strip().lower()
|
||
|
|
name = str(f.get("name") or "").strip().lower()
|
||
|
|
return (fmt, name)
|
||
|
|
|
||
|
|
candidates.sort(key=_key)
|
||
|
|
|
||
|
|
title = ""
|
||
|
|
try:
|
||
|
|
title = str(get_field(item, "title") or "").strip()
|
||
|
|
except Exception:
|
||
|
|
title = ""
|
||
|
|
|
||
|
|
table_title = f"Internet Archive: {title}".strip().rstrip(":")
|
||
|
|
if not title:
|
||
|
|
table_title = f"Internet Archive: {identifier}".strip().rstrip(":")
|
||
|
|
|
||
|
|
table = ResultTable(table_title).set_preserve_order(True)
|
||
|
|
table.set_table("internetarchive.formats")
|
||
|
|
# Selecting a row should expand to `download-file <direct-url>`.
|
||
|
|
table.set_source_command("download-file", [])
|
||
|
|
|
||
|
|
rows: List[Dict[str, Any]] = []
|
||
|
|
for f in candidates:
|
||
|
|
name = str(f.get("name") or "").strip()
|
||
|
|
if not name:
|
||
|
|
continue
|
||
|
|
|
||
|
|
fmt = str(f.get("format") or "").strip()
|
||
|
|
src = str(f.get("source") or "").strip()
|
||
|
|
|
||
|
|
size_val: Any = f.get("size")
|
||
|
|
try:
|
||
|
|
size_val = int(size_val) if size_val not in (None, "") else ""
|
||
|
|
except Exception:
|
||
|
|
# Keep as-is; ResultTable will stringify.
|
||
|
|
pass
|
||
|
|
|
||
|
|
direct_url = f"https://archive.org/download/{identifier}/{quote(name, safe='')}"
|
||
|
|
|
||
|
|
row_item: Dict[str, Any] = {
|
||
|
|
"table": "internetarchive",
|
||
|
|
"title": fmt or name,
|
||
|
|
"path": direct_url,
|
||
|
|
"url": direct_url,
|
||
|
|
"columns": [
|
||
|
|
("Format", fmt),
|
||
|
|
("Name", name),
|
||
|
|
("Size", size_val),
|
||
|
|
("Source", src),
|
||
|
|
],
|
||
|
|
# Used by @N expansion: download-file <direct-url>
|
||
|
|
"_selection_args": [direct_url],
|
||
|
|
"full_metadata": {
|
||
|
|
"identifier": identifier,
|
||
|
|
"name": name,
|
||
|
|
"format": fmt,
|
||
|
|
"source": src,
|
||
|
|
"size": f.get("size"),
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
rows.append(row_item)
|
||
|
|
table.add_result(row_item)
|
||
|
|
|
||
|
|
if not rows:
|
||
|
|
log("download-data: no downloadable files found for this item", file=sys.stderr)
|
||
|
|
return 1
|
||
|
|
|
||
|
|
try:
|
||
|
|
pipeline_context.set_last_result_table(table, rows, subject=item)
|
||
|
|
pipeline_context.set_current_stage_table(table)
|
||
|
|
except Exception as exc:
|
||
|
|
debug(f"[download-data] Failed to register result table: {exc}")
|
||
|
|
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
CMDLET = Download_Data()
|