Files
Medios-Macina/cmdlet/download_data.py

268 lines
9.1 KiB
Python
Raw Normal View History

2025-12-27 14:50:59 -08:00
"""Smart downloader front-door.
Currently focused on Internet Archive item pages:
- Takes a piped InternetArchive search-provider row (table=internetarchive) or an archive.org details URL
- Displays a selectable table of available files/formats (PDF/ZIP/OCR/etc)
- Selecting a row via @N expands to download-file <direct-url>
This enables:
search-provider -provider internetarchive "..."
@3 # shows formats table
@2 | add-file ... # downloads selected file then pipes to add-file
"""
from __future__ import annotations
import re
import sys
from typing import Any, Dict, List, Sequence, cast
from urllib.parse import quote
from SYS.logger import log, debug
import pipeline as pipeline_context
from result_table import ResultTable
from . import _shared as sh
Cmdlet = sh.Cmdlet
SharedArgs = sh.SharedArgs
parse_cmdlet_args = sh.parse_cmdlet_args
get_field = sh.get_field
def _extract_ia_identifier(text: str) -> str:
s = str(text or "").strip()
if not s:
return ""
# https://archive.org/details/<identifier>
m = re.search(r"archive\.org/(?:details|download)/([^/?#\s]+)", s, flags=re.IGNORECASE)
if m:
return str(m.group(1) or "").strip()
# internetarchive:<identifier>
if s.lower().startswith("internetarchive:"):
return s.split(":", 1)[-1].strip()
return ""
class Download_Data(Cmdlet):
def __init__(self) -> None:
super().__init__(
name="download-data",
summary="List downloadable files/formats for provider items (e.g., Internet Archive)",
usage="download-data <url> OR @N | download-data (provider item), then select a file with @N",
alias=[],
arg=[SharedArgs.URL],
detail=[
"For Internet Archive item pages, shows a selectable list of available files (PDF/ZIP/OCR/etc).",
"Select a file row with @N to run download-file on that direct URL.",
],
exec=self.run,
)
self.register()
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
try:
# parse_cmdlet_args typing varies across cmdlets; keep runtime behavior.
parsed = parse_cmdlet_args(args, cast(Any, self))
except Exception:
parsed = {}
raw_urls = parsed.get("url", [])
if isinstance(raw_urls, str):
raw_urls = [raw_urls]
url_arg = str(raw_urls[0]).strip() if raw_urls else ""
piped_items: List[Any] = []
if isinstance(result, list):
piped_items = list(result)
elif result is not None:
piped_items = [result]
# Prefer piped item target if present.
target = ""
if piped_items:
target = str(get_field(piped_items[0], "path") or get_field(piped_items[0], "url") or "").strip()
if not target:
target = url_arg
table_name = ""
try:
table_name = str(get_field(piped_items[0], "table") or "").strip().lower() if piped_items else ""
except Exception:
table_name = ""
identifier = ""
if piped_items:
md = get_field(piped_items[0], "full_metadata")
if isinstance(md, dict):
identifier = str(md.get("identifier") or "").strip()
if not identifier:
identifier = _extract_ia_identifier(target)
if table_name == "internetarchive" or ("archive.org" in target.lower() and identifier):
return self._run_internetarchive(piped_items[0] if piped_items else None, identifier=identifier)
log("download-data: unsupported target (currently only Internet Archive item pages are supported)", file=sys.stderr)
return 1
@staticmethod
def _run_internetarchive(item: Any, *, identifier: str) -> int:
try:
from Provider.internetarchive import _ia as _ia_loader
except Exception as exc:
log(f"download-data: Internet Archive provider unavailable: {exc}", file=sys.stderr)
return 1
def _is_ia_metadata_file(f: Dict[str, Any]) -> bool:
try:
source = str(f.get("source") or "").strip().lower()
fmt = str(f.get("format") or "").strip().lower()
except Exception:
source = ""
fmt = ""
if source == "metadata":
return True
if fmt in {"metadata", "archive bittorrent"}:
return True
if fmt.startswith("thumbnail"):
return True
return False
ia = None
try:
ia = _ia_loader()
except Exception as exc:
log(f"download-data: Internet Archive module unavailable: {exc}", file=sys.stderr)
return 1
try:
get_item = getattr(ia, "get_item", None)
if not callable(get_item):
raise Exception("internetarchive.get_item is not available")
ia_item = cast(Any, get_item(str(identifier)))
except Exception as exc:
log(f"download-data: Internet Archive item lookup failed: {exc}", file=sys.stderr)
return 1
files: List[Dict[str, Any]] = []
try:
raw_files = getattr(ia_item, "files", None)
if isinstance(raw_files, list):
for f in raw_files:
if isinstance(f, dict):
files.append(f)
except Exception:
files = []
if not files:
try:
for f in ia_item.get_files():
name = getattr(f, "name", None)
if not name and isinstance(f, dict):
name = f.get("name")
if not name:
continue
files.append(
{
"name": str(name),
"size": getattr(f, "size", None),
"format": getattr(f, "format", None),
"source": getattr(f, "source", None),
}
)
except Exception:
files = []
if not files:
log("download-data: Internet Archive item has no files", file=sys.stderr)
return 1
# Prefer non-metadata files for the picker.
candidates = [f for f in files if not _is_ia_metadata_file(f)]
if not candidates:
candidates = list(files)
def _key(f: Dict[str, Any]) -> tuple[str, str]:
fmt = str(f.get("format") or "").strip().lower()
name = str(f.get("name") or "").strip().lower()
return (fmt, name)
candidates.sort(key=_key)
title = ""
try:
title = str(get_field(item, "title") or "").strip()
except Exception:
title = ""
table_title = f"Internet Archive: {title}".strip().rstrip(":")
if not title:
table_title = f"Internet Archive: {identifier}".strip().rstrip(":")
table = ResultTable(table_title).set_preserve_order(True)
table.set_table("internetarchive.formats")
# Selecting a row should expand to `download-file <direct-url>`.
table.set_source_command("download-file", [])
rows: List[Dict[str, Any]] = []
for f in candidates:
name = str(f.get("name") or "").strip()
if not name:
continue
fmt = str(f.get("format") or "").strip()
src = str(f.get("source") or "").strip()
size_val: Any = f.get("size")
try:
size_val = int(size_val) if size_val not in (None, "") else ""
except Exception:
# Keep as-is; ResultTable will stringify.
pass
direct_url = f"https://archive.org/download/{identifier}/{quote(name, safe='')}"
row_item: Dict[str, Any] = {
"table": "internetarchive",
"title": fmt or name,
"path": direct_url,
"url": direct_url,
"columns": [
("Format", fmt),
("Name", name),
("Size", size_val),
("Source", src),
],
# Used by @N expansion: download-file <direct-url>
"_selection_args": [direct_url],
"full_metadata": {
"identifier": identifier,
"name": name,
"format": fmt,
"source": src,
"size": f.get("size"),
},
}
rows.append(row_item)
table.add_result(row_item)
if not rows:
log("download-data: no downloadable files found for this item", file=sys.stderr)
return 1
try:
pipeline_context.set_last_result_table(table, rows, subject=item)
pipeline_context.set_current_stage_table(table)
except Exception as exc:
debug(f"[download-data] Failed to register result table: {exc}")
return 0
CMDLET = Download_Data()