This commit is contained in:
2026-01-06 16:19:29 -08:00
parent 41c11d39fd
commit edc33f4528
10 changed files with 1192 additions and 881 deletions

View File

@@ -30,6 +30,7 @@ from SYS.result_table import ResultTable
from SYS.rich_display import stderr_console as get_stderr_console
from SYS import pipeline as pipeline_context
from SYS.utils import sha256_file
from SYS.metadata import normalize_urls as normalize_url_list
from rich.prompt import Confirm
from tool.ytdlp import (
@@ -125,243 +126,6 @@ class Download_File(Cmdlet):
debug(f"[download-file] run invoked with args: {list(args)}")
return self._run_impl(result, args, config)
@staticmethod
def _normalize_urls(parsed: Dict[str, Any]) -> List[str]:
raw_url = parsed.get("url", [])
if isinstance(raw_url, str):
raw_url = [raw_url]
expanded_urls: List[str] = []
for u in raw_url or []:
if u is None:
continue
s = str(u).strip()
if not s:
continue
if "," in s:
parts = [p.strip() for p in s.split(",")]
expanded_urls.extend([p for p in parts if p])
else:
expanded_urls.append(s)
return expanded_urls
@staticmethod
def _rewrite_archive_org_urls(raw_urls: Sequence[str]) -> List[str]:
"""Rewrite Archive.org URLs using metadata JSON to pick the right flow.
- /metadata/<id>:
- if lendable (collection contains inlibrary/printdisabled/lendinglibrary) -> /borrow/<id>
- else -> /details/<id>
- /details/<id>:
- if lendable -> /borrow/<id>
This makes `download-file` do the right thing for borrow-only items.
"""
out: List[str] = []
for u in list(raw_urls or []):
s = str(u or "").strip()
if not s:
continue
try:
p = urlparse(s)
host = (p.hostname or "").strip().lower()
path = (p.path or "").strip()
except Exception:
out.append(s)
continue
if not host or (host != "archive.org" and not host.endswith(".archive.org")):
out.append(s)
continue
low_path = path.lower().strip()
if not (low_path.startswith("/metadata/") or low_path.startswith("/details/")):
out.append(s)
continue
parts = [x for x in path.split("/") if x]
if len(parts) < 2:
out.append(s)
continue
head = str(parts[0] or "").strip().lower()
archive_id = str(parts[1] or "").strip()
if head not in {"metadata", "details"} or not archive_id:
out.append(s)
continue
lendable = False
try:
meta_url = f"https://archive.org/metadata/{archive_id}"
resp = requests.get(meta_url, timeout=8)
resp.raise_for_status()
data = resp.json() if resp is not None else {}
meta = data.get("metadata", {}) if isinstance(data, dict) else {}
collection = meta.get("collection") if isinstance(meta, dict) else None
values: List[str] = []
if isinstance(collection, list):
values = [str(x).strip().lower() for x in collection if str(x).strip()]
elif isinstance(collection, str):
values = [collection.strip().lower()] if collection.strip() else []
lendable = any(v in {"inlibrary", "lendinglibrary"} for v in values)
except Exception:
lendable = False
if lendable:
debug(f"[download-file] archive.org item '{archive_id}' looks lendable; using borrow flow")
out.append(f"https://archive.org/borrow/{archive_id}")
continue
# Non-lendable: turn metadata URLs into details URLs so IA picker can show files.
if head == "metadata":
out.append(f"https://archive.org/details/{archive_id}")
continue
out.append(s)
return out
@staticmethod
def _collect_piped_items_if_no_urls(result: Any,
raw_urls: Sequence[str]) -> List[Any]:
if raw_urls:
return []
if isinstance(result, list):
return list(result)
if result:
return [result]
return []
@staticmethod
def _safe_total_items(raw_urls: Sequence[str], piped_items: Sequence[Any]) -> int:
try:
return int(len(raw_urls or []) + len(piped_items or []))
except Exception:
return 1
@staticmethod
def _build_preview(
raw_urls: Sequence[str],
piped_items: Sequence[Any],
total_items: int
) -> List[Any]:
try:
preview: List[Any] = []
preview.extend(list(raw_urls or [])[:max(0, total_items)])
if len(preview) < total_items:
preview.extend(
list(piped_items or [])[:max(0,
total_items - len(preview))]
)
return preview
except Exception:
return []
@staticmethod
def _load_provider_registry() -> Dict[str, Any]:
try:
from ProviderCore.registry import (
get_search_provider as _get_search_provider,
get_provider as _get_provider,
match_provider_name_for_url as _match_provider_name_for_url,
SearchResult as _SearchResult,
)
return {
"get_search_provider": _get_search_provider,
"get_provider": _get_provider,
"match_provider_name_for_url": _match_provider_name_for_url,
"SearchResult": _SearchResult,
}
except Exception:
return {
"get_search_provider": None,
"get_provider": None,
"match_provider_name_for_url": None,
"SearchResult": None,
}
@staticmethod
def _path_from_download_result(result_obj: Any) -> Path:
file_path = None
if hasattr(result_obj, "path"):
file_path = getattr(result_obj, "path")
elif isinstance(result_obj, dict):
file_path = result_obj.get("path")
if not file_path:
file_path = str(result_obj)
return Path(str(file_path))
def _emit_local_file(
self,
*,
downloaded_path: Path,
source: Optional[str],
title_hint: Optional[str],
tags_hint: Optional[List[str]],
media_kind_hint: Optional[str],
full_metadata: Optional[Dict[str,
Any]],
progress: PipelineProgress,
config: Dict[str,
Any],
provider_hint: Optional[str] = None,
) -> None:
title_val = (title_hint or downloaded_path.stem
or "Unknown").strip() or downloaded_path.stem
hash_value = self._compute_file_hash(downloaded_path)
notes: Optional[Dict[str, str]] = None
try:
if isinstance(full_metadata, dict):
subtitles = full_metadata.get("_tidal_lyrics_subtitles")
if isinstance(subtitles, str) and subtitles.strip():
notes = {"lyric": subtitles}
except Exception:
notes = None
tag: List[str] = []
if tags_hint:
tag.extend([str(t) for t in tags_hint if t])
if not any(str(t).lower().startswith("title:") for t in tag):
tag.insert(0, f"title:{title_val}")
payload: Dict[str,
Any] = {
"path": str(downloaded_path),
"hash": hash_value,
"title": title_val,
"action": "cmdlet:download-file",
"download_mode": "file",
"store": "local",
"media_kind": media_kind_hint or "file",
"tag": tag,
}
if provider_hint:
payload["provider"] = str(provider_hint)
if full_metadata:
payload["full_metadata"] = full_metadata
if notes:
payload["notes"] = notes
if source and str(source).startswith("http"):
payload["url"] = source
elif source:
payload["source_url"] = source
pipeline_context.emit(payload)
# When running with a local progress UI (standalone cmdlet), ensure
# the pipe advances on emit.
progress.on_emit(payload)
# Automatically register url with local library
if payload.get("url"):
pipe_obj = coerce_to_pipe_object(payload)
register_url_with_local_library(pipe_obj, config)
def _process_explicit_urls(
self,
*,
@@ -373,6 +137,7 @@ class Download_File(Cmdlet):
registry: Dict[str,
Any],
progress: PipelineProgress,
context_items: Sequence[Any] = (),
) -> tuple[int,
Optional[int]]:
downloaded_count = 0
@@ -381,6 +146,12 @@ class Download_File(Cmdlet):
get_provider = registry.get("get_provider")
match_provider_name_for_url = registry.get("match_provider_name_for_url")
context_items_list: List[Any]
try:
context_items_list = list(context_items) if context_items else []
except Exception:
context_items_list = []
for url in raw_urls:
try:
debug(f"Processing URL: {url}")
@@ -521,14 +292,15 @@ class Download_File(Cmdlet):
if provider_name and get_provider is not None and SearchResult is not None:
# OpenLibrary URLs should be handled by the OpenLibrary provider.
if provider_name == "openlibrary":
url_str = str(url).strip()
provider = get_provider("openlibrary", config)
if provider is None:
raise DownloadError(
"OpenLibrary provider not configured or not available"
)
edition_id = ol_provider.edition_id_from_url(str(url))
title_hint = ol_provider.title_hint_from_url_slug(str(url))
edition_id = ol_provider.edition_id_from_url(url_str)
title_hint = ol_provider.title_hint_from_url_slug(url_str)
download_payload: Optional[Dict[str, Any]] = None
try:
@@ -596,9 +368,95 @@ class Download_File(Cmdlet):
progress_cb = _progress
if hasattr(provider, "download_url"):
# Prefer piped OpenLibrary context (selection row) when present so we keep
# resolved metadata like archive_id and availability.
ctx_item = None
ctx_md: Dict[str, Any] = {}
ctx_title: Optional[str] = None
ctx_tags: Optional[List[str]] = None
ctx_media_kind: Optional[str] = None
for candidate in context_items_list:
try:
table_val = get_field(candidate, "table")
except Exception:
table_val = None
if str(table_val or "").lower() != "openlibrary":
continue
md_val = get_field(candidate, "full_metadata")
md_dict = md_val if isinstance(md_val, dict) else {}
cand_olid = str(md_dict.get("openlibrary_id") or md_dict.get("olid") or "").strip()
cand_archive = str(md_dict.get("archive_id") or "").strip()
cand_url = str(
get_field(candidate, "path")
or get_field(candidate, "url")
or md_dict.get("selection_url")
or ""
).strip()
matched = False
if edition_id and cand_olid and cand_olid == edition_id:
matched = True
elif cand_url and url_str and cand_url == url_str:
matched = True
elif (not edition_id) and cand_archive and cand_archive in url_str:
matched = True
if matched:
ctx_item = candidate
ctx_md = md_dict
ctx_title = get_field(candidate, "title")
ctx_media_kind = get_field(candidate, "media_kind")
tags_val = get_field(candidate, "tag")
if isinstance(tags_val, list):
ctx_tags = [str(t) for t in tags_val if t]
break
if ctx_item is not None and SearchResult is not None:
sr_meta = dict(ctx_md) if isinstance(ctx_md, dict) else {}
if edition_id and not sr_meta.get("openlibrary_id"):
sr_meta["openlibrary_id"] = edition_id
sr_title = str(ctx_title or title_hint or "").strip() or title_hint
sr_media_kind = str(ctx_media_kind or "book")
sr_obj = (
ctx_item
if isinstance(ctx_item, SearchResult)
else SearchResult(
table="openlibrary",
title=sr_title,
path=url_str,
media_kind=sr_media_kind,
full_metadata=sr_meta,
)
)
try:
sr_obj.path = url_str # type: ignore[attr-defined]
except Exception:
pass
try:
if ctx_tags:
sr_obj.tag = set(ctx_tags) # type: ignore[attr-defined]
except Exception:
pass
downloaded_path = provider.download(
sr_obj,
final_output_dir,
progress_callback=progress_cb
) # type: ignore[call-arg]
if downloaded_path:
download_payload = {
"path": Path(downloaded_path),
"search_result": sr_obj,
}
if download_payload is None and hasattr(provider, "download_url"):
download_payload = provider.download_url( # type: ignore[attr-defined]
str(url),
url_str,
final_output_dir,
progress_cb,
)
@@ -606,12 +464,12 @@ class Download_File(Cmdlet):
if download_payload is None:
sr = None
if hasattr(provider, "search_result_from_url"):
sr = provider.search_result_from_url(str(url)) # type: ignore[attr-defined]
sr = provider.search_result_from_url(url_str) # type: ignore[attr-defined]
if sr is None:
sr = SearchResult(
table="openlibrary",
title=title_hint,
path=str(url),
path=url_str,
media_kind="book",
full_metadata={
"openlibrary_id": edition_id,
@@ -811,6 +669,97 @@ class Download_File(Cmdlet):
downloaded_count += 1
continue
if provider_name and get_provider is not None and SearchResult is not None:
provider = get_provider(provider_name, config)
if provider is not None and hasattr(provider, "download_url"):
try:
downloaded_path = provider.download_url(
str(url),
final_output_dir
) # type: ignore[attr-defined]
except Exception as exc:
raise DownloadError(str(exc))
if downloaded_path:
self._emit_local_file(
downloaded_path=Path(downloaded_path),
source=str(url),
title_hint=Path(str(downloaded_path)).stem,
tags_hint=None,
media_kind_hint="file",
full_metadata=None,
provider_hint=str(provider_name),
progress=progress,
config=config,
)
downloaded_count += 1
continue
if provider is not None:
sr_obj = None
try:
sr_obj = SearchResult(
table=str(provider_name),
title=str(url),
path=str(url),
full_metadata={},
)
downloaded_path = provider.download(
sr_obj,
final_output_dir
) # type: ignore[call-arg]
except Exception:
downloaded_path = None
if (not downloaded_path
) and str(provider_name).lower() == "libgen":
raise DownloadError(
"LibGen URL did not resolve to a downloadable file"
)
if downloaded_path:
emit_tags: Optional[List[str]] = None
full_md: Optional[Dict[str, Any]] = None
title_hint = Path(str(downloaded_path)).stem
media_kind_hint = "file"
if str(provider_name
).lower() == "libgen" and sr_obj is not None:
media_kind_hint = "book"
try:
sr_tags = getattr(sr_obj, "tag", None)
if isinstance(sr_tags, set) and sr_tags:
emit_tags = sorted(
[str(t) for t in sr_tags if t]
)
except Exception:
emit_tags = None
try:
sr_full_md = getattr(sr_obj, "full_metadata", None)
if isinstance(sr_full_md, dict):
full_md = sr_full_md
t = str(sr_full_md.get("title") or "").strip()
if t:
title_hint = t
except Exception:
full_md = None
self._emit_local_file(
downloaded_path=Path(downloaded_path),
source=str(url),
title_hint=title_hint,
tags_hint=emit_tags,
media_kind_hint=media_kind_hint,
full_metadata=full_md,
provider_hint=str(provider_name),
progress=progress,
config=config,
)
downloaded_count += 1
continue
result_obj = _download_direct_file(
str(url),
final_output_dir,
@@ -1237,6 +1186,170 @@ class Download_File(Cmdlet):
return downloaded_count, queued_magnet_submissions
@staticmethod
def _path_from_download_result(result_obj: Any) -> Path:
file_path = None
if hasattr(result_obj, "path"):
file_path = getattr(result_obj, "path")
elif isinstance(result_obj, dict):
file_path = result_obj.get("path")
if not file_path:
file_path = str(result_obj)
return Path(str(file_path))
def _emit_local_file(
self,
*,
downloaded_path: Path,
source: Optional[str],
title_hint: Optional[str],
tags_hint: Optional[List[str]],
media_kind_hint: Optional[str],
full_metadata: Optional[Dict[str, Any]],
progress: PipelineProgress,
config: Dict[str, Any],
provider_hint: Optional[str] = None,
) -> None:
title_val = (title_hint or downloaded_path.stem or "Unknown").strip() or downloaded_path.stem
hash_value = self._compute_file_hash(downloaded_path)
notes: Optional[Dict[str, str]] = None
try:
if isinstance(full_metadata, dict):
subtitles = full_metadata.get("_tidal_lyrics_subtitles")
if isinstance(subtitles, str) and subtitles.strip():
notes = {"lyric": subtitles}
except Exception:
notes = None
tag: List[str] = []
if tags_hint:
tag.extend([str(t) for t in tags_hint if t])
if not any(str(t).lower().startswith("title:") for t in tag):
tag.insert(0, f"title:{title_val}")
payload: Dict[str, Any] = {
"path": str(downloaded_path),
"hash": hash_value,
"title": title_val,
"action": "cmdlet:download-file",
"download_mode": "file",
"store": "local",
"media_kind": media_kind_hint or "file",
"tag": tag,
}
if provider_hint:
payload["provider"] = str(provider_hint)
if full_metadata:
payload["full_metadata"] = full_metadata
if notes:
payload["notes"] = notes
if source and str(source).startswith("http"):
payload["url"] = source
elif source:
payload["source_url"] = source
pipeline_context.emit(payload)
@staticmethod
def _normalize_urls(parsed: Dict[str, Any]) -> List[str]:
urls: List[str] = []
url_value: Any = None
if isinstance(parsed, dict):
url_value = parsed.get("url")
try:
urls = normalize_url_list(url_value)
except Exception:
urls = []
if not urls and isinstance(parsed, dict):
query_val = parsed.get("query")
try:
if isinstance(query_val, str) and query_val.strip().lower().startswith("url:"):
urls = normalize_url_list(query_val)
except Exception:
pass
return urls
@staticmethod
def _collect_piped_items_if_no_urls(result: Any, raw_url: Sequence[str]) -> List[Any]:
if raw_url:
return []
if result is None:
return []
if isinstance(result, list):
return list(result)
return [result]
@staticmethod
def _load_provider_registry() -> Dict[str, Any]:
"""Lightweight accessor for provider helpers without hard dependencies."""
try:
from ProviderCore import registry as provider_registry # type: ignore
from ProviderCore.base import SearchResult # type: ignore
return {
"get_provider": getattr(provider_registry, "get_provider", None),
"get_search_provider": getattr(provider_registry, "get_search_provider", None),
"match_provider_name_for_url": getattr(provider_registry, "match_provider_name_for_url", None),
"SearchResult": SearchResult,
}
except Exception:
return {
"get_provider": None,
"get_search_provider": None,
"match_provider_name_for_url": None,
"SearchResult": None,
}
@staticmethod
def _safe_total_items(raw_url: Sequence[str], piped_items: Sequence[Any]) -> int:
"""Return a sane item count for progress display."""
try:
url_count = len(raw_url or [])
except Exception:
url_count = 0
try:
piped_count = len(piped_items or [])
except Exception:
piped_count = 0
total = url_count + piped_count
return total if total > 0 else 1
@staticmethod
def _build_preview(raw_url: Sequence[str], piped_items: Sequence[Any], total_items: int) -> List[str]:
"""Construct a short preview list for the local progress UI."""
preview: List[str] = []
try:
for url in raw_url or []:
if len(preview) >= 5:
break
preview.append(str(url))
except Exception:
pass
if len(preview) < 5:
try:
items = piped_items if isinstance(piped_items, list) else list(piped_items or [])
except Exception:
items = []
for item in items:
if len(preview) >= 5:
break
try:
label = get_field(item, "title") or get_field(item, "path") or get_field(item, "url")
except Exception:
label = None
if label:
preview.append(str(label))
# If we still have nothing, supply a generic placeholder to avoid empty previews.
if not preview and total_items:
preview.append(f"{total_items} item(s)")
return preview
# === Streaming helpers (yt-dlp) ===
@staticmethod
@@ -3531,7 +3644,6 @@ class Download_File(Cmdlet):
parsed = parse_cmdlet_args(args, self)
raw_url = self._normalize_urls(parsed)
raw_url = self._rewrite_archive_org_urls(raw_url)
piped_items = self._collect_piped_items_if_no_urls(result, raw_url)
had_piped_input = False
@@ -3660,6 +3772,7 @@ class Download_File(Cmdlet):
quiet_mode=quiet_mode,
registry=registry,
progress=progress,
context_items=(result if isinstance(result, list) else ([result] if result else [])),
)
downloaded_count += int(urls_downloaded)
if early_exit is not None:

View File

@@ -14,15 +14,12 @@ import sys
from SYS.logger import log, debug
try:
from Provider.openlibrary import OpenLibrary
_ol_scrape_isbn_metadata = OpenLibrary.scrape_isbn_metadata
_ol_scrape_openlibrary_metadata = OpenLibrary.scrape_openlibrary_metadata
except Exception:
_ol_scrape_isbn_metadata = None # type: ignore[assignment]
_ol_scrape_openlibrary_metadata = None # type: ignore[assignment]
from Provider.metadata_provider import get_metadata_provider, list_metadata_providers
from Provider.metadata_provider import (
get_metadata_provider,
list_metadata_providers,
scrape_isbn_metadata,
scrape_openlibrary_metadata,
)
import subprocess
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple
@@ -270,9 +267,6 @@ def _pick_supported_ytdlp_url(urls: List[str]) -> Optional[str]:
return candidates[0] if candidates else None
_scrape_isbn_metadata = _ol_scrape_isbn_metadata # type: ignore[assignment]
_scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata # type: ignore[assignment]
# Tag item for ResultTable display and piping
from dataclasses import dataclass
@@ -1039,22 +1033,16 @@ def _extract_url_formats(formats: list) -> List[Tuple[str, str]]:
def _scrape_isbn_metadata(isbn: str) -> List[str]:
if _ol_scrape_isbn_metadata is None:
log("OpenLibrary scraper unavailable", file=sys.stderr)
return []
try:
return list(_ol_scrape_isbn_metadata(isbn))
return list(scrape_isbn_metadata(isbn))
except Exception as e:
log(f"ISBN scraping error: {e}", file=sys.stderr)
return []
def _scrape_openlibrary_metadata(olid: str) -> List[str]:
if _ol_scrape_openlibrary_metadata is None:
log("OpenLibrary scraper unavailable", file=sys.stderr)
return []
try:
return list(_ol_scrape_openlibrary_metadata(olid))
return list(scrape_openlibrary_metadata(olid))
except Exception as e:
log(f"OpenLibrary scraping error: {e}", file=sys.stderr)
return []