This commit is contained in:
nose
2025-12-16 23:23:43 -08:00
parent 9873280f0e
commit 86918f2ae2
46 changed files with 2277 additions and 1347 deletions

View File

@@ -244,6 +244,8 @@ class HTTPClient:
self,
method: str,
url: str,
raise_for_status: bool = True,
log_http_errors: bool = True,
**kwargs
) -> httpx.Response:
"""
@@ -273,6 +275,7 @@ class HTTPClient:
for attempt in range(self.retries):
try:
response = self._client.request(method, url, **kwargs)
if raise_for_status:
response.raise_for_status()
return response
except httpx.TimeoutException as e:
@@ -287,6 +290,7 @@ class HTTPClient:
response_text = e.response.text[:500]
except:
response_text = "<unable to read response>"
if log_http_errors:
logger.error(f"HTTP {e.response.status_code} from {url}: {response_text}")
raise
last_exception = e

View File

@@ -71,6 +71,7 @@ class HydrusNetwork:
url: str
access_key: str = ""
timeout: float = 60.0
instance_name: str = "" # Optional store name (e.g., 'home') for namespaced logs
scheme: str = field(init=False)
hostname: str = field(init=False)
@@ -90,6 +91,12 @@ class HydrusNetwork:
self.port = parsed.port or (443 if self.scheme == "https" else 80)
self.base_path = parsed.path.rstrip("/")
self.access_key = self.access_key or ""
self.instance_name = str(self.instance_name or "").strip()
def _log_prefix(self) -> str:
if self.instance_name:
return f"[hydrusnetwork:{self.instance_name}]"
return f"[hydrusnetwork:{self.hostname}:{self.port}]"
# ------------------------------------------------------------------
# low-level helpers
@@ -120,7 +127,7 @@ class HydrusNetwork:
url = f"{self.scheme}://{self.hostname}:{self.port}{path}"
# Log request details
logger.debug(f"[Hydrus] {spec.method} {spec.endpoint} (auth: {'session_key' if self._session_key else 'access_key' if self.access_key else 'none'})")
logger.debug(f"{self._log_prefix()} {spec.method} {spec.endpoint} (auth: {'session_key' if self._session_key else 'access_key' if self.access_key else 'none'})")
status = 0
reason = ""
@@ -135,14 +142,14 @@ class HydrusNetwork:
file_path = Path(spec.file_path)
if not file_path.is_file():
error_msg = f"Upload file not found: {file_path}"
logger.error(f"[Hydrus] {error_msg}")
logger.error(f"{self._log_prefix()} {error_msg}")
raise FileNotFoundError(error_msg)
file_size = file_path.stat().st_size
headers["Content-Type"] = spec.content_type or "application/octet-stream"
headers["Content-Length"] = str(file_size)
logger.debug(f"[Hydrus] Uploading file {file_path.name} ({file_size} bytes)")
logger.debug(f"{self._log_prefix()} Uploading file {file_path.name} ({file_size} bytes)")
def file_gen():
with file_path.open("rb") as handle:
@@ -153,7 +160,9 @@ class HydrusNetwork:
spec.method,
url,
content=file_gen(),
headers=headers
headers=headers,
raise_for_status=False,
log_http_errors=False,
)
else:
content = None
@@ -163,14 +172,16 @@ class HydrusNetwork:
content = spec.data
else:
json_data = spec.data
logger.debug(f"[Hydrus] Request body size: {len(content) if content else 'json'}")
logger.debug(f"{self._log_prefix()} Request body size: {len(content) if content else 'json'}")
response = client.request(
spec.method,
url,
content=content,
json=json_data,
headers=headers
headers=headers,
raise_for_status=False,
log_http_errors=False,
)
status = response.status_code
@@ -178,20 +189,14 @@ class HydrusNetwork:
body = response.content
content_type = response.headers.get("Content-Type", "") or ""
logger.debug(f"[Hydrus] Response {status} {reason} ({len(body)} bytes)")
logger.debug(f"{self._log_prefix()} Response {status} {reason} ({len(body)} bytes)")
except (httpx.ConnectError, httpx.TimeoutException, httpx.NetworkError) as exc:
msg = f"Hydrus unavailable: {exc}"
logger.warning(f"[Hydrus] {msg}")
logger.warning(f"{self._log_prefix()} {msg}")
raise HydrusConnectionError(msg) from exc
except httpx.HTTPStatusError as exc:
response = exc.response
status = response.status_code
reason = response.reason_phrase
body = response.content
content_type = response.headers.get("Content-Type", "") or ""
except Exception as exc:
logger.error(f"[Hydrus] Connection error: {exc}", exc_info=True)
logger.error(f"{self._log_prefix()} Connection error: {exc}", exc_info=True)
raise
payload: Any
@@ -220,18 +225,22 @@ class HydrusNetwork:
else:
message = reason or "HTTP error"
logger.error(f"[Hydrus] HTTP {status}: {message}")
# Some endpoints are naturally "missing" sometimes and should not spam logs.
if status == 404 and spec.endpoint.rstrip("/") == "/get_files/file_path":
return {}
logger.error(f"{self._log_prefix()} HTTP {status}: {message}")
# Handle expired session key (419) by clearing cache and retrying once
if status == 419 and self._session_key and "session" in message.lower():
logger.warning(f"[Hydrus] Session key expired, acquiring new one and retrying...")
logger.warning(f"{self._log_prefix()} Session key expired, acquiring new one and retrying...")
self._session_key = "" # Clear expired session key
try:
self._acquire_session_key()
# Retry the request with new session key
return self._perform_request(spec)
except Exception as retry_error:
logger.error(f"[Hydrus] Retry failed: {retry_error}", exc_info=True)
logger.error(f"{self._log_prefix()} Retry failed: {retry_error}", exc_info=True)
# If retry fails, raise the original error
raise HydrusRequestError(status, message, payload) from retry_error
@@ -316,6 +325,16 @@ class HydrusNetwork:
def add_file(self, file_path: Path) -> dict[str, Any]:
return self._post("/add_files/add_file", file_path=file_path)
def undelete_files(self, hashes: Union[str, Iterable[str]]) -> dict[str, Any]:
"""Restore files from Hydrus trash back into 'my files'.
Hydrus Client API: POST /add_files/undelete_files
Required JSON args: {"hashes": [<sha256 hex>, ...]}
"""
hash_list = self._ensure_hashes(hashes)
body = {"hashes": hash_list}
return self._post("/add_files/undelete_files", data=body)
def add_tag(self, hash: Union[str, Iterable[str]], tags: Iterable[str], service_name: str) -> dict[str, Any]:
hash = self._ensure_hashes(hash)
body = {"hashes": hash, "service_names_to_tags": {service_name: list(tags)}}

39
CLI.py
View File

@@ -68,7 +68,7 @@ from typing import Callable
from config import get_local_storage_path, load_config
from cmdlet.catalog import (
from cmdlet_catalog import (
import_cmd_module as _catalog_import_cmd_module,
list_cmdlet_metadata as _catalog_list_cmdlet_metadata,
list_cmdlet_names as _catalog_list_cmdlet_names,
@@ -305,8 +305,6 @@ def _get_table_title_for_command(
'add_file': 'Results',
'delete-file': 'Results',
'delete_file': 'Results',
'check-file-status': 'Status',
'check_file_status': 'Status',
'get-metadata': None,
'get_metadata': None,
}
@@ -843,10 +841,6 @@ def _create_cmdlet_cli():
# Load config
config = _load_cli_config()
# Initialize cookies check for yt-dlp
from hydrus_health_check import initialize_cookies_check
initialize_cookies_check(config, emit_debug=False)
# Initialize debug logging if enabled
if config:
from SYS.logger import set_debug
@@ -991,8 +985,6 @@ def _create_cmdlet_cli():
# Run startup checks and render table
try:
from hydrus_health_check import initialize_cookies_check
# MPV availability is validated by MPV.MPV.__init__.
try:
from MPV.mpv_ipc import MPV
@@ -1294,8 +1286,13 @@ def _create_cmdlet_cli():
# Cookies are used by yt-dlp; keep this centralized utility.
try:
ok, detail = initialize_cookies_check(config, emit_debug=False)
_add_startup_check("FOUND" if ok else "MISSING", "Cookies", "N/A", detail or "Not found")
from tool.ytdlp import YtDlpTool
cookiefile = YtDlpTool(config).resolve_cookiefile()
if cookiefile is not None:
_add_startup_check("FOUND", "Cookies", "N/A", str(cookiefile))
else:
_add_startup_check("MISSING", "Cookies", "N/A", "Not found")
except Exception as exc:
_add_startup_check("ERROR", "Cookies", "N/A", str(exc))
@@ -1580,10 +1577,11 @@ def _execute_pipeline(tokens: list):
hash_val = getattr(item, 'hash', getattr(item, 'hash_hex', 'N/A'))
title_val = getattr(item, 'title', 'N/A')
if hash_val != 'N/A':
hash_display = hash_val[:8] + '...' if len(str(hash_val)) > 8 else hash_val
print(f" -> hash={hash_display}, title={title_val}")
hash_display = str(hash_val)
title_display = str(title_val)
print(f" -> hash:{hash_display}, title:{title_display}")
else:
print(f" -> title={title_val}")
print(f" -> title:{title_val}")
else:
print(" -> [source_index out of range]")
if resolved_list is not None:
@@ -2143,14 +2141,14 @@ def _execute_pipeline(tokens: list):
display_only_commands = {
'get-note', 'get_note',
'get-relationship', 'get_relationship', 'get-file', 'get_file',
'check-file-status', 'check_file_status'
}
# Commands that manage their own table/history state (e.g. get-tag)
self_managing_commands = {
'get-tag', 'get_tag', 'tags',
'get-url', 'get_url',
'search-file', 'search_file',
'search-provider', 'search_provider'
'search-provider', 'search_provider',
'search-store', 'search_store'
}
overlay_table = ctx.get_display_table() if hasattr(ctx, 'get_display_table') else None
@@ -2382,7 +2380,7 @@ def _execute_cmdlet(cmd_name: str, args: list):
# Ensure native commands (cmdnat) are loaded
try:
from cmdlet.catalog import ensure_registry_loaded as _ensure_registry_loaded
from cmdlet_catalog import ensure_registry_loaded as _ensure_registry_loaded
_ensure_registry_loaded()
except Exception:
pass
@@ -2391,7 +2389,7 @@ def _execute_cmdlet(cmd_name: str, args: list):
cmd_fn = REGISTRY.get(cmd_name)
if not cmd_fn:
# Attempt lazy import of the module and retry
from cmdlet.catalog import import_cmd_module as _catalog_import
from cmdlet_catalog import import_cmd_module as _catalog_import
try:
mod = _catalog_import(cmd_name)
data = getattr(mod, "CMDLET", None) if mod else None
@@ -2537,13 +2535,13 @@ def _execute_cmdlet(cmd_name: str, args: list):
display_only_commands = {
'get-url', 'get_url', 'get-note', 'get_note',
'get-relationship', 'get_relationship', 'get-file', 'get_file',
'check-file-status', 'check_file_status'
}
# Commands that manage their own table/history state (e.g. get-tag)
self_managing_commands = {
'get-tag', 'get_tag', 'tags',
'search-file', 'search_file',
'search-provider', 'search_provider'
'search-provider', 'search_provider',
'search-store', 'search_store'
}
if cmd_name in self_managing_commands:
@@ -2596,7 +2594,6 @@ def _execute_cmdlet(cmd_name: str, args: list):
display_only_commands = {
'get-url', 'get_url', 'get-note', 'get_note',
'get-relationship', 'get_relationship', 'get-file', 'get_file',
'check-file-status', 'check_file_status'
}
self_managing_commands = {
'get-tag', 'get_tag', 'tags',

View File

@@ -15,11 +15,11 @@ from SYS.logger import log
from models import ProgressBar
# Optional dependencies
# Optional dependency for HTML scraping fallbacks
try:
from bs4 import BeautifulSoup
from lxml import html as lxml_html
except ImportError:
BeautifulSoup = None
lxml_html = None
class Libgen(SearchProvider):
@@ -116,7 +116,7 @@ class Libgen(SearchProvider):
return []
def validate(self) -> bool:
# JSON-based searching can work without BeautifulSoup; HTML parsing is a fallback.
# JSON-based searching can work without lxml; HTML parsing is a fallback.
return True
def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
@@ -342,8 +342,8 @@ class LibgenSearch:
Uses a total time budget across mirrors to avoid long hangs.
"""
# Prefer JSON API (no BeautifulSoup needed); HTML scraping is a fallback.
has_bs4 = BeautifulSoup is not None
# Prefer JSON API (no lxml needed); HTML scraping is a fallback.
has_lxml = lxml_html is not None
started = time.monotonic()
@@ -372,7 +372,7 @@ class LibgenSearch:
results = []
if not results:
if not has_bs4:
if not has_lxml:
continue
if "libgen.li" in mirror or "libgen.gl" in mirror:
@@ -417,57 +417,73 @@ class LibgenSearch:
resp = self.session.get(url, params=params, timeout=timeout)
resp.raise_for_status()
if BeautifulSoup is None:
if lxml_html is None:
return []
soup = BeautifulSoup(resp.text, "html.parser")
table = soup.find("table", {"class": "c"})
if not table:
tables = soup.find_all("table")
for t in tables:
if len(t.find_all("tr")) > 5:
def _text(el: Any) -> str:
return " ".join([t.strip() for t in el.itertext() if t and str(t).strip()]).strip()
try:
doc = lxml_html.fromstring(resp.content)
except Exception:
return []
table_nodes = doc.xpath(
"//table[contains(concat(' ', normalize-space(@class), ' '), ' c ')]"
)
table = table_nodes[0] if table_nodes else None
if table is None:
for t in doc.xpath("//table"):
if len(t.xpath(".//tr")) > 5:
table = t
break
if not table:
if table is None:
return []
results: List[Dict[str, Any]] = []
rows = table.find_all("tr")[1:]
rows = table.xpath(".//tr")[1:]
for row in rows:
cols = row.find_all("td")
cols = row.xpath("./td")
if len(cols) < 9:
continue
try:
libgen_id = cols[0].get_text(strip=True)
authors = [a.get_text(strip=True) for a in cols[1].find_all("a")]
if not authors:
authors = [cols[1].get_text(strip=True)]
libgen_id = _text(cols[0])
title_tag = cols[2].find("a")
title = title_tag.get_text(strip=True) if title_tag else cols[2].get_text(strip=True)
author_links = cols[1].xpath(".//a")
authors = [_text(a) for a in author_links if _text(a)]
if not authors:
authors = [_text(cols[1])]
title_tag = None
title_links = cols[2].xpath(".//a")
if title_links:
title_tag = title_links[0]
title = _text(title_tag) if title_tag is not None else _text(cols[2])
md5 = ""
if title_tag and title_tag.has_attr("href"):
if title_tag is not None:
href = str(title_tag.get("href") or "")
match = re.search(r"md5=([a-fA-F0-9]{32})", href)
if match:
md5 = match.group(1)
publisher = cols[3].get_text(strip=True)
year = cols[4].get_text(strip=True)
pages = cols[5].get_text(strip=True)
language = cols[6].get_text(strip=True)
size = cols[7].get_text(strip=True)
extension = cols[8].get_text(strip=True)
publisher = _text(cols[3])
year = _text(cols[4])
pages = _text(cols[5])
language = _text(cols[6])
size = _text(cols[7])
extension = _text(cols[8])
mirror_links = []
mirror_links: List[str] = []
for i in range(9, len(cols)):
a = cols[i].find("a")
if a and a.has_attr("href"):
mirror_links.append(a["href"])
a_nodes = cols[i].xpath(".//a[@href]")
if a_nodes:
href = str(a_nodes[0].get("href") or "").strip()
if href:
mirror_links.append(href)
if md5:
download_link = f"http://library.lol/main/{md5}"
@@ -476,10 +492,11 @@ class LibgenSearch:
else:
download_link = ""
results.append({
results.append(
{
"id": libgen_id,
"title": title,
"author": ", ".join(authors),
"author": ", ".join([a for a in authors if a]) or "Unknown",
"publisher": publisher,
"year": year,
"pages": pages,
@@ -489,11 +506,11 @@ class LibgenSearch:
"md5": md5,
"mirror_url": download_link,
"cover": "",
})
}
)
if len(results) >= limit:
break
except Exception as e:
logging.debug(f"Error parsing row: {e}")
continue
@@ -521,21 +538,35 @@ class LibgenSearch:
resp = self.session.get(url, params=params, timeout=timeout)
resp.raise_for_status()
if BeautifulSoup is None:
if lxml_html is None:
return []
soup = BeautifulSoup(resp.text, "html.parser")
table = soup.find("table", {"id": "tablelibgen"})
if not table:
table = soup.find("table", {"class": "table table-striped"})
if not table:
def _text(el: Any) -> str:
return " ".join([t.strip() for t in el.itertext() if t and str(t).strip()]).strip()
try:
doc = lxml_html.fromstring(resp.content)
except Exception:
return []
table_nodes = doc.xpath("//table[@id='tablelibgen']")
table = table_nodes[0] if table_nodes else None
if table is None:
# Common libgen.li/gl fallback
table_nodes = doc.xpath(
"//table[contains(concat(' ', normalize-space(@class), ' '), ' table ') and "
"contains(concat(' ', normalize-space(@class), ' '), ' table-striped ')]"
)
table = table_nodes[0] if table_nodes else None
if table is None:
return []
results: List[Dict[str, Any]] = []
rows = table.find_all("tr")[1:]
rows = table.xpath(".//tr")[1:]
for row in rows:
cols = row.find_all("td")
cols = row.xpath("./td")
if len(cols) < 9:
continue
@@ -543,26 +574,30 @@ class LibgenSearch:
# Extract md5 (libgen.gl exposes /ads.php?md5=... in mirror column)
md5 = ""
mirror_url = ""
for a in row.find_all("a"):
href = a.get("href")
for a in row.xpath(".//a[@href]"):
href = str(a.get("href") or "")
if not href:
continue
m = re.search(r"md5=([a-fA-F0-9]{32})", str(href))
m = re.search(r"md5=([a-fA-F0-9]{32})", href)
if m:
md5 = m.group(1)
if "ads.php" in str(href):
mirror_url = urljoin(mirror, str(href))
if "ads.php" in href:
mirror_url = urljoin(mirror, href)
break
if not mirror_url and md5:
mirror_url = urljoin(mirror, f"/ads.php?md5={md5}")
# Extract numeric file id from /file.php?id=...
libgen_id = ""
file_link = row.find("a", href=re.compile(r"/file\.php\?id=\d+"))
if file_link and file_link.get("href"):
m = re.search(r"id=(\d+)", str(file_link.get("href")))
for a in row.xpath(".//a[@href]"):
href = str(a.get("href") or "")
if not href:
continue
if re.search(r"/file\.php\?id=\d+", href):
m = re.search(r"id=(\d+)", href)
if m:
libgen_id = m.group(1)
break
title = ""
authors = ""
@@ -585,7 +620,7 @@ class LibgenSearch:
if offset is not None:
meta_cell = cols[offset]
meta_text = " ".join([str(s).strip() for s in meta_cell.stripped_strings if str(s).strip()])
meta_text = _text(meta_cell)
# Extract ISBNs from meta cell (avoid using them as title)
# Matches 10 or 13-digit ISBN with optional leading 978/979.
@@ -601,11 +636,11 @@ class LibgenSearch:
# Choose a "real" title from meta cell.
# libgen.gl meta can include series/edition/isbn blobs; prefer text with letters.
raw_candidates: List[str] = []
for a in meta_cell.find_all("a"):
t = a.get_text(" ", strip=True)
for a in meta_cell.xpath(".//a"):
t = _text(a)
if t:
raw_candidates.append(t)
for s in meta_cell.stripped_strings:
for s in meta_cell.itertext():
t = str(s).strip()
if t:
raw_candidates.append(t)
@@ -645,27 +680,27 @@ class LibgenSearch:
best_score = score
best_title = cand
title = best_title or meta_cell.get_text(" ", strip=True)
title = best_title or _text(meta_cell)
authors = cols[offset + 1].get_text(" ", strip=True)
publisher = cols[offset + 2].get_text(" ", strip=True)
year = cols[offset + 3].get_text(" ", strip=True)
language = cols[offset + 4].get_text(" ", strip=True)
pages = cols[offset + 5].get_text(" ", strip=True)
size = cols[offset + 6].get_text(" ", strip=True)
extension = cols[offset + 7].get_text(" ", strip=True)
authors = _text(cols[offset + 1])
publisher = _text(cols[offset + 2])
year = _text(cols[offset + 3])
language = _text(cols[offset + 4])
pages = _text(cols[offset + 5])
size = _text(cols[offset + 6])
extension = _text(cols[offset + 7])
else:
# Older fallback structure
title_col = cols[1]
title_link = title_col.find("a")
title = title_link.get_text(" ", strip=True) if title_link else title_col.get_text(" ", strip=True)
authors = cols[2].get_text(" ", strip=True)
publisher = cols[3].get_text(" ", strip=True)
year = cols[4].get_text(" ", strip=True)
language = cols[5].get_text(" ", strip=True)
pages = cols[6].get_text(" ", strip=True)
size = cols[7].get_text(" ", strip=True)
extension = cols[8].get_text(" ", strip=True)
title_links = title_col.xpath(".//a")
title = _text(title_links[0]) if title_links else _text(title_col)
authors = _text(cols[2])
publisher = _text(cols[3])
year = _text(cols[4])
language = _text(cols[5])
pages = _text(cols[6])
size = _text(cols[7])
extension = _text(cols[8])
title = (title or "").strip() or "Unknown"
authors = (authors or "").strip() or "Unknown"
@@ -729,15 +764,49 @@ def _resolve_download_url(
current_url = url
visited = set()
if BeautifulSoup is None:
_call(log_info, "[resolve] BeautifulSoup not available; cannot resolve HTML download chain")
def _resolve_html_links_regex(base_url: str, html: str) -> Optional[str]:
"""Best-effort HTML link resolver without lxml.
This is intentionally minimal: it primarily targets LibGen landing pages like
`/ads.php?md5=...` which contain a `get.php?md5=...` link.
"""
if not html:
return None
def _find_a_by_text(pattern: str) -> Optional[Any]:
for a in soup.find_all("a"):
t = a.get_text(" ", strip=True)
# Prefer explicit get.php md5 links (most common successful chain).
m = re.search(r'href=["\']([^"\']*get\.php\?md5=[a-fA-F0-9]{32}[^"\']*)["\']', html, flags=re.IGNORECASE)
if m:
href = str(m.group(1) or "").strip()
if href and not href.lower().startswith("javascript:"):
return urljoin(base_url, href)
# Next: library.lol main links.
m = re.search(r'href=["\']([^"\']*library\.lol[^"\']*)["\']', html, flags=re.IGNORECASE)
if m:
href = str(m.group(1) or "").strip()
if href and not href.lower().startswith("javascript:"):
return urljoin(base_url, href)
# Finally: any direct file extension link.
m = re.search(
r'href=["\']([^"\']+\.(?:pdf|epub|mobi|djvu|azw3|cbz|cbr)(?:\?[^"\']*)?)["\']',
html,
flags=re.IGNORECASE,
)
if m:
href = str(m.group(1) or "").strip()
if href and not href.lower().startswith("javascript:"):
return urljoin(base_url, href)
return None
def _find_href_by_text(doc: Any, pattern: str) -> Optional[str]:
for a in doc.xpath("//a[@href]"):
t = " ".join([s.strip() for s in a.itertext() if s and str(s).strip()]).strip()
if t and re.search(pattern, t, re.IGNORECASE):
return a
href = str(a.get("href") or "").strip()
if href and not href.lower().startswith("javascript:"):
return href
return None
for _ in range(6):
@@ -763,42 +832,58 @@ def _resolve_download_url(
_call(log_info, f"[resolve] Failed to fetch {current_url}: {e}")
return None
soup = BeautifulSoup(content, "html.parser")
doc = None
if lxml_html is not None:
try:
doc = lxml_html.fromstring(content)
except Exception:
doc = None
get_link = _find_a_by_text(r"^GET$")
if get_link and get_link.has_attr("href"):
return urljoin(current_url, str(get_link.get("href") or ""))
if doc is None:
next_url = _resolve_html_links_regex(current_url, content)
if next_url:
current_url = next_url
continue
_call(log_info, "[resolve] lxml not available and regex resolver found no links")
return None
get_href = _find_href_by_text(doc, r"^GET$")
if get_href:
return urljoin(current_url, get_href)
if "series.php" in current_url:
edition_link = soup.find("a", href=re.compile(r"edition\.php"))
if edition_link:
current_url = urljoin(current_url, str(edition_link.get("href") or ""))
hrefs = doc.xpath("//a[contains(@href,'edition.php')]/@href")
if hrefs:
current_url = urljoin(current_url, str(hrefs[0] or ""))
continue
if "edition.php" in current_url:
file_link = soup.find("a", href=re.compile(r"file\.php"))
if file_link:
current_url = urljoin(current_url, str(file_link.get("href") or ""))
hrefs = doc.xpath("//a[contains(@href,'file.php')]/@href")
if hrefs:
current_url = urljoin(current_url, str(hrefs[0] or ""))
continue
if "file.php" in current_url:
libgen_link = soup.find("a", title="libgen")
if not libgen_link:
libgen_link = _find_a_by_text(r"Libgen")
if libgen_link and libgen_link.has_attr("href"):
current_url = urljoin(current_url, str(libgen_link.get("href") or ""))
libgen_href = None
for a in doc.xpath("//a[@href]"):
if str(a.get("title") or "").strip().lower() == "libgen":
libgen_href = str(a.get("href") or "").strip()
break
if not libgen_href:
libgen_href = _find_href_by_text(doc, r"Libgen")
if libgen_href:
current_url = urljoin(current_url, libgen_href)
continue
if "ads.php" in current_url:
get_php_link = soup.find("a", href=re.compile(r"get\.php"))
if get_php_link:
return urljoin(current_url, str(get_php_link.get("href") or ""))
hrefs = doc.xpath("//a[contains(@href,'get.php')]/@href")
if hrefs:
return urljoin(current_url, str(hrefs[0] or ""))
for text in ["Cloudflare", "IPFS.io", "Infura"]:
link = _find_a_by_text(re.escape(text))
if link and link.has_attr("href"):
return urljoin(current_url, str(link.get("href") or ""))
href = _find_href_by_text(doc, re.escape(text))
if href:
return urljoin(current_url, href)
break

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import base64
import io
from concurrent import futures
import hashlib
import json as json_module
@@ -34,6 +35,53 @@ except ImportError:
tqdm = None # type: ignore
def _image_paths_to_pdf_bytes(images: List[str]) -> Optional[bytes]:
if not images:
return None
try:
from PIL import Image # type: ignore
except Exception:
return None
pil_images: List[Any] = []
try:
for p in images:
img_path = Path(p)
if not img_path.is_file():
continue
with Image.open(img_path) as im: # type: ignore[attr-defined]
# Ensure PDF-compatible mode.
if im.mode in {"RGBA", "LA", "P"}:
im = im.convert("RGB")
else:
im = im.convert("RGB")
pil_images.append(im.copy())
except Exception:
for im in pil_images:
try:
im.close()
except Exception:
pass
return None
if not pil_images:
return None
buf = io.BytesIO()
first, rest = pil_images[0], pil_images[1:]
try:
first.save(buf, format="PDF", save_all=True, append_images=rest)
return buf.getvalue()
except Exception:
return None
finally:
for im in pil_images:
try:
im.close()
except Exception:
pass
def _looks_like_isbn(text: str) -> bool:
t = (text or "").replace("-", "").strip()
return t.isdigit() and len(t) in (10, 13)
@@ -941,17 +989,11 @@ class OpenLibrary(SearchProvider):
try:
images = self._archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
try:
import img2pdf # type: ignore
pdf_bytes = img2pdf.convert(images) if images else None
pdf_bytes = _image_paths_to_pdf_bytes(images)
if not pdf_bytes:
log("[openlibrary] PDF conversion failed", file=sys.stderr)
try:
shutil.rmtree(temp_dir)
except Exception:
pass
return None
# Keep images folder for manual conversion.
log("[openlibrary] PDF conversion failed; keeping images folder", file=sys.stderr)
return Path(temp_dir)
pdf_path = unique_path(output_dir / f"{title}.pdf")
with open(pdf_path, "wb") as f:
@@ -963,10 +1005,6 @@ class OpenLibrary(SearchProvider):
pass
return pdf_path
except ImportError:
# Keep images folder.
return Path(temp_dir)
except Exception:
try:
shutil.rmtree(temp_dir)

View File

@@ -281,13 +281,6 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
if opts.cookies_path and opts.cookies_path.is_file():
base_options["cookiefile"] = str(opts.cookies_path)
else:
# Check global cookies file lazily to avoid import cycles
from hydrus_health_check import get_cookies_file_path # local import
global_cookies = get_cookies_file_path()
if global_cookies:
base_options["cookiefile"] = global_cookies
else:
# Fallback to browser cookies
base_options["cookiesfrombrowser"] = ("chrome",)
@@ -453,21 +446,40 @@ def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
# Try to find actual download link in the page
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
try:
from lxml import html as lxml_html
except ImportError:
lxml_html = None
# Look for download links - LibGen typically has forms with download buttons
# Look for all links and forms that might lead to download
for link in soup.find_all('a'):
href = link.get('href')
if href and isinstance(href, str):
# Look for direct file links or get.php redirects
if 'get.php' in href.lower() or href.endswith(('.pdf', '.epub', '.djvu', '.mobi')):
download_url = href if href.startswith('http') else urljoin(final_url, href)
if lxml_html is not None:
doc = lxml_html.fromstring(response.content)
for a in doc.xpath("//a[@href]"):
href = str(a.get("href") or "").strip()
if not href:
continue
href_lower = href.lower()
if "get.php" in href_lower or href_lower.endswith((".pdf", ".epub", ".djvu", ".mobi")):
download_url = href if href.startswith("http") else urljoin(final_url, href)
debug(f"Found download link: {download_url}")
return download_url
except ImportError:
pass # BeautifulSoup not available
else:
# Regex fallback
for m in re.finditer(
r"href=[\"\']([^\"\']+)[\"\']",
response.text or "",
flags=re.IGNORECASE,
):
href = str(m.group(1) or "").strip()
if not href or href.lower().startswith("javascript:"):
continue
href_lower = href.lower()
if "get.php" in href_lower or href_lower.endswith((".pdf", ".epub", ".djvu", ".mobi")):
download_url = href if href.startswith("http") else urljoin(final_url, href)
debug(f"Found download link: {download_url}")
return download_url
except Exception:
pass
# If we followed redirects successfully, return the final URL
# This handles cases where libgen redirects to a direct download mirror
@@ -708,12 +720,7 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) ->
"noprogress": True, # No progress bars
}
# Add cookies if available (lazy import to avoid circular dependency)
from hydrus_health_check import get_cookies_file_path # local import
global_cookies = get_cookies_file_path()
if global_cookies:
ydl_opts["cookiefile"] = global_cookies
# Cookies are optional for probing; callers should pass cookiefile via DownloadOptions when needed.
# Add no_playlist option if specified
if no_playlist:

View File

@@ -23,6 +23,10 @@ class HydrusNetwork(Store):
Maintains its own HydrusClient.
"""
def _log_prefix(self) -> str:
store_name = getattr(self, "NAME", None) or "unknown"
return f"[hydrusnetwork:{store_name}]"
def __new__(cls, *args: Any, **kwargs: Any) -> "HydrusNetwork":
instance = super().__new__(cls)
name = kwargs.get("NAME")
@@ -109,7 +113,7 @@ class HydrusNetwork(Store):
raise RuntimeError(f"Hydrus '{self.NAME}' unavailable: {err}") from exc
# Create a persistent client for this instance (auth via access key by default).
self._client = HydrusClient(url=self.URL, access_key=self.API)
self._client = HydrusClient(url=self.URL, access_key=self.API, instance_name=self.NAME)
# Best-effort total count (fast on Hydrus side; does not fetch IDs/hashes).
try:
@@ -129,7 +133,7 @@ class HydrusNetwork(Store):
if isinstance(count_val, int):
self.total_count = count_val
except Exception as exc:
debug(f"Hydrus total count unavailable for '{self.NAME}': {exc}", file=sys.stderr)
debug(f"{self._log_prefix()} total count unavailable: {exc}", file=sys.stderr)
def name(self) -> str:
return self.NAME
@@ -167,7 +171,7 @@ class HydrusNetwork(Store):
try:
# Compute file hash
file_hash = sha256_file(file_path)
debug(f"File hash: {file_hash}")
debug(f"{self._log_prefix()} file hash: {file_hash}")
# Use persistent client with session key
client = self._client
@@ -177,11 +181,24 @@ class HydrusNetwork(Store):
# Check if file already exists in Hydrus
file_exists = False
try:
metadata = client.fetch_file_metadata(hashes=[file_hash])
metadata = client.fetch_file_metadata(
hashes=[file_hash],
include_service_keys_to_tags=False,
include_file_url=False,
include_duration=False,
include_size=False,
include_mime=False,
)
if metadata and isinstance(metadata, dict):
files = metadata.get("metadata", [])
if files:
metas = metadata.get("metadata", [])
if isinstance(metas, list) and metas:
# Hydrus returns placeholder rows for unknown hashes.
# Only treat as a real duplicate if it has a concrete file_id.
for meta in metas:
if isinstance(meta, dict) and meta.get("file_id") is not None:
file_exists = True
break
if file_exists:
log(
f" Duplicate detected - file already in Hydrus with hash: {file_hash}",
file=sys.stderr,
@@ -189,9 +206,17 @@ class HydrusNetwork(Store):
except Exception:
pass
# If Hydrus reports an existing file, it may be in trash. Best-effort restore it to 'my files'.
# This keeps behavior aligned with user expectation: "use API only" and ensure it lands in my files.
if file_exists:
try:
client.undelete_files([file_hash])
except Exception:
pass
# Upload file if not already present
if not file_exists:
log(f"Uploading to Hydrus: {file_path.name}", file=sys.stderr)
log(f"{self._log_prefix()} Uploading: {file_path.name}", file=sys.stderr)
response = client.add_file(file_path)
# Extract hash from response
@@ -207,7 +232,7 @@ class HydrusNetwork(Store):
raise Exception(f"Hydrus response missing file hash: {response}")
file_hash = hydrus_hash
log(f"Hydrus: {file_hash}", file=sys.stderr)
log(f"{self._log_prefix()} hash: {file_hash}", file=sys.stderr)
# Add tags if provided (both for new and existing files)
if tag_list:
@@ -218,27 +243,27 @@ class HydrusNetwork(Store):
service_name = "my tags"
try:
debug(f"Adding {len(tag_list)} tag(s) to Hydrus: {tag_list}")
debug(f"{self._log_prefix()} Adding {len(tag_list)} tag(s): {tag_list}")
client.add_tag(file_hash, tag_list, service_name)
log(f"Tags added via '{service_name}'", file=sys.stderr)
log(f"{self._log_prefix()} Tags added via '{service_name}'", file=sys.stderr)
except Exception as exc:
log(f"⚠️ Failed to add tags: {exc}", file=sys.stderr)
log(f"{self._log_prefix()} ⚠️ Failed to add tags: {exc}", file=sys.stderr)
# Associate url if provided (both for new and existing files)
if url:
log(f"Associating {len(url)} URL(s) with file", file=sys.stderr)
log(f"{self._log_prefix()} Associating {len(url)} URL(s) with file", file=sys.stderr)
for url in url:
if url:
try:
client.associate_url(file_hash, str(url))
debug(f"Associated URL: {url}")
debug(f"{self._log_prefix()} Associated URL: {url}")
except Exception as exc:
log(f"⚠️ Failed to associate URL {url}: {exc}", file=sys.stderr)
log(f"{self._log_prefix()} ⚠️ Failed to associate URL {url}: {exc}", file=sys.stderr)
return file_hash
except Exception as exc:
log(f"❌ Hydrus upload failed: {exc}", file=sys.stderr)
log(f"{self._log_prefix()} upload failed: {exc}", file=sys.stderr)
raise
def search(self, query: str, **kwargs: Any) -> list[Dict[str, Any]]:
@@ -262,7 +287,8 @@ class HydrusNetwork(Store):
if client is None:
raise Exception("Hydrus client unavailable")
debug(f"Searching Hydrus for: {query}")
prefix = self._log_prefix()
debug(f"{prefix} Searching for: {query}")
def _extract_urls(meta_obj: Any) -> list[str]:
if not isinstance(meta_obj, dict):
@@ -446,7 +472,7 @@ class HydrusNetwork(Store):
tags = [query_lower]
if not tags:
debug(f"Found 0 result(s)")
debug(f"{prefix} 0 result(s)")
return []
# Search files with the tags (unless url: search already produced metadata)
@@ -465,7 +491,7 @@ class HydrusNetwork(Store):
hashes = search_result.get("hashes", []) if isinstance(search_result, dict) else []
if not file_ids and not hashes:
debug(f"Found 0 result(s)")
debug(f"{prefix} 0 result(s)")
return []
if file_ids:
@@ -595,7 +621,7 @@ class HydrusNetwork(Store):
"ext": ext,
})
debug(f"Found {len(results)} result(s)")
debug(f"{prefix} {len(results)} result(s)")
return results[:limit]
except Exception as exc:
@@ -611,13 +637,13 @@ class HydrusNetwork(Store):
Only explicit user actions (e.g. the get-file cmdlet) should open files.
"""
debug(f"[HydrusNetwork.get_file] Starting for hash: {file_hash[:12]}...")
debug(f"{self._log_prefix()} get_file: start hash={file_hash[:12]}...")
# Build browser URL with access key
base_url = str(self.URL).rstrip('/')
access_key = str(self.API)
browser_url = f"{base_url}/get_files/file?hash={file_hash}&Hydrus-Client-API-Access-Key={access_key}"
debug(f"[HydrusNetwork.get_file] Returning URL: {browser_url}")
debug(f"{self._log_prefix()} get_file: url={browser_url}")
return browser_url
def get_metadata(self, file_hash: str, **kwargs: Any) -> Optional[Dict[str, Any]]:
@@ -632,17 +658,28 @@ class HydrusNetwork(Store):
try:
client = self._client
if not client:
debug("get_metadata: Hydrus client unavailable")
debug(f"{self._log_prefix()} get_metadata: client unavailable")
return None
# Fetch file metadata
payload = client.fetch_file_metadata(hashes=[file_hash], include_service_keys_to_tags=True)
# Fetch file metadata with the fields we need for CLI display.
payload = client.fetch_file_metadata(
hashes=[file_hash],
include_service_keys_to_tags=True,
include_file_url=True,
include_duration=True,
include_size=True,
include_mime=True,
)
if not payload or not payload.get("metadata"):
return None
meta = payload["metadata"][0]
# Hydrus can return placeholder metadata rows for unknown hashes.
if not isinstance(meta, dict) or meta.get("file_id") is None:
return None
# Extract title from tags
title = f"Hydrus_{file_hash[:12]}"
tags_payload = meta.get("tags", {})
@@ -660,33 +697,109 @@ class HydrusNetwork(Store):
if title != f"Hydrus_{file_hash[:12]}":
break
# Prefer Hydrus-provided extension (e.g. ".webm"); fall back to MIME map if needed.
mime_type = meta.get("mime", "")
ext_raw = meta.get("ext")
ext = str(ext_raw or "").strip().lstrip(".")
if not ext and mime_type:
# Hydrus may return mime as an int enum, or sometimes a human label.
mime_val = meta.get("mime")
filetype_human = meta.get("filetype_human") or meta.get("mime_human") or meta.get("mime_string")
# Determine ext: prefer Hydrus metadata ext, then filetype_human (when it looks like an ext),
# then title suffix, then file path suffix.
ext = str(meta.get("ext") or "").strip().lstrip(".")
if not ext:
ft = str(filetype_human or "").strip().lstrip(".").lower()
if ft and ft != "unknown filetype" and ft.isalnum() and len(ft) <= 8:
# Treat simple labels like "mp4", "m4a", "webm" as extensions.
ext = ft
if not ext and isinstance(title, str) and "." in title:
try:
from SYS.utils_constant import mime_maps
for category in mime_maps.values():
for _ext_key, info in category.items():
if mime_type in info.get("mimes", []):
ext = str(info.get("ext", "")).strip().lstrip(".")
break
if ext:
break
ext = Path(title).suffix.lstrip(".")
except Exception:
ext = ""
if not ext:
try:
path_payload = client.get_file_path(file_hash)
if isinstance(path_payload, dict):
p = path_payload.get("path")
if isinstance(p, str) and p.strip():
ext = Path(p.strip()).suffix.lstrip(".")
except Exception:
ext = ""
# If extension is still unknown, attempt a best-effort lookup from MIME.
def _mime_from_ext(ext_value: str) -> str:
ext_clean = str(ext_value or "").strip().lstrip(".").lower()
if not ext_clean:
return ""
try:
for category in mime_maps.values():
info = category.get(ext_clean)
if isinstance(info, dict):
mimes = info.get("mimes")
if isinstance(mimes, list) and mimes:
first = mimes[0]
return str(first)
except Exception:
return ""
return ""
# Normalize to a MIME string for CLI output.
# Avoid passing through human labels like "unknown filetype".
mime_type = ""
if isinstance(mime_val, str):
candidate = mime_val.strip()
if "/" in candidate and candidate.lower() != "unknown filetype":
mime_type = candidate
if not mime_type and isinstance(filetype_human, str):
candidate = filetype_human.strip()
if "/" in candidate and candidate.lower() != "unknown filetype":
mime_type = candidate
if not mime_type:
mime_type = _mime_from_ext(ext)
# Normalize size/duration to stable scalar types.
size_val = meta.get("size")
if size_val is None:
size_val = meta.get("size_bytes")
try:
size_int: int | None = int(size_val) if size_val is not None else None
except Exception:
size_int = None
dur_val = meta.get("duration")
if dur_val is None:
dur_val = meta.get("duration_ms")
try:
dur_int: int | None = int(dur_val) if dur_val is not None else None
except Exception:
dur_int = None
raw_urls = (
meta.get("known_urls")
or meta.get("urls")
or meta.get("url")
or []
)
url_list: list[str] = []
if isinstance(raw_urls, str):
s = raw_urls.strip()
url_list = [s] if s else []
elif isinstance(raw_urls, list):
url_list = [str(u).strip() for u in raw_urls if isinstance(u, str) and str(u).strip()]
return {
"hash": file_hash,
"title": title,
"ext": ext,
"size": meta.get("size"),
"size": size_int,
"mime": mime_type,
# Keep raw fields available for troubleshooting/other callers.
"hydrus_mime": mime_val,
"filetype_human": filetype_human,
"duration_ms": dur_int,
"url": url_list,
}
except Exception as exc:
debug(f"Failed to get metadata from Hydrus: {exc}")
debug(f"{self._log_prefix()} get_metadata failed: {exc}")
return None
def get_tag(self, file_identifier: str, **kwargs: Any) -> Tuple[List[str], str]:
@@ -705,13 +818,13 @@ class HydrusNetwork(Store):
file_hash = str(file_identifier or "").strip().lower()
if len(file_hash) != 64 or not all(ch in "0123456789abcdef" for ch in file_hash):
debug(f"get_tags: invalid file hash '{file_identifier}'")
debug(f"{self._log_prefix()} get_tags: invalid file hash '{file_identifier}'")
return [], "unknown"
# Get Hydrus client and service info
client = self._client
if not client:
debug("get_tags: Hydrus client unavailable")
debug(f"{self._log_prefix()} get_tags: client unavailable")
return [], "unknown"
# Fetch file metadata
@@ -723,12 +836,12 @@ class HydrusNetwork(Store):
items = payload.get("metadata") if isinstance(payload, dict) else None
if not isinstance(items, list) or not items:
debug(f"get_tags: No metadata returned for hash {file_hash}")
debug(f"{self._log_prefix()} get_tags: no metadata for hash {file_hash}")
return [], "unknown"
meta = items[0] if isinstance(items[0], dict) else None
if not isinstance(meta, dict) or meta.get("file_id") is None:
debug(f"get_tags: Invalid metadata for hash {file_hash}")
debug(f"{self._log_prefix()} get_tags: invalid metadata for hash {file_hash}")
return [], "unknown"
# Extract tags using service name
@@ -741,7 +854,7 @@ class HydrusNetwork(Store):
return tags, "hydrus"
except Exception as exc:
debug(f"get_tags failed for Hydrus file: {exc}")
debug(f"{self._log_prefix()} get_tags failed: {exc}")
return [], "unknown"
def add_tag(self, file_identifier: str, tags: List[str], **kwargs: Any) -> bool:
@@ -750,12 +863,12 @@ class HydrusNetwork(Store):
try:
client = self._client
if client is None:
debug("add_tag: Hydrus client unavailable")
debug(f"{self._log_prefix()} add_tag: client unavailable")
return False
file_hash = str(file_identifier or "").strip().lower()
if len(file_hash) != 64 or not all(ch in "0123456789abcdef" for ch in file_hash):
debug(f"add_tag: invalid file hash '{file_identifier}'")
debug(f"{self._log_prefix()} add_tag: invalid file hash '{file_identifier}'")
return False
service_name = kwargs.get("service_name") or "my tags"
# Ensure tags is a list
@@ -765,7 +878,7 @@ class HydrusNetwork(Store):
client.add_tag(file_hash, tag_list, service_name)
return True
except Exception as exc:
debug(f"Hydrus add_tag failed: {exc}")
debug(f"{self._log_prefix()} add_tag failed: {exc}")
return False
def delete_tag(self, file_identifier: str, tags: List[str], **kwargs: Any) -> bool:
@@ -774,12 +887,12 @@ class HydrusNetwork(Store):
try:
client = self._client
if client is None:
debug("delete_tag: Hydrus client unavailable")
debug(f"{self._log_prefix()} delete_tag: client unavailable")
return False
file_hash = str(file_identifier or "").strip().lower()
if len(file_hash) != 64 or not all(ch in "0123456789abcdef" for ch in file_hash):
debug(f"delete_tag: invalid file hash '{file_identifier}'")
debug(f"{self._log_prefix()} delete_tag: invalid file hash '{file_identifier}'")
return False
service_name = kwargs.get("service_name") or "my tags"
tag_list = list(tags) if isinstance(tags, (list, tuple)) else [str(tags)]
@@ -788,7 +901,7 @@ class HydrusNetwork(Store):
client.delete_tag(file_hash, tag_list, service_name)
return True
except Exception as exc:
debug(f"Hydrus delete_tag failed: {exc}")
debug(f"{self._log_prefix()} delete_tag failed: {exc}")
return False
def get_url(self, file_identifier: str, **kwargs: Any) -> List[str]:
@@ -797,7 +910,7 @@ class HydrusNetwork(Store):
try:
client = self._client
if client is None:
debug("get_url: Hydrus client unavailable")
debug(f"{self._log_prefix()} get_url: client unavailable")
return []
file_hash = str(file_identifier or "").strip().lower()
@@ -830,7 +943,7 @@ class HydrusNetwork(Store):
return out
return []
except Exception as exc:
debug(f"Hydrus get_url failed: {exc}")
debug(f"{self._log_prefix()} get_url failed: {exc}")
return []
def add_url(self, file_identifier: str, url: List[str], **kwargs: Any) -> bool:
@@ -839,13 +952,13 @@ class HydrusNetwork(Store):
try:
client = self._client
if client is None:
debug("add_url: Hydrus client unavailable")
debug(f"{self._log_prefix()} add_url: client unavailable")
return False
for u in url:
client.associate_url(file_identifier, u)
return True
except Exception as exc:
debug(f"Hydrus add_url failed: {exc}")
debug(f"{self._log_prefix()} add_url failed: {exc}")
return False
def delete_url(self, file_identifier: str, url: List[str], **kwargs: Any) -> bool:
@@ -854,13 +967,13 @@ class HydrusNetwork(Store):
try:
client = self._client
if client is None:
debug("delete_url: Hydrus client unavailable")
debug(f"{self._log_prefix()} delete_url: client unavailable")
return False
for u in url:
client.delete_url(file_identifier, u)
return True
except Exception as exc:
debug(f"Hydrus delete_url failed: {exc}")
debug(f"{self._log_prefix()} delete_url failed: {exc}")
return False
def get_note(self, file_identifier: str, **kwargs: Any) -> Dict[str, str]:
@@ -868,7 +981,7 @@ class HydrusNetwork(Store):
try:
client = self._client
if client is None:
debug("get_note: Hydrus client unavailable")
debug(f"{self._log_prefix()} get_note: client unavailable")
return {}
file_hash = str(file_identifier or "").strip().lower()
@@ -889,7 +1002,7 @@ class HydrusNetwork(Store):
return {}
except Exception as exc:
debug(f"Hydrus get_note failed: {exc}")
debug(f"{self._log_prefix()} get_note failed: {exc}")
return {}
def set_note(self, file_identifier: str, name: str, text: str, **kwargs: Any) -> bool:
@@ -897,7 +1010,7 @@ class HydrusNetwork(Store):
try:
client = self._client
if client is None:
debug("set_note: Hydrus client unavailable")
debug(f"{self._log_prefix()} set_note: client unavailable")
return False
file_hash = str(file_identifier or "").strip().lower()
@@ -912,7 +1025,7 @@ class HydrusNetwork(Store):
client.set_notes(file_hash, {note_name: note_text})
return True
except Exception as exc:
debug(f"Hydrus set_note failed: {exc}")
debug(f"{self._log_prefix()} set_note failed: {exc}")
return False
def delete_note(self, file_identifier: str, name: str, **kwargs: Any) -> bool:
@@ -920,7 +1033,7 @@ class HydrusNetwork(Store):
try:
client = self._client
if client is None:
debug("delete_note: Hydrus client unavailable")
debug(f"{self._log_prefix()} delete_note: client unavailable")
return False
file_hash = str(file_identifier or "").strip().lower()
@@ -934,7 +1047,7 @@ class HydrusNetwork(Store):
client.delete_notes(file_hash, [note_name])
return True
except Exception as exc:
debug(f"Hydrus delete_note failed: {exc}")
debug(f"{self._log_prefix()} delete_note failed: {exc}")
return False
@staticmethod

View File

@@ -6,6 +6,7 @@ import sys
import shutil
import tempfile
import re
from urllib.parse import urlsplit, parse_qs
import models
import pipeline as ctx
@@ -13,12 +14,20 @@ from API import HydrusNetwork as hydrus_wrapper
from SYS.logger import log, debug
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS
from Store import Store
from ._shared import (
Cmdlet, CmdletArg, parse_cmdlet_args, SharedArgs,
extract_tag_from_result, extract_title_from_result, extract_url_from_result,
merge_sequences, extract_relationships, extract_duration, coerce_to_pipe_object
)
from ._shared import collapse_namespace_tag
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
parse_cmdlet_args = sh.parse_cmdlet_args
SharedArgs = sh.SharedArgs
extract_tag_from_result = sh.extract_tag_from_result
extract_title_from_result = sh.extract_title_from_result
extract_url_from_result = sh.extract_url_from_result
merge_sequences = sh.merge_sequences
extract_relationships = sh.extract_relationships
extract_duration = sh.extract_duration
coerce_to_pipe_object = sh.coerce_to_pipe_object
collapse_namespace_tag = sh.collapse_namespace_tag
from API.folder import read_sidecar, find_sidecar, write_sidecar, API_folder_store
from SYS.utils import sha256_file, unique_path
from metadata import write_metadata
@@ -181,7 +190,7 @@ class Add_File(Cmdlet):
downloaded_path = Path(downloaded)
if downloaded_path.exists() and downloaded_path.is_dir():
log(
"[add-file] OpenLibrary download produced a directory (missing img2pdf?). Cannot ingest.",
"[add-file] OpenLibrary download produced a directory (PDF conversion failed). Cannot ingest.",
file=sys.stderr,
)
failures += 1
@@ -192,6 +201,26 @@ class Add_File(Cmdlet):
delete_after_item = True
# For non-provider URLs, or if still a URL after provider attempt, delegate to download-media.
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
("http://", "https://", "magnet:", "torrent:")
):
# Hydrus file URLs are direct file downloads and may require Hydrus auth headers.
# If the user provided a destination (-provider or -store), download now and continue.
if (provider_name or location) and isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(("http://", "https://")):
downloaded = self._try_download_hydrus_file_url(
file_url=str(media_path_or_url),
pipe_obj=pipe_obj,
config=config,
)
if downloaded is not None:
downloaded_path, downloaded_temp_dir = downloaded
temp_dir_to_cleanup = downloaded_temp_dir
media_path_or_url = str(downloaded_path)
pipe_obj.path = str(downloaded_path)
pipe_obj.is_temp = True
delete_after_item = True
# If it's still a URL target, fall back to the legacy delegate.
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
("http://", "https://", "magnet:", "torrent:")
):
@@ -767,6 +796,134 @@ class Add_File(Cmdlet):
return True
return False
@staticmethod
def _sanitize_filename(value: str) -> str:
# Minimal Windows-safe filename sanitization.
text = str(value or "").strip()
if not text:
return "file"
invalid = '<>:"/\\|?*'
text = "".join("_" if (ch in invalid or ord(ch) < 32) else ch for ch in text)
text = re.sub(r"\s+", " ", text).strip(" .")
return text or "file"
@staticmethod
def _parse_hydrus_file_url(file_url: str) -> Optional[str]:
"""Return the sha256 hash from a Hydrus /get_files/file URL, or None."""
try:
split = urlsplit(str(file_url))
if split.scheme.lower() not in {"http", "https"}:
return None
path_lower = (split.path or "").lower()
if "/get_files/file" not in path_lower:
return None
params = parse_qs(split.query or "")
raw = None
if "hash" in params and params["hash"]:
raw = params["hash"][0]
if not raw:
return None
hash_val = str(raw).strip().lower()
if not re.fullmatch(r"[0-9a-f]{64}", hash_val):
return None
return hash_val
except Exception:
return None
def _try_download_hydrus_file_url(
self,
*,
file_url: str,
pipe_obj: models.PipeObject,
config: Dict[str, Any],
) -> Optional[tuple[Path, Path]]:
"""If *file_url* is a Hydrus file URL, download it to temp and return (path, temp_dir)."""
file_hash = self._parse_hydrus_file_url(file_url)
if not file_hash:
return None
# Resolve Hydrus backend for auth.
store_name = str(getattr(pipe_obj, "store", "") or "").strip()
if ":" in store_name:
store_name = store_name.split(":", 1)[-1].strip()
backend = None
try:
store_registry = Store(config)
if store_name and store_registry.is_available(store_name):
candidate = store_registry[store_name]
if type(candidate).__name__.lower() == "hydrusnetwork":
backend = candidate
except Exception:
backend = None
if backend is None:
try:
store_registry = Store(config)
target_prefix = str(file_url).split("/get_files/file", 1)[0].rstrip("/")
for backend_name in store_registry.list_backends():
candidate = store_registry[backend_name]
if type(candidate).__name__.lower() != "hydrusnetwork":
continue
base_url = str(getattr(candidate, "URL", "") or "").rstrip("/")
if base_url and (target_prefix.lower() == base_url.lower() or target_prefix.lower().startswith(base_url.lower())):
backend = candidate
break
except Exception:
backend = None
if backend is None:
debug("[add-file] Hydrus file URL detected but no Hydrus backend matched for auth")
return None
api_key = str(getattr(backend, "API", "") or "").strip()
if not api_key:
debug(f"[add-file] Hydrus backend '{getattr(backend, 'NAME', '') or store_name}' missing API key")
return None
# Best-effort filename from title + ext.
ext = ""
try:
if isinstance(pipe_obj.extra, dict):
ext = str(pipe_obj.extra.get("ext") or "").strip().lstrip(".")
except Exception:
ext = ""
if not ext:
ext = "bin"
title_hint = str(getattr(pipe_obj, "title", "") or "").strip()
base_name = self._sanitize_filename(title_hint) if title_hint else f"hydrus_{file_hash[:12]}"
temp_dir = Path(tempfile.mkdtemp(prefix="medios_hydrus_"))
destination = unique_path(temp_dir / f"{base_name}.{ext}")
headers = {"Hydrus-Client-API-Access-Key": api_key}
timeout = 60.0
try:
client = getattr(backend, "_client", None)
timeout_val = getattr(client, "timeout", None)
if timeout_val is not None:
timeout = float(timeout_val)
except Exception:
timeout = 60.0
try:
log(
f"[add-file] Downloading Hydrus file via API ({getattr(backend, 'NAME', '') or store_name})",
file=sys.stderr,
)
downloaded_bytes = hydrus_wrapper.download_hydrus_file(str(file_url), headers, destination, timeout)
if downloaded_bytes <= 0 and not destination.exists():
return None
return destination, temp_dir
except Exception as exc:
log(f"[add-file] Hydrus download failed: {exc}", file=sys.stderr)
try:
shutil.rmtree(temp_dir, ignore_errors=True)
except Exception:
pass
return None
def _delegate_to_download_data(
self,
result: Any,
@@ -883,6 +1040,61 @@ class Add_File(Cmdlet):
except Exception:
return None
@staticmethod
def _get_note_text(result: Any, pipe_obj: models.PipeObject, note_name: str) -> Optional[str]:
"""Extract a named note text from a piped item.
Supports:
- pipe_obj.extra["notes"][note_name]
- result["notes"][note_name] for dict results
- pipe_obj.extra[note_name] / result[note_name] as fallback
"""
def _normalize(val: Any) -> Optional[str]:
if val is None:
return None
if isinstance(val, bytes):
try:
val = val.decode("utf-8", errors="ignore")
except Exception:
val = str(val)
if isinstance(val, str):
text = val.strip()
return text if text else None
try:
text = str(val).strip()
return text if text else None
except Exception:
return None
note_key = str(note_name or "").strip()
if not note_key:
return None
# Prefer notes dict on PipeObject.extra (common for cmdlet-emitted dicts)
try:
if isinstance(pipe_obj.extra, dict):
notes_val = pipe_obj.extra.get("notes")
if isinstance(notes_val, dict) and note_key in notes_val:
return _normalize(notes_val.get(note_key))
if note_key in pipe_obj.extra:
return _normalize(pipe_obj.extra.get(note_key))
except Exception:
pass
# Fallback to raw result dict
if isinstance(result, dict):
try:
notes_val = result.get("notes")
if isinstance(notes_val, dict) and note_key in notes_val:
return _normalize(notes_val.get(note_key))
if note_key in result:
return _normalize(result.get(note_key))
except Exception:
pass
return None
@staticmethod
def _update_pipe_object_destination(
pipe_obj: models.PipeObject,
@@ -1451,6 +1663,26 @@ class Add_File(Cmdlet):
except Exception:
pass
# If a subtitle note was provided upstream (e.g., download-media writes notes.sub),
# persist it automatically like add-note would.
sub_note = Add_File._get_note_text(result, pipe_obj, "sub")
if sub_note:
try:
setter = getattr(backend, "set_note", None)
if callable(setter):
setter(resolved_hash, "sub", sub_note)
except Exception:
pass
chapters_note = Add_File._get_note_text(result, pipe_obj, "chapters")
if chapters_note:
try:
setter = getattr(backend, "set_note", None)
if callable(setter):
setter(resolved_hash, "chapters", chapters_note)
except Exception:
pass
meta: Dict[str, Any] = {}
try:
meta = backend.get_metadata(resolved_hash) or {}

View File

@@ -7,15 +7,15 @@ import sys
from SYS.logger import log
import pipeline as ctx
from ._shared import (
Cmdlet,
CmdletArg,
SharedArgs,
normalize_hash,
parse_cmdlet_args,
normalize_result_input,
should_show_help,
)
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
normalize_hash = sh.normalize_hash
parse_cmdlet_args = sh.parse_cmdlet_args
normalize_result_input = sh.normalize_result_input
should_show_help = sh.should_show_help
from Store import Store
from SYS.utils import sha256_file
@@ -84,9 +84,9 @@ class Add_Note(Cmdlet):
else:
note_text = str(text_parts or "").strip()
if not note_text:
log("[add_note] Error: Empty note text", file=sys.stderr)
return 1
# Note text can be omitted when upstream stages provide it (e.g. download-media --write-sub
# attaches notes.sub). In that case we resolve per-item below.
user_provided_text = bool(note_text)
results = normalize_result_input(result)
if not results:
@@ -99,11 +99,56 @@ class Add_Note(Cmdlet):
store_registry = Store(config)
updated = 0
# Optional global fallback for note text from pipeline values.
# Allows patterns like: ... | add-note sub
pipeline_default_text = None
if not user_provided_text:
try:
pipeline_default_text = ctx.load_value(note_name)
except Exception:
pipeline_default_text = None
if isinstance(pipeline_default_text, list):
pipeline_default_text = " ".join([str(x) for x in pipeline_default_text]).strip()
elif pipeline_default_text is not None:
pipeline_default_text = str(pipeline_default_text).strip()
for res in results:
if not isinstance(res, dict):
ctx.emit(res)
continue
# Resolve note text for this item when not provided explicitly.
item_note_text = note_text
if not user_provided_text:
# Prefer item-scoped notes dict.
candidate = None
try:
notes = res.get("notes")
if isinstance(notes, dict):
candidate = notes.get(note_name)
except Exception:
candidate = None
# Also allow direct field fallback: res["sub"], etc.
if candidate is None:
try:
candidate = res.get(note_name)
except Exception:
candidate = None
if candidate is None:
candidate = pipeline_default_text
if isinstance(candidate, list):
item_note_text = " ".join([str(x) for x in candidate]).strip()
else:
item_note_text = str(candidate or "").strip()
if not item_note_text:
log(f"[add_note] Warning: No note text found for '{note_name}'; skipping", file=sys.stderr)
ctx.emit(res)
continue
store_name = str(store_override or res.get("store") or "").strip()
raw_hash = res.get("hash")
raw_path = res.get("path")
@@ -130,7 +175,7 @@ class Add_Note(Cmdlet):
ok = False
try:
ok = bool(backend.set_note(resolved_hash, note_name, note_text, config=config))
ok = bool(backend.set_note(resolved_hash, note_name, item_note_text, config=config))
except Exception as exc:
log(f"[add_note] Error: Failed to set note: {exc}", file=sys.stderr)
ok = False

View File

@@ -11,7 +11,15 @@ from SYS.logger import log
import pipeline as ctx
from API import HydrusNetwork as hydrus_wrapper
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, normalize_result_input, should_show_help, get_field
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
parse_cmdlet_args = sh.parse_cmdlet_args
normalize_result_input = sh.normalize_result_input
should_show_help = sh.should_show_help
get_field = sh.get_field
from API.folder import read_sidecar, find_sidecar, API_folder_store
from Store import Store

View File

@@ -8,19 +8,20 @@ from SYS.logger import log
import models
import pipeline as ctx
from ._shared import normalize_result_input, filter_results_by_temp
from ._shared import (
Cmdlet,
CmdletArg,
SharedArgs,
normalize_hash,
parse_tag_arguments,
expand_tag_groups,
parse_cmdlet_args,
collapse_namespace_tag,
should_show_help,
get_field,
)
from . import _shared as sh
normalize_result_input = sh.normalize_result_input
filter_results_by_temp = sh.filter_results_by_temp
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
normalize_hash = sh.normalize_hash
parse_tag_arguments = sh.parse_tag_arguments
expand_tag_groups = sh.expand_tag_groups
parse_cmdlet_args = sh.parse_cmdlet_args
collapse_namespace_tag = sh.collapse_namespace_tag
should_show_help = sh.should_show_help
get_field = sh.get_field
from Store import Store
from SYS.utils import sha256_file

View File

@@ -8,19 +8,20 @@ from SYS.logger import log
import models
import pipeline as ctx
from ._shared import normalize_result_input, filter_results_by_temp
from ._shared import (
Cmdlet,
CmdletArg,
SharedArgs,
normalize_hash,
parse_tag_arguments,
expand_tag_groups,
parse_cmdlet_args,
collapse_namespace_tags,
should_show_help,
get_field,
)
from . import _shared as sh
normalize_result_input = sh.normalize_result_input
filter_results_by_temp = sh.filter_results_by_temp
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
normalize_hash = sh.normalize_hash
parse_tag_arguments = sh.parse_tag_arguments
expand_tag_groups = sh.expand_tag_groups
parse_cmdlet_args = sh.parse_cmdlet_args
collapse_namespace_tags = sh.collapse_namespace_tags
should_show_help = sh.should_show_help
get_field = sh.get_field
from Store import Store
from SYS.utils import sha256_file

View File

@@ -4,12 +4,12 @@ from typing import Any, Dict, Sequence
import sys
import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
from . import _shared as sh
from SYS.logger import log
from Store import Store
class Add_Url(Cmdlet):
class Add_Url(sh.Cmdlet):
"""Add URL associations to files via hash+store."""
def __init__(self) -> None:
@@ -18,9 +18,9 @@ class Add_Url(Cmdlet):
summary="Associate a URL with a file",
usage="@1 | add-url <url>",
arg=[
SharedArgs.HASH,
SharedArgs.STORE,
CmdletArg("url", required=True, description="URL to associate"),
sh.SharedArgs.HASH,
sh.SharedArgs.STORE,
sh.CmdletArg("url", required=True, description="URL to associate"),
],
detail=[
"- Associates URL with file identified by hash+store",
@@ -32,11 +32,11 @@ class Add_Url(Cmdlet):
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Add URL to file via hash+store backend."""
parsed = parse_cmdlet_args(args, self)
parsed = sh.parse_cmdlet_args(args, self)
# Extract hash and store from result or args
file_hash = parsed.get("hash") or get_field(result, "hash")
store_name = parsed.get("store") or get_field(result, "store")
file_hash = parsed.get("hash") or sh.get_field(result, "hash")
store_name = parsed.get("store") or sh.get_field(result, "store")
url_arg = parsed.get("url")
if not file_hash:
@@ -52,7 +52,7 @@ class Add_Url(Cmdlet):
return 1
# Normalize hash
file_hash = normalize_hash(file_hash)
file_hash = sh.normalize_hash(file_hash)
if not file_hash:
log("Error: Invalid hash format")
return 1

View File

@@ -1,190 +0,0 @@
from __future__ import annotations
from typing import Any, Dict, Sequence
import json
import sys
from SYS.logger import log
from API import HydrusNetwork as hydrus_wrapper
from ._shared import Cmdlet, CmdletArg, SharedArgs, normalize_hash, should_show_help
from Store import Store
CMDLET = Cmdlet(
name="check-file-status",
summary="Check if a file is active, deleted, or corrupted in Hydrus.",
usage="check-file-status [-hash <sha256>] [-store <name>]",
arg=[
SharedArgs.HASH,
SharedArgs.STORE,
],
detail=[
"- Shows whether file is active in Hydrus or marked as deleted",
"- Detects corrupted data (e.g., comma-separated url)",
"- Displays file metadata and service locations",
"- Note: Hydrus keeps deleted files for recovery. Use cleanup-corrupted for full removal.",
],
)
def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
# Help
if should_show_help(args):
log(f"Cmdlet: {CMDLET.name}\nSummary: {CMDLET.summary}\nUsage: {CMDLET.usage}")
return 0
# Parse arguments
override_hash: str | None = None
override_store: str | None = None
i = 0
while i < len(args):
token = args[i]
low = str(token).lower()
if low in {"-hash", "--hash", "hash"} and i + 1 < len(args):
override_hash = str(args[i + 1]).strip()
i += 2
continue
if low in {"-store", "--store", "store"} and i + 1 < len(args):
override_store = str(args[i + 1]).strip()
i += 2
continue
i += 1
store_name: str | None = override_store
if not store_name:
if isinstance(result, dict):
store_name = str(result.get("store") or "").strip() or None
else:
store_name = str(getattr(result, "store", "") or "").strip() or None
if override_hash:
hash_hex = normalize_hash(override_hash)
else:
if isinstance(result, dict):
hash_hex = normalize_hash(result.get("hash") or result.get("hash_hex"))
else:
hash_hex = normalize_hash(getattr(result, "hash", None) or getattr(result, "hash_hex", None))
if not hash_hex:
log("No hash provided and no result selected", file=sys.stderr)
return 1
try:
client = None
if store_name:
# Store specified: do not fall back to a global/default Hydrus client.
try:
store = Store(config)
backend = store[str(store_name)]
candidate = getattr(backend, "_client", None)
if candidate is not None and hasattr(candidate, "fetch_file_metadata"):
client = candidate
except Exception:
client = None
if client is None:
log(f"Hydrus client unavailable for store '{store_name}'", file=sys.stderr)
return 1
else:
client = hydrus_wrapper.get_client(config)
if client is None:
log("Hydrus client unavailable", file=sys.stderr)
return 1
except Exception as exc:
log(f"Hydrus client unavailable: {exc}", file=sys.stderr)
return 1
try:
result_data = client.fetch_file_metadata(hashes=[hash_hex])
if not result_data.get("metadata"):
log(f"File not found: {hash_hex[:16]}...", file=sys.stderr)
return 1
file_info = result_data["metadata"][0]
# Status summary
is_deleted = file_info.get("is_deleted", False)
is_local = file_info.get("is_local", False)
is_trashed = file_info.get("is_trashed", False)
status_str = "DELETED" if is_deleted else ("TRASHED" if is_trashed else "ACTIVE")
log(f"File status: {status_str}", file=sys.stderr)
# File info
log(f"\n📄 File Information:", file=sys.stderr)
log(f" Hash: {file_info['hash'][:16]}...", file=sys.stderr)
log(f" Size: {file_info['size']:,} bytes", file=sys.stderr)
log(f" MIME: {file_info['mime']}", file=sys.stderr)
log(f" Dimensions: {file_info.get('width', '?')}x{file_info.get('height', '?')}", file=sys.stderr)
# Service status
file_services = file_info.get("file_services", {})
current_services = file_services.get("current", {})
deleted_services = file_services.get("deleted", {})
if current_services:
log(f"\n✓ In services ({len(current_services)}):", file=sys.stderr)
for service_key, service_info in current_services.items():
sname = service_info.get("name", "unknown")
stype = service_info.get("type_pretty", "unknown")
log(f" - {sname} ({stype})", file=sys.stderr)
if deleted_services:
log(f"\n✗ Deleted from services ({len(deleted_services)}):", file=sys.stderr)
for service_key, service_info in deleted_services.items():
sname = service_info.get("name", "unknown")
stype = service_info.get("type_pretty", "unknown")
time_deleted = service_info.get("time_deleted", "?")
log(f" - {sname} ({stype}) - deleted at {time_deleted}", file=sys.stderr)
# URL check
url = file_info.get("url", [])
log(f"\n🔗 url ({len(url)}):", file=sys.stderr)
corrupted_count = 0
for i, url in enumerate(url, 1):
if "," in url:
corrupted_count += 1
log(f" [{i}] ⚠️ CORRUPTED (comma-separated): {url[:50]}...", file=sys.stderr)
else:
log(f" [{i}] {url[:70]}{'...' if len(url) > 70 else ''}", file=sys.stderr)
if corrupted_count > 0:
log(f"\n⚠️ WARNING: Found {corrupted_count} corrupted URL(s)", file=sys.stderr)
# Tags
tags_dict = file_info.get("tags", {})
total_tags = 0
for service_key, service_data in tags_dict.items():
service_name = service_data.get("name", "unknown")
display_tags = service_data.get("display_tags", {}).get("0", [])
total_tags += len(display_tags)
if total_tags > 0:
log(f"\n🏷️ Tags ({total_tags}):", file=sys.stderr)
for service_key, service_data in tags_dict.items():
display_tags = service_data.get("display_tags", {}).get("0", [])
if display_tags:
service_name = service_data.get("name", "unknown")
log(f" {service_name}:", file=sys.stderr)
for tag in display_tags[:5]: # Show first 5
log(f" - {tag}", file=sys.stderr)
if len(display_tags) > 5:
log(f" ... and {len(display_tags) - 5} more", file=sys.stderr)
log("\n", file=sys.stderr)
return 0
except Exception as exc:
log(f"Error checking file status: {exc}", file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
return 1
# Register cmdlet (no legacy decorator)
CMDLET.exec = _run
CMDLET.alias = ["check-status", "file-status", "status"]
CMDLET.register()

View File

@@ -1,105 +0,0 @@
"""Cleanup cmdlet for removing temporary artifacts from pipeline.
This cmdlet processes result lists and removes temporary files (marked with is_temp=True),
then emits the remaining non-temporary results for further pipeline stages.
"""
from __future__ import annotations
from typing import Any, Dict, Sequence
from pathlib import Path
import sys
import json
from SYS.logger import log
from ._shared import Cmdlet, CmdletArg, get_pipe_object_path, normalize_result_input, filter_results_by_temp, should_show_help
import models
import pipeline as pipeline_context
def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Remove temporary files from pipeline results.
Accepts:
- Single result object with is_temp field
- List of result objects to clean up
Process:
- Filters results by is_temp=True
- Deletes those files from disk
- Emits only non-temporary results
Typical pipeline usage:
download-data url | screen-shot | add-tag -store local "tag" --all | cleanup
"""
# Help
if should_show_help(args):
log(f"Cmdlet: {CMDLET.name}\nSummary: {CMDLET.summary}\nUsage: {CMDLET.usage}")
return 0
# Normalize input to list
results = normalize_result_input(result)
if not results:
log("[cleanup] No results to process", file=sys.stderr)
return 1
# Separate temporary and permanent results
temp_results = pipeline_context.filter_results_by_temp(results, include_temp=True)
perm_results = pipeline_context.filter_results_by_temp(results, include_temp=False)
# Delete temporary files
deleted_count = 0
for temp_result in temp_results:
try:
file_path = get_pipe_object_path(temp_result)
if file_path:
path_obj = Path(file_path)
if path_obj.exists():
# Delete the file
path_obj.unlink()
log(f"[cleanup] Deleted temporary file: {path_obj.name}", file=sys.stderr)
deleted_count += 1
# Clean up any associated sidecar files
for ext in ['.tag', '.metadata']:
sidecar = path_obj.parent / (path_obj.name + ext)
if sidecar.exists():
try:
sidecar.unlink()
log(f"[cleanup] Deleted sidecar: {sidecar.name}", file=sys.stderr)
except Exception as e:
log(f"[cleanup] Warning: Could not delete sidecar {sidecar.name}: {e}", file=sys.stderr)
else:
log(f"[cleanup] File does not exist: {file_path}", file=sys.stderr)
except Exception as e:
log(f"[cleanup] Error deleting file: {e}", file=sys.stderr)
# Log summary
log(f"[cleanup] Deleted {deleted_count} temporary file(s), emitting {len(perm_results)} permanent result(s)", file=sys.stderr)
# Emit permanent results for downstream processing
for perm_result in perm_results:
pipeline_context.emit(perm_result)
return 0
CMDLET = Cmdlet(
name="cleanup",
summary="Remove temporary artifacts from pipeline (marked with is_temp=True).",
usage="cleanup",
arg=[],
detail=[
"- Accepts pipeline results that may contain temporary files (screenshots, intermediate artifacts)",
"- Deletes files marked with is_temp=True from disk",
"- Also cleans up associated sidecar files (.tag, .metadata)",
"- Emits only non-temporary results for further processing",
"- Typical usage at end of pipeline: ... | add-tag -store local \"tag\" --all | cleanup",
"- Exit code 0 if cleanup successful, 1 if no results to process",
],
exec=_run,
).register()

View File

@@ -8,12 +8,12 @@ from pathlib import Path
from SYS.logger import debug, log
from Store.Folder import Folder
from Store import Store
from ._shared import Cmdlet, CmdletArg, normalize_hash, looks_like_hash, get_field, should_show_help
from . import _shared as sh
from API import HydrusNetwork as hydrus_wrapper
import pipeline as ctx
class Delete_File(Cmdlet):
class Delete_File(sh.Cmdlet):
"""Class-based delete-file cmdlet with self-registration."""
def __init__(self) -> None:
@@ -23,10 +23,10 @@ class Delete_File(Cmdlet):
usage="delete-file [-hash <sha256>] [-conserve <local|hydrus>] [-lib-root <path>] [reason]",
alias=["del-file"],
arg=[
CmdletArg("hash", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."),
CmdletArg("conserve", description="Choose which copy to keep: 'local' or 'hydrus'."),
CmdletArg("lib-root", description="Path to local library root for database cleanup."),
CmdletArg("reason", description="Optional reason for deletion (free text)."),
sh.CmdletArg("hash", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."),
sh.CmdletArg("conserve", description="Choose which copy to keep: 'local' or 'hydrus'."),
sh.CmdletArg("lib-root", description="Path to local library root for database cleanup."),
sh.CmdletArg("reason", description="Optional reason for deletion (free text)."),
],
detail=[
"Default removes both the local file and Hydrus file.",
@@ -45,24 +45,28 @@ class Delete_File(Cmdlet):
if isinstance(item, dict):
hash_hex_raw = item.get("hash_hex") or item.get("hash")
target = item.get("target") or item.get("file_path") or item.get("path")
title_val = item.get("title") or item.get("name")
else:
hash_hex_raw = get_field(item, "hash_hex") or get_field(item, "hash")
target = get_field(item, "target") or get_field(item, "file_path") or get_field(item, "path")
hash_hex_raw = sh.get_field(item, "hash_hex") or sh.get_field(item, "hash")
target = sh.get_field(item, "target") or sh.get_field(item, "file_path") or sh.get_field(item, "path")
title_val = sh.get_field(item, "title") or sh.get_field(item, "name")
store = None
if isinstance(item, dict):
store = item.get("store")
else:
store = get_field(item, "store")
store = sh.get_field(item, "store")
store_lower = str(store).lower() if store else ""
is_hydrus_store = bool(store_lower) and ("hydrus" in store_lower or store_lower in {"home", "work"})
store_label = str(store) if store else "default"
hydrus_prefix = f"[hydrusnetwork:{store_label}]"
# For Hydrus files, the target IS the hash
if is_hydrus_store and not hash_hex_raw:
hash_hex_raw = target
hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(hash_hex_raw)
hash_hex = sh.normalize_hash(override_hash) if override_hash else sh.normalize_hash(hash_hex_raw)
local_deleted = False
local_target = isinstance(target, str) and target.strip() and not str(target).lower().startswith(("http://", "https://"))
@@ -156,19 +160,28 @@ class Delete_File(Cmdlet):
try:
client._post("/add_files/delete_files", data=payload) # type: ignore[attr-defined]
hydrus_deleted = True
preview = hash_hex[:12] + ('' if len(hash_hex) > 12 else '')
debug(f"Deleted from Hydrus: {preview}", file=sys.stderr)
title_str = str(title_val).strip() if title_val else ""
if title_str:
debug(f"{hydrus_prefix} Deleted title:{title_str} hash:{hash_hex}", file=sys.stderr)
else:
debug(f"{hydrus_prefix} Deleted hash:{hash_hex}", file=sys.stderr)
except Exception:
# If it's not in Hydrus (e.g. 404 or similar), that's fine
if not local_deleted:
return False
if hydrus_deleted and hash_hex:
preview = hash_hex[:12] + ('' if len(hash_hex) > 12 else '')
title_str = str(title_val).strip() if title_val else ""
if reason:
ctx.emit(f"Deleted {preview} (reason: {reason}).")
if title_str:
ctx.emit(f"{hydrus_prefix} Deleted title:{title_str} hash:{hash_hex} (reason: {reason}).")
else:
ctx.emit(f"Deleted {preview}.")
ctx.emit(f"{hydrus_prefix} Deleted hash:{hash_hex} (reason: {reason}).")
else:
if title_str:
ctx.emit(f"{hydrus_prefix} Deleted title:{title_str} hash:{hash_hex}.")
else:
ctx.emit(f"{hydrus_prefix} Deleted hash:{hash_hex}.")
if hydrus_deleted or local_deleted:
return True
@@ -178,7 +191,7 @@ class Delete_File(Cmdlet):
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Execute delete-file command."""
if should_show_help(args):
if sh.should_show_help(args):
log(f"Cmdlet: {self.name}\nSummary: {self.summary}\nUsage: {self.usage}")
return 0

View File

@@ -7,16 +7,16 @@ import sys
from SYS.logger import log
import pipeline as ctx
from ._shared import (
Cmdlet,
CmdletArg,
SharedArgs,
normalize_hash,
parse_cmdlet_args,
normalize_result_input,
get_field,
should_show_help,
)
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
normalize_hash = sh.normalize_hash
parse_cmdlet_args = sh.parse_cmdlet_args
normalize_result_input = sh.normalize_result_input
get_field = sh.get_field
should_show_help = sh.should_show_help
from Store import Store
from SYS.utils import sha256_file

View File

@@ -10,7 +10,16 @@ import sys
from SYS.logger import log
import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, normalize_hash, normalize_result_input, get_field, should_show_help
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
parse_cmdlet_args = sh.parse_cmdlet_args
normalize_hash = sh.normalize_hash
normalize_result_input = sh.normalize_result_input
get_field = sh.get_field
should_show_help = sh.should_show_help
from API.folder import API_folder_store
from Store import Store
from config import get_local_storage_path

View File

@@ -7,7 +7,15 @@ import sys
import models
import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, normalize_hash, parse_tag_arguments, should_show_help, get_field
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
normalize_hash = sh.normalize_hash
parse_tag_arguments = sh.parse_tag_arguments
should_show_help = sh.should_show_help
get_field = sh.get_field
from SYS.logger import debug, log
from Store import Store

View File

@@ -4,7 +4,16 @@ from typing import Any, Dict, Sequence
import sys
import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
from . import _shared as sh
Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash = (
sh.Cmdlet,
sh.CmdletArg,
sh.SharedArgs,
sh.parse_cmdlet_args,
sh.get_field,
sh.normalize_hash,
)
from SYS.logger import log
from Store import Store

View File

@@ -17,15 +17,15 @@ from SYS.download import DownloadError, _download_direct_file
from SYS.logger import log, debug
import pipeline as pipeline_context
from ._shared import (
Cmdlet,
CmdletArg,
SharedArgs,
parse_cmdlet_args,
register_url_with_local_library,
coerce_to_pipe_object,
get_field,
)
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
parse_cmdlet_args = sh.parse_cmdlet_args
register_url_with_local_library = sh.register_url_with_local_library
coerce_to_pipe_object = sh.coerce_to_pipe_object
get_field = sh.get_field
class Download_File(Cmdlet):
@@ -251,6 +251,13 @@ class Download_File(Cmdlet):
# Fallback: if we have a direct HTTP URL, download it directly
if downloaded_path is None and isinstance(target, str) and target.startswith("http"):
# Guard: provider landing pages (e.g. LibGen ads.php) are HTML, not files.
# Never download these as "files".
if str(table or "").lower() == "libgen":
low = target.lower()
if ("/ads.php" in low) or ("/file.php" in low) or ("/index.php" in low):
log("[download-file] Refusing to download LibGen landing page (expected provider to resolve file link)", file=sys.stderr)
continue
debug(f"[download-file] Provider item looks like direct URL, downloading: {target}")
result_obj = _download_direct_file(target, final_output_dir, quiet=quiet_mode)
file_path = None

View File

@@ -38,7 +38,18 @@ from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLog
import pipeline as pipeline_context
from result_table import ResultTable
from ._shared import Cmdlet, CmdletArg, SharedArgs, create_pipe_object_result, parse_cmdlet_args, register_url_with_local_library, coerce_to_pipe_object
from tool.ytdlp import YtDlpTool
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
create_pipe_object_result = sh.create_pipe_object_result
parse_cmdlet_args = sh.parse_cmdlet_args
register_url_with_local_library = sh.register_url_with_local_library
coerce_to_pipe_object = sh.coerce_to_pipe_object
get_field = sh.get_field
# Minimal inlined helpers from helper/download.py (is_url_supported_by_ytdlp, list_formats)
@@ -62,6 +73,136 @@ _EXTRACTOR_CACHE: List[Any] | None = None
_YTDLP_PROGRESS_BAR = ProgressBar()
_SUBTITLE_EXTS = (".vtt", ".srt", ".ass", ".ssa", ".lrc")
def _format_chapters_note(info: Dict[str, Any]) -> Optional[str]:
"""Format yt-dlp chapter metadata into a stable, note-friendly text.
Output is one chapter per line, e.g.:
00:00 Intro
01:23-02:10 Topic name
"""
try:
chapters = info.get("chapters")
except Exception:
chapters = None
if not isinstance(chapters, list) or not chapters:
return None
rows: List[tuple[int, Optional[int], str]] = []
max_t = 0
for ch in chapters:
if not isinstance(ch, dict):
continue
start_raw = ch.get("start_time")
end_raw = ch.get("end_time")
title_raw = ch.get("title") or ch.get("name") or ch.get("chapter")
try:
start_s = int(float(start_raw))
except Exception:
continue
end_s: Optional[int] = None
try:
if end_raw is not None:
end_s = int(float(end_raw))
except Exception:
end_s = None
title = str(title_raw).strip() if title_raw is not None else ""
rows.append((start_s, end_s, title))
try:
max_t = max(max_t, start_s, end_s or 0)
except Exception:
max_t = max(max_t, start_s)
if not rows:
return None
force_hours = bool(max_t >= 3600)
def _tc(seconds: int) -> str:
total = max(0, int(seconds))
minutes, secs = divmod(total, 60)
hours, minutes = divmod(minutes, 60)
if force_hours:
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
return f"{minutes:02d}:{secs:02d}"
lines: List[str] = []
for start_s, end_s, title in sorted(rows, key=lambda r: (r[0], r[1] if r[1] is not None else 10**9, r[2])):
if end_s is not None and end_s > start_s:
prefix = f"{_tc(start_s)}-{_tc(end_s)}"
else:
prefix = _tc(start_s)
line = f"{prefix} {title}".strip()
if line:
lines.append(line)
text = "\n".join(lines).strip()
return text or None
def _best_subtitle_sidecar(media_path: Path) -> Optional[Path]:
"""Find the most likely subtitle sidecar file for a downloaded media file."""
try:
base_dir = media_path.parent
stem = media_path.stem
if not stem:
return None
candidates: List[Path] = []
for p in base_dir.glob(stem + ".*"):
try:
if not p.is_file():
continue
except Exception:
continue
if p.suffix.lower() in _SUBTITLE_EXTS:
candidates.append(p)
if not candidates:
return None
def _rank(path: Path) -> tuple[int, int, float, str]:
name = path.name.lower()
lang_rank = 0 if ".en." in name or name.endswith(".en" + path.suffix.lower()) else 1
ext = path.suffix.lower()
ext_rank_map = {".vtt": 0, ".srt": 1, ".ass": 2, ".ssa": 3, ".lrc": 4}
ext_rank = ext_rank_map.get(ext, 9)
try:
mtime = float(path.stat().st_mtime)
except Exception:
mtime = 0.0
return (lang_rank, ext_rank, -mtime, name)
candidates.sort(key=_rank)
return candidates[0]
except Exception:
return None
def _read_text_file(path: Path, *, max_bytes: int = 1_500_000) -> Optional[str]:
try:
data = path.read_bytes()
except Exception:
return None
if not data:
return None
if len(data) > max_bytes:
data = data[:max_bytes]
try:
return data.decode("utf-8", errors="replace")
except Exception:
try:
return data.decode(errors="replace")
except Exception:
return None
def _ensure_yt_dlp_ready() -> None:
if yt_dlp is not None:
return
@@ -100,16 +241,26 @@ def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[s
ydl_opts["noplaylist"] = True
if playlist_items:
ydl_opts["playlist_items"] = playlist_items
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type]
debug(f"Fetching format list for: {url}")
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type]
info = ydl.extract_info(url, download=False)
formats = info.get("formats", [])
if not formats:
if not isinstance(info, dict):
log("No formats available", file=sys.stderr)
return None
result_formats = []
formats = info.get("formats") or []
if not isinstance(formats, list) or not formats:
log("No formats available", file=sys.stderr)
return None
result_formats: List[Dict[str, Any]] = []
for fmt in formats:
result_formats.append({
if not isinstance(fmt, dict):
continue
result_formats.append(
{
"format_id": fmt.get("format_id", ""),
"format": fmt.get("format", ""),
"ext": fmt.get("ext", ""),
@@ -122,9 +273,11 @@ def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[s
"filesize": fmt.get("filesize"),
"abr": fmt.get("abr"),
"tbr": fmt.get("tbr"),
})
}
)
debug(f"Found {len(result_formats)} available formats")
return result_formats
return result_formats or None
except Exception as e:
log(f"✗ Error fetching formats: {e}", file=sys.stderr)
return None
@@ -215,6 +368,31 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
cmd = ["yt-dlp"]
if ytdl_options.get("format"):
cmd.extend(["-f", ytdl_options["format"]])
if ytdl_options.get("merge_output_format"):
cmd.extend(["--merge-output-format", str(ytdl_options["merge_output_format"])])
# For CLI downloads, infer chapter/metadata embedding from either legacy flags
# or explicit FFmpegMetadata postprocessor entries.
postprocessors = ytdl_options.get("postprocessors")
want_add_metadata = bool(ytdl_options.get("addmetadata"))
want_embed_chapters = bool(ytdl_options.get("embedchapters"))
if isinstance(postprocessors, list):
for pp in postprocessors:
if not isinstance(pp, dict):
continue
if str(pp.get("key") or "") == "FFmpegMetadata":
want_add_metadata = True
if bool(pp.get("add_chapters", True)):
want_embed_chapters = True
if want_add_metadata:
cmd.append("--add-metadata")
if want_embed_chapters:
cmd.append("--embed-chapters")
if ytdl_options.get("writesubtitles"):
cmd.append("--write-sub")
cmd.append("--write-auto-sub")
cmd.extend(["--sub-format", "vtt"])
if ytdl_options.get("force_keyframes_at_cuts"):
cmd.extend(["--force-keyframes-at-cuts"]) if ytdl_options.get("force_keyframes_at_cuts") else None
cmd.extend(["-o", section_outtmpl])
@@ -258,11 +436,6 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
if opts.cookies_path and opts.cookies_path.is_file():
base_options["cookiefile"] = str(opts.cookies_path)
else:
from hydrus_health_check import get_cookies_file_path # local import
global_cookies = get_cookies_file_path()
if global_cookies:
base_options["cookiefile"] = global_cookies
if opts.no_playlist:
base_options["noplaylist"] = True
@@ -274,6 +447,37 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best"
base_options["format_sort"] = ["res:4320", "res:2880", "res:2160", "res:1440", "res:1080", "res:720", "res"]
# Optional yt-dlp features
if getattr(opts, "embed_chapters", False):
# Prefer explicit FFmpegMetadata PP so chapter embedding runs even when
# we already specified other postprocessors (e.g. FFmpegExtractAudio).
pps = base_options.get("postprocessors")
if not isinstance(pps, list):
pps = []
already_has_metadata = any(
isinstance(pp, dict) and str(pp.get("key") or "") == "FFmpegMetadata" for pp in pps
)
if not already_has_metadata:
pps.append(
{
"key": "FFmpegMetadata",
"add_metadata": True,
"add_chapters": True,
"add_infojson": "if_exists",
}
)
base_options["postprocessors"] = pps
# Chapter embedding is most reliable in mkv/mp4 containers.
# When merging separate video+audio streams, prefer mkv so mpv sees chapters.
if opts.mode != "audio":
base_options.setdefault("merge_output_format", "mkv")
if getattr(opts, "write_sub", False):
base_options["writesubtitles"] = True
base_options["writeautomaticsub"] = True
base_options["subtitlesformat"] = "vtt"
if opts.clip_sections:
sections: List[str] = []
@@ -410,13 +614,27 @@ def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
response = session.get(libgen_url, timeout=10, allow_redirects=True)
final_url = response.url
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
for link in soup.find_all('a'):
href = link.get('href')
if href and 'get.php' in href:
return urljoin(libgen_url, href)
try:
from lxml import html as lxml_html
except ImportError:
lxml_html = None
if lxml_html is not None:
doc = lxml_html.fromstring(response.content)
for a in doc.xpath("//a[@href]"):
href = str(a.get("href") or "").strip()
if href and "get.php" in href.lower():
return urljoin(final_url, href)
else:
for m in re.finditer(
r"href=[\"\']([^\"\']+)[\"\']",
response.text or "",
flags=re.IGNORECASE,
):
href = str(m.group(1) or "").strip()
if href and "get.php" in href.lower():
return urljoin(final_url, href)
except Exception:
pass
if final_url != libgen_url:
debug(f"LibGen resolved to mirror: {final_url}")
@@ -648,7 +866,7 @@ def _download_direct_file(
raise DownloadError(f"Error downloading file: {exc}") from exc
def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) -> Optional[Dict[str, Any]]:
def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15, *, cookiefile: Optional[str] = None) -> Optional[Dict[str, Any]]:
"""Probe URL to extract metadata WITHOUT downloading.
Args:
@@ -686,12 +904,8 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) ->
"noprogress": True, # No progress bars
}
# Add cookies if available (lazy import to avoid circular dependency)
from hydrus_health_check import get_cookies_file_path # local import
global_cookies = get_cookies_file_path()
if global_cookies:
ydl_opts["cookiefile"] = global_cookies
if cookiefile:
ydl_opts["cookiefile"] = str(cookiefile)
# Add no_playlist option if specified
if no_playlist:
@@ -807,7 +1021,14 @@ def download_media(
debug(f"Skipping probe for playlist (item selection: {opts.playlist_items}), proceeding with download")
probe_result = {"url": opts.url} # Minimal probe result
else:
probe_result = probe_url(opts.url, no_playlist=opts.no_playlist, timeout_seconds=15)
probe_cookiefile = None
try:
if opts.cookies_path and opts.cookies_path.is_file():
probe_cookiefile = str(opts.cookies_path)
except Exception:
probe_cookiefile = None
probe_result = probe_url(opts.url, no_playlist=opts.no_playlist, timeout_seconds=15, cookiefile=probe_cookiefile)
if probe_result is None:
if not opts.quiet:
@@ -1182,6 +1403,8 @@ class Download_Media(Cmdlet):
try:
debug("Starting download-media")
ytdlp_tool = YtDlpTool(config)
# Parse arguments
parsed = parse_cmdlet_args(args, self)
@@ -1192,7 +1415,6 @@ class Download_Media(Cmdlet):
# If no url provided via args, try to extract from piped result
if not raw_url and result:
from ._shared import get_field
# Handle single result or list of results
results_to_check = result if isinstance(result, list) else [result]
for item in results_to_check:
@@ -1226,6 +1448,10 @@ class Download_Media(Cmdlet):
# Get other options
clip_spec = parsed.get("clip")
# Always enable chapters + subtitles so downstream pipes (e.g. mpv) can consume them.
embed_chapters = True
write_sub = True
mode = "audio" if parsed.get("audio") else "video"
# Parse clip range(s) if specified
@@ -1379,7 +1605,14 @@ class Download_Media(Cmdlet):
if playlist_items:
return str(requested_url)
try:
pr = probe_url(requested_url, no_playlist=False, timeout_seconds=15)
cf = None
try:
cookie_path = ytdlp_tool.resolve_cookiefile()
if cookie_path is not None and cookie_path.is_file():
cf = str(cookie_path)
except Exception:
cf = None
pr = probe_url(requested_url, no_playlist=False, timeout_seconds=15, cookiefile=cf)
if isinstance(pr, dict):
for key in ("webpage_url", "original_url", "url", "requested_url"):
value = pr.get(key)
@@ -1458,7 +1691,14 @@ class Download_Media(Cmdlet):
- selected_urls: Optional[List[str]] (expanded per-entry urls when available)
"""
try:
pr = probe_url(url, no_playlist=False, timeout_seconds=15)
cf = None
try:
cookie_path = ytdlp_tool.resolve_cookiefile()
if cookie_path is not None and cookie_path.is_file():
cf = str(cookie_path)
except Exception:
cf = None
pr = probe_url(url, no_playlist=False, timeout_seconds=15, cookiefile=cf)
except Exception:
pr = None
if not isinstance(pr, dict):
@@ -1686,6 +1926,15 @@ class Download_Media(Cmdlet):
filesize = fmt.get("filesize")
format_id = fmt.get("format_id", "")
# If the chosen format is video-only (no audio stream), automatically
# request best audio too so the resulting file has sound.
selection_format_id = format_id
try:
if vcodec != "none" and acodec == "none" and format_id:
selection_format_id = f"{format_id}+ba"
except Exception:
selection_format_id = format_id
# Format size
size_str = ""
if filesize:
@@ -1729,9 +1978,9 @@ class Download_Media(Cmdlet):
"full_metadata": {
"format_id": format_id,
"url": url,
"item_selector": format_id,
"item_selector": selection_format_id,
},
"_selection_args": ["-format", format_id]
"_selection_args": ["-format", selection_format_id]
}
# Add to results list and table (don't emit - formats should wait for @N selection)
@@ -1778,23 +2027,57 @@ class Download_Media(Cmdlet):
actual_format = playlist_items
actual_playlist_items = None
# Auto-pick best audio format when -audio is used and no explicit format is given.
# For -audio, default to yt-dlp's built-in bestaudio selector.
# This should *not* require interactive format picking.
if mode == "audio" and not actual_format:
chosen = None
actual_format = "bestaudio"
# If no explicit format is provided for video mode, allow a config override.
if mode == "video" and not actual_format:
configured = (ytdlp_tool.default_format("video") or "").strip()
if configured and configured != "bestvideo+bestaudio/best":
actual_format = configured
# If a single format id was chosen and it is video-only, auto-merge best audio.
if (
actual_format
and isinstance(actual_format, str)
and mode != "audio"
and "+" not in actual_format
and "/" not in actual_format
and "[" not in actual_format
and actual_format not in {"best", "bv", "ba", "b"}
):
try:
formats = list_formats(url, no_playlist=False, playlist_items=actual_playlist_items)
if formats:
chosen = _pick_best_audio_format_id(formats)
actual_format = chosen or "bestaudio/best"
fmt_match = next(
(f for f in formats if str(f.get("format_id", "")) == actual_format),
None,
)
if fmt_match:
vcodec = str(fmt_match.get("vcodec", "none"))
acodec = str(fmt_match.get("acodec", "none"))
if vcodec != "none" and acodec == "none":
debug(
f"Selected video-only format {actual_format}; using {actual_format}+ba for audio"
)
actual_format = f"{actual_format}+ba"
except Exception:
pass
opts = DownloadOptions(
url=url,
mode=mode,
output_dir=final_output_dir,
ytdl_format=actual_format,
cookies_path=ytdlp_tool.resolve_cookiefile(),
clip_sections=clip_sections_spec,
playlist_items=actual_playlist_items,
quiet=quiet_mode,
no_playlist=False,
embed_chapters=embed_chapters,
write_sub=write_sub,
)
# Use timeout wrapper to prevent hanging
@@ -1838,7 +2121,40 @@ class Download_Media(Cmdlet):
# Build PipeObjects first so we can attach cross-clip relationships.
pipe_objects: List[Dict[str, Any]] = []
for downloaded in results_to_emit:
pipe_objects.append(self._build_pipe_object(downloaded, url, opts))
po = self._build_pipe_object(downloaded, url, opts)
# Attach chapter timestamps for downstream consumers (e.g., mpv scripts)
# even if container embedding fails.
try:
info = downloaded.info if isinstance(getattr(downloaded, "info", None), dict) else {}
except Exception:
info = {}
chapters_text = _format_chapters_note(info) if embed_chapters else None
if chapters_text:
notes = po.get("notes")
if not isinstance(notes, dict):
notes = {}
notes.setdefault("chapters", chapters_text)
po["notes"] = notes
if write_sub:
try:
media_path = Path(str(po.get("path") or ""))
except Exception:
media_path = None
if media_path is not None and media_path.exists() and media_path.is_file():
sub_path = _best_subtitle_sidecar(media_path)
if sub_path is not None:
sub_text = _read_text_file(sub_path)
if sub_text:
notes = po.get("notes")
if not isinstance(notes, dict):
notes = {}
notes["sub"] = sub_text
po["notes"] = notes
pipe_objects.append(po)
# If this is a clip download, decorate titles/tags so the title: tag is clip-based.
# Relationship tags are only added when multiple clips exist.
@@ -1868,6 +2184,95 @@ class Download_Media(Cmdlet):
debug("✓ Downloaded and emitted")
except DownloadError as e:
# Special-case yt-dlp format errors: show a selectable format list table so
# the user can pick a working format_id and continue the pipeline via @N.
cause = getattr(e, "__cause__", None)
detail = ""
try:
detail = str(cause or "")
except Exception:
detail = ""
if "requested format is not available" in (detail or "").lower() and mode != "audio":
formats = list_formats(url, no_playlist=False, playlist_items=actual_playlist_items)
if formats:
formats_to_show = formats
table = ResultTable()
table.title = f"Available formats for {url}"
table.set_source_command("download-media", [str(a) for a in (args or [])])
results_list: List[Dict[str, Any]] = []
for idx, fmt in enumerate(formats_to_show, 1):
resolution = fmt.get("resolution", "")
ext = fmt.get("ext", "")
vcodec = fmt.get("vcodec", "none")
acodec = fmt.get("acodec", "none")
filesize = fmt.get("filesize")
format_id = fmt.get("format_id", "")
selection_format_id = format_id
try:
if vcodec != "none" and acodec == "none" and format_id:
selection_format_id = f"{format_id}+ba"
except Exception:
selection_format_id = format_id
size_str = ""
if filesize:
try:
size_mb = float(filesize) / (1024 * 1024)
size_str = f"{size_mb:.1f}MB"
except Exception:
size_str = ""
desc_parts: List[str] = []
if resolution and resolution != "audio only":
desc_parts.append(str(resolution))
if ext:
desc_parts.append(str(ext).upper())
if vcodec != "none":
desc_parts.append(f"v:{vcodec}")
if acodec != "none":
desc_parts.append(f"a:{acodec}")
if size_str:
desc_parts.append(size_str)
format_desc = " | ".join(desc_parts)
format_dict: Dict[str, Any] = {
"table": "download-media",
"title": f"Format {format_id}",
"url": url,
"target": url,
"detail": format_desc,
"media_kind": "format",
"columns": [
("#", str(idx)),
("ID", format_id),
("Resolution", resolution or "N/A"),
("Ext", ext),
("Video", vcodec),
("Audio", acodec),
("Size", size_str or "N/A"),
],
"full_metadata": {
"format_id": format_id,
"url": url,
"item_selector": selection_format_id,
},
"_selection_args": ["-format", selection_format_id],
}
results_list.append(format_dict)
table.add_result(format_dict)
pipeline_context.set_current_stage_table(table)
pipeline_context.set_last_result_table(table, results_list)
# Returning 0 with no emits lets the CLI pause the pipeline for @N selection.
log("Requested format is not available; select a working format with @N", file=sys.stderr)
return 0
log(f"Download failed for {url}: {e}", file=sys.stderr)
except Exception as e:
log(f"Error processing {url}: {e}", file=sys.stderr)

View File

@@ -15,9 +15,9 @@ from pathlib import Path
from typing import Any, Dict, Optional, Sequence
from SYS.logger import log
from ._shared import Cmdlet, CmdletArg, parse_cmdlet_args
from . import _shared as sh
class Download_Torrent(Cmdlet):
class Download_Torrent(sh.Cmdlet):
"""Class-based download-torrent cmdlet with self-registration."""
def __init__(self) -> None:
@@ -27,10 +27,10 @@ class Download_Torrent(Cmdlet):
usage="download-torrent <magnet|.torrent> [options]",
alias=["torrent", "magnet"],
arg=[
CmdletArg(name="magnet", type="string", required=False, description="Magnet link or .torrent file/URL", variadic=True),
CmdletArg(name="output", type="string", description="Output directory for downloaded files"),
CmdletArg(name="wait", type="float", description="Wait time (seconds) for magnet processing timeout"),
CmdletArg(name="background", type="flag", alias="bg", description="Start download in background"),
sh.CmdletArg(name="magnet", type="string", required=False, description="Magnet link or .torrent file/URL", variadic=True),
sh.CmdletArg(name="output", type="string", description="Output directory for downloaded files"),
sh.CmdletArg(name="wait", type="float", description="Wait time (seconds) for magnet processing timeout"),
sh.CmdletArg(name="background", type="flag", alias="bg", description="Start download in background"),
],
detail=["Download torrents/magnets via AllDebrid API."],
exec=self.run,
@@ -38,7 +38,7 @@ class Download_Torrent(Cmdlet):
self.register()
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
parsed = parse_cmdlet_args(args, self)
parsed = sh.parse_cmdlet_args(args, self)
magnet_args = parsed.get("magnet", [])
output_dir = Path(parsed.get("output") or Path.home() / "Downloads")
wait_timeout = int(float(parsed.get("wait", 600)))

View File

@@ -9,13 +9,13 @@ import subprocess
import webbrowser
import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
from . import _shared as sh
from SYS.logger import log, debug
from Store import Store
from config import resolve_output_dir
class Get_File(Cmdlet):
class Get_File(sh.Cmdlet):
"""Export files to local path via hash+store."""
def __init__(self) -> None:
@@ -25,10 +25,10 @@ class Get_File(Cmdlet):
summary="Export file to local path",
usage="@1 | get-file -path C:\\Downloads",
arg=[
SharedArgs.HASH,
SharedArgs.STORE,
SharedArgs.PATH,
CmdletArg("name", description="Output filename (default: from metadata title)"),
sh.SharedArgs.HASH,
sh.SharedArgs.STORE,
sh.SharedArgs.PATH,
sh.CmdletArg("name", description="Output filename (default: from metadata title)"),
],
detail=[
"- Exports file from storage backend to local path",
@@ -42,12 +42,12 @@ class Get_File(Cmdlet):
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Export file via hash+store backend."""
debug(f"[get-file] run() called with result type: {type(result)}")
parsed = parse_cmdlet_args(args, self)
parsed = sh.parse_cmdlet_args(args, self)
debug(f"[get-file] parsed args: {parsed}")
# Extract hash and store from result or args
file_hash = parsed.get("hash") or get_field(result, "hash")
store_name = parsed.get("store") or get_field(result, "store")
file_hash = parsed.get("hash") or sh.get_field(result, "hash")
store_name = parsed.get("store") or sh.get_field(result, "store")
output_path = parsed.get("path")
output_name = parsed.get("name")
@@ -62,7 +62,7 @@ class Get_File(Cmdlet):
return 1
# Normalize hash
file_hash = normalize_hash(file_hash)
file_hash = sh.normalize_hash(file_hash)
if not file_hash:
log("Error: Invalid hash format")
return 1
@@ -84,9 +84,9 @@ class Get_File(Cmdlet):
def resolve_display_title() -> str:
candidates = [
get_field(result, "title"),
get_field(result, "name"),
get_field(result, "filename"),
sh.get_field(result, "title"),
sh.get_field(result, "name"),
sh.get_field(result, "filename"),
(metadata.get("title") if isinstance(metadata, dict) else None),
(metadata.get("name") if isinstance(metadata, dict) else None),
(metadata.get("filename") if isinstance(metadata, dict) else None),

View File

@@ -7,7 +7,13 @@ import sys
from SYS.logger import log
from pathlib import Path
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
parse_cmdlet_args = sh.parse_cmdlet_args
get_field = sh.get_field
import pipeline as ctx
from result_table import ResultTable
@@ -74,9 +80,15 @@ class Get_Metadata(Cmdlet):
hash_value: Optional[str], pages: Optional[int] = None) -> Dict[str, Any]:
"""Build a table row dict with metadata fields."""
size_mb = None
if isinstance(size_bytes, int):
size_int: Optional[int] = None
if size_bytes is not None:
try:
size_mb = int(size_bytes / (1024 * 1024))
size_int = int(size_bytes)
except Exception:
size_int = None
if isinstance(size_int, int):
try:
size_mb = int(size_int / (1024 * 1024))
except Exception:
size_mb = None
@@ -105,7 +117,7 @@ class Get_Metadata(Cmdlet):
"path": path,
"store": store,
"mime": mime,
"size_bytes": size_bytes,
"size_bytes": size_int,
"duration_seconds": dur_int,
"pages": pages_int,
"imported_ts": imported_ts,
@@ -237,8 +249,8 @@ class Get_Metadata(Cmdlet):
pages=pages,
)
table_title = title
table = ResultTable(table_title).init_command("get-metadata", list(args))
table_title = f"get-metadata: {title}" if title else "get-metadata"
table = ResultTable(table_title).init_command(table_title, "get-metadata", list(args))
self._add_table_body_row(table, row)
ctx.set_last_result_table_overlay(table, [row], row)
ctx.emit(row)

View File

@@ -7,15 +7,15 @@ import sys
from SYS.logger import log
import pipeline as ctx
from ._shared import (
Cmdlet,
CmdletArg,
SharedArgs,
normalize_hash,
parse_cmdlet_args,
normalize_result_input,
should_show_help,
)
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
normalize_hash = sh.normalize_hash
parse_cmdlet_args = sh.parse_cmdlet_args
normalize_result_input = sh.normalize_result_input
should_show_help = sh.should_show_help
from Store import Store
from SYS.utils import sha256_file

View File

@@ -10,7 +10,17 @@ from SYS.logger import log
import models
import pipeline as ctx
from API import HydrusNetwork as hydrus_wrapper
from ._shared import Cmdlet, CmdletArg, SharedArgs, normalize_hash, fmt_bytes, get_hash_for_operation, fetch_hydrus_metadata, should_show_help, get_field
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
normalize_hash = sh.normalize_hash
fmt_bytes = sh.fmt_bytes
get_hash_for_operation = sh.get_hash_for_operation
fetch_hydrus_metadata = sh.fetch_hydrus_metadata
should_show_help = sh.should_show_help
get_field = sh.get_field
from API.folder import API_folder_store
from config import get_local_storage_path
from result_table import ResultTable
@@ -224,13 +234,14 @@ def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int:
try:
client = None
store_label = "hydrus"
backend_obj = None
if store_name:
# Store specified: do not fall back to a global/default Hydrus client.
store_label = str(store_name)
try:
store = Store(config)
backend = store[str(store_name)]
candidate = getattr(backend, "_client", None)
backend_obj = store[str(store_name)]
candidate = getattr(backend_obj, "_client", None)
if candidate is not None and hasattr(candidate, "get_file_relationships"):
client = candidate
except Exception:
@@ -241,6 +252,74 @@ def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int:
else:
client = hydrus_wrapper.get_client(config)
def _resolve_related_title(rel_hash: str) -> str:
"""Best-effort resolve a Hydrus hash to a human title.
Preference order:
- title: tag from the backend (fast path)
- Hydrus metadata tags via fetch_hydrus_metadata
- fallback to short hash
"""
h = normalize_hash(rel_hash)
if not h:
return str(rel_hash)
# Prefer backend tag extraction when available.
if backend_obj is not None and hasattr(backend_obj, "get_tag"):
try:
tag_result = backend_obj.get_tag(h)
tags = tag_result[0] if isinstance(tag_result, tuple) and tag_result else tag_result
if isinstance(tags, list):
for t in tags:
if isinstance(t, str) and t.lower().startswith("title:"):
val = t.split(":", 1)[1].strip()
if val:
return val
except Exception:
pass
# Fallback: fetch minimal metadata and scan for a title tag.
try:
meta, _ = fetch_hydrus_metadata(
config,
h,
store_name=store_label if store_name else None,
hydrus_client=client,
include_service_keys_to_tags=True,
include_file_url=False,
include_duration=False,
include_size=False,
include_mime=False,
)
if isinstance(meta, dict):
tags_payload = meta.get("tags")
tag_candidates: list[str] = []
if isinstance(tags_payload, dict):
for svc_data in tags_payload.values():
if not isinstance(svc_data, dict):
continue
storage = svc_data.get("storage_tags")
if isinstance(storage, dict):
for group in storage.values():
if isinstance(group, list):
tag_candidates.extend([str(x) for x in group if isinstance(x, str)])
display = svc_data.get("display_tags")
if isinstance(display, list):
tag_candidates.extend([str(x) for x in display if isinstance(x, str)])
flat = meta.get("tags_flat")
if isinstance(flat, list):
tag_candidates.extend([str(x) for x in flat if isinstance(x, str)])
for t in tag_candidates:
if isinstance(t, str) and t.lower().startswith("title:"):
val = t.split(":", 1)[1].strip()
if val:
return val
except Exception:
pass
return h[:16] + "..."
if client:
rel = client.get_file_relationships(hash_hex)
if rel:
@@ -274,7 +353,7 @@ def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int:
found_relationships.append({
"hash": king_hash,
"type": "king",
"title": king_hash,
"title": _resolve_related_title(king_hash),
"path": None,
"store": store_label,
})
@@ -292,7 +371,7 @@ def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int:
found_relationships.append({
"hash": rel_hash_norm,
"type": rel_name,
"title": rel_hash_norm, # Can't resolve title easily without another API call
"title": _resolve_related_title(rel_hash_norm),
"path": None,
"store": store_label,
})
@@ -304,7 +383,7 @@ def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int:
found_relationships.append({
"hash": rel_hash_norm,
"type": rel_name,
"title": rel_hash_norm,
"title": _resolve_related_title(rel_hash_norm),
"path": None,
"store": store_label,
})

View File

@@ -27,7 +27,15 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple
import pipeline as ctx
from API import HydrusNetwork
from API.folder import read_sidecar, write_sidecar, find_sidecar, API_folder_store
from ._shared import normalize_hash, looks_like_hash, Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field
from . import _shared as sh
normalize_hash = sh.normalize_hash
looks_like_hash = sh.looks_like_hash
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
parse_cmdlet_args = sh.parse_cmdlet_args
get_field = sh.get_field
from config import get_local_storage_path

View File

@@ -5,7 +5,15 @@ from typing import Any, Dict, List, Sequence
import sys
import pipeline as ctx
from ._shared import Cmdlet, SharedArgs, parse_cmdlet_args, get_field, normalize_hash
from . import _shared as sh
Cmdlet, SharedArgs, parse_cmdlet_args, get_field, normalize_hash = (
sh.Cmdlet,
sh.SharedArgs,
sh.parse_cmdlet_args,
sh.get_field,
sh.normalize_hash,
)
from SYS.logger import log
from Store import Store

View File

@@ -12,17 +12,17 @@ import re as _re
from config import resolve_output_dir
from ._shared import (
Cmdlet,
CmdletArg,
create_pipe_object_result,
get_field,
get_pipe_object_hash,
get_pipe_object_path,
normalize_result_input,
parse_cmdlet_args,
should_show_help,
)
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
create_pipe_object_result = sh.create_pipe_object_result
get_field = sh.get_field
get_pipe_object_hash = sh.get_pipe_object_hash
get_pipe_object_path = sh.get_pipe_object_path
normalize_result_input = sh.normalize_result_input
parse_cmdlet_args = sh.parse_cmdlet_args
should_show_help = sh.should_show_help
import pipeline as ctx

View File

@@ -20,7 +20,16 @@ from urllib.parse import urlsplit, quote, urljoin
from SYS.logger import log, debug
from API.HTTP import HTTPClient
from SYS.utils import ensure_directory, unique_path, unique_preserve_order
from ._shared import Cmdlet, CmdletArg, SharedArgs, create_pipe_object_result, normalize_result_input, should_show_help, get_field
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
create_pipe_object_result = sh.create_pipe_object_result
normalize_result_input = sh.normalize_result_input
should_show_help = sh.should_show_help
get_field = sh.get_field
parse_cmdlet_args = sh.parse_cmdlet_args
import pipeline as pipeline_context
# ============================================================================
@@ -33,20 +42,7 @@ import pipeline as pipeline_context
# Playwright & Screenshot Dependencies
# ============================================================================
try:
from playwright.sync_api import (
TimeoutError as PlaywrightTimeoutError,
sync_playwright,
)
HAS_PLAYWRIGHT = True
except Exception:
HAS_PLAYWRIGHT = False
PlaywrightTimeoutError = TimeoutError # type: ignore
def sync_playwright(*_args: Any, **_kwargs: Any) -> Any: # type: ignore
raise RuntimeError(
"playwright is required for screenshot capture; install with: pip install playwright; then: playwright install"
)
from tool.playwright import HAS_PLAYWRIGHT, PlaywrightTimeoutError, PlaywrightTool
try:
from config import resolve_output_dir
@@ -128,6 +124,7 @@ class ScreenshotOptions:
prefer_platform_target: bool = False
target_selectors: Optional[Sequence[str]] = None
selector_timeout_ms: int = 10_000
playwright_tool: Optional[PlaywrightTool] = None
@dataclass(slots=True)
@@ -324,33 +321,22 @@ def _prepare_output_path(options: ScreenshotOptions) -> Path:
def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) -> None:
"""Capture screenshot using Playwright."""
debug(f"[_capture] Starting capture for {options.url} -> {destination}")
playwright = None
browser = None
context = None
try:
debug("Starting Playwright...", flush=True)
playwright = sync_playwright().start()
log("Launching Chromium browser...", flush=True)
tool = options.playwright_tool or PlaywrightTool({})
tool.debug_dump()
log("Launching browser...", flush=True)
format_name = _normalise_format(options.output_format)
headless = options.headless or format_name == "pdf"
debug(f"[_capture] Format: {format_name}, Headless: {headless}")
if format_name == "pdf" and not options.headless:
warnings.append("pdf output requires headless Chromium; overriding headless mode")
browser = playwright.chromium.launch(
headless=headless,
args=["--disable-blink-features=AutomationControlled"],
)
log("Creating browser context...", flush=True)
context = browser.new_context(
user_agent=USER_AGENT,
viewport=DEFAULT_VIEWPORT,
ignore_https_errors=True,
)
page = context.new_page()
with tool.open_page(headless=headless) as page:
log(f"Navigating to {options.url}...", flush=True)
try:
page.goto(options.url, timeout=90_000, wait_until="domcontentloaded")
tool.goto(page, options.url)
log("Page loaded successfully", flush=True)
except PlaywrightTimeoutError:
warnings.append("navigation timeout; capturing current page state")
@@ -448,18 +434,6 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
except Exception as exc:
debug(f"[_capture] Exception: {exc}")
raise ScreenshotError(f"Failed to capture screenshot: {exc}") from exc
finally:
log("Cleaning up browser resources...", flush=True)
with contextlib.suppress(Exception):
if context is not None:
context.close()
with contextlib.suppress(Exception):
if browser is not None:
browser.close()
with contextlib.suppress(Exception):
if playwright is not None:
playwright.stop()
log("Cleanup complete", flush=True)
def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
@@ -511,8 +485,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
Screenshots are created using Playwright and marked as temporary
so they can be cleaned up later with the cleanup cmdlet.
"""
from ._shared import parse_cmdlet_args
debug(f"[_run] screen-shot invoked with args: {args}")
# Help check
@@ -534,6 +506,19 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
parsed = parse_cmdlet_args(args, CMDLET)
format_value = parsed.get("format")
if not format_value:
# Default format can be set via config.conf tool block:
# [tool=playwright]
# format="pdf"
try:
tool_cfg = config.get("tool", {}) if isinstance(config, dict) else {}
pw_cfg = tool_cfg.get("playwright") if isinstance(tool_cfg, dict) else None
if isinstance(pw_cfg, dict):
format_value = pw_cfg.get("format")
except Exception:
pass
if not format_value:
format_value = "png"
storage_value = parsed.get("storage")
selector_arg = parsed.get("selector")
selectors = [selector_arg] if selector_arg else []
@@ -669,6 +654,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
prefer_platform_target=False,
wait_for_article=False,
full_page=True,
playwright_tool=PlaywrightTool(config),
)
screenshot_result = _capture_screenshot(options)

View File

@@ -10,7 +10,13 @@ import importlib
from SYS.logger import log, debug
from ProviderCore.registry import get_search_provider, list_search_providers
from ._shared import Cmdlet, CmdletArg, should_show_help
from . import _shared as sh
Cmdlet, CmdletArg, should_show_help = (
sh.Cmdlet,
sh.CmdletArg,
sh.should_show_help,
)
import pipeline as ctx
# Optional dependencies

View File

@@ -10,7 +10,17 @@ import sys
from SYS.logger import log, debug
from ._shared import Cmdlet, CmdletArg, SharedArgs, get_field, should_show_help, normalize_hash, first_title_tag
from . import _shared as sh
Cmdlet, CmdletArg, SharedArgs, get_field, should_show_help, normalize_hash, first_title_tag = (
sh.Cmdlet,
sh.CmdletArg,
sh.SharedArgs,
sh.get_field,
sh.should_show_help,
sh.normalize_hash,
sh.first_title_tag,
)
import pipeline as ctx
@@ -209,6 +219,10 @@ class Search_Store(Cmdlet):
table_title += f" [{storage_backend}]"
table = ResultTable(table_title)
try:
table.set_source_command("search-store", list(args_list))
except Exception:
pass
if hash_query:
try:
table.set_preserve_order(True)
@@ -309,6 +323,11 @@ class Search_Store(Cmdlet):
ext_val = Path(path_str).suffix
except Exception:
ext_val = None
if not ext_val and title:
try:
ext_val = Path(str(title)).suffix
except Exception:
ext_val = None
size_bytes = meta_obj.get("size")
if size_bytes is None:
@@ -333,6 +352,20 @@ class Search_Store(Cmdlet):
ctx.emit(payload)
if found_any:
# Title should reflect the command, query, and only stores present in the table.
store_counts: "OrderedDict[str, int]" = OrderedDict()
for row_item in results_list:
store_val = str(row_item.get("store") or "").strip()
if not store_val:
continue
if store_val not in store_counts:
store_counts[store_val] = 0
store_counts[store_val] += 1
counts_part = " ".join(f"{name}:{count}" for name, count in store_counts.items() if count > 0)
base_title = f"search-store: {query}".strip()
table.title = f"{base_title} | {counts_part}" if counts_part else base_title
ctx.set_last_result_table(table, results_list)
db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
db.update_worker_status(worker_id, 'completed')
@@ -377,28 +410,6 @@ class Search_Store(Cmdlet):
log(f"Backend {backend_name} search failed: {exc}", file=sys.stderr)
results = all_results[:limit]
def _format_storage_label(name: str) -> str:
clean = str(name or "").strip()
if not clean:
return "Unknown"
return clean.replace("_", " ").title()
storage_counts: OrderedDict[str, int] = OrderedDict((name, 0) for name in searched_backends)
for item in results or []:
store = get_field(item, "store")
if not store:
continue
key = str(store).lower()
if key not in storage_counts:
storage_counts[key] = 0
storage_counts[key] += 1
if storage_counts or query:
display_counts = OrderedDict((_format_storage_label(name), count) for name, count in storage_counts.items())
summary_line = table.set_storage_summary(display_counts, query, inline=True)
if summary_line:
table.title = summary_line
if results:
for item in results:
def _as_dict(obj: Any) -> Dict[str, Any]:
@@ -428,6 +439,20 @@ class Search_Store(Cmdlet):
results_list.append(normalized)
ctx.emit(normalized)
# Title should reflect the command, query, and only stores present in the table.
store_counts: "OrderedDict[str, int]" = OrderedDict()
for row_item in results_list:
store_val = str(row_item.get("store") or "").strip()
if not store_val:
continue
if store_val not in store_counts:
store_counts[store_val] = 0
store_counts[store_val] += 1
counts_part = " ".join(f"{name}:{count}" for name, count in store_counts.items() if count > 0)
base_title = f"search-store: {query}".strip()
table.title = f"{base_title} | {counts_part}" if counts_part else base_title
ctx.set_last_result_table(table, results_list)
db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
else:

View File

@@ -11,14 +11,14 @@ import re
from SYS.logger import log, debug
from SYS.utils import sha256_file
from ._shared import (
Cmdlet,
CmdletArg,
parse_cmdlet_args,
normalize_result_input,
extract_tag_from_result,
extract_title_from_result
)
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
parse_cmdlet_args = sh.parse_cmdlet_args
normalize_result_input = sh.normalize_result_input
extract_tag_from_result = sh.extract_tag_from_result
extract_title_from_result = sh.extract_title_from_result
import pipeline as ctx
CMDLET = Cmdlet(

View File

@@ -26,9 +26,9 @@ def ensure_registry_loaded() -> None:
def _normalize_mod_name(mod_name: str) -> str:
"""Normalize a command/module name for import resolution."""
normalized = (mod_name or "").strip()
if normalized.startswith('.'):
normalized = normalized.lstrip('.')
normalized = normalized.replace('-', '_')
if normalized.startswith("."):
normalized = normalized.lstrip(".")
normalized = normalized.replace("-", "_")
return normalized
@@ -83,7 +83,7 @@ def get_cmdlet_metadata(cmd_name: str) -> Optional[Dict[str, Any]]:
if data is None:
try:
reg_fn = (REGISTRY or {}).get(cmd_name.replace('_', '-').lower())
reg_fn = (REGISTRY or {}).get(cmd_name.replace("_", "-").lower())
if reg_fn:
owner_mod = getattr(reg_fn, "__module__", "")
if owner_mod:
@@ -186,8 +186,6 @@ def get_cmdlet_arg_flags(cmd_name: str) -> List[str]:
if not meta:
return []
# Preserve the order that arguments are defined on the cmdlet (arg=[...]) so
# completions feel stable and predictable.
flags: List[str] = []
seen: set[str] = set()

View File

@@ -135,7 +135,7 @@ def _render_detail(meta: Dict[str, Any], args: Sequence[str]) -> None:
def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
try:
from cmdlet import catalog as _catalog
import cmdlet_catalog as _catalog
CMDLET.arg[0].choices = _normalize_choice_list(_catalog.list_cmdlet_names())
metadata = _catalog.list_cmdlet_metadata()

View File

@@ -16,7 +16,7 @@ from models import PipeObject
from API.folder import LocalLibrarySearchOptimizer
from config import get_local_storage_path, get_hydrus_access_key, get_hydrus_url
from hydrus_health_check import get_cookies_file_path
_ALLDEBRID_UNLOCK_CACHE: Dict[str, str] = {}
@@ -372,12 +372,18 @@ def _build_hydrus_header(config: Dict[str, Any]) -> Optional[str]:
def _build_ytdl_options(config: Optional[Dict[str, Any]], hydrus_header: Optional[str]) -> Optional[str]:
"""Compose ytdl-raw-options string including cookies and optional Hydrus header."""
opts: List[str] = []
cookies_path = None
try:
cookies_path = get_cookies_file_path()
from tool.ytdlp import YtDlpTool
cookiefile = YtDlpTool(config or {}).resolve_cookiefile()
if cookiefile is not None:
cookies_path = str(cookiefile)
except Exception:
cookies_path = None
if cookies_path:
opts.append(f"cookies={cookies_path.replace('\\', '/')}")
opts.append(f"cookies={cookies_path.replace('\\', '/')}" )
else:
opts.append("cookies-from-browser=chrome")
if hydrus_header:
@@ -407,10 +413,18 @@ def _is_hydrus_path(path: str, hydrus_url: Optional[str]) -> bool:
return True
return False
def _ensure_ytdl_cookies() -> None:
def _ensure_ytdl_cookies(config: Optional[Dict[str, Any]] = None) -> None:
"""Ensure yt-dlp options are set correctly for this session."""
from pathlib import Path
cookies_path = get_cookies_file_path()
cookies_path = None
try:
from tool.ytdlp import YtDlpTool
cookiefile = YtDlpTool(config or {}).resolve_cookiefile()
if cookiefile is not None:
cookies_path = str(cookiefile)
except Exception:
cookies_path = None
if cookies_path:
# Check if file exists and has content (use forward slashes for path checking)
check_path = cookies_path.replace('\\', '/')
@@ -635,7 +649,7 @@ def _queue_items(
pass
# Just verify cookies are configured, don't try to set via IPC
_ensure_ytdl_cookies()
_ensure_ytdl_cookies(config)
hydrus_header = _build_hydrus_header(config or {})
ytdl_opts = _build_ytdl_options(config, hydrus_header)
@@ -1426,7 +1440,15 @@ def _start_mpv(items: List[Any], config: Optional[Dict[str, Any]] = None, start_
hydrus_header = _build_hydrus_header(config or {})
ytdl_opts = _build_ytdl_options(config, hydrus_header)
cookies_path = get_cookies_file_path()
cookies_path = None
try:
from tool.ytdlp import YtDlpTool
cookiefile = YtDlpTool(config or {}).resolve_cookiefile()
if cookiefile is not None:
cookies_path = str(cookiefile)
except Exception:
cookies_path = None
if cookies_path:
debug(f"Starting MPV with cookies file: {cookies_path.replace('\\', '/')}")
else:

View File

@@ -1,11 +1,5 @@
"""Unified configuration helpers.
Configuration is defined exclusively via the modular `.conf` format.
- Required: `temp`
- Optional: stores, providers, and other settings
- Modular: optional fragments in `config.d/*.conf` are merged in lexicographic order
"""
"""
from __future__ import annotations
@@ -130,6 +124,21 @@ def _apply_conf_block(config: Dict[str, Any], kind: str, subtype: str, block: Di
provider[provider_name] = dict(block)
return
if kind_l == "tool":
tool_name = str(subtype).strip().lower()
if not tool_name:
return
tool = config.setdefault("tool", {})
if not isinstance(tool, dict):
config["tool"] = {}
tool = config["tool"]
existing = tool.get(tool_name)
if isinstance(existing, dict):
_merge_dict_inplace(existing, block)
else:
tool[tool_name] = dict(block)
return
def parse_conf_text(text: str, *, base: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
"""Parse a lightweight .conf format into the app's config dict.
@@ -227,7 +236,7 @@ def _serialize_conf(config: Dict[str, Any]) -> str:
# Top-level scalars first
for key in sorted(config.keys()):
if key in {"store", "provider"}:
if key in {"store", "provider", "tool"}:
continue
value = config.get(key)
if isinstance(value, dict):
@@ -263,6 +272,18 @@ def _serialize_conf(config: Dict[str, Any]) -> str:
for k in sorted(block.keys()):
lines.append(f"{k}={_format_conf_value(block.get(k))}")
# Tool blocks
tool = config.get("tool")
if isinstance(tool, dict):
for name in sorted(tool.keys()):
block = tool.get(name)
if not isinstance(block, dict):
continue
lines.append("")
lines.append(f"[tool={name}]")
for k in sorted(block.keys()):
lines.append(f"{k}={_format_conf_value(block.get(k))}")
return "\n".join(lines).rstrip() + "\n"
@@ -510,12 +531,43 @@ def get_provider_credentials(config: Dict[str, Any], provider: str) -> Optional[
def resolve_cookies_path(config: Dict[str, Any], script_dir: Optional[Path] = None) -> Optional[Path]:
value = config.get("cookies")
if value:
# Support both legacy top-level `cookies=...` and the modular conf style:
# [tool=ytdlp]
# cookies="C:\\path\\cookies.txt"
values: list[Any] = []
try:
values.append(config.get("cookies"))
except Exception:
pass
try:
tool = config.get("tool")
if isinstance(tool, dict):
ytdlp = tool.get("ytdlp")
if isinstance(ytdlp, dict):
values.append(ytdlp.get("cookies"))
values.append(ytdlp.get("cookiefile"))
except Exception:
pass
try:
ytdlp_block = config.get("ytdlp")
if isinstance(ytdlp_block, dict):
values.append(ytdlp_block.get("cookies"))
values.append(ytdlp_block.get("cookiefile"))
except Exception:
pass
base_dir = script_dir or SCRIPT_DIR
for value in values:
if not value:
continue
candidate = Path(str(value)).expanduser()
if not candidate.is_absolute():
candidate = (base_dir / candidate).expanduser()
if candidate.is_file():
return candidate
base_dir = script_dir or SCRIPT_DIR
default_path = base_dir / "cookies.txt"
if default_path.is_file():
return default_path

View File

@@ -1,43 +0,0 @@
"""Cookies availability helpers.
This module is intentionally limited to cookie-file resolution used by yt-dlp.
Other service availability checks live in their owning store/provider objects.
"""
import sys
from pathlib import Path
from typing import Any, Dict, Optional, Tuple
from SYS.logger import debug
# Global state for Cookies availability
_COOKIES_FILE_PATH: Optional[str] = None
def initialize_cookies_check(config: Optional[Dict[str, Any]] = None, emit_debug: bool = True) -> Tuple[bool, str]:
"""Resolve cookies file path from config, falling back to cookies.txt in app root.
Returns a tuple of (found, detail_message).
"""
global _COOKIES_FILE_PATH
try:
from config import resolve_cookies_path
cookies_path = resolve_cookies_path(config or {}, script_dir=Path(__file__).parent)
except Exception:
cookies_path = None
if cookies_path and cookies_path.exists():
_COOKIES_FILE_PATH = str(cookies_path)
if emit_debug:
debug(f"Cookies: ENABLED - Found cookies file", file=sys.stderr)
return True, str(cookies_path)
else:
_COOKIES_FILE_PATH = None
return False, "Not found"
def get_cookies_file_path() -> Optional[str]:
"""Get the path to the cookies.txt file if it exists."""
return _COOKIES_FILE_PATH

View File

@@ -348,6 +348,8 @@ class DownloadOptions:
playlist_items: Optional[str] = None # yt-dlp --playlist-items format (e.g., "1-3,5,8")
no_playlist: bool = False # If True, pass --no-playlist to yt-dlp
quiet: bool = False # If True, suppress all console output (progress, debug logs)
embed_chapters: bool = False # If True, pass yt-dlp --embed-chapters / embedchapters
write_sub: bool = False # If True, download subtitles (writesubtitles/writeautomaticsub)
class SendFunc(Protocol):

View File

@@ -35,7 +35,7 @@ dependencies = [
"textual>=0.30.0",
# Media processing and downloading
"yt-dlp>=2023.11.0",
"yt-dlp[default]>=2023.11.0",
"yt-dlp-ejs", # EJS challenge solver scripts for YouTube JavaScript challenges
"requests>=2.31.0",
"httpx>=0.25.0",
@@ -43,7 +43,6 @@ dependencies = [
# Document and data handling
"pypdf>=3.0.0",
"img2pdf>=0.6.0",
"mutagen>=1.46.0",
"cbor2>=4.0",
@@ -53,7 +52,6 @@ dependencies = [
# Metadata extraction and processing
"musicbrainzngs>=0.7.0",
"beautifulsoup4>=4.12.0",
"lxml>=4.9.0",
# Advanced searching and libraries

View File

@@ -4,14 +4,13 @@ prompt-toolkit>=3.0.0
textual>=0.30.0
# Media processing and downloading
yt-dlp>=2023.11.0
yt-dlp[default]>=2023.11.0
requests>=2.31.0
httpx>=0.25.0
ffmpeg-python>=0.2.0
# Document and data handling
pypdf>=3.0.0
img2pdf>=0.6.0
mutagen>=1.46.0
cbor2>=4.0
@@ -21,7 +20,6 @@ python-bidi>=0.4.2
# Metadata extraction and processing
musicbrainzngs>=0.7.0
beautifulsoup4>=4.12.0
lxml>=4.9.0
# Advanced searching and libraries

View File

@@ -1,336 +0,0 @@
import requests
import random, string
from concurrent import futures
from tqdm import tqdm
import time
from datetime import datetime
import argparse
import os
import sys
import shutil
import json
import re
import base64
import hashlib
from Crypto.Cipher import AES
from Crypto.Util import Counter
def display_error(response, message):
print(message)
print(response)
print(response.text)
exit()
def get_book_infos(session, url):
r = session.get(url).text
infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&")
response = session.get(infos_url)
data = response.json()['data']
title = data['brOptions']['bookTitle'].strip().replace(" ", "_")
title = ''.join( c for c in title if c not in '<>:"/\\|?*' ) # Filter forbidden chars in directory names (Windows & Linux)
title = title[:150] # Trim the title to avoid long file names
metadata = data['metadata']
links = []
for item in data['brOptions']['data']:
for page in item:
links.append(page['uri'])
if len(links) > 1:
print(f"[+] Found {len(links)} pages")
return title, links, metadata
else:
print(f"[-] Error while getting image links")
exit()
def login(email, password):
session = requests.Session()
response = session.get("https://archive.org/services/account/login/")
login_data = response.json()
if not login_data['success']:
display_error(response, "[-] Error while getting login token:")
login_token = login_data["value"]["token"]
headers = {"Content-Type": "application/x-www-form-urlencoded"}
data = {"username":email, "password":password, "t": login_token}
response = session.post("https://archive.org/services/account/login/", headers=headers, data=json.dumps(data))
try:
response_json = response.json()
except:
display_error(response, "[-] Error while login:")
if response_json["success"] == False:
if response_json["value"] == "bad_login":
print("[-] Invalid credentials!")
exit()
display_error(response, "[-] Error while login:")
else:
print("[+] Successful login")
return session
def loan(session, book_id, verbose=True):
data = {
"action": "grant_access",
"identifier": book_id
}
response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data)
data['action'] = "browse_book"
response = session.post("https://archive.org/services/loans/loan/", data=data)
if response.status_code == 400 :
try:
if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
print("This book doesn't need to be borrowed")
return session
else :
display_error(response, "Something went wrong when trying to borrow the book.")
except: # The response is not in JSON format
display_error(response, "The book cannot be borrowed")
data['action'] = "create_token"
response = session.post("https://archive.org/services/loans/loan/", data=data)
if "token" in response.text:
if verbose:
print("[+] Successful loan")
return session
else:
display_error(response, "Something went wrong when trying to borrow the book, maybe you can't borrow this book.")
def return_loan(session, book_id):
data = {
"action": "return_loan",
"identifier": book_id
}
response = session.post("https://archive.org/services/loans/loan/", data=data)
if response.status_code == 200 and response.json()["success"]:
print("[+] Book returned")
else:
display_error(response, "Something went wrong when trying to return the book")
def image_name(pages, page, directory):
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
def deobfuscate_image(image_data, link, obf_header):
"""
@Author: https://github.com/justimm
Decrypts the first 1024 bytes of image_data using AES-CTR.
The obfuscation_header is expected in the form "1|<base64encoded_counter>"
where the base64-decoded counter is 16 bytes.
We derive the AES key by taking the SHA-1 digest of the image URL (with protocol/host removed)
and using the first 16 bytes.
For AES-CTR, we use a 16-byte counter block. The first 8 bytes are used as a fixed prefix,
and the remaining 8 bytes (interpreted as a big-endian integer) are used as the initial counter value.
"""
try:
version, counter_b64 = obf_header.split('|')
except Exception as e:
raise ValueError("Invalid X-Obfuscate header format") from e
if version != '1':
raise ValueError("Unsupported obfuscation version: " + version)
# Derive AES key: replace protocol/host in link with '/'
aesKey = re.sub(r"^https?:\/\/.*?\/", "/", link)
sha1_digest = hashlib.sha1(aesKey.encode('utf-8')).digest()
key = sha1_digest[:16]
# Decode the counter (should be 16 bytes)
counter_bytes = base64.b64decode(counter_b64)
if len(counter_bytes) != 16:
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
prefix = counter_bytes[:8]
initial_value = int.from_bytes(counter_bytes[8:], byteorder='big')
# Create AES-CTR cipher with a 64-bit counter length.
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False)
cipher = AES.new(key, AES.MODE_CTR, counter=ctr)
decrypted_part = cipher.decrypt(image_data[:1024])
new_data = decrypted_part + image_data[1024:]
return new_data
def download_one_image(session, link, i, directory, book_id, pages):
headers = {
"Referer": "https://archive.org/",
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
"Sec-Fetch-Site": "same-site",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Dest": "image",
}
retry = True
response = None
while retry:
try:
response = session.get(link, headers=headers)
if response.status_code == 403:
session = loan(session, book_id, verbose=False)
raise Exception("Borrow again")
elif response.status_code == 200:
retry = False
except:
time.sleep(1) # Wait 1 second before retrying
image = image_name(pages, i, directory)
obf_header = response.headers.get("X-Obfuscate")
image_content = None
if obf_header:
try:
image_content = deobfuscate_image(response.content, link, obf_header)
except Exception as e:
print(f"[ERROR] Deobfuscation failed: {e}")
return
else:
image_content = response.content
with open(image, "wb") as f:
f.write(image_content)
def download(session, n_threads, directory, links, scale, book_id):
print("Downloading pages...")
links = [f"{link}&rotate=0&scale={scale}" for link in links]
pages = len(links)
tasks = []
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
for link in links:
i = links.index(link)
tasks.append(executor.submit(download_one_image, session=session, link=link, i=i, directory=directory, book_id=book_id, pages=pages))
for task in tqdm(futures.as_completed(tasks), total=len(tasks)):
pass
images = [image_name(pages, i, directory) for i in range(len(links))]
return images
def make_pdf(pdf, title, directory):
file = title+".pdf"
# Handle the case where multiple books with the same name are downloaded
i = 1
while os.path.isfile(os.path.join(directory, file)):
file = f"{title}({i}).pdf"
i += 1
with open(os.path.join(directory, file),"wb") as f:
f.write(pdf)
print(f"[+] PDF saved as \"{file}\"")
if __name__ == "__main__":
my_parser = argparse.ArgumentParser()
my_parser.add_argument('-e', '--email', help='Your archive.org email', type=str, required=True)
my_parser.add_argument('-p', '--password', help='Your archive.org password', type=str, required=True)
my_parser.add_argument('-u', '--url', help='Link to the book (https://archive.org/details/XXXX). You can use this argument several times to download multiple books', action='append', type=str)
my_parser.add_argument('-d', '--dir', help='Output directory', type=str)
my_parser.add_argument('-f', '--file', help='File where are stored the URLs of the books to download', type=str)
my_parser.add_argument('-r', '--resolution', help='Image resolution (10 to 0, 0 is the highest), [default 3]', type=int, default=3)
my_parser.add_argument('-t', '--threads', help="Maximum number of threads, [default 50]", type=int, default=50)
my_parser.add_argument('-j', '--jpg', help="Output to individual JPG's rather than a PDF", action='store_true')
my_parser.add_argument('-m', '--meta', help="Output the metadata of the book to a json file (-j option required)", action='store_true')
if len(sys.argv) == 1:
my_parser.print_help(sys.stderr)
sys.exit(1)
args = my_parser.parse_args()
if args.url is None and args.file is None:
my_parser.error("At least one of --url and --file required")
email = args.email
password = args.password
scale = args.resolution
n_threads = args.threads
d = args.dir
if d == None:
d = os.getcwd()
elif not os.path.isdir(d):
print(f"Output directory does not exist!")
exit()
if args.url is not None:
urls = args.url
else:
if os.path.exists(args.file):
with open(args.file) as f:
urls = f.read().strip().split("\n")
else:
print(f"{args.file} does not exist!")
exit()
# Check the urls format
for url in urls:
if not url.startswith("https://archive.org/details/"):
print(f"{url} --> Invalid url. URL must starts with \"https://archive.org/details/\"")
exit()
print(f"{len(urls)} Book(s) to download")
session = login(email, password)
for url in urls:
book_id = list(filter(None, url.split("/")))[3]
print("="*40)
print(f"Current book: https://archive.org/details/{book_id}")
session = loan(session, book_id)
title, links, metadata = get_book_infos(session, url)
directory = os.path.join(d, title)
# Handle the case where multiple books with the same name are downloaded
i = 1
_directory = directory
while os.path.isdir(directory):
directory = f"{_directory}({i})"
i += 1
os.makedirs(directory)
if args.meta:
print("Writing metadata.json...")
with open(f"{directory}/metadata.json",'w') as f:
json.dump(metadata,f)
images = download(session, n_threads, directory, links, scale, book_id)
if not args.jpg: # Create pdf with images and remove the images folder
import img2pdf
# prepare PDF metadata
# sometimes archive metadata is missing
pdfmeta = { }
# ensure metadata are str
for key in ["title", "creator", "associated-names"]:
if key in metadata:
if isinstance(metadata[key], str):
pass
elif isinstance(metadata[key], list):
metadata[key] = "; ".join(metadata[key])
else:
raise Exception("unsupported metadata type")
# title
if 'title' in metadata:
pdfmeta['title'] = metadata['title']
# author
if 'creator' in metadata and 'associated-names' in metadata:
pdfmeta['author'] = metadata['creator'] + "; " + metadata['associated-names']
elif 'creator' in metadata:
pdfmeta['author'] = metadata['creator']
elif 'associated-names' in metadata:
pdfmeta['author'] = metadata['associated-names']
# date
if 'date' in metadata:
try:
pdfmeta['creationdate'] = datetime.strptime(metadata['date'][0:4], '%Y')
except:
pass
# keywords
pdfmeta['keywords'] = [f"https://archive.org/details/{book_id}"]
pdf = img2pdf.convert(images, **pdfmeta)
make_pdf(pdf, title, args.dir if args.dir != None else "")
try:
shutil.rmtree(directory)
except OSError as e:
print ("Error: %s - %s." % (e.filename, e.strerror))
return_loan(session, book_id)

11
tool/__init__.py Normal file
View File

@@ -0,0 +1,11 @@
"""Tool helpers.
This package contains wrappers around external tools (e.g. yt-dlp) so cmdlets can share
common defaults (cookies, timeouts, format selectors) and users can override them via
`config.conf`.
"""
from .ytdlp import YtDlpTool, YtDlpDefaults
from .playwright import PlaywrightTool, PlaywrightDefaults
__all__ = ["YtDlpTool", "YtDlpDefaults", "PlaywrightTool", "PlaywrightDefaults"]

203
tool/playwright.py Normal file
View File

@@ -0,0 +1,203 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict, Iterator, Optional
from SYS.logger import debug
try:
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
from playwright.sync_api import sync_playwright
HAS_PLAYWRIGHT = True
_PLAYWRIGHT_IMPORT_ERROR: Optional[Exception] = None
except Exception as exc: # pragma: no cover
HAS_PLAYWRIGHT = False
_PLAYWRIGHT_IMPORT_ERROR = exc
PlaywrightTimeoutError = TimeoutError # type: ignore
sync_playwright = None # type: ignore
# Re-export for consumers (e.g. cmdlets catching navigation timeouts)
__all__ = ["HAS_PLAYWRIGHT", "PlaywrightTimeoutError", "PlaywrightTool", "PlaywrightDefaults"]
def _get_nested(config: Dict[str, Any], *path: str) -> Any:
cur: Any = config
for key in path:
if not isinstance(cur, dict):
return None
cur = cur.get(key)
return cur
@dataclass(slots=True)
class PlaywrightDefaults:
browser: str = "chromium" # chromium|firefox|webkit
headless: bool = True
user_agent: str = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
viewport_width: int = 1280
viewport_height: int = 1200
navigation_timeout_ms: int = 90_000
ignore_https_errors: bool = True
class PlaywrightTool:
"""Small wrapper to standardize Playwright defaults and lifecycle.
This is meant to keep cmdlets/providers from duplicating:
- sync_playwright start/stop
- browser launch/context creation
- user-agent/viewport defaults
Config overrides (top-level keys):
- playwright.browser="chromium"
- playwright.headless=true
- playwright.user_agent="..."
- playwright.viewport_width=1280
- playwright.viewport_height=1200
- playwright.navigation_timeout_ms=90000
- playwright.ignore_https_errors=true
"""
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
self._config: Dict[str, Any] = dict(config or {})
self.defaults = self._load_defaults()
def _load_defaults(self) -> PlaywrightDefaults:
cfg = self._config
tool_block = _get_nested(cfg, "tool", "playwright")
if not isinstance(tool_block, dict):
tool_block = {}
pw_block = cfg.get("playwright") if isinstance(cfg.get("playwright"), dict) else {}
if not isinstance(pw_block, dict):
pw_block = {}
def _get(name: str, fallback: Any) -> Any:
val = tool_block.get(name)
if val is None:
val = pw_block.get(name)
if val is None:
val = cfg.get(f"playwright_{name}")
if val is None:
val = _get_nested(cfg, "playwright", name)
return fallback if val is None else val
browser = str(_get("browser", PlaywrightDefaults.browser)).strip().lower() or "chromium"
if browser not in {"chromium", "firefox", "webkit"}:
browser = "chromium"
headless_raw = _get("headless", PlaywrightDefaults.headless)
headless = bool(headless_raw)
ua = str(_get("user_agent", PlaywrightDefaults.user_agent))
def _int(name: str, fallback: int) -> int:
raw = _get(name, fallback)
try:
return int(raw)
except Exception:
return fallback
vw = _int("viewport_width", PlaywrightDefaults.viewport_width)
vh = _int("viewport_height", PlaywrightDefaults.viewport_height)
nav_timeout = _int("navigation_timeout_ms", PlaywrightDefaults.navigation_timeout_ms)
ignore_https = bool(_get("ignore_https_errors", PlaywrightDefaults.ignore_https_errors))
return PlaywrightDefaults(
browser=browser,
headless=headless,
user_agent=ua,
viewport_width=vw,
viewport_height=vh,
navigation_timeout_ms=nav_timeout,
ignore_https_errors=ignore_https,
)
def require(self) -> None:
if HAS_PLAYWRIGHT and sync_playwright is not None:
return
detail = str(_PLAYWRIGHT_IMPORT_ERROR or "playwright is not installed")
raise RuntimeError(
"playwright is required; install with: pip install playwright; then: playwright install\n"
f"detail: {detail}"
)
def open_page(
self,
*,
headless: Optional[bool] = None,
user_agent: Optional[str] = None,
viewport_width: Optional[int] = None,
viewport_height: Optional[int] = None,
ignore_https_errors: Optional[bool] = None,
) -> Iterator[Any]:
"""Context manager yielding a Playwright page with sane defaults."""
self.require()
h = self.defaults.headless if headless is None else bool(headless)
ua = self.defaults.user_agent if user_agent is None else str(user_agent)
vw = self.defaults.viewport_width if viewport_width is None else int(viewport_width)
vh = self.defaults.viewport_height if viewport_height is None else int(viewport_height)
ihe = self.defaults.ignore_https_errors if ignore_https_errors is None else bool(ignore_https_errors)
pw = None
browser = None
context = None
try:
assert sync_playwright is not None
pw = sync_playwright().start()
browser_type = getattr(pw, self.defaults.browser, None)
if browser_type is None:
browser_type = pw.chromium
browser = browser_type.launch(
headless=h,
args=["--disable-blink-features=AutomationControlled"],
)
context = browser.new_context(
user_agent=ua,
viewport={"width": vw, "height": vh},
ignore_https_errors=ihe,
)
page = context.new_page()
yield page
finally:
try:
if context is not None:
context.close()
except Exception:
pass
try:
if browser is not None:
browser.close()
except Exception:
pass
try:
if pw is not None:
pw.stop()
except Exception:
pass
def goto(self, page: Any, url: str) -> None:
"""Navigate with configured timeout."""
try:
page.goto(url, timeout=int(self.defaults.navigation_timeout_ms), wait_until="domcontentloaded")
except Exception:
raise
def debug_dump(self) -> None:
try:
debug(
f"[playwright] browser={self.defaults.browser} headless={self.defaults.headless} "
f"viewport={self.defaults.viewport_width}x{self.defaults.viewport_height} "
f"nav_timeout_ms={self.defaults.navigation_timeout_ms}"
)
except Exception:
pass

195
tool/ytdlp.py Normal file
View File

@@ -0,0 +1,195 @@
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence
from SYS.logger import debug
def _get_nested(config: Dict[str, Any], *path: str) -> Any:
cur: Any = config
for key in path:
if not isinstance(cur, dict):
return None
cur = cur.get(key)
return cur
def _parse_csv_list(value: Any) -> Optional[List[str]]:
if value is None:
return None
if isinstance(value, list):
out: List[str] = []
for item in value:
s = str(item).strip()
if s:
out.append(s)
return out or None
s = str(value).strip()
if not s:
return None
# allow either JSON-ish list strings or simple comma-separated values
if s.startswith("[") and s.endswith("]"):
s = s[1:-1]
parts = [p.strip() for p in s.split(",")]
parts = [p for p in parts if p]
return parts or None
@dataclass(slots=True)
class YtDlpDefaults:
"""User-tunable defaults for yt-dlp behavior.
Recommended config.conf keys (top-level dotted keys):
- ytdlp.video_format="bestvideo+bestaudio/best"
- ytdlp.audio_format="251/140/bestaudio"
- ytdlp.format_sort="res:2160,res:1440,res:1080,res:720,res"
Cookies:
- cookies="C:\\path\\cookies.txt" (already supported by config.resolve_cookies_path)
"""
video_format: str = "bestvideo+bestaudio/best"
audio_format: str = "251/140/bestaudio"
format_sort: Optional[List[str]] = None
class YtDlpTool:
"""Centralizes yt-dlp defaults and translation helpers.
This is intentionally small and dependency-light so cmdlets can use it without
forcing a full refactor.
"""
def __init__(self, config: Optional[Dict[str, Any]] = None, *, script_dir: Optional[Path] = None) -> None:
self._config: Dict[str, Any] = dict(config or {})
# `resolve_cookies_path` expects the app root so it can fall back to ./cookies.txt.
# This file lives under ./tool/, so default to the parent directory.
self._script_dir = script_dir or Path(__file__).resolve().parent.parent
self.defaults = self._load_defaults()
self._cookiefile: Optional[Path] = self._init_cookiefile()
def _init_cookiefile(self) -> Optional[Path]:
"""Resolve cookies once at tool init (yt-dlp is the primary consumer)."""
try:
from config import resolve_cookies_path
resolved = resolve_cookies_path(self._config, script_dir=self._script_dir)
if resolved is not None and resolved.is_file():
return resolved
except Exception:
pass
return None
def _load_defaults(self) -> YtDlpDefaults:
cfg = self._config
tool_block = _get_nested(cfg, "tool", "ytdlp")
if not isinstance(tool_block, dict):
tool_block = {}
ytdlp_block = cfg.get("ytdlp") if isinstance(cfg.get("ytdlp"), dict) else {}
if not isinstance(ytdlp_block, dict):
ytdlp_block = {}
# Accept both nested and flat styles.
video_format = (
tool_block.get("video_format")
or tool_block.get("format")
or ytdlp_block.get("video_format")
or ytdlp_block.get("video")
or ytdlp_block.get("format_video")
or cfg.get("ytdlp_video_format")
)
audio_format = (
tool_block.get("audio_format")
or ytdlp_block.get("audio_format")
or ytdlp_block.get("audio")
or ytdlp_block.get("format_audio")
or cfg.get("ytdlp_audio_format")
)
# Also accept dotted keys written as nested dicts: ytdlp.format.video, ytdlp.format.audio
nested_video = _get_nested(cfg, "ytdlp", "format", "video")
nested_audio = _get_nested(cfg, "ytdlp", "format", "audio")
fmt_sort_val = (
tool_block.get("format_sort")
or ytdlp_block.get("format_sort")
or ytdlp_block.get("formatSort")
or cfg.get("ytdlp_format_sort")
or _get_nested(cfg, "ytdlp", "format", "sort")
)
fmt_sort = _parse_csv_list(fmt_sort_val)
defaults = YtDlpDefaults(
video_format=str(nested_video or video_format or YtDlpDefaults.video_format),
audio_format=str(nested_audio or audio_format or YtDlpDefaults.audio_format),
format_sort=fmt_sort,
)
return defaults
def resolve_cookiefile(self) -> Optional[Path]:
return self._cookiefile
def default_format(self, mode: str) -> str:
m = str(mode or "").lower().strip()
if m == "audio":
return self.defaults.audio_format
return self.defaults.video_format
def build_yt_dlp_cli_args(
self,
*,
url: str,
output_dir: Optional[Path] = None,
ytdl_format: Optional[str] = None,
playlist_items: Optional[str] = None,
no_playlist: bool = False,
quiet: bool = True,
extra_args: Optional[Sequence[str]] = None,
) -> List[str]:
"""Build a yt-dlp command line (argv list).
This is primarily for debug output or subprocess execution.
"""
argv: List[str] = ["yt-dlp"]
if quiet:
argv.extend(["--quiet", "--no-warnings"])
argv.append("--no-progress")
cookiefile = self.resolve_cookiefile()
if cookiefile is not None:
argv.extend(["--cookies", str(cookiefile)])
if no_playlist:
argv.append("--no-playlist")
if playlist_items:
argv.extend(["--playlist-items", str(playlist_items)])
fmt = (ytdl_format or "").strip()
if fmt:
# Use long form to avoid confusion with app-level flags.
argv.extend(["--format", fmt])
if self.defaults.format_sort:
for sort_key in self.defaults.format_sort:
argv.extend(["-S", sort_key])
if output_dir is not None:
outtmpl = str((output_dir / "%(title)s.%(ext)s").resolve())
argv.extend(["-o", outtmpl])
if extra_args:
argv.extend([str(a) for a in extra_args if str(a).strip()])
argv.append(str(url))
return argv
def debug_print_cli(self, argv: Sequence[str]) -> None:
try:
debug("yt-dlp argv: " + " ".join(str(a) for a in argv))
except Exception:
pass