This commit is contained in:
nose
2025-12-16 23:23:43 -08:00
parent 9873280f0e
commit 86918f2ae2
46 changed files with 2277 additions and 1347 deletions

View File

@@ -244,6 +244,8 @@ class HTTPClient:
self, self,
method: str, method: str,
url: str, url: str,
raise_for_status: bool = True,
log_http_errors: bool = True,
**kwargs **kwargs
) -> httpx.Response: ) -> httpx.Response:
""" """
@@ -273,6 +275,7 @@ class HTTPClient:
for attempt in range(self.retries): for attempt in range(self.retries):
try: try:
response = self._client.request(method, url, **kwargs) response = self._client.request(method, url, **kwargs)
if raise_for_status:
response.raise_for_status() response.raise_for_status()
return response return response
except httpx.TimeoutException as e: except httpx.TimeoutException as e:
@@ -287,6 +290,7 @@ class HTTPClient:
response_text = e.response.text[:500] response_text = e.response.text[:500]
except: except:
response_text = "<unable to read response>" response_text = "<unable to read response>"
if log_http_errors:
logger.error(f"HTTP {e.response.status_code} from {url}: {response_text}") logger.error(f"HTTP {e.response.status_code} from {url}: {response_text}")
raise raise
last_exception = e last_exception = e

View File

@@ -71,6 +71,7 @@ class HydrusNetwork:
url: str url: str
access_key: str = "" access_key: str = ""
timeout: float = 60.0 timeout: float = 60.0
instance_name: str = "" # Optional store name (e.g., 'home') for namespaced logs
scheme: str = field(init=False) scheme: str = field(init=False)
hostname: str = field(init=False) hostname: str = field(init=False)
@@ -90,6 +91,12 @@ class HydrusNetwork:
self.port = parsed.port or (443 if self.scheme == "https" else 80) self.port = parsed.port or (443 if self.scheme == "https" else 80)
self.base_path = parsed.path.rstrip("/") self.base_path = parsed.path.rstrip("/")
self.access_key = self.access_key or "" self.access_key = self.access_key or ""
self.instance_name = str(self.instance_name or "").strip()
def _log_prefix(self) -> str:
if self.instance_name:
return f"[hydrusnetwork:{self.instance_name}]"
return f"[hydrusnetwork:{self.hostname}:{self.port}]"
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# low-level helpers # low-level helpers
@@ -120,7 +127,7 @@ class HydrusNetwork:
url = f"{self.scheme}://{self.hostname}:{self.port}{path}" url = f"{self.scheme}://{self.hostname}:{self.port}{path}"
# Log request details # Log request details
logger.debug(f"[Hydrus] {spec.method} {spec.endpoint} (auth: {'session_key' if self._session_key else 'access_key' if self.access_key else 'none'})") logger.debug(f"{self._log_prefix()} {spec.method} {spec.endpoint} (auth: {'session_key' if self._session_key else 'access_key' if self.access_key else 'none'})")
status = 0 status = 0
reason = "" reason = ""
@@ -135,14 +142,14 @@ class HydrusNetwork:
file_path = Path(spec.file_path) file_path = Path(spec.file_path)
if not file_path.is_file(): if not file_path.is_file():
error_msg = f"Upload file not found: {file_path}" error_msg = f"Upload file not found: {file_path}"
logger.error(f"[Hydrus] {error_msg}") logger.error(f"{self._log_prefix()} {error_msg}")
raise FileNotFoundError(error_msg) raise FileNotFoundError(error_msg)
file_size = file_path.stat().st_size file_size = file_path.stat().st_size
headers["Content-Type"] = spec.content_type or "application/octet-stream" headers["Content-Type"] = spec.content_type or "application/octet-stream"
headers["Content-Length"] = str(file_size) headers["Content-Length"] = str(file_size)
logger.debug(f"[Hydrus] Uploading file {file_path.name} ({file_size} bytes)") logger.debug(f"{self._log_prefix()} Uploading file {file_path.name} ({file_size} bytes)")
def file_gen(): def file_gen():
with file_path.open("rb") as handle: with file_path.open("rb") as handle:
@@ -153,7 +160,9 @@ class HydrusNetwork:
spec.method, spec.method,
url, url,
content=file_gen(), content=file_gen(),
headers=headers headers=headers,
raise_for_status=False,
log_http_errors=False,
) )
else: else:
content = None content = None
@@ -163,14 +172,16 @@ class HydrusNetwork:
content = spec.data content = spec.data
else: else:
json_data = spec.data json_data = spec.data
logger.debug(f"[Hydrus] Request body size: {len(content) if content else 'json'}") logger.debug(f"{self._log_prefix()} Request body size: {len(content) if content else 'json'}")
response = client.request( response = client.request(
spec.method, spec.method,
url, url,
content=content, content=content,
json=json_data, json=json_data,
headers=headers headers=headers,
raise_for_status=False,
log_http_errors=False,
) )
status = response.status_code status = response.status_code
@@ -178,20 +189,14 @@ class HydrusNetwork:
body = response.content body = response.content
content_type = response.headers.get("Content-Type", "") or "" content_type = response.headers.get("Content-Type", "") or ""
logger.debug(f"[Hydrus] Response {status} {reason} ({len(body)} bytes)") logger.debug(f"{self._log_prefix()} Response {status} {reason} ({len(body)} bytes)")
except (httpx.ConnectError, httpx.TimeoutException, httpx.NetworkError) as exc: except (httpx.ConnectError, httpx.TimeoutException, httpx.NetworkError) as exc:
msg = f"Hydrus unavailable: {exc}" msg = f"Hydrus unavailable: {exc}"
logger.warning(f"[Hydrus] {msg}") logger.warning(f"{self._log_prefix()} {msg}")
raise HydrusConnectionError(msg) from exc raise HydrusConnectionError(msg) from exc
except httpx.HTTPStatusError as exc:
response = exc.response
status = response.status_code
reason = response.reason_phrase
body = response.content
content_type = response.headers.get("Content-Type", "") or ""
except Exception as exc: except Exception as exc:
logger.error(f"[Hydrus] Connection error: {exc}", exc_info=True) logger.error(f"{self._log_prefix()} Connection error: {exc}", exc_info=True)
raise raise
payload: Any payload: Any
@@ -220,18 +225,22 @@ class HydrusNetwork:
else: else:
message = reason or "HTTP error" message = reason or "HTTP error"
logger.error(f"[Hydrus] HTTP {status}: {message}") # Some endpoints are naturally "missing" sometimes and should not spam logs.
if status == 404 and spec.endpoint.rstrip("/") == "/get_files/file_path":
return {}
logger.error(f"{self._log_prefix()} HTTP {status}: {message}")
# Handle expired session key (419) by clearing cache and retrying once # Handle expired session key (419) by clearing cache and retrying once
if status == 419 and self._session_key and "session" in message.lower(): if status == 419 and self._session_key and "session" in message.lower():
logger.warning(f"[Hydrus] Session key expired, acquiring new one and retrying...") logger.warning(f"{self._log_prefix()} Session key expired, acquiring new one and retrying...")
self._session_key = "" # Clear expired session key self._session_key = "" # Clear expired session key
try: try:
self._acquire_session_key() self._acquire_session_key()
# Retry the request with new session key # Retry the request with new session key
return self._perform_request(spec) return self._perform_request(spec)
except Exception as retry_error: except Exception as retry_error:
logger.error(f"[Hydrus] Retry failed: {retry_error}", exc_info=True) logger.error(f"{self._log_prefix()} Retry failed: {retry_error}", exc_info=True)
# If retry fails, raise the original error # If retry fails, raise the original error
raise HydrusRequestError(status, message, payload) from retry_error raise HydrusRequestError(status, message, payload) from retry_error
@@ -316,6 +325,16 @@ class HydrusNetwork:
def add_file(self, file_path: Path) -> dict[str, Any]: def add_file(self, file_path: Path) -> dict[str, Any]:
return self._post("/add_files/add_file", file_path=file_path) return self._post("/add_files/add_file", file_path=file_path)
def undelete_files(self, hashes: Union[str, Iterable[str]]) -> dict[str, Any]:
"""Restore files from Hydrus trash back into 'my files'.
Hydrus Client API: POST /add_files/undelete_files
Required JSON args: {"hashes": [<sha256 hex>, ...]}
"""
hash_list = self._ensure_hashes(hashes)
body = {"hashes": hash_list}
return self._post("/add_files/undelete_files", data=body)
def add_tag(self, hash: Union[str, Iterable[str]], tags: Iterable[str], service_name: str) -> dict[str, Any]: def add_tag(self, hash: Union[str, Iterable[str]], tags: Iterable[str], service_name: str) -> dict[str, Any]:
hash = self._ensure_hashes(hash) hash = self._ensure_hashes(hash)
body = {"hashes": hash, "service_names_to_tags": {service_name: list(tags)}} body = {"hashes": hash, "service_names_to_tags": {service_name: list(tags)}}

39
CLI.py
View File

@@ -68,7 +68,7 @@ from typing import Callable
from config import get_local_storage_path, load_config from config import get_local_storage_path, load_config
from cmdlet.catalog import ( from cmdlet_catalog import (
import_cmd_module as _catalog_import_cmd_module, import_cmd_module as _catalog_import_cmd_module,
list_cmdlet_metadata as _catalog_list_cmdlet_metadata, list_cmdlet_metadata as _catalog_list_cmdlet_metadata,
list_cmdlet_names as _catalog_list_cmdlet_names, list_cmdlet_names as _catalog_list_cmdlet_names,
@@ -305,8 +305,6 @@ def _get_table_title_for_command(
'add_file': 'Results', 'add_file': 'Results',
'delete-file': 'Results', 'delete-file': 'Results',
'delete_file': 'Results', 'delete_file': 'Results',
'check-file-status': 'Status',
'check_file_status': 'Status',
'get-metadata': None, 'get-metadata': None,
'get_metadata': None, 'get_metadata': None,
} }
@@ -843,10 +841,6 @@ def _create_cmdlet_cli():
# Load config # Load config
config = _load_cli_config() config = _load_cli_config()
# Initialize cookies check for yt-dlp
from hydrus_health_check import initialize_cookies_check
initialize_cookies_check(config, emit_debug=False)
# Initialize debug logging if enabled # Initialize debug logging if enabled
if config: if config:
from SYS.logger import set_debug from SYS.logger import set_debug
@@ -991,8 +985,6 @@ def _create_cmdlet_cli():
# Run startup checks and render table # Run startup checks and render table
try: try:
from hydrus_health_check import initialize_cookies_check
# MPV availability is validated by MPV.MPV.__init__. # MPV availability is validated by MPV.MPV.__init__.
try: try:
from MPV.mpv_ipc import MPV from MPV.mpv_ipc import MPV
@@ -1294,8 +1286,13 @@ def _create_cmdlet_cli():
# Cookies are used by yt-dlp; keep this centralized utility. # Cookies are used by yt-dlp; keep this centralized utility.
try: try:
ok, detail = initialize_cookies_check(config, emit_debug=False) from tool.ytdlp import YtDlpTool
_add_startup_check("FOUND" if ok else "MISSING", "Cookies", "N/A", detail or "Not found")
cookiefile = YtDlpTool(config).resolve_cookiefile()
if cookiefile is not None:
_add_startup_check("FOUND", "Cookies", "N/A", str(cookiefile))
else:
_add_startup_check("MISSING", "Cookies", "N/A", "Not found")
except Exception as exc: except Exception as exc:
_add_startup_check("ERROR", "Cookies", "N/A", str(exc)) _add_startup_check("ERROR", "Cookies", "N/A", str(exc))
@@ -1580,10 +1577,11 @@ def _execute_pipeline(tokens: list):
hash_val = getattr(item, 'hash', getattr(item, 'hash_hex', 'N/A')) hash_val = getattr(item, 'hash', getattr(item, 'hash_hex', 'N/A'))
title_val = getattr(item, 'title', 'N/A') title_val = getattr(item, 'title', 'N/A')
if hash_val != 'N/A': if hash_val != 'N/A':
hash_display = hash_val[:8] + '...' if len(str(hash_val)) > 8 else hash_val hash_display = str(hash_val)
print(f" -> hash={hash_display}, title={title_val}") title_display = str(title_val)
print(f" -> hash:{hash_display}, title:{title_display}")
else: else:
print(f" -> title={title_val}") print(f" -> title:{title_val}")
else: else:
print(" -> [source_index out of range]") print(" -> [source_index out of range]")
if resolved_list is not None: if resolved_list is not None:
@@ -2143,14 +2141,14 @@ def _execute_pipeline(tokens: list):
display_only_commands = { display_only_commands = {
'get-note', 'get_note', 'get-note', 'get_note',
'get-relationship', 'get_relationship', 'get-file', 'get_file', 'get-relationship', 'get_relationship', 'get-file', 'get_file',
'check-file-status', 'check_file_status'
} }
# Commands that manage their own table/history state (e.g. get-tag) # Commands that manage their own table/history state (e.g. get-tag)
self_managing_commands = { self_managing_commands = {
'get-tag', 'get_tag', 'tags', 'get-tag', 'get_tag', 'tags',
'get-url', 'get_url', 'get-url', 'get_url',
'search-file', 'search_file', 'search-file', 'search_file',
'search-provider', 'search_provider' 'search-provider', 'search_provider',
'search-store', 'search_store'
} }
overlay_table = ctx.get_display_table() if hasattr(ctx, 'get_display_table') else None overlay_table = ctx.get_display_table() if hasattr(ctx, 'get_display_table') else None
@@ -2382,7 +2380,7 @@ def _execute_cmdlet(cmd_name: str, args: list):
# Ensure native commands (cmdnat) are loaded # Ensure native commands (cmdnat) are loaded
try: try:
from cmdlet.catalog import ensure_registry_loaded as _ensure_registry_loaded from cmdlet_catalog import ensure_registry_loaded as _ensure_registry_loaded
_ensure_registry_loaded() _ensure_registry_loaded()
except Exception: except Exception:
pass pass
@@ -2391,7 +2389,7 @@ def _execute_cmdlet(cmd_name: str, args: list):
cmd_fn = REGISTRY.get(cmd_name) cmd_fn = REGISTRY.get(cmd_name)
if not cmd_fn: if not cmd_fn:
# Attempt lazy import of the module and retry # Attempt lazy import of the module and retry
from cmdlet.catalog import import_cmd_module as _catalog_import from cmdlet_catalog import import_cmd_module as _catalog_import
try: try:
mod = _catalog_import(cmd_name) mod = _catalog_import(cmd_name)
data = getattr(mod, "CMDLET", None) if mod else None data = getattr(mod, "CMDLET", None) if mod else None
@@ -2537,13 +2535,13 @@ def _execute_cmdlet(cmd_name: str, args: list):
display_only_commands = { display_only_commands = {
'get-url', 'get_url', 'get-note', 'get_note', 'get-url', 'get_url', 'get-note', 'get_note',
'get-relationship', 'get_relationship', 'get-file', 'get_file', 'get-relationship', 'get_relationship', 'get-file', 'get_file',
'check-file-status', 'check_file_status'
} }
# Commands that manage their own table/history state (e.g. get-tag) # Commands that manage their own table/history state (e.g. get-tag)
self_managing_commands = { self_managing_commands = {
'get-tag', 'get_tag', 'tags', 'get-tag', 'get_tag', 'tags',
'search-file', 'search_file', 'search-file', 'search_file',
'search-provider', 'search_provider' 'search-provider', 'search_provider',
'search-store', 'search_store'
} }
if cmd_name in self_managing_commands: if cmd_name in self_managing_commands:
@@ -2596,7 +2594,6 @@ def _execute_cmdlet(cmd_name: str, args: list):
display_only_commands = { display_only_commands = {
'get-url', 'get_url', 'get-note', 'get_note', 'get-url', 'get_url', 'get-note', 'get_note',
'get-relationship', 'get_relationship', 'get-file', 'get_file', 'get-relationship', 'get_relationship', 'get-file', 'get_file',
'check-file-status', 'check_file_status'
} }
self_managing_commands = { self_managing_commands = {
'get-tag', 'get_tag', 'tags', 'get-tag', 'get_tag', 'tags',

View File

@@ -15,11 +15,11 @@ from SYS.logger import log
from models import ProgressBar from models import ProgressBar
# Optional dependencies # Optional dependency for HTML scraping fallbacks
try: try:
from bs4 import BeautifulSoup from lxml import html as lxml_html
except ImportError: except ImportError:
BeautifulSoup = None lxml_html = None
class Libgen(SearchProvider): class Libgen(SearchProvider):
@@ -116,7 +116,7 @@ class Libgen(SearchProvider):
return [] return []
def validate(self) -> bool: def validate(self) -> bool:
# JSON-based searching can work without BeautifulSoup; HTML parsing is a fallback. # JSON-based searching can work without lxml; HTML parsing is a fallback.
return True return True
def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]: def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
@@ -342,8 +342,8 @@ class LibgenSearch:
Uses a total time budget across mirrors to avoid long hangs. Uses a total time budget across mirrors to avoid long hangs.
""" """
# Prefer JSON API (no BeautifulSoup needed); HTML scraping is a fallback. # Prefer JSON API (no lxml needed); HTML scraping is a fallback.
has_bs4 = BeautifulSoup is not None has_lxml = lxml_html is not None
started = time.monotonic() started = time.monotonic()
@@ -372,7 +372,7 @@ class LibgenSearch:
results = [] results = []
if not results: if not results:
if not has_bs4: if not has_lxml:
continue continue
if "libgen.li" in mirror or "libgen.gl" in mirror: if "libgen.li" in mirror or "libgen.gl" in mirror:
@@ -417,57 +417,73 @@ class LibgenSearch:
resp = self.session.get(url, params=params, timeout=timeout) resp = self.session.get(url, params=params, timeout=timeout)
resp.raise_for_status() resp.raise_for_status()
if BeautifulSoup is None: if lxml_html is None:
return [] return []
soup = BeautifulSoup(resp.text, "html.parser")
table = soup.find("table", {"class": "c"}) def _text(el: Any) -> str:
if not table: return " ".join([t.strip() for t in el.itertext() if t and str(t).strip()]).strip()
tables = soup.find_all("table")
for t in tables: try:
if len(t.find_all("tr")) > 5: doc = lxml_html.fromstring(resp.content)
except Exception:
return []
table_nodes = doc.xpath(
"//table[contains(concat(' ', normalize-space(@class), ' '), ' c ')]"
)
table = table_nodes[0] if table_nodes else None
if table is None:
for t in doc.xpath("//table"):
if len(t.xpath(".//tr")) > 5:
table = t table = t
break break
if not table: if table is None:
return [] return []
results: List[Dict[str, Any]] = [] results: List[Dict[str, Any]] = []
rows = table.find_all("tr")[1:] rows = table.xpath(".//tr")[1:]
for row in rows: for row in rows:
cols = row.find_all("td") cols = row.xpath("./td")
if len(cols) < 9: if len(cols) < 9:
continue continue
try: try:
libgen_id = cols[0].get_text(strip=True) libgen_id = _text(cols[0])
authors = [a.get_text(strip=True) for a in cols[1].find_all("a")]
if not authors:
authors = [cols[1].get_text(strip=True)]
title_tag = cols[2].find("a") author_links = cols[1].xpath(".//a")
title = title_tag.get_text(strip=True) if title_tag else cols[2].get_text(strip=True) authors = [_text(a) for a in author_links if _text(a)]
if not authors:
authors = [_text(cols[1])]
title_tag = None
title_links = cols[2].xpath(".//a")
if title_links:
title_tag = title_links[0]
title = _text(title_tag) if title_tag is not None else _text(cols[2])
md5 = "" md5 = ""
if title_tag and title_tag.has_attr("href"): if title_tag is not None:
href = str(title_tag.get("href") or "") href = str(title_tag.get("href") or "")
match = re.search(r"md5=([a-fA-F0-9]{32})", href) match = re.search(r"md5=([a-fA-F0-9]{32})", href)
if match: if match:
md5 = match.group(1) md5 = match.group(1)
publisher = cols[3].get_text(strip=True) publisher = _text(cols[3])
year = cols[4].get_text(strip=True) year = _text(cols[4])
pages = cols[5].get_text(strip=True) pages = _text(cols[5])
language = cols[6].get_text(strip=True) language = _text(cols[6])
size = cols[7].get_text(strip=True) size = _text(cols[7])
extension = cols[8].get_text(strip=True) extension = _text(cols[8])
mirror_links = [] mirror_links: List[str] = []
for i in range(9, len(cols)): for i in range(9, len(cols)):
a = cols[i].find("a") a_nodes = cols[i].xpath(".//a[@href]")
if a and a.has_attr("href"): if a_nodes:
mirror_links.append(a["href"]) href = str(a_nodes[0].get("href") or "").strip()
if href:
mirror_links.append(href)
if md5: if md5:
download_link = f"http://library.lol/main/{md5}" download_link = f"http://library.lol/main/{md5}"
@@ -476,10 +492,11 @@ class LibgenSearch:
else: else:
download_link = "" download_link = ""
results.append({ results.append(
{
"id": libgen_id, "id": libgen_id,
"title": title, "title": title,
"author": ", ".join(authors), "author": ", ".join([a for a in authors if a]) or "Unknown",
"publisher": publisher, "publisher": publisher,
"year": year, "year": year,
"pages": pages, "pages": pages,
@@ -489,11 +506,11 @@ class LibgenSearch:
"md5": md5, "md5": md5,
"mirror_url": download_link, "mirror_url": download_link,
"cover": "", "cover": "",
}) }
)
if len(results) >= limit: if len(results) >= limit:
break break
except Exception as e: except Exception as e:
logging.debug(f"Error parsing row: {e}") logging.debug(f"Error parsing row: {e}")
continue continue
@@ -521,21 +538,35 @@ class LibgenSearch:
resp = self.session.get(url, params=params, timeout=timeout) resp = self.session.get(url, params=params, timeout=timeout)
resp.raise_for_status() resp.raise_for_status()
if BeautifulSoup is None: if lxml_html is None:
return [] return []
soup = BeautifulSoup(resp.text, "html.parser")
table = soup.find("table", {"id": "tablelibgen"})
if not table:
table = soup.find("table", {"class": "table table-striped"})
if not table: def _text(el: Any) -> str:
return " ".join([t.strip() for t in el.itertext() if t and str(t).strip()]).strip()
try:
doc = lxml_html.fromstring(resp.content)
except Exception:
return []
table_nodes = doc.xpath("//table[@id='tablelibgen']")
table = table_nodes[0] if table_nodes else None
if table is None:
# Common libgen.li/gl fallback
table_nodes = doc.xpath(
"//table[contains(concat(' ', normalize-space(@class), ' '), ' table ') and "
"contains(concat(' ', normalize-space(@class), ' '), ' table-striped ')]"
)
table = table_nodes[0] if table_nodes else None
if table is None:
return [] return []
results: List[Dict[str, Any]] = [] results: List[Dict[str, Any]] = []
rows = table.find_all("tr")[1:] rows = table.xpath(".//tr")[1:]
for row in rows: for row in rows:
cols = row.find_all("td") cols = row.xpath("./td")
if len(cols) < 9: if len(cols) < 9:
continue continue
@@ -543,26 +574,30 @@ class LibgenSearch:
# Extract md5 (libgen.gl exposes /ads.php?md5=... in mirror column) # Extract md5 (libgen.gl exposes /ads.php?md5=... in mirror column)
md5 = "" md5 = ""
mirror_url = "" mirror_url = ""
for a in row.find_all("a"): for a in row.xpath(".//a[@href]"):
href = a.get("href") href = str(a.get("href") or "")
if not href: if not href:
continue continue
m = re.search(r"md5=([a-fA-F0-9]{32})", str(href)) m = re.search(r"md5=([a-fA-F0-9]{32})", href)
if m: if m:
md5 = m.group(1) md5 = m.group(1)
if "ads.php" in str(href): if "ads.php" in href:
mirror_url = urljoin(mirror, str(href)) mirror_url = urljoin(mirror, href)
break break
if not mirror_url and md5: if not mirror_url and md5:
mirror_url = urljoin(mirror, f"/ads.php?md5={md5}") mirror_url = urljoin(mirror, f"/ads.php?md5={md5}")
# Extract numeric file id from /file.php?id=... # Extract numeric file id from /file.php?id=...
libgen_id = "" libgen_id = ""
file_link = row.find("a", href=re.compile(r"/file\.php\?id=\d+")) for a in row.xpath(".//a[@href]"):
if file_link and file_link.get("href"): href = str(a.get("href") or "")
m = re.search(r"id=(\d+)", str(file_link.get("href"))) if not href:
continue
if re.search(r"/file\.php\?id=\d+", href):
m = re.search(r"id=(\d+)", href)
if m: if m:
libgen_id = m.group(1) libgen_id = m.group(1)
break
title = "" title = ""
authors = "" authors = ""
@@ -585,7 +620,7 @@ class LibgenSearch:
if offset is not None: if offset is not None:
meta_cell = cols[offset] meta_cell = cols[offset]
meta_text = " ".join([str(s).strip() for s in meta_cell.stripped_strings if str(s).strip()]) meta_text = _text(meta_cell)
# Extract ISBNs from meta cell (avoid using them as title) # Extract ISBNs from meta cell (avoid using them as title)
# Matches 10 or 13-digit ISBN with optional leading 978/979. # Matches 10 or 13-digit ISBN with optional leading 978/979.
@@ -601,11 +636,11 @@ class LibgenSearch:
# Choose a "real" title from meta cell. # Choose a "real" title from meta cell.
# libgen.gl meta can include series/edition/isbn blobs; prefer text with letters. # libgen.gl meta can include series/edition/isbn blobs; prefer text with letters.
raw_candidates: List[str] = [] raw_candidates: List[str] = []
for a in meta_cell.find_all("a"): for a in meta_cell.xpath(".//a"):
t = a.get_text(" ", strip=True) t = _text(a)
if t: if t:
raw_candidates.append(t) raw_candidates.append(t)
for s in meta_cell.stripped_strings: for s in meta_cell.itertext():
t = str(s).strip() t = str(s).strip()
if t: if t:
raw_candidates.append(t) raw_candidates.append(t)
@@ -645,27 +680,27 @@ class LibgenSearch:
best_score = score best_score = score
best_title = cand best_title = cand
title = best_title or meta_cell.get_text(" ", strip=True) title = best_title or _text(meta_cell)
authors = cols[offset + 1].get_text(" ", strip=True) authors = _text(cols[offset + 1])
publisher = cols[offset + 2].get_text(" ", strip=True) publisher = _text(cols[offset + 2])
year = cols[offset + 3].get_text(" ", strip=True) year = _text(cols[offset + 3])
language = cols[offset + 4].get_text(" ", strip=True) language = _text(cols[offset + 4])
pages = cols[offset + 5].get_text(" ", strip=True) pages = _text(cols[offset + 5])
size = cols[offset + 6].get_text(" ", strip=True) size = _text(cols[offset + 6])
extension = cols[offset + 7].get_text(" ", strip=True) extension = _text(cols[offset + 7])
else: else:
# Older fallback structure # Older fallback structure
title_col = cols[1] title_col = cols[1]
title_link = title_col.find("a") title_links = title_col.xpath(".//a")
title = title_link.get_text(" ", strip=True) if title_link else title_col.get_text(" ", strip=True) title = _text(title_links[0]) if title_links else _text(title_col)
authors = cols[2].get_text(" ", strip=True) authors = _text(cols[2])
publisher = cols[3].get_text(" ", strip=True) publisher = _text(cols[3])
year = cols[4].get_text(" ", strip=True) year = _text(cols[4])
language = cols[5].get_text(" ", strip=True) language = _text(cols[5])
pages = cols[6].get_text(" ", strip=True) pages = _text(cols[6])
size = cols[7].get_text(" ", strip=True) size = _text(cols[7])
extension = cols[8].get_text(" ", strip=True) extension = _text(cols[8])
title = (title or "").strip() or "Unknown" title = (title or "").strip() or "Unknown"
authors = (authors or "").strip() or "Unknown" authors = (authors or "").strip() or "Unknown"
@@ -729,15 +764,49 @@ def _resolve_download_url(
current_url = url current_url = url
visited = set() visited = set()
if BeautifulSoup is None: def _resolve_html_links_regex(base_url: str, html: str) -> Optional[str]:
_call(log_info, "[resolve] BeautifulSoup not available; cannot resolve HTML download chain") """Best-effort HTML link resolver without lxml.
This is intentionally minimal: it primarily targets LibGen landing pages like
`/ads.php?md5=...` which contain a `get.php?md5=...` link.
"""
if not html:
return None return None
def _find_a_by_text(pattern: str) -> Optional[Any]: # Prefer explicit get.php md5 links (most common successful chain).
for a in soup.find_all("a"): m = re.search(r'href=["\']([^"\']*get\.php\?md5=[a-fA-F0-9]{32}[^"\']*)["\']', html, flags=re.IGNORECASE)
t = a.get_text(" ", strip=True) if m:
href = str(m.group(1) or "").strip()
if href and not href.lower().startswith("javascript:"):
return urljoin(base_url, href)
# Next: library.lol main links.
m = re.search(r'href=["\']([^"\']*library\.lol[^"\']*)["\']', html, flags=re.IGNORECASE)
if m:
href = str(m.group(1) or "").strip()
if href and not href.lower().startswith("javascript:"):
return urljoin(base_url, href)
# Finally: any direct file extension link.
m = re.search(
r'href=["\']([^"\']+\.(?:pdf|epub|mobi|djvu|azw3|cbz|cbr)(?:\?[^"\']*)?)["\']',
html,
flags=re.IGNORECASE,
)
if m:
href = str(m.group(1) or "").strip()
if href and not href.lower().startswith("javascript:"):
return urljoin(base_url, href)
return None
def _find_href_by_text(doc: Any, pattern: str) -> Optional[str]:
for a in doc.xpath("//a[@href]"):
t = " ".join([s.strip() for s in a.itertext() if s and str(s).strip()]).strip()
if t and re.search(pattern, t, re.IGNORECASE): if t and re.search(pattern, t, re.IGNORECASE):
return a href = str(a.get("href") or "").strip()
if href and not href.lower().startswith("javascript:"):
return href
return None return None
for _ in range(6): for _ in range(6):
@@ -763,42 +832,58 @@ def _resolve_download_url(
_call(log_info, f"[resolve] Failed to fetch {current_url}: {e}") _call(log_info, f"[resolve] Failed to fetch {current_url}: {e}")
return None return None
soup = BeautifulSoup(content, "html.parser") doc = None
if lxml_html is not None:
try:
doc = lxml_html.fromstring(content)
except Exception:
doc = None
get_link = _find_a_by_text(r"^GET$") if doc is None:
if get_link and get_link.has_attr("href"): next_url = _resolve_html_links_regex(current_url, content)
return urljoin(current_url, str(get_link.get("href") or "")) if next_url:
current_url = next_url
continue
_call(log_info, "[resolve] lxml not available and regex resolver found no links")
return None
get_href = _find_href_by_text(doc, r"^GET$")
if get_href:
return urljoin(current_url, get_href)
if "series.php" in current_url: if "series.php" in current_url:
edition_link = soup.find("a", href=re.compile(r"edition\.php")) hrefs = doc.xpath("//a[contains(@href,'edition.php')]/@href")
if edition_link: if hrefs:
current_url = urljoin(current_url, str(edition_link.get("href") or "")) current_url = urljoin(current_url, str(hrefs[0] or ""))
continue continue
if "edition.php" in current_url: if "edition.php" in current_url:
file_link = soup.find("a", href=re.compile(r"file\.php")) hrefs = doc.xpath("//a[contains(@href,'file.php')]/@href")
if file_link: if hrefs:
current_url = urljoin(current_url, str(file_link.get("href") or "")) current_url = urljoin(current_url, str(hrefs[0] or ""))
continue continue
if "file.php" in current_url: if "file.php" in current_url:
libgen_link = soup.find("a", title="libgen") libgen_href = None
if not libgen_link: for a in doc.xpath("//a[@href]"):
libgen_link = _find_a_by_text(r"Libgen") if str(a.get("title") or "").strip().lower() == "libgen":
libgen_href = str(a.get("href") or "").strip()
if libgen_link and libgen_link.has_attr("href"): break
current_url = urljoin(current_url, str(libgen_link.get("href") or "")) if not libgen_href:
libgen_href = _find_href_by_text(doc, r"Libgen")
if libgen_href:
current_url = urljoin(current_url, libgen_href)
continue continue
if "ads.php" in current_url: if "ads.php" in current_url:
get_php_link = soup.find("a", href=re.compile(r"get\.php")) hrefs = doc.xpath("//a[contains(@href,'get.php')]/@href")
if get_php_link: if hrefs:
return urljoin(current_url, str(get_php_link.get("href") or "")) return urljoin(current_url, str(hrefs[0] or ""))
for text in ["Cloudflare", "IPFS.io", "Infura"]: for text in ["Cloudflare", "IPFS.io", "Infura"]:
link = _find_a_by_text(re.escape(text)) href = _find_href_by_text(doc, re.escape(text))
if link and link.has_attr("href"): if href:
return urljoin(current_url, str(link.get("href") or "")) return urljoin(current_url, href)
break break

View File

@@ -1,6 +1,7 @@
from __future__ import annotations from __future__ import annotations
import base64 import base64
import io
from concurrent import futures from concurrent import futures
import hashlib import hashlib
import json as json_module import json as json_module
@@ -34,6 +35,53 @@ except ImportError:
tqdm = None # type: ignore tqdm = None # type: ignore
def _image_paths_to_pdf_bytes(images: List[str]) -> Optional[bytes]:
if not images:
return None
try:
from PIL import Image # type: ignore
except Exception:
return None
pil_images: List[Any] = []
try:
for p in images:
img_path = Path(p)
if not img_path.is_file():
continue
with Image.open(img_path) as im: # type: ignore[attr-defined]
# Ensure PDF-compatible mode.
if im.mode in {"RGBA", "LA", "P"}:
im = im.convert("RGB")
else:
im = im.convert("RGB")
pil_images.append(im.copy())
except Exception:
for im in pil_images:
try:
im.close()
except Exception:
pass
return None
if not pil_images:
return None
buf = io.BytesIO()
first, rest = pil_images[0], pil_images[1:]
try:
first.save(buf, format="PDF", save_all=True, append_images=rest)
return buf.getvalue()
except Exception:
return None
finally:
for im in pil_images:
try:
im.close()
except Exception:
pass
def _looks_like_isbn(text: str) -> bool: def _looks_like_isbn(text: str) -> bool:
t = (text or "").replace("-", "").strip() t = (text or "").replace("-", "").strip()
return t.isdigit() and len(t) in (10, 13) return t.isdigit() and len(t) in (10, 13)
@@ -941,17 +989,11 @@ class OpenLibrary(SearchProvider):
try: try:
images = self._archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id) images = self._archive_download(session=session, n_threads=10, directory=temp_dir, links=links, scale=3, book_id=archive_id)
try: pdf_bytes = _image_paths_to_pdf_bytes(images)
import img2pdf # type: ignore
pdf_bytes = img2pdf.convert(images) if images else None
if not pdf_bytes: if not pdf_bytes:
log("[openlibrary] PDF conversion failed", file=sys.stderr) # Keep images folder for manual conversion.
try: log("[openlibrary] PDF conversion failed; keeping images folder", file=sys.stderr)
shutil.rmtree(temp_dir) return Path(temp_dir)
except Exception:
pass
return None
pdf_path = unique_path(output_dir / f"{title}.pdf") pdf_path = unique_path(output_dir / f"{title}.pdf")
with open(pdf_path, "wb") as f: with open(pdf_path, "wb") as f:
@@ -963,10 +1005,6 @@ class OpenLibrary(SearchProvider):
pass pass
return pdf_path return pdf_path
except ImportError:
# Keep images folder.
return Path(temp_dir)
except Exception: except Exception:
try: try:
shutil.rmtree(temp_dir) shutil.rmtree(temp_dir)

View File

@@ -281,13 +281,6 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
if opts.cookies_path and opts.cookies_path.is_file(): if opts.cookies_path and opts.cookies_path.is_file():
base_options["cookiefile"] = str(opts.cookies_path) base_options["cookiefile"] = str(opts.cookies_path)
else:
# Check global cookies file lazily to avoid import cycles
from hydrus_health_check import get_cookies_file_path # local import
global_cookies = get_cookies_file_path()
if global_cookies:
base_options["cookiefile"] = global_cookies
else: else:
# Fallback to browser cookies # Fallback to browser cookies
base_options["cookiesfrombrowser"] = ("chrome",) base_options["cookiesfrombrowser"] = ("chrome",)
@@ -453,21 +446,40 @@ def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
# Try to find actual download link in the page # Try to find actual download link in the page
try: try:
from bs4 import BeautifulSoup try:
soup = BeautifulSoup(response.content, 'html.parser') from lxml import html as lxml_html
except ImportError:
lxml_html = None
# Look for download links - LibGen typically has forms with download buttons if lxml_html is not None:
# Look for all links and forms that might lead to download doc = lxml_html.fromstring(response.content)
for link in soup.find_all('a'): for a in doc.xpath("//a[@href]"):
href = link.get('href') href = str(a.get("href") or "").strip()
if href and isinstance(href, str): if not href:
# Look for direct file links or get.php redirects continue
if 'get.php' in href.lower() or href.endswith(('.pdf', '.epub', '.djvu', '.mobi')):
download_url = href if href.startswith('http') else urljoin(final_url, href) href_lower = href.lower()
if "get.php" in href_lower or href_lower.endswith((".pdf", ".epub", ".djvu", ".mobi")):
download_url = href if href.startswith("http") else urljoin(final_url, href)
debug(f"Found download link: {download_url}") debug(f"Found download link: {download_url}")
return download_url return download_url
except ImportError: else:
pass # BeautifulSoup not available # Regex fallback
for m in re.finditer(
r"href=[\"\']([^\"\']+)[\"\']",
response.text or "",
flags=re.IGNORECASE,
):
href = str(m.group(1) or "").strip()
if not href or href.lower().startswith("javascript:"):
continue
href_lower = href.lower()
if "get.php" in href_lower or href_lower.endswith((".pdf", ".epub", ".djvu", ".mobi")):
download_url = href if href.startswith("http") else urljoin(final_url, href)
debug(f"Found download link: {download_url}")
return download_url
except Exception:
pass
# If we followed redirects successfully, return the final URL # If we followed redirects successfully, return the final URL
# This handles cases where libgen redirects to a direct download mirror # This handles cases where libgen redirects to a direct download mirror
@@ -708,12 +720,7 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) ->
"noprogress": True, # No progress bars "noprogress": True, # No progress bars
} }
# Add cookies if available (lazy import to avoid circular dependency) # Cookies are optional for probing; callers should pass cookiefile via DownloadOptions when needed.
from hydrus_health_check import get_cookies_file_path # local import
global_cookies = get_cookies_file_path()
if global_cookies:
ydl_opts["cookiefile"] = global_cookies
# Add no_playlist option if specified # Add no_playlist option if specified
if no_playlist: if no_playlist:

View File

@@ -23,6 +23,10 @@ class HydrusNetwork(Store):
Maintains its own HydrusClient. Maintains its own HydrusClient.
""" """
def _log_prefix(self) -> str:
store_name = getattr(self, "NAME", None) or "unknown"
return f"[hydrusnetwork:{store_name}]"
def __new__(cls, *args: Any, **kwargs: Any) -> "HydrusNetwork": def __new__(cls, *args: Any, **kwargs: Any) -> "HydrusNetwork":
instance = super().__new__(cls) instance = super().__new__(cls)
name = kwargs.get("NAME") name = kwargs.get("NAME")
@@ -109,7 +113,7 @@ class HydrusNetwork(Store):
raise RuntimeError(f"Hydrus '{self.NAME}' unavailable: {err}") from exc raise RuntimeError(f"Hydrus '{self.NAME}' unavailable: {err}") from exc
# Create a persistent client for this instance (auth via access key by default). # Create a persistent client for this instance (auth via access key by default).
self._client = HydrusClient(url=self.URL, access_key=self.API) self._client = HydrusClient(url=self.URL, access_key=self.API, instance_name=self.NAME)
# Best-effort total count (fast on Hydrus side; does not fetch IDs/hashes). # Best-effort total count (fast on Hydrus side; does not fetch IDs/hashes).
try: try:
@@ -129,7 +133,7 @@ class HydrusNetwork(Store):
if isinstance(count_val, int): if isinstance(count_val, int):
self.total_count = count_val self.total_count = count_val
except Exception as exc: except Exception as exc:
debug(f"Hydrus total count unavailable for '{self.NAME}': {exc}", file=sys.stderr) debug(f"{self._log_prefix()} total count unavailable: {exc}", file=sys.stderr)
def name(self) -> str: def name(self) -> str:
return self.NAME return self.NAME
@@ -167,7 +171,7 @@ class HydrusNetwork(Store):
try: try:
# Compute file hash # Compute file hash
file_hash = sha256_file(file_path) file_hash = sha256_file(file_path)
debug(f"File hash: {file_hash}") debug(f"{self._log_prefix()} file hash: {file_hash}")
# Use persistent client with session key # Use persistent client with session key
client = self._client client = self._client
@@ -177,11 +181,24 @@ class HydrusNetwork(Store):
# Check if file already exists in Hydrus # Check if file already exists in Hydrus
file_exists = False file_exists = False
try: try:
metadata = client.fetch_file_metadata(hashes=[file_hash]) metadata = client.fetch_file_metadata(
hashes=[file_hash],
include_service_keys_to_tags=False,
include_file_url=False,
include_duration=False,
include_size=False,
include_mime=False,
)
if metadata and isinstance(metadata, dict): if metadata and isinstance(metadata, dict):
files = metadata.get("metadata", []) metas = metadata.get("metadata", [])
if files: if isinstance(metas, list) and metas:
# Hydrus returns placeholder rows for unknown hashes.
# Only treat as a real duplicate if it has a concrete file_id.
for meta in metas:
if isinstance(meta, dict) and meta.get("file_id") is not None:
file_exists = True file_exists = True
break
if file_exists:
log( log(
f" Duplicate detected - file already in Hydrus with hash: {file_hash}", f" Duplicate detected - file already in Hydrus with hash: {file_hash}",
file=sys.stderr, file=sys.stderr,
@@ -189,9 +206,17 @@ class HydrusNetwork(Store):
except Exception: except Exception:
pass pass
# If Hydrus reports an existing file, it may be in trash. Best-effort restore it to 'my files'.
# This keeps behavior aligned with user expectation: "use API only" and ensure it lands in my files.
if file_exists:
try:
client.undelete_files([file_hash])
except Exception:
pass
# Upload file if not already present # Upload file if not already present
if not file_exists: if not file_exists:
log(f"Uploading to Hydrus: {file_path.name}", file=sys.stderr) log(f"{self._log_prefix()} Uploading: {file_path.name}", file=sys.stderr)
response = client.add_file(file_path) response = client.add_file(file_path)
# Extract hash from response # Extract hash from response
@@ -207,7 +232,7 @@ class HydrusNetwork(Store):
raise Exception(f"Hydrus response missing file hash: {response}") raise Exception(f"Hydrus response missing file hash: {response}")
file_hash = hydrus_hash file_hash = hydrus_hash
log(f"Hydrus: {file_hash}", file=sys.stderr) log(f"{self._log_prefix()} hash: {file_hash}", file=sys.stderr)
# Add tags if provided (both for new and existing files) # Add tags if provided (both for new and existing files)
if tag_list: if tag_list:
@@ -218,27 +243,27 @@ class HydrusNetwork(Store):
service_name = "my tags" service_name = "my tags"
try: try:
debug(f"Adding {len(tag_list)} tag(s) to Hydrus: {tag_list}") debug(f"{self._log_prefix()} Adding {len(tag_list)} tag(s): {tag_list}")
client.add_tag(file_hash, tag_list, service_name) client.add_tag(file_hash, tag_list, service_name)
log(f"Tags added via '{service_name}'", file=sys.stderr) log(f"{self._log_prefix()} Tags added via '{service_name}'", file=sys.stderr)
except Exception as exc: except Exception as exc:
log(f"⚠️ Failed to add tags: {exc}", file=sys.stderr) log(f"{self._log_prefix()} ⚠️ Failed to add tags: {exc}", file=sys.stderr)
# Associate url if provided (both for new and existing files) # Associate url if provided (both for new and existing files)
if url: if url:
log(f"Associating {len(url)} URL(s) with file", file=sys.stderr) log(f"{self._log_prefix()} Associating {len(url)} URL(s) with file", file=sys.stderr)
for url in url: for url in url:
if url: if url:
try: try:
client.associate_url(file_hash, str(url)) client.associate_url(file_hash, str(url))
debug(f"Associated URL: {url}") debug(f"{self._log_prefix()} Associated URL: {url}")
except Exception as exc: except Exception as exc:
log(f"⚠️ Failed to associate URL {url}: {exc}", file=sys.stderr) log(f"{self._log_prefix()} ⚠️ Failed to associate URL {url}: {exc}", file=sys.stderr)
return file_hash return file_hash
except Exception as exc: except Exception as exc:
log(f"❌ Hydrus upload failed: {exc}", file=sys.stderr) log(f"{self._log_prefix()} upload failed: {exc}", file=sys.stderr)
raise raise
def search(self, query: str, **kwargs: Any) -> list[Dict[str, Any]]: def search(self, query: str, **kwargs: Any) -> list[Dict[str, Any]]:
@@ -262,7 +287,8 @@ class HydrusNetwork(Store):
if client is None: if client is None:
raise Exception("Hydrus client unavailable") raise Exception("Hydrus client unavailable")
debug(f"Searching Hydrus for: {query}") prefix = self._log_prefix()
debug(f"{prefix} Searching for: {query}")
def _extract_urls(meta_obj: Any) -> list[str]: def _extract_urls(meta_obj: Any) -> list[str]:
if not isinstance(meta_obj, dict): if not isinstance(meta_obj, dict):
@@ -446,7 +472,7 @@ class HydrusNetwork(Store):
tags = [query_lower] tags = [query_lower]
if not tags: if not tags:
debug(f"Found 0 result(s)") debug(f"{prefix} 0 result(s)")
return [] return []
# Search files with the tags (unless url: search already produced metadata) # Search files with the tags (unless url: search already produced metadata)
@@ -465,7 +491,7 @@ class HydrusNetwork(Store):
hashes = search_result.get("hashes", []) if isinstance(search_result, dict) else [] hashes = search_result.get("hashes", []) if isinstance(search_result, dict) else []
if not file_ids and not hashes: if not file_ids and not hashes:
debug(f"Found 0 result(s)") debug(f"{prefix} 0 result(s)")
return [] return []
if file_ids: if file_ids:
@@ -595,7 +621,7 @@ class HydrusNetwork(Store):
"ext": ext, "ext": ext,
}) })
debug(f"Found {len(results)} result(s)") debug(f"{prefix} {len(results)} result(s)")
return results[:limit] return results[:limit]
except Exception as exc: except Exception as exc:
@@ -611,13 +637,13 @@ class HydrusNetwork(Store):
Only explicit user actions (e.g. the get-file cmdlet) should open files. Only explicit user actions (e.g. the get-file cmdlet) should open files.
""" """
debug(f"[HydrusNetwork.get_file] Starting for hash: {file_hash[:12]}...") debug(f"{self._log_prefix()} get_file: start hash={file_hash[:12]}...")
# Build browser URL with access key # Build browser URL with access key
base_url = str(self.URL).rstrip('/') base_url = str(self.URL).rstrip('/')
access_key = str(self.API) access_key = str(self.API)
browser_url = f"{base_url}/get_files/file?hash={file_hash}&Hydrus-Client-API-Access-Key={access_key}" browser_url = f"{base_url}/get_files/file?hash={file_hash}&Hydrus-Client-API-Access-Key={access_key}"
debug(f"[HydrusNetwork.get_file] Returning URL: {browser_url}") debug(f"{self._log_prefix()} get_file: url={browser_url}")
return browser_url return browser_url
def get_metadata(self, file_hash: str, **kwargs: Any) -> Optional[Dict[str, Any]]: def get_metadata(self, file_hash: str, **kwargs: Any) -> Optional[Dict[str, Any]]:
@@ -632,17 +658,28 @@ class HydrusNetwork(Store):
try: try:
client = self._client client = self._client
if not client: if not client:
debug("get_metadata: Hydrus client unavailable") debug(f"{self._log_prefix()} get_metadata: client unavailable")
return None return None
# Fetch file metadata # Fetch file metadata with the fields we need for CLI display.
payload = client.fetch_file_metadata(hashes=[file_hash], include_service_keys_to_tags=True) payload = client.fetch_file_metadata(
hashes=[file_hash],
include_service_keys_to_tags=True,
include_file_url=True,
include_duration=True,
include_size=True,
include_mime=True,
)
if not payload or not payload.get("metadata"): if not payload or not payload.get("metadata"):
return None return None
meta = payload["metadata"][0] meta = payload["metadata"][0]
# Hydrus can return placeholder metadata rows for unknown hashes.
if not isinstance(meta, dict) or meta.get("file_id") is None:
return None
# Extract title from tags # Extract title from tags
title = f"Hydrus_{file_hash[:12]}" title = f"Hydrus_{file_hash[:12]}"
tags_payload = meta.get("tags", {}) tags_payload = meta.get("tags", {})
@@ -660,33 +697,109 @@ class HydrusNetwork(Store):
if title != f"Hydrus_{file_hash[:12]}": if title != f"Hydrus_{file_hash[:12]}":
break break
# Prefer Hydrus-provided extension (e.g. ".webm"); fall back to MIME map if needed. # Hydrus may return mime as an int enum, or sometimes a human label.
mime_type = meta.get("mime", "") mime_val = meta.get("mime")
ext_raw = meta.get("ext") filetype_human = meta.get("filetype_human") or meta.get("mime_human") or meta.get("mime_string")
ext = str(ext_raw or "").strip().lstrip(".")
if not ext and mime_type: # Determine ext: prefer Hydrus metadata ext, then filetype_human (when it looks like an ext),
# then title suffix, then file path suffix.
ext = str(meta.get("ext") or "").strip().lstrip(".")
if not ext:
ft = str(filetype_human or "").strip().lstrip(".").lower()
if ft and ft != "unknown filetype" and ft.isalnum() and len(ft) <= 8:
# Treat simple labels like "mp4", "m4a", "webm" as extensions.
ext = ft
if not ext and isinstance(title, str) and "." in title:
try: try:
from SYS.utils_constant import mime_maps ext = Path(title).suffix.lstrip(".")
for category in mime_maps.values():
for _ext_key, info in category.items():
if mime_type in info.get("mimes", []):
ext = str(info.get("ext", "")).strip().lstrip(".")
break
if ext:
break
except Exception: except Exception:
ext = "" ext = ""
if not ext:
try:
path_payload = client.get_file_path(file_hash)
if isinstance(path_payload, dict):
p = path_payload.get("path")
if isinstance(p, str) and p.strip():
ext = Path(p.strip()).suffix.lstrip(".")
except Exception:
ext = ""
# If extension is still unknown, attempt a best-effort lookup from MIME.
def _mime_from_ext(ext_value: str) -> str:
ext_clean = str(ext_value or "").strip().lstrip(".").lower()
if not ext_clean:
return ""
try:
for category in mime_maps.values():
info = category.get(ext_clean)
if isinstance(info, dict):
mimes = info.get("mimes")
if isinstance(mimes, list) and mimes:
first = mimes[0]
return str(first)
except Exception:
return ""
return ""
# Normalize to a MIME string for CLI output.
# Avoid passing through human labels like "unknown filetype".
mime_type = ""
if isinstance(mime_val, str):
candidate = mime_val.strip()
if "/" in candidate and candidate.lower() != "unknown filetype":
mime_type = candidate
if not mime_type and isinstance(filetype_human, str):
candidate = filetype_human.strip()
if "/" in candidate and candidate.lower() != "unknown filetype":
mime_type = candidate
if not mime_type:
mime_type = _mime_from_ext(ext)
# Normalize size/duration to stable scalar types.
size_val = meta.get("size")
if size_val is None:
size_val = meta.get("size_bytes")
try:
size_int: int | None = int(size_val) if size_val is not None else None
except Exception:
size_int = None
dur_val = meta.get("duration")
if dur_val is None:
dur_val = meta.get("duration_ms")
try:
dur_int: int | None = int(dur_val) if dur_val is not None else None
except Exception:
dur_int = None
raw_urls = (
meta.get("known_urls")
or meta.get("urls")
or meta.get("url")
or []
)
url_list: list[str] = []
if isinstance(raw_urls, str):
s = raw_urls.strip()
url_list = [s] if s else []
elif isinstance(raw_urls, list):
url_list = [str(u).strip() for u in raw_urls if isinstance(u, str) and str(u).strip()]
return { return {
"hash": file_hash, "hash": file_hash,
"title": title, "title": title,
"ext": ext, "ext": ext,
"size": meta.get("size"), "size": size_int,
"mime": mime_type, "mime": mime_type,
# Keep raw fields available for troubleshooting/other callers.
"hydrus_mime": mime_val,
"filetype_human": filetype_human,
"duration_ms": dur_int,
"url": url_list,
} }
except Exception as exc: except Exception as exc:
debug(f"Failed to get metadata from Hydrus: {exc}") debug(f"{self._log_prefix()} get_metadata failed: {exc}")
return None return None
def get_tag(self, file_identifier: str, **kwargs: Any) -> Tuple[List[str], str]: def get_tag(self, file_identifier: str, **kwargs: Any) -> Tuple[List[str], str]:
@@ -705,13 +818,13 @@ class HydrusNetwork(Store):
file_hash = str(file_identifier or "").strip().lower() file_hash = str(file_identifier or "").strip().lower()
if len(file_hash) != 64 or not all(ch in "0123456789abcdef" for ch in file_hash): if len(file_hash) != 64 or not all(ch in "0123456789abcdef" for ch in file_hash):
debug(f"get_tags: invalid file hash '{file_identifier}'") debug(f"{self._log_prefix()} get_tags: invalid file hash '{file_identifier}'")
return [], "unknown" return [], "unknown"
# Get Hydrus client and service info # Get Hydrus client and service info
client = self._client client = self._client
if not client: if not client:
debug("get_tags: Hydrus client unavailable") debug(f"{self._log_prefix()} get_tags: client unavailable")
return [], "unknown" return [], "unknown"
# Fetch file metadata # Fetch file metadata
@@ -723,12 +836,12 @@ class HydrusNetwork(Store):
items = payload.get("metadata") if isinstance(payload, dict) else None items = payload.get("metadata") if isinstance(payload, dict) else None
if not isinstance(items, list) or not items: if not isinstance(items, list) or not items:
debug(f"get_tags: No metadata returned for hash {file_hash}") debug(f"{self._log_prefix()} get_tags: no metadata for hash {file_hash}")
return [], "unknown" return [], "unknown"
meta = items[0] if isinstance(items[0], dict) else None meta = items[0] if isinstance(items[0], dict) else None
if not isinstance(meta, dict) or meta.get("file_id") is None: if not isinstance(meta, dict) or meta.get("file_id") is None:
debug(f"get_tags: Invalid metadata for hash {file_hash}") debug(f"{self._log_prefix()} get_tags: invalid metadata for hash {file_hash}")
return [], "unknown" return [], "unknown"
# Extract tags using service name # Extract tags using service name
@@ -741,7 +854,7 @@ class HydrusNetwork(Store):
return tags, "hydrus" return tags, "hydrus"
except Exception as exc: except Exception as exc:
debug(f"get_tags failed for Hydrus file: {exc}") debug(f"{self._log_prefix()} get_tags failed: {exc}")
return [], "unknown" return [], "unknown"
def add_tag(self, file_identifier: str, tags: List[str], **kwargs: Any) -> bool: def add_tag(self, file_identifier: str, tags: List[str], **kwargs: Any) -> bool:
@@ -750,12 +863,12 @@ class HydrusNetwork(Store):
try: try:
client = self._client client = self._client
if client is None: if client is None:
debug("add_tag: Hydrus client unavailable") debug(f"{self._log_prefix()} add_tag: client unavailable")
return False return False
file_hash = str(file_identifier or "").strip().lower() file_hash = str(file_identifier or "").strip().lower()
if len(file_hash) != 64 or not all(ch in "0123456789abcdef" for ch in file_hash): if len(file_hash) != 64 or not all(ch in "0123456789abcdef" for ch in file_hash):
debug(f"add_tag: invalid file hash '{file_identifier}'") debug(f"{self._log_prefix()} add_tag: invalid file hash '{file_identifier}'")
return False return False
service_name = kwargs.get("service_name") or "my tags" service_name = kwargs.get("service_name") or "my tags"
# Ensure tags is a list # Ensure tags is a list
@@ -765,7 +878,7 @@ class HydrusNetwork(Store):
client.add_tag(file_hash, tag_list, service_name) client.add_tag(file_hash, tag_list, service_name)
return True return True
except Exception as exc: except Exception as exc:
debug(f"Hydrus add_tag failed: {exc}") debug(f"{self._log_prefix()} add_tag failed: {exc}")
return False return False
def delete_tag(self, file_identifier: str, tags: List[str], **kwargs: Any) -> bool: def delete_tag(self, file_identifier: str, tags: List[str], **kwargs: Any) -> bool:
@@ -774,12 +887,12 @@ class HydrusNetwork(Store):
try: try:
client = self._client client = self._client
if client is None: if client is None:
debug("delete_tag: Hydrus client unavailable") debug(f"{self._log_prefix()} delete_tag: client unavailable")
return False return False
file_hash = str(file_identifier or "").strip().lower() file_hash = str(file_identifier or "").strip().lower()
if len(file_hash) != 64 or not all(ch in "0123456789abcdef" for ch in file_hash): if len(file_hash) != 64 or not all(ch in "0123456789abcdef" for ch in file_hash):
debug(f"delete_tag: invalid file hash '{file_identifier}'") debug(f"{self._log_prefix()} delete_tag: invalid file hash '{file_identifier}'")
return False return False
service_name = kwargs.get("service_name") or "my tags" service_name = kwargs.get("service_name") or "my tags"
tag_list = list(tags) if isinstance(tags, (list, tuple)) else [str(tags)] tag_list = list(tags) if isinstance(tags, (list, tuple)) else [str(tags)]
@@ -788,7 +901,7 @@ class HydrusNetwork(Store):
client.delete_tag(file_hash, tag_list, service_name) client.delete_tag(file_hash, tag_list, service_name)
return True return True
except Exception as exc: except Exception as exc:
debug(f"Hydrus delete_tag failed: {exc}") debug(f"{self._log_prefix()} delete_tag failed: {exc}")
return False return False
def get_url(self, file_identifier: str, **kwargs: Any) -> List[str]: def get_url(self, file_identifier: str, **kwargs: Any) -> List[str]:
@@ -797,7 +910,7 @@ class HydrusNetwork(Store):
try: try:
client = self._client client = self._client
if client is None: if client is None:
debug("get_url: Hydrus client unavailable") debug(f"{self._log_prefix()} get_url: client unavailable")
return [] return []
file_hash = str(file_identifier or "").strip().lower() file_hash = str(file_identifier or "").strip().lower()
@@ -830,7 +943,7 @@ class HydrusNetwork(Store):
return out return out
return [] return []
except Exception as exc: except Exception as exc:
debug(f"Hydrus get_url failed: {exc}") debug(f"{self._log_prefix()} get_url failed: {exc}")
return [] return []
def add_url(self, file_identifier: str, url: List[str], **kwargs: Any) -> bool: def add_url(self, file_identifier: str, url: List[str], **kwargs: Any) -> bool:
@@ -839,13 +952,13 @@ class HydrusNetwork(Store):
try: try:
client = self._client client = self._client
if client is None: if client is None:
debug("add_url: Hydrus client unavailable") debug(f"{self._log_prefix()} add_url: client unavailable")
return False return False
for u in url: for u in url:
client.associate_url(file_identifier, u) client.associate_url(file_identifier, u)
return True return True
except Exception as exc: except Exception as exc:
debug(f"Hydrus add_url failed: {exc}") debug(f"{self._log_prefix()} add_url failed: {exc}")
return False return False
def delete_url(self, file_identifier: str, url: List[str], **kwargs: Any) -> bool: def delete_url(self, file_identifier: str, url: List[str], **kwargs: Any) -> bool:
@@ -854,13 +967,13 @@ class HydrusNetwork(Store):
try: try:
client = self._client client = self._client
if client is None: if client is None:
debug("delete_url: Hydrus client unavailable") debug(f"{self._log_prefix()} delete_url: client unavailable")
return False return False
for u in url: for u in url:
client.delete_url(file_identifier, u) client.delete_url(file_identifier, u)
return True return True
except Exception as exc: except Exception as exc:
debug(f"Hydrus delete_url failed: {exc}") debug(f"{self._log_prefix()} delete_url failed: {exc}")
return False return False
def get_note(self, file_identifier: str, **kwargs: Any) -> Dict[str, str]: def get_note(self, file_identifier: str, **kwargs: Any) -> Dict[str, str]:
@@ -868,7 +981,7 @@ class HydrusNetwork(Store):
try: try:
client = self._client client = self._client
if client is None: if client is None:
debug("get_note: Hydrus client unavailable") debug(f"{self._log_prefix()} get_note: client unavailable")
return {} return {}
file_hash = str(file_identifier or "").strip().lower() file_hash = str(file_identifier or "").strip().lower()
@@ -889,7 +1002,7 @@ class HydrusNetwork(Store):
return {} return {}
except Exception as exc: except Exception as exc:
debug(f"Hydrus get_note failed: {exc}") debug(f"{self._log_prefix()} get_note failed: {exc}")
return {} return {}
def set_note(self, file_identifier: str, name: str, text: str, **kwargs: Any) -> bool: def set_note(self, file_identifier: str, name: str, text: str, **kwargs: Any) -> bool:
@@ -897,7 +1010,7 @@ class HydrusNetwork(Store):
try: try:
client = self._client client = self._client
if client is None: if client is None:
debug("set_note: Hydrus client unavailable") debug(f"{self._log_prefix()} set_note: client unavailable")
return False return False
file_hash = str(file_identifier or "").strip().lower() file_hash = str(file_identifier or "").strip().lower()
@@ -912,7 +1025,7 @@ class HydrusNetwork(Store):
client.set_notes(file_hash, {note_name: note_text}) client.set_notes(file_hash, {note_name: note_text})
return True return True
except Exception as exc: except Exception as exc:
debug(f"Hydrus set_note failed: {exc}") debug(f"{self._log_prefix()} set_note failed: {exc}")
return False return False
def delete_note(self, file_identifier: str, name: str, **kwargs: Any) -> bool: def delete_note(self, file_identifier: str, name: str, **kwargs: Any) -> bool:
@@ -920,7 +1033,7 @@ class HydrusNetwork(Store):
try: try:
client = self._client client = self._client
if client is None: if client is None:
debug("delete_note: Hydrus client unavailable") debug(f"{self._log_prefix()} delete_note: client unavailable")
return False return False
file_hash = str(file_identifier or "").strip().lower() file_hash = str(file_identifier or "").strip().lower()
@@ -934,7 +1047,7 @@ class HydrusNetwork(Store):
client.delete_notes(file_hash, [note_name]) client.delete_notes(file_hash, [note_name])
return True return True
except Exception as exc: except Exception as exc:
debug(f"Hydrus delete_note failed: {exc}") debug(f"{self._log_prefix()} delete_note failed: {exc}")
return False return False
@staticmethod @staticmethod

View File

@@ -6,6 +6,7 @@ import sys
import shutil import shutil
import tempfile import tempfile
import re import re
from urllib.parse import urlsplit, parse_qs
import models import models
import pipeline as ctx import pipeline as ctx
@@ -13,12 +14,20 @@ from API import HydrusNetwork as hydrus_wrapper
from SYS.logger import log, debug from SYS.logger import log, debug
from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS from SYS.utils_constant import ALL_SUPPORTED_EXTENSIONS
from Store import Store from Store import Store
from ._shared import ( from . import _shared as sh
Cmdlet, CmdletArg, parse_cmdlet_args, SharedArgs,
extract_tag_from_result, extract_title_from_result, extract_url_from_result, Cmdlet = sh.Cmdlet
merge_sequences, extract_relationships, extract_duration, coerce_to_pipe_object CmdletArg = sh.CmdletArg
) parse_cmdlet_args = sh.parse_cmdlet_args
from ._shared import collapse_namespace_tag SharedArgs = sh.SharedArgs
extract_tag_from_result = sh.extract_tag_from_result
extract_title_from_result = sh.extract_title_from_result
extract_url_from_result = sh.extract_url_from_result
merge_sequences = sh.merge_sequences
extract_relationships = sh.extract_relationships
extract_duration = sh.extract_duration
coerce_to_pipe_object = sh.coerce_to_pipe_object
collapse_namespace_tag = sh.collapse_namespace_tag
from API.folder import read_sidecar, find_sidecar, write_sidecar, API_folder_store from API.folder import read_sidecar, find_sidecar, write_sidecar, API_folder_store
from SYS.utils import sha256_file, unique_path from SYS.utils import sha256_file, unique_path
from metadata import write_metadata from metadata import write_metadata
@@ -181,7 +190,7 @@ class Add_File(Cmdlet):
downloaded_path = Path(downloaded) downloaded_path = Path(downloaded)
if downloaded_path.exists() and downloaded_path.is_dir(): if downloaded_path.exists() and downloaded_path.is_dir():
log( log(
"[add-file] OpenLibrary download produced a directory (missing img2pdf?). Cannot ingest.", "[add-file] OpenLibrary download produced a directory (PDF conversion failed). Cannot ingest.",
file=sys.stderr, file=sys.stderr,
) )
failures += 1 failures += 1
@@ -192,6 +201,26 @@ class Add_File(Cmdlet):
delete_after_item = True delete_after_item = True
# For non-provider URLs, or if still a URL after provider attempt, delegate to download-media. # For non-provider URLs, or if still a URL after provider attempt, delegate to download-media.
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
("http://", "https://", "magnet:", "torrent:")
):
# Hydrus file URLs are direct file downloads and may require Hydrus auth headers.
# If the user provided a destination (-provider or -store), download now and continue.
if (provider_name or location) and isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(("http://", "https://")):
downloaded = self._try_download_hydrus_file_url(
file_url=str(media_path_or_url),
pipe_obj=pipe_obj,
config=config,
)
if downloaded is not None:
downloaded_path, downloaded_temp_dir = downloaded
temp_dir_to_cleanup = downloaded_temp_dir
media_path_or_url = str(downloaded_path)
pipe_obj.path = str(downloaded_path)
pipe_obj.is_temp = True
delete_after_item = True
# If it's still a URL target, fall back to the legacy delegate.
if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith( if isinstance(media_path_or_url, str) and media_path_or_url.lower().startswith(
("http://", "https://", "magnet:", "torrent:") ("http://", "https://", "magnet:", "torrent:")
): ):
@@ -767,6 +796,134 @@ class Add_File(Cmdlet):
return True return True
return False return False
@staticmethod
def _sanitize_filename(value: str) -> str:
# Minimal Windows-safe filename sanitization.
text = str(value or "").strip()
if not text:
return "file"
invalid = '<>:"/\\|?*'
text = "".join("_" if (ch in invalid or ord(ch) < 32) else ch for ch in text)
text = re.sub(r"\s+", " ", text).strip(" .")
return text or "file"
@staticmethod
def _parse_hydrus_file_url(file_url: str) -> Optional[str]:
"""Return the sha256 hash from a Hydrus /get_files/file URL, or None."""
try:
split = urlsplit(str(file_url))
if split.scheme.lower() not in {"http", "https"}:
return None
path_lower = (split.path or "").lower()
if "/get_files/file" not in path_lower:
return None
params = parse_qs(split.query or "")
raw = None
if "hash" in params and params["hash"]:
raw = params["hash"][0]
if not raw:
return None
hash_val = str(raw).strip().lower()
if not re.fullmatch(r"[0-9a-f]{64}", hash_val):
return None
return hash_val
except Exception:
return None
def _try_download_hydrus_file_url(
self,
*,
file_url: str,
pipe_obj: models.PipeObject,
config: Dict[str, Any],
) -> Optional[tuple[Path, Path]]:
"""If *file_url* is a Hydrus file URL, download it to temp and return (path, temp_dir)."""
file_hash = self._parse_hydrus_file_url(file_url)
if not file_hash:
return None
# Resolve Hydrus backend for auth.
store_name = str(getattr(pipe_obj, "store", "") or "").strip()
if ":" in store_name:
store_name = store_name.split(":", 1)[-1].strip()
backend = None
try:
store_registry = Store(config)
if store_name and store_registry.is_available(store_name):
candidate = store_registry[store_name]
if type(candidate).__name__.lower() == "hydrusnetwork":
backend = candidate
except Exception:
backend = None
if backend is None:
try:
store_registry = Store(config)
target_prefix = str(file_url).split("/get_files/file", 1)[0].rstrip("/")
for backend_name in store_registry.list_backends():
candidate = store_registry[backend_name]
if type(candidate).__name__.lower() != "hydrusnetwork":
continue
base_url = str(getattr(candidate, "URL", "") or "").rstrip("/")
if base_url and (target_prefix.lower() == base_url.lower() or target_prefix.lower().startswith(base_url.lower())):
backend = candidate
break
except Exception:
backend = None
if backend is None:
debug("[add-file] Hydrus file URL detected but no Hydrus backend matched for auth")
return None
api_key = str(getattr(backend, "API", "") or "").strip()
if not api_key:
debug(f"[add-file] Hydrus backend '{getattr(backend, 'NAME', '') or store_name}' missing API key")
return None
# Best-effort filename from title + ext.
ext = ""
try:
if isinstance(pipe_obj.extra, dict):
ext = str(pipe_obj.extra.get("ext") or "").strip().lstrip(".")
except Exception:
ext = ""
if not ext:
ext = "bin"
title_hint = str(getattr(pipe_obj, "title", "") or "").strip()
base_name = self._sanitize_filename(title_hint) if title_hint else f"hydrus_{file_hash[:12]}"
temp_dir = Path(tempfile.mkdtemp(prefix="medios_hydrus_"))
destination = unique_path(temp_dir / f"{base_name}.{ext}")
headers = {"Hydrus-Client-API-Access-Key": api_key}
timeout = 60.0
try:
client = getattr(backend, "_client", None)
timeout_val = getattr(client, "timeout", None)
if timeout_val is not None:
timeout = float(timeout_val)
except Exception:
timeout = 60.0
try:
log(
f"[add-file] Downloading Hydrus file via API ({getattr(backend, 'NAME', '') or store_name})",
file=sys.stderr,
)
downloaded_bytes = hydrus_wrapper.download_hydrus_file(str(file_url), headers, destination, timeout)
if downloaded_bytes <= 0 and not destination.exists():
return None
return destination, temp_dir
except Exception as exc:
log(f"[add-file] Hydrus download failed: {exc}", file=sys.stderr)
try:
shutil.rmtree(temp_dir, ignore_errors=True)
except Exception:
pass
return None
def _delegate_to_download_data( def _delegate_to_download_data(
self, self,
result: Any, result: Any,
@@ -883,6 +1040,61 @@ class Add_File(Cmdlet):
except Exception: except Exception:
return None return None
@staticmethod
def _get_note_text(result: Any, pipe_obj: models.PipeObject, note_name: str) -> Optional[str]:
"""Extract a named note text from a piped item.
Supports:
- pipe_obj.extra["notes"][note_name]
- result["notes"][note_name] for dict results
- pipe_obj.extra[note_name] / result[note_name] as fallback
"""
def _normalize(val: Any) -> Optional[str]:
if val is None:
return None
if isinstance(val, bytes):
try:
val = val.decode("utf-8", errors="ignore")
except Exception:
val = str(val)
if isinstance(val, str):
text = val.strip()
return text if text else None
try:
text = str(val).strip()
return text if text else None
except Exception:
return None
note_key = str(note_name or "").strip()
if not note_key:
return None
# Prefer notes dict on PipeObject.extra (common for cmdlet-emitted dicts)
try:
if isinstance(pipe_obj.extra, dict):
notes_val = pipe_obj.extra.get("notes")
if isinstance(notes_val, dict) and note_key in notes_val:
return _normalize(notes_val.get(note_key))
if note_key in pipe_obj.extra:
return _normalize(pipe_obj.extra.get(note_key))
except Exception:
pass
# Fallback to raw result dict
if isinstance(result, dict):
try:
notes_val = result.get("notes")
if isinstance(notes_val, dict) and note_key in notes_val:
return _normalize(notes_val.get(note_key))
if note_key in result:
return _normalize(result.get(note_key))
except Exception:
pass
return None
@staticmethod @staticmethod
def _update_pipe_object_destination( def _update_pipe_object_destination(
pipe_obj: models.PipeObject, pipe_obj: models.PipeObject,
@@ -1451,6 +1663,26 @@ class Add_File(Cmdlet):
except Exception: except Exception:
pass pass
# If a subtitle note was provided upstream (e.g., download-media writes notes.sub),
# persist it automatically like add-note would.
sub_note = Add_File._get_note_text(result, pipe_obj, "sub")
if sub_note:
try:
setter = getattr(backend, "set_note", None)
if callable(setter):
setter(resolved_hash, "sub", sub_note)
except Exception:
pass
chapters_note = Add_File._get_note_text(result, pipe_obj, "chapters")
if chapters_note:
try:
setter = getattr(backend, "set_note", None)
if callable(setter):
setter(resolved_hash, "chapters", chapters_note)
except Exception:
pass
meta: Dict[str, Any] = {} meta: Dict[str, Any] = {}
try: try:
meta = backend.get_metadata(resolved_hash) or {} meta = backend.get_metadata(resolved_hash) or {}

View File

@@ -7,15 +7,15 @@ import sys
from SYS.logger import log from SYS.logger import log
import pipeline as ctx import pipeline as ctx
from ._shared import ( from . import _shared as sh
Cmdlet,
CmdletArg, Cmdlet = sh.Cmdlet
SharedArgs, CmdletArg = sh.CmdletArg
normalize_hash, SharedArgs = sh.SharedArgs
parse_cmdlet_args, normalize_hash = sh.normalize_hash
normalize_result_input, parse_cmdlet_args = sh.parse_cmdlet_args
should_show_help, normalize_result_input = sh.normalize_result_input
) should_show_help = sh.should_show_help
from Store import Store from Store import Store
from SYS.utils import sha256_file from SYS.utils import sha256_file
@@ -84,9 +84,9 @@ class Add_Note(Cmdlet):
else: else:
note_text = str(text_parts or "").strip() note_text = str(text_parts or "").strip()
if not note_text: # Note text can be omitted when upstream stages provide it (e.g. download-media --write-sub
log("[add_note] Error: Empty note text", file=sys.stderr) # attaches notes.sub). In that case we resolve per-item below.
return 1 user_provided_text = bool(note_text)
results = normalize_result_input(result) results = normalize_result_input(result)
if not results: if not results:
@@ -99,11 +99,56 @@ class Add_Note(Cmdlet):
store_registry = Store(config) store_registry = Store(config)
updated = 0 updated = 0
# Optional global fallback for note text from pipeline values.
# Allows patterns like: ... | add-note sub
pipeline_default_text = None
if not user_provided_text:
try:
pipeline_default_text = ctx.load_value(note_name)
except Exception:
pipeline_default_text = None
if isinstance(pipeline_default_text, list):
pipeline_default_text = " ".join([str(x) for x in pipeline_default_text]).strip()
elif pipeline_default_text is not None:
pipeline_default_text = str(pipeline_default_text).strip()
for res in results: for res in results:
if not isinstance(res, dict): if not isinstance(res, dict):
ctx.emit(res) ctx.emit(res)
continue continue
# Resolve note text for this item when not provided explicitly.
item_note_text = note_text
if not user_provided_text:
# Prefer item-scoped notes dict.
candidate = None
try:
notes = res.get("notes")
if isinstance(notes, dict):
candidate = notes.get(note_name)
except Exception:
candidate = None
# Also allow direct field fallback: res["sub"], etc.
if candidate is None:
try:
candidate = res.get(note_name)
except Exception:
candidate = None
if candidate is None:
candidate = pipeline_default_text
if isinstance(candidate, list):
item_note_text = " ".join([str(x) for x in candidate]).strip()
else:
item_note_text = str(candidate or "").strip()
if not item_note_text:
log(f"[add_note] Warning: No note text found for '{note_name}'; skipping", file=sys.stderr)
ctx.emit(res)
continue
store_name = str(store_override or res.get("store") or "").strip() store_name = str(store_override or res.get("store") or "").strip()
raw_hash = res.get("hash") raw_hash = res.get("hash")
raw_path = res.get("path") raw_path = res.get("path")
@@ -130,7 +175,7 @@ class Add_Note(Cmdlet):
ok = False ok = False
try: try:
ok = bool(backend.set_note(resolved_hash, note_name, note_text, config=config)) ok = bool(backend.set_note(resolved_hash, note_name, item_note_text, config=config))
except Exception as exc: except Exception as exc:
log(f"[add_note] Error: Failed to set note: {exc}", file=sys.stderr) log(f"[add_note] Error: Failed to set note: {exc}", file=sys.stderr)
ok = False ok = False

View File

@@ -11,7 +11,15 @@ from SYS.logger import log
import pipeline as ctx import pipeline as ctx
from API import HydrusNetwork as hydrus_wrapper from API import HydrusNetwork as hydrus_wrapper
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, normalize_result_input, should_show_help, get_field from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
parse_cmdlet_args = sh.parse_cmdlet_args
normalize_result_input = sh.normalize_result_input
should_show_help = sh.should_show_help
get_field = sh.get_field
from API.folder import read_sidecar, find_sidecar, API_folder_store from API.folder import read_sidecar, find_sidecar, API_folder_store
from Store import Store from Store import Store

View File

@@ -8,19 +8,20 @@ from SYS.logger import log
import models import models
import pipeline as ctx import pipeline as ctx
from ._shared import normalize_result_input, filter_results_by_temp from . import _shared as sh
from ._shared import (
Cmdlet, normalize_result_input = sh.normalize_result_input
CmdletArg, filter_results_by_temp = sh.filter_results_by_temp
SharedArgs, Cmdlet = sh.Cmdlet
normalize_hash, CmdletArg = sh.CmdletArg
parse_tag_arguments, SharedArgs = sh.SharedArgs
expand_tag_groups, normalize_hash = sh.normalize_hash
parse_cmdlet_args, parse_tag_arguments = sh.parse_tag_arguments
collapse_namespace_tag, expand_tag_groups = sh.expand_tag_groups
should_show_help, parse_cmdlet_args = sh.parse_cmdlet_args
get_field, collapse_namespace_tag = sh.collapse_namespace_tag
) should_show_help = sh.should_show_help
get_field = sh.get_field
from Store import Store from Store import Store
from SYS.utils import sha256_file from SYS.utils import sha256_file

View File

@@ -8,19 +8,20 @@ from SYS.logger import log
import models import models
import pipeline as ctx import pipeline as ctx
from ._shared import normalize_result_input, filter_results_by_temp from . import _shared as sh
from ._shared import (
Cmdlet, normalize_result_input = sh.normalize_result_input
CmdletArg, filter_results_by_temp = sh.filter_results_by_temp
SharedArgs, Cmdlet = sh.Cmdlet
normalize_hash, CmdletArg = sh.CmdletArg
parse_tag_arguments, SharedArgs = sh.SharedArgs
expand_tag_groups, normalize_hash = sh.normalize_hash
parse_cmdlet_args, parse_tag_arguments = sh.parse_tag_arguments
collapse_namespace_tags, expand_tag_groups = sh.expand_tag_groups
should_show_help, parse_cmdlet_args = sh.parse_cmdlet_args
get_field, collapse_namespace_tags = sh.collapse_namespace_tags
) should_show_help = sh.should_show_help
get_field = sh.get_field
from Store import Store from Store import Store
from SYS.utils import sha256_file from SYS.utils import sha256_file

View File

@@ -4,12 +4,12 @@ from typing import Any, Dict, Sequence
import sys import sys
import pipeline as ctx import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash from . import _shared as sh
from SYS.logger import log from SYS.logger import log
from Store import Store from Store import Store
class Add_Url(Cmdlet): class Add_Url(sh.Cmdlet):
"""Add URL associations to files via hash+store.""" """Add URL associations to files via hash+store."""
def __init__(self) -> None: def __init__(self) -> None:
@@ -18,9 +18,9 @@ class Add_Url(Cmdlet):
summary="Associate a URL with a file", summary="Associate a URL with a file",
usage="@1 | add-url <url>", usage="@1 | add-url <url>",
arg=[ arg=[
SharedArgs.HASH, sh.SharedArgs.HASH,
SharedArgs.STORE, sh.SharedArgs.STORE,
CmdletArg("url", required=True, description="URL to associate"), sh.CmdletArg("url", required=True, description="URL to associate"),
], ],
detail=[ detail=[
"- Associates URL with file identified by hash+store", "- Associates URL with file identified by hash+store",
@@ -32,11 +32,11 @@ class Add_Url(Cmdlet):
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Add URL to file via hash+store backend.""" """Add URL to file via hash+store backend."""
parsed = parse_cmdlet_args(args, self) parsed = sh.parse_cmdlet_args(args, self)
# Extract hash and store from result or args # Extract hash and store from result or args
file_hash = parsed.get("hash") or get_field(result, "hash") file_hash = parsed.get("hash") or sh.get_field(result, "hash")
store_name = parsed.get("store") or get_field(result, "store") store_name = parsed.get("store") or sh.get_field(result, "store")
url_arg = parsed.get("url") url_arg = parsed.get("url")
if not file_hash: if not file_hash:
@@ -52,7 +52,7 @@ class Add_Url(Cmdlet):
return 1 return 1
# Normalize hash # Normalize hash
file_hash = normalize_hash(file_hash) file_hash = sh.normalize_hash(file_hash)
if not file_hash: if not file_hash:
log("Error: Invalid hash format") log("Error: Invalid hash format")
return 1 return 1

View File

@@ -1,190 +0,0 @@
from __future__ import annotations
from typing import Any, Dict, Sequence
import json
import sys
from SYS.logger import log
from API import HydrusNetwork as hydrus_wrapper
from ._shared import Cmdlet, CmdletArg, SharedArgs, normalize_hash, should_show_help
from Store import Store
CMDLET = Cmdlet(
name="check-file-status",
summary="Check if a file is active, deleted, or corrupted in Hydrus.",
usage="check-file-status [-hash <sha256>] [-store <name>]",
arg=[
SharedArgs.HASH,
SharedArgs.STORE,
],
detail=[
"- Shows whether file is active in Hydrus or marked as deleted",
"- Detects corrupted data (e.g., comma-separated url)",
"- Displays file metadata and service locations",
"- Note: Hydrus keeps deleted files for recovery. Use cleanup-corrupted for full removal.",
],
)
def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
# Help
if should_show_help(args):
log(f"Cmdlet: {CMDLET.name}\nSummary: {CMDLET.summary}\nUsage: {CMDLET.usage}")
return 0
# Parse arguments
override_hash: str | None = None
override_store: str | None = None
i = 0
while i < len(args):
token = args[i]
low = str(token).lower()
if low in {"-hash", "--hash", "hash"} and i + 1 < len(args):
override_hash = str(args[i + 1]).strip()
i += 2
continue
if low in {"-store", "--store", "store"} and i + 1 < len(args):
override_store = str(args[i + 1]).strip()
i += 2
continue
i += 1
store_name: str | None = override_store
if not store_name:
if isinstance(result, dict):
store_name = str(result.get("store") or "").strip() or None
else:
store_name = str(getattr(result, "store", "") or "").strip() or None
if override_hash:
hash_hex = normalize_hash(override_hash)
else:
if isinstance(result, dict):
hash_hex = normalize_hash(result.get("hash") or result.get("hash_hex"))
else:
hash_hex = normalize_hash(getattr(result, "hash", None) or getattr(result, "hash_hex", None))
if not hash_hex:
log("No hash provided and no result selected", file=sys.stderr)
return 1
try:
client = None
if store_name:
# Store specified: do not fall back to a global/default Hydrus client.
try:
store = Store(config)
backend = store[str(store_name)]
candidate = getattr(backend, "_client", None)
if candidate is not None and hasattr(candidate, "fetch_file_metadata"):
client = candidate
except Exception:
client = None
if client is None:
log(f"Hydrus client unavailable for store '{store_name}'", file=sys.stderr)
return 1
else:
client = hydrus_wrapper.get_client(config)
if client is None:
log("Hydrus client unavailable", file=sys.stderr)
return 1
except Exception as exc:
log(f"Hydrus client unavailable: {exc}", file=sys.stderr)
return 1
try:
result_data = client.fetch_file_metadata(hashes=[hash_hex])
if not result_data.get("metadata"):
log(f"File not found: {hash_hex[:16]}...", file=sys.stderr)
return 1
file_info = result_data["metadata"][0]
# Status summary
is_deleted = file_info.get("is_deleted", False)
is_local = file_info.get("is_local", False)
is_trashed = file_info.get("is_trashed", False)
status_str = "DELETED" if is_deleted else ("TRASHED" if is_trashed else "ACTIVE")
log(f"File status: {status_str}", file=sys.stderr)
# File info
log(f"\n📄 File Information:", file=sys.stderr)
log(f" Hash: {file_info['hash'][:16]}...", file=sys.stderr)
log(f" Size: {file_info['size']:,} bytes", file=sys.stderr)
log(f" MIME: {file_info['mime']}", file=sys.stderr)
log(f" Dimensions: {file_info.get('width', '?')}x{file_info.get('height', '?')}", file=sys.stderr)
# Service status
file_services = file_info.get("file_services", {})
current_services = file_services.get("current", {})
deleted_services = file_services.get("deleted", {})
if current_services:
log(f"\n✓ In services ({len(current_services)}):", file=sys.stderr)
for service_key, service_info in current_services.items():
sname = service_info.get("name", "unknown")
stype = service_info.get("type_pretty", "unknown")
log(f" - {sname} ({stype})", file=sys.stderr)
if deleted_services:
log(f"\n✗ Deleted from services ({len(deleted_services)}):", file=sys.stderr)
for service_key, service_info in deleted_services.items():
sname = service_info.get("name", "unknown")
stype = service_info.get("type_pretty", "unknown")
time_deleted = service_info.get("time_deleted", "?")
log(f" - {sname} ({stype}) - deleted at {time_deleted}", file=sys.stderr)
# URL check
url = file_info.get("url", [])
log(f"\n🔗 url ({len(url)}):", file=sys.stderr)
corrupted_count = 0
for i, url in enumerate(url, 1):
if "," in url:
corrupted_count += 1
log(f" [{i}] ⚠️ CORRUPTED (comma-separated): {url[:50]}...", file=sys.stderr)
else:
log(f" [{i}] {url[:70]}{'...' if len(url) > 70 else ''}", file=sys.stderr)
if corrupted_count > 0:
log(f"\n⚠️ WARNING: Found {corrupted_count} corrupted URL(s)", file=sys.stderr)
# Tags
tags_dict = file_info.get("tags", {})
total_tags = 0
for service_key, service_data in tags_dict.items():
service_name = service_data.get("name", "unknown")
display_tags = service_data.get("display_tags", {}).get("0", [])
total_tags += len(display_tags)
if total_tags > 0:
log(f"\n🏷️ Tags ({total_tags}):", file=sys.stderr)
for service_key, service_data in tags_dict.items():
display_tags = service_data.get("display_tags", {}).get("0", [])
if display_tags:
service_name = service_data.get("name", "unknown")
log(f" {service_name}:", file=sys.stderr)
for tag in display_tags[:5]: # Show first 5
log(f" - {tag}", file=sys.stderr)
if len(display_tags) > 5:
log(f" ... and {len(display_tags) - 5} more", file=sys.stderr)
log("\n", file=sys.stderr)
return 0
except Exception as exc:
log(f"Error checking file status: {exc}", file=sys.stderr)
import traceback
traceback.print_exc(file=sys.stderr)
return 1
# Register cmdlet (no legacy decorator)
CMDLET.exec = _run
CMDLET.alias = ["check-status", "file-status", "status"]
CMDLET.register()

View File

@@ -1,105 +0,0 @@
"""Cleanup cmdlet for removing temporary artifacts from pipeline.
This cmdlet processes result lists and removes temporary files (marked with is_temp=True),
then emits the remaining non-temporary results for further pipeline stages.
"""
from __future__ import annotations
from typing import Any, Dict, Sequence
from pathlib import Path
import sys
import json
from SYS.logger import log
from ._shared import Cmdlet, CmdletArg, get_pipe_object_path, normalize_result_input, filter_results_by_temp, should_show_help
import models
import pipeline as pipeline_context
def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Remove temporary files from pipeline results.
Accepts:
- Single result object with is_temp field
- List of result objects to clean up
Process:
- Filters results by is_temp=True
- Deletes those files from disk
- Emits only non-temporary results
Typical pipeline usage:
download-data url | screen-shot | add-tag -store local "tag" --all | cleanup
"""
# Help
if should_show_help(args):
log(f"Cmdlet: {CMDLET.name}\nSummary: {CMDLET.summary}\nUsage: {CMDLET.usage}")
return 0
# Normalize input to list
results = normalize_result_input(result)
if not results:
log("[cleanup] No results to process", file=sys.stderr)
return 1
# Separate temporary and permanent results
temp_results = pipeline_context.filter_results_by_temp(results, include_temp=True)
perm_results = pipeline_context.filter_results_by_temp(results, include_temp=False)
# Delete temporary files
deleted_count = 0
for temp_result in temp_results:
try:
file_path = get_pipe_object_path(temp_result)
if file_path:
path_obj = Path(file_path)
if path_obj.exists():
# Delete the file
path_obj.unlink()
log(f"[cleanup] Deleted temporary file: {path_obj.name}", file=sys.stderr)
deleted_count += 1
# Clean up any associated sidecar files
for ext in ['.tag', '.metadata']:
sidecar = path_obj.parent / (path_obj.name + ext)
if sidecar.exists():
try:
sidecar.unlink()
log(f"[cleanup] Deleted sidecar: {sidecar.name}", file=sys.stderr)
except Exception as e:
log(f"[cleanup] Warning: Could not delete sidecar {sidecar.name}: {e}", file=sys.stderr)
else:
log(f"[cleanup] File does not exist: {file_path}", file=sys.stderr)
except Exception as e:
log(f"[cleanup] Error deleting file: {e}", file=sys.stderr)
# Log summary
log(f"[cleanup] Deleted {deleted_count} temporary file(s), emitting {len(perm_results)} permanent result(s)", file=sys.stderr)
# Emit permanent results for downstream processing
for perm_result in perm_results:
pipeline_context.emit(perm_result)
return 0
CMDLET = Cmdlet(
name="cleanup",
summary="Remove temporary artifacts from pipeline (marked with is_temp=True).",
usage="cleanup",
arg=[],
detail=[
"- Accepts pipeline results that may contain temporary files (screenshots, intermediate artifacts)",
"- Deletes files marked with is_temp=True from disk",
"- Also cleans up associated sidecar files (.tag, .metadata)",
"- Emits only non-temporary results for further processing",
"- Typical usage at end of pipeline: ... | add-tag -store local \"tag\" --all | cleanup",
"- Exit code 0 if cleanup successful, 1 if no results to process",
],
exec=_run,
).register()

View File

@@ -8,12 +8,12 @@ from pathlib import Path
from SYS.logger import debug, log from SYS.logger import debug, log
from Store.Folder import Folder from Store.Folder import Folder
from Store import Store from Store import Store
from ._shared import Cmdlet, CmdletArg, normalize_hash, looks_like_hash, get_field, should_show_help from . import _shared as sh
from API import HydrusNetwork as hydrus_wrapper from API import HydrusNetwork as hydrus_wrapper
import pipeline as ctx import pipeline as ctx
class Delete_File(Cmdlet): class Delete_File(sh.Cmdlet):
"""Class-based delete-file cmdlet with self-registration.""" """Class-based delete-file cmdlet with self-registration."""
def __init__(self) -> None: def __init__(self) -> None:
@@ -23,10 +23,10 @@ class Delete_File(Cmdlet):
usage="delete-file [-hash <sha256>] [-conserve <local|hydrus>] [-lib-root <path>] [reason]", usage="delete-file [-hash <sha256>] [-conserve <local|hydrus>] [-lib-root <path>] [reason]",
alias=["del-file"], alias=["del-file"],
arg=[ arg=[
CmdletArg("hash", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."), sh.CmdletArg("hash", description="Override the Hydrus file hash (SHA256) to target instead of the selected result."),
CmdletArg("conserve", description="Choose which copy to keep: 'local' or 'hydrus'."), sh.CmdletArg("conserve", description="Choose which copy to keep: 'local' or 'hydrus'."),
CmdletArg("lib-root", description="Path to local library root for database cleanup."), sh.CmdletArg("lib-root", description="Path to local library root for database cleanup."),
CmdletArg("reason", description="Optional reason for deletion (free text)."), sh.CmdletArg("reason", description="Optional reason for deletion (free text)."),
], ],
detail=[ detail=[
"Default removes both the local file and Hydrus file.", "Default removes both the local file and Hydrus file.",
@@ -45,24 +45,28 @@ class Delete_File(Cmdlet):
if isinstance(item, dict): if isinstance(item, dict):
hash_hex_raw = item.get("hash_hex") or item.get("hash") hash_hex_raw = item.get("hash_hex") or item.get("hash")
target = item.get("target") or item.get("file_path") or item.get("path") target = item.get("target") or item.get("file_path") or item.get("path")
title_val = item.get("title") or item.get("name")
else: else:
hash_hex_raw = get_field(item, "hash_hex") or get_field(item, "hash") hash_hex_raw = sh.get_field(item, "hash_hex") or sh.get_field(item, "hash")
target = get_field(item, "target") or get_field(item, "file_path") or get_field(item, "path") target = sh.get_field(item, "target") or sh.get_field(item, "file_path") or sh.get_field(item, "path")
title_val = sh.get_field(item, "title") or sh.get_field(item, "name")
store = None store = None
if isinstance(item, dict): if isinstance(item, dict):
store = item.get("store") store = item.get("store")
else: else:
store = get_field(item, "store") store = sh.get_field(item, "store")
store_lower = str(store).lower() if store else "" store_lower = str(store).lower() if store else ""
is_hydrus_store = bool(store_lower) and ("hydrus" in store_lower or store_lower in {"home", "work"}) is_hydrus_store = bool(store_lower) and ("hydrus" in store_lower or store_lower in {"home", "work"})
store_label = str(store) if store else "default"
hydrus_prefix = f"[hydrusnetwork:{store_label}]"
# For Hydrus files, the target IS the hash # For Hydrus files, the target IS the hash
if is_hydrus_store and not hash_hex_raw: if is_hydrus_store and not hash_hex_raw:
hash_hex_raw = target hash_hex_raw = target
hash_hex = normalize_hash(override_hash) if override_hash else normalize_hash(hash_hex_raw) hash_hex = sh.normalize_hash(override_hash) if override_hash else sh.normalize_hash(hash_hex_raw)
local_deleted = False local_deleted = False
local_target = isinstance(target, str) and target.strip() and not str(target).lower().startswith(("http://", "https://")) local_target = isinstance(target, str) and target.strip() and not str(target).lower().startswith(("http://", "https://"))
@@ -156,19 +160,28 @@ class Delete_File(Cmdlet):
try: try:
client._post("/add_files/delete_files", data=payload) # type: ignore[attr-defined] client._post("/add_files/delete_files", data=payload) # type: ignore[attr-defined]
hydrus_deleted = True hydrus_deleted = True
preview = hash_hex[:12] + ('' if len(hash_hex) > 12 else '') title_str = str(title_val).strip() if title_val else ""
debug(f"Deleted from Hydrus: {preview}", file=sys.stderr) if title_str:
debug(f"{hydrus_prefix} Deleted title:{title_str} hash:{hash_hex}", file=sys.stderr)
else:
debug(f"{hydrus_prefix} Deleted hash:{hash_hex}", file=sys.stderr)
except Exception: except Exception:
# If it's not in Hydrus (e.g. 404 or similar), that's fine # If it's not in Hydrus (e.g. 404 or similar), that's fine
if not local_deleted: if not local_deleted:
return False return False
if hydrus_deleted and hash_hex: if hydrus_deleted and hash_hex:
preview = hash_hex[:12] + ('' if len(hash_hex) > 12 else '') title_str = str(title_val).strip() if title_val else ""
if reason: if reason:
ctx.emit(f"Deleted {preview} (reason: {reason}).") if title_str:
ctx.emit(f"{hydrus_prefix} Deleted title:{title_str} hash:{hash_hex} (reason: {reason}).")
else: else:
ctx.emit(f"Deleted {preview}.") ctx.emit(f"{hydrus_prefix} Deleted hash:{hash_hex} (reason: {reason}).")
else:
if title_str:
ctx.emit(f"{hydrus_prefix} Deleted title:{title_str} hash:{hash_hex}.")
else:
ctx.emit(f"{hydrus_prefix} Deleted hash:{hash_hex}.")
if hydrus_deleted or local_deleted: if hydrus_deleted or local_deleted:
return True return True
@@ -178,7 +191,7 @@ class Delete_File(Cmdlet):
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Execute delete-file command.""" """Execute delete-file command."""
if should_show_help(args): if sh.should_show_help(args):
log(f"Cmdlet: {self.name}\nSummary: {self.summary}\nUsage: {self.usage}") log(f"Cmdlet: {self.name}\nSummary: {self.summary}\nUsage: {self.usage}")
return 0 return 0

View File

@@ -7,16 +7,16 @@ import sys
from SYS.logger import log from SYS.logger import log
import pipeline as ctx import pipeline as ctx
from ._shared import ( from . import _shared as sh
Cmdlet,
CmdletArg, Cmdlet = sh.Cmdlet
SharedArgs, CmdletArg = sh.CmdletArg
normalize_hash, SharedArgs = sh.SharedArgs
parse_cmdlet_args, normalize_hash = sh.normalize_hash
normalize_result_input, parse_cmdlet_args = sh.parse_cmdlet_args
get_field, normalize_result_input = sh.normalize_result_input
should_show_help, get_field = sh.get_field
) should_show_help = sh.should_show_help
from Store import Store from Store import Store
from SYS.utils import sha256_file from SYS.utils import sha256_file

View File

@@ -10,7 +10,16 @@ import sys
from SYS.logger import log from SYS.logger import log
import pipeline as ctx import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, normalize_hash, normalize_result_input, get_field, should_show_help from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
parse_cmdlet_args = sh.parse_cmdlet_args
normalize_hash = sh.normalize_hash
normalize_result_input = sh.normalize_result_input
get_field = sh.get_field
should_show_help = sh.should_show_help
from API.folder import API_folder_store from API.folder import API_folder_store
from Store import Store from Store import Store
from config import get_local_storage_path from config import get_local_storage_path

View File

@@ -7,7 +7,15 @@ import sys
import models import models
import pipeline as ctx import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, normalize_hash, parse_tag_arguments, should_show_help, get_field from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
normalize_hash = sh.normalize_hash
parse_tag_arguments = sh.parse_tag_arguments
should_show_help = sh.should_show_help
get_field = sh.get_field
from SYS.logger import debug, log from SYS.logger import debug, log
from Store import Store from Store import Store

View File

@@ -4,7 +4,16 @@ from typing import Any, Dict, Sequence
import sys import sys
import pipeline as ctx import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash from . import _shared as sh
Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash = (
sh.Cmdlet,
sh.CmdletArg,
sh.SharedArgs,
sh.parse_cmdlet_args,
sh.get_field,
sh.normalize_hash,
)
from SYS.logger import log from SYS.logger import log
from Store import Store from Store import Store

View File

@@ -17,15 +17,15 @@ from SYS.download import DownloadError, _download_direct_file
from SYS.logger import log, debug from SYS.logger import log, debug
import pipeline as pipeline_context import pipeline as pipeline_context
from ._shared import ( from . import _shared as sh
Cmdlet,
CmdletArg, Cmdlet = sh.Cmdlet
SharedArgs, CmdletArg = sh.CmdletArg
parse_cmdlet_args, SharedArgs = sh.SharedArgs
register_url_with_local_library, parse_cmdlet_args = sh.parse_cmdlet_args
coerce_to_pipe_object, register_url_with_local_library = sh.register_url_with_local_library
get_field, coerce_to_pipe_object = sh.coerce_to_pipe_object
) get_field = sh.get_field
class Download_File(Cmdlet): class Download_File(Cmdlet):
@@ -251,6 +251,13 @@ class Download_File(Cmdlet):
# Fallback: if we have a direct HTTP URL, download it directly # Fallback: if we have a direct HTTP URL, download it directly
if downloaded_path is None and isinstance(target, str) and target.startswith("http"): if downloaded_path is None and isinstance(target, str) and target.startswith("http"):
# Guard: provider landing pages (e.g. LibGen ads.php) are HTML, not files.
# Never download these as "files".
if str(table or "").lower() == "libgen":
low = target.lower()
if ("/ads.php" in low) or ("/file.php" in low) or ("/index.php" in low):
log("[download-file] Refusing to download LibGen landing page (expected provider to resolve file link)", file=sys.stderr)
continue
debug(f"[download-file] Provider item looks like direct URL, downloading: {target}") debug(f"[download-file] Provider item looks like direct URL, downloading: {target}")
result_obj = _download_direct_file(target, final_output_dir, quiet=quiet_mode) result_obj = _download_direct_file(target, final_output_dir, quiet=quiet_mode)
file_path = None file_path = None

View File

@@ -38,7 +38,18 @@ from models import DownloadError, DownloadOptions, DownloadMediaResult, DebugLog
import pipeline as pipeline_context import pipeline as pipeline_context
from result_table import ResultTable from result_table import ResultTable
from ._shared import Cmdlet, CmdletArg, SharedArgs, create_pipe_object_result, parse_cmdlet_args, register_url_with_local_library, coerce_to_pipe_object from tool.ytdlp import YtDlpTool
from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
create_pipe_object_result = sh.create_pipe_object_result
parse_cmdlet_args = sh.parse_cmdlet_args
register_url_with_local_library = sh.register_url_with_local_library
coerce_to_pipe_object = sh.coerce_to_pipe_object
get_field = sh.get_field
# Minimal inlined helpers from helper/download.py (is_url_supported_by_ytdlp, list_formats) # Minimal inlined helpers from helper/download.py (is_url_supported_by_ytdlp, list_formats)
@@ -62,6 +73,136 @@ _EXTRACTOR_CACHE: List[Any] | None = None
_YTDLP_PROGRESS_BAR = ProgressBar() _YTDLP_PROGRESS_BAR = ProgressBar()
_SUBTITLE_EXTS = (".vtt", ".srt", ".ass", ".ssa", ".lrc")
def _format_chapters_note(info: Dict[str, Any]) -> Optional[str]:
"""Format yt-dlp chapter metadata into a stable, note-friendly text.
Output is one chapter per line, e.g.:
00:00 Intro
01:23-02:10 Topic name
"""
try:
chapters = info.get("chapters")
except Exception:
chapters = None
if not isinstance(chapters, list) or not chapters:
return None
rows: List[tuple[int, Optional[int], str]] = []
max_t = 0
for ch in chapters:
if not isinstance(ch, dict):
continue
start_raw = ch.get("start_time")
end_raw = ch.get("end_time")
title_raw = ch.get("title") or ch.get("name") or ch.get("chapter")
try:
start_s = int(float(start_raw))
except Exception:
continue
end_s: Optional[int] = None
try:
if end_raw is not None:
end_s = int(float(end_raw))
except Exception:
end_s = None
title = str(title_raw).strip() if title_raw is not None else ""
rows.append((start_s, end_s, title))
try:
max_t = max(max_t, start_s, end_s or 0)
except Exception:
max_t = max(max_t, start_s)
if not rows:
return None
force_hours = bool(max_t >= 3600)
def _tc(seconds: int) -> str:
total = max(0, int(seconds))
minutes, secs = divmod(total, 60)
hours, minutes = divmod(minutes, 60)
if force_hours:
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
return f"{minutes:02d}:{secs:02d}"
lines: List[str] = []
for start_s, end_s, title in sorted(rows, key=lambda r: (r[0], r[1] if r[1] is not None else 10**9, r[2])):
if end_s is not None and end_s > start_s:
prefix = f"{_tc(start_s)}-{_tc(end_s)}"
else:
prefix = _tc(start_s)
line = f"{prefix} {title}".strip()
if line:
lines.append(line)
text = "\n".join(lines).strip()
return text or None
def _best_subtitle_sidecar(media_path: Path) -> Optional[Path]:
"""Find the most likely subtitle sidecar file for a downloaded media file."""
try:
base_dir = media_path.parent
stem = media_path.stem
if not stem:
return None
candidates: List[Path] = []
for p in base_dir.glob(stem + ".*"):
try:
if not p.is_file():
continue
except Exception:
continue
if p.suffix.lower() in _SUBTITLE_EXTS:
candidates.append(p)
if not candidates:
return None
def _rank(path: Path) -> tuple[int, int, float, str]:
name = path.name.lower()
lang_rank = 0 if ".en." in name or name.endswith(".en" + path.suffix.lower()) else 1
ext = path.suffix.lower()
ext_rank_map = {".vtt": 0, ".srt": 1, ".ass": 2, ".ssa": 3, ".lrc": 4}
ext_rank = ext_rank_map.get(ext, 9)
try:
mtime = float(path.stat().st_mtime)
except Exception:
mtime = 0.0
return (lang_rank, ext_rank, -mtime, name)
candidates.sort(key=_rank)
return candidates[0]
except Exception:
return None
def _read_text_file(path: Path, *, max_bytes: int = 1_500_000) -> Optional[str]:
try:
data = path.read_bytes()
except Exception:
return None
if not data:
return None
if len(data) > max_bytes:
data = data[:max_bytes]
try:
return data.decode("utf-8", errors="replace")
except Exception:
try:
return data.decode(errors="replace")
except Exception:
return None
def _ensure_yt_dlp_ready() -> None: def _ensure_yt_dlp_ready() -> None:
if yt_dlp is not None: if yt_dlp is not None:
return return
@@ -100,16 +241,26 @@ def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[s
ydl_opts["noplaylist"] = True ydl_opts["noplaylist"] = True
if playlist_items: if playlist_items:
ydl_opts["playlist_items"] = playlist_items ydl_opts["playlist_items"] = playlist_items
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type]
debug(f"Fetching format list for: {url}") debug(f"Fetching format list for: {url}")
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[arg-type]
info = ydl.extract_info(url, download=False) info = ydl.extract_info(url, download=False)
formats = info.get("formats", [])
if not formats: if not isinstance(info, dict):
log("No formats available", file=sys.stderr) log("No formats available", file=sys.stderr)
return None return None
result_formats = []
formats = info.get("formats") or []
if not isinstance(formats, list) or not formats:
log("No formats available", file=sys.stderr)
return None
result_formats: List[Dict[str, Any]] = []
for fmt in formats: for fmt in formats:
result_formats.append({ if not isinstance(fmt, dict):
continue
result_formats.append(
{
"format_id": fmt.get("format_id", ""), "format_id": fmt.get("format_id", ""),
"format": fmt.get("format", ""), "format": fmt.get("format", ""),
"ext": fmt.get("ext", ""), "ext": fmt.get("ext", ""),
@@ -122,9 +273,11 @@ def list_formats(url: str, no_playlist: bool = False, playlist_items: Optional[s
"filesize": fmt.get("filesize"), "filesize": fmt.get("filesize"),
"abr": fmt.get("abr"), "abr": fmt.get("abr"),
"tbr": fmt.get("tbr"), "tbr": fmt.get("tbr"),
}) }
)
debug(f"Found {len(result_formats)} available formats") debug(f"Found {len(result_formats)} available formats")
return result_formats return result_formats or None
except Exception as e: except Exception as e:
log(f"✗ Error fetching formats: {e}", file=sys.stderr) log(f"✗ Error fetching formats: {e}", file=sys.stderr)
return None return None
@@ -215,6 +368,31 @@ def _download_with_sections_via_cli(url: str, ytdl_options: Dict[str, Any], sect
cmd = ["yt-dlp"] cmd = ["yt-dlp"]
if ytdl_options.get("format"): if ytdl_options.get("format"):
cmd.extend(["-f", ytdl_options["format"]]) cmd.extend(["-f", ytdl_options["format"]])
if ytdl_options.get("merge_output_format"):
cmd.extend(["--merge-output-format", str(ytdl_options["merge_output_format"])])
# For CLI downloads, infer chapter/metadata embedding from either legacy flags
# or explicit FFmpegMetadata postprocessor entries.
postprocessors = ytdl_options.get("postprocessors")
want_add_metadata = bool(ytdl_options.get("addmetadata"))
want_embed_chapters = bool(ytdl_options.get("embedchapters"))
if isinstance(postprocessors, list):
for pp in postprocessors:
if not isinstance(pp, dict):
continue
if str(pp.get("key") or "") == "FFmpegMetadata":
want_add_metadata = True
if bool(pp.get("add_chapters", True)):
want_embed_chapters = True
if want_add_metadata:
cmd.append("--add-metadata")
if want_embed_chapters:
cmd.append("--embed-chapters")
if ytdl_options.get("writesubtitles"):
cmd.append("--write-sub")
cmd.append("--write-auto-sub")
cmd.extend(["--sub-format", "vtt"])
if ytdl_options.get("force_keyframes_at_cuts"): if ytdl_options.get("force_keyframes_at_cuts"):
cmd.extend(["--force-keyframes-at-cuts"]) if ytdl_options.get("force_keyframes_at_cuts") else None cmd.extend(["--force-keyframes-at-cuts"]) if ytdl_options.get("force_keyframes_at_cuts") else None
cmd.extend(["-o", section_outtmpl]) cmd.extend(["-o", section_outtmpl])
@@ -258,11 +436,6 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
if opts.cookies_path and opts.cookies_path.is_file(): if opts.cookies_path and opts.cookies_path.is_file():
base_options["cookiefile"] = str(opts.cookies_path) base_options["cookiefile"] = str(opts.cookies_path)
else:
from hydrus_health_check import get_cookies_file_path # local import
global_cookies = get_cookies_file_path()
if global_cookies:
base_options["cookiefile"] = global_cookies
if opts.no_playlist: if opts.no_playlist:
base_options["noplaylist"] = True base_options["noplaylist"] = True
@@ -274,6 +447,37 @@ def _build_ytdlp_options(opts: DownloadOptions) -> Dict[str, Any]:
base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best" base_options["format"] = opts.ytdl_format or "bestvideo+bestaudio/best"
base_options["format_sort"] = ["res:4320", "res:2880", "res:2160", "res:1440", "res:1080", "res:720", "res"] base_options["format_sort"] = ["res:4320", "res:2880", "res:2160", "res:1440", "res:1080", "res:720", "res"]
# Optional yt-dlp features
if getattr(opts, "embed_chapters", False):
# Prefer explicit FFmpegMetadata PP so chapter embedding runs even when
# we already specified other postprocessors (e.g. FFmpegExtractAudio).
pps = base_options.get("postprocessors")
if not isinstance(pps, list):
pps = []
already_has_metadata = any(
isinstance(pp, dict) and str(pp.get("key") or "") == "FFmpegMetadata" for pp in pps
)
if not already_has_metadata:
pps.append(
{
"key": "FFmpegMetadata",
"add_metadata": True,
"add_chapters": True,
"add_infojson": "if_exists",
}
)
base_options["postprocessors"] = pps
# Chapter embedding is most reliable in mkv/mp4 containers.
# When merging separate video+audio streams, prefer mkv so mpv sees chapters.
if opts.mode != "audio":
base_options.setdefault("merge_output_format", "mkv")
if getattr(opts, "write_sub", False):
base_options["writesubtitles"] = True
base_options["writeautomaticsub"] = True
base_options["subtitlesformat"] = "vtt"
if opts.clip_sections: if opts.clip_sections:
sections: List[str] = [] sections: List[str] = []
@@ -410,13 +614,27 @@ def _get_libgen_download_url(libgen_url: str) -> Optional[str]:
response = session.get(libgen_url, timeout=10, allow_redirects=True) response = session.get(libgen_url, timeout=10, allow_redirects=True)
final_url = response.url final_url = response.url
try: try:
from bs4 import BeautifulSoup try:
soup = BeautifulSoup(response.content, 'html.parser') from lxml import html as lxml_html
for link in soup.find_all('a'):
href = link.get('href')
if href and 'get.php' in href:
return urljoin(libgen_url, href)
except ImportError: except ImportError:
lxml_html = None
if lxml_html is not None:
doc = lxml_html.fromstring(response.content)
for a in doc.xpath("//a[@href]"):
href = str(a.get("href") or "").strip()
if href and "get.php" in href.lower():
return urljoin(final_url, href)
else:
for m in re.finditer(
r"href=[\"\']([^\"\']+)[\"\']",
response.text or "",
flags=re.IGNORECASE,
):
href = str(m.group(1) or "").strip()
if href and "get.php" in href.lower():
return urljoin(final_url, href)
except Exception:
pass pass
if final_url != libgen_url: if final_url != libgen_url:
debug(f"LibGen resolved to mirror: {final_url}") debug(f"LibGen resolved to mirror: {final_url}")
@@ -648,7 +866,7 @@ def _download_direct_file(
raise DownloadError(f"Error downloading file: {exc}") from exc raise DownloadError(f"Error downloading file: {exc}") from exc
def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) -> Optional[Dict[str, Any]]: def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15, *, cookiefile: Optional[str] = None) -> Optional[Dict[str, Any]]:
"""Probe URL to extract metadata WITHOUT downloading. """Probe URL to extract metadata WITHOUT downloading.
Args: Args:
@@ -686,12 +904,8 @@ def probe_url(url: str, no_playlist: bool = False, timeout_seconds: int = 15) ->
"noprogress": True, # No progress bars "noprogress": True, # No progress bars
} }
# Add cookies if available (lazy import to avoid circular dependency) if cookiefile:
from hydrus_health_check import get_cookies_file_path # local import ydl_opts["cookiefile"] = str(cookiefile)
global_cookies = get_cookies_file_path()
if global_cookies:
ydl_opts["cookiefile"] = global_cookies
# Add no_playlist option if specified # Add no_playlist option if specified
if no_playlist: if no_playlist:
@@ -807,7 +1021,14 @@ def download_media(
debug(f"Skipping probe for playlist (item selection: {opts.playlist_items}), proceeding with download") debug(f"Skipping probe for playlist (item selection: {opts.playlist_items}), proceeding with download")
probe_result = {"url": opts.url} # Minimal probe result probe_result = {"url": opts.url} # Minimal probe result
else: else:
probe_result = probe_url(opts.url, no_playlist=opts.no_playlist, timeout_seconds=15) probe_cookiefile = None
try:
if opts.cookies_path and opts.cookies_path.is_file():
probe_cookiefile = str(opts.cookies_path)
except Exception:
probe_cookiefile = None
probe_result = probe_url(opts.url, no_playlist=opts.no_playlist, timeout_seconds=15, cookiefile=probe_cookiefile)
if probe_result is None: if probe_result is None:
if not opts.quiet: if not opts.quiet:
@@ -1182,6 +1403,8 @@ class Download_Media(Cmdlet):
try: try:
debug("Starting download-media") debug("Starting download-media")
ytdlp_tool = YtDlpTool(config)
# Parse arguments # Parse arguments
parsed = parse_cmdlet_args(args, self) parsed = parse_cmdlet_args(args, self)
@@ -1192,7 +1415,6 @@ class Download_Media(Cmdlet):
# If no url provided via args, try to extract from piped result # If no url provided via args, try to extract from piped result
if not raw_url and result: if not raw_url and result:
from ._shared import get_field
# Handle single result or list of results # Handle single result or list of results
results_to_check = result if isinstance(result, list) else [result] results_to_check = result if isinstance(result, list) else [result]
for item in results_to_check: for item in results_to_check:
@@ -1226,6 +1448,10 @@ class Download_Media(Cmdlet):
# Get other options # Get other options
clip_spec = parsed.get("clip") clip_spec = parsed.get("clip")
# Always enable chapters + subtitles so downstream pipes (e.g. mpv) can consume them.
embed_chapters = True
write_sub = True
mode = "audio" if parsed.get("audio") else "video" mode = "audio" if parsed.get("audio") else "video"
# Parse clip range(s) if specified # Parse clip range(s) if specified
@@ -1379,7 +1605,14 @@ class Download_Media(Cmdlet):
if playlist_items: if playlist_items:
return str(requested_url) return str(requested_url)
try: try:
pr = probe_url(requested_url, no_playlist=False, timeout_seconds=15) cf = None
try:
cookie_path = ytdlp_tool.resolve_cookiefile()
if cookie_path is not None and cookie_path.is_file():
cf = str(cookie_path)
except Exception:
cf = None
pr = probe_url(requested_url, no_playlist=False, timeout_seconds=15, cookiefile=cf)
if isinstance(pr, dict): if isinstance(pr, dict):
for key in ("webpage_url", "original_url", "url", "requested_url"): for key in ("webpage_url", "original_url", "url", "requested_url"):
value = pr.get(key) value = pr.get(key)
@@ -1458,7 +1691,14 @@ class Download_Media(Cmdlet):
- selected_urls: Optional[List[str]] (expanded per-entry urls when available) - selected_urls: Optional[List[str]] (expanded per-entry urls when available)
""" """
try: try:
pr = probe_url(url, no_playlist=False, timeout_seconds=15) cf = None
try:
cookie_path = ytdlp_tool.resolve_cookiefile()
if cookie_path is not None and cookie_path.is_file():
cf = str(cookie_path)
except Exception:
cf = None
pr = probe_url(url, no_playlist=False, timeout_seconds=15, cookiefile=cf)
except Exception: except Exception:
pr = None pr = None
if not isinstance(pr, dict): if not isinstance(pr, dict):
@@ -1686,6 +1926,15 @@ class Download_Media(Cmdlet):
filesize = fmt.get("filesize") filesize = fmt.get("filesize")
format_id = fmt.get("format_id", "") format_id = fmt.get("format_id", "")
# If the chosen format is video-only (no audio stream), automatically
# request best audio too so the resulting file has sound.
selection_format_id = format_id
try:
if vcodec != "none" and acodec == "none" and format_id:
selection_format_id = f"{format_id}+ba"
except Exception:
selection_format_id = format_id
# Format size # Format size
size_str = "" size_str = ""
if filesize: if filesize:
@@ -1729,9 +1978,9 @@ class Download_Media(Cmdlet):
"full_metadata": { "full_metadata": {
"format_id": format_id, "format_id": format_id,
"url": url, "url": url,
"item_selector": format_id, "item_selector": selection_format_id,
}, },
"_selection_args": ["-format", format_id] "_selection_args": ["-format", selection_format_id]
} }
# Add to results list and table (don't emit - formats should wait for @N selection) # Add to results list and table (don't emit - formats should wait for @N selection)
@@ -1778,23 +2027,57 @@ class Download_Media(Cmdlet):
actual_format = playlist_items actual_format = playlist_items
actual_playlist_items = None actual_playlist_items = None
# Auto-pick best audio format when -audio is used and no explicit format is given. # For -audio, default to yt-dlp's built-in bestaudio selector.
# This should *not* require interactive format picking.
if mode == "audio" and not actual_format: if mode == "audio" and not actual_format:
chosen = None actual_format = "bestaudio"
# If no explicit format is provided for video mode, allow a config override.
if mode == "video" and not actual_format:
configured = (ytdlp_tool.default_format("video") or "").strip()
if configured and configured != "bestvideo+bestaudio/best":
actual_format = configured
# If a single format id was chosen and it is video-only, auto-merge best audio.
if (
actual_format
and isinstance(actual_format, str)
and mode != "audio"
and "+" not in actual_format
and "/" not in actual_format
and "[" not in actual_format
and actual_format not in {"best", "bv", "ba", "b"}
):
try:
formats = list_formats(url, no_playlist=False, playlist_items=actual_playlist_items) formats = list_formats(url, no_playlist=False, playlist_items=actual_playlist_items)
if formats: if formats:
chosen = _pick_best_audio_format_id(formats) fmt_match = next(
actual_format = chosen or "bestaudio/best" (f for f in formats if str(f.get("format_id", "")) == actual_format),
None,
)
if fmt_match:
vcodec = str(fmt_match.get("vcodec", "none"))
acodec = str(fmt_match.get("acodec", "none"))
if vcodec != "none" and acodec == "none":
debug(
f"Selected video-only format {actual_format}; using {actual_format}+ba for audio"
)
actual_format = f"{actual_format}+ba"
except Exception:
pass
opts = DownloadOptions( opts = DownloadOptions(
url=url, url=url,
mode=mode, mode=mode,
output_dir=final_output_dir, output_dir=final_output_dir,
ytdl_format=actual_format, ytdl_format=actual_format,
cookies_path=ytdlp_tool.resolve_cookiefile(),
clip_sections=clip_sections_spec, clip_sections=clip_sections_spec,
playlist_items=actual_playlist_items, playlist_items=actual_playlist_items,
quiet=quiet_mode, quiet=quiet_mode,
no_playlist=False, no_playlist=False,
embed_chapters=embed_chapters,
write_sub=write_sub,
) )
# Use timeout wrapper to prevent hanging # Use timeout wrapper to prevent hanging
@@ -1838,7 +2121,40 @@ class Download_Media(Cmdlet):
# Build PipeObjects first so we can attach cross-clip relationships. # Build PipeObjects first so we can attach cross-clip relationships.
pipe_objects: List[Dict[str, Any]] = [] pipe_objects: List[Dict[str, Any]] = []
for downloaded in results_to_emit: for downloaded in results_to_emit:
pipe_objects.append(self._build_pipe_object(downloaded, url, opts)) po = self._build_pipe_object(downloaded, url, opts)
# Attach chapter timestamps for downstream consumers (e.g., mpv scripts)
# even if container embedding fails.
try:
info = downloaded.info if isinstance(getattr(downloaded, "info", None), dict) else {}
except Exception:
info = {}
chapters_text = _format_chapters_note(info) if embed_chapters else None
if chapters_text:
notes = po.get("notes")
if not isinstance(notes, dict):
notes = {}
notes.setdefault("chapters", chapters_text)
po["notes"] = notes
if write_sub:
try:
media_path = Path(str(po.get("path") or ""))
except Exception:
media_path = None
if media_path is not None and media_path.exists() and media_path.is_file():
sub_path = _best_subtitle_sidecar(media_path)
if sub_path is not None:
sub_text = _read_text_file(sub_path)
if sub_text:
notes = po.get("notes")
if not isinstance(notes, dict):
notes = {}
notes["sub"] = sub_text
po["notes"] = notes
pipe_objects.append(po)
# If this is a clip download, decorate titles/tags so the title: tag is clip-based. # If this is a clip download, decorate titles/tags so the title: tag is clip-based.
# Relationship tags are only added when multiple clips exist. # Relationship tags are only added when multiple clips exist.
@@ -1868,6 +2184,95 @@ class Download_Media(Cmdlet):
debug("✓ Downloaded and emitted") debug("✓ Downloaded and emitted")
except DownloadError as e: except DownloadError as e:
# Special-case yt-dlp format errors: show a selectable format list table so
# the user can pick a working format_id and continue the pipeline via @N.
cause = getattr(e, "__cause__", None)
detail = ""
try:
detail = str(cause or "")
except Exception:
detail = ""
if "requested format is not available" in (detail or "").lower() and mode != "audio":
formats = list_formats(url, no_playlist=False, playlist_items=actual_playlist_items)
if formats:
formats_to_show = formats
table = ResultTable()
table.title = f"Available formats for {url}"
table.set_source_command("download-media", [str(a) for a in (args or [])])
results_list: List[Dict[str, Any]] = []
for idx, fmt in enumerate(formats_to_show, 1):
resolution = fmt.get("resolution", "")
ext = fmt.get("ext", "")
vcodec = fmt.get("vcodec", "none")
acodec = fmt.get("acodec", "none")
filesize = fmt.get("filesize")
format_id = fmt.get("format_id", "")
selection_format_id = format_id
try:
if vcodec != "none" and acodec == "none" and format_id:
selection_format_id = f"{format_id}+ba"
except Exception:
selection_format_id = format_id
size_str = ""
if filesize:
try:
size_mb = float(filesize) / (1024 * 1024)
size_str = f"{size_mb:.1f}MB"
except Exception:
size_str = ""
desc_parts: List[str] = []
if resolution and resolution != "audio only":
desc_parts.append(str(resolution))
if ext:
desc_parts.append(str(ext).upper())
if vcodec != "none":
desc_parts.append(f"v:{vcodec}")
if acodec != "none":
desc_parts.append(f"a:{acodec}")
if size_str:
desc_parts.append(size_str)
format_desc = " | ".join(desc_parts)
format_dict: Dict[str, Any] = {
"table": "download-media",
"title": f"Format {format_id}",
"url": url,
"target": url,
"detail": format_desc,
"media_kind": "format",
"columns": [
("#", str(idx)),
("ID", format_id),
("Resolution", resolution or "N/A"),
("Ext", ext),
("Video", vcodec),
("Audio", acodec),
("Size", size_str or "N/A"),
],
"full_metadata": {
"format_id": format_id,
"url": url,
"item_selector": selection_format_id,
},
"_selection_args": ["-format", selection_format_id],
}
results_list.append(format_dict)
table.add_result(format_dict)
pipeline_context.set_current_stage_table(table)
pipeline_context.set_last_result_table(table, results_list)
# Returning 0 with no emits lets the CLI pause the pipeline for @N selection.
log("Requested format is not available; select a working format with @N", file=sys.stderr)
return 0
log(f"Download failed for {url}: {e}", file=sys.stderr) log(f"Download failed for {url}: {e}", file=sys.stderr)
except Exception as e: except Exception as e:
log(f"Error processing {url}: {e}", file=sys.stderr) log(f"Error processing {url}: {e}", file=sys.stderr)

View File

@@ -15,9 +15,9 @@ from pathlib import Path
from typing import Any, Dict, Optional, Sequence from typing import Any, Dict, Optional, Sequence
from SYS.logger import log from SYS.logger import log
from ._shared import Cmdlet, CmdletArg, parse_cmdlet_args from . import _shared as sh
class Download_Torrent(Cmdlet): class Download_Torrent(sh.Cmdlet):
"""Class-based download-torrent cmdlet with self-registration.""" """Class-based download-torrent cmdlet with self-registration."""
def __init__(self) -> None: def __init__(self) -> None:
@@ -27,10 +27,10 @@ class Download_Torrent(Cmdlet):
usage="download-torrent <magnet|.torrent> [options]", usage="download-torrent <magnet|.torrent> [options]",
alias=["torrent", "magnet"], alias=["torrent", "magnet"],
arg=[ arg=[
CmdletArg(name="magnet", type="string", required=False, description="Magnet link or .torrent file/URL", variadic=True), sh.CmdletArg(name="magnet", type="string", required=False, description="Magnet link or .torrent file/URL", variadic=True),
CmdletArg(name="output", type="string", description="Output directory for downloaded files"), sh.CmdletArg(name="output", type="string", description="Output directory for downloaded files"),
CmdletArg(name="wait", type="float", description="Wait time (seconds) for magnet processing timeout"), sh.CmdletArg(name="wait", type="float", description="Wait time (seconds) for magnet processing timeout"),
CmdletArg(name="background", type="flag", alias="bg", description="Start download in background"), sh.CmdletArg(name="background", type="flag", alias="bg", description="Start download in background"),
], ],
detail=["Download torrents/magnets via AllDebrid API."], detail=["Download torrents/magnets via AllDebrid API."],
exec=self.run, exec=self.run,
@@ -38,7 +38,7 @@ class Download_Torrent(Cmdlet):
self.register() self.register()
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
parsed = parse_cmdlet_args(args, self) parsed = sh.parse_cmdlet_args(args, self)
magnet_args = parsed.get("magnet", []) magnet_args = parsed.get("magnet", [])
output_dir = Path(parsed.get("output") or Path.home() / "Downloads") output_dir = Path(parsed.get("output") or Path.home() / "Downloads")
wait_timeout = int(float(parsed.get("wait", 600))) wait_timeout = int(float(parsed.get("wait", 600)))

View File

@@ -9,13 +9,13 @@ import subprocess
import webbrowser import webbrowser
import pipeline as ctx import pipeline as ctx
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field, normalize_hash from . import _shared as sh
from SYS.logger import log, debug from SYS.logger import log, debug
from Store import Store from Store import Store
from config import resolve_output_dir from config import resolve_output_dir
class Get_File(Cmdlet): class Get_File(sh.Cmdlet):
"""Export files to local path via hash+store.""" """Export files to local path via hash+store."""
def __init__(self) -> None: def __init__(self) -> None:
@@ -25,10 +25,10 @@ class Get_File(Cmdlet):
summary="Export file to local path", summary="Export file to local path",
usage="@1 | get-file -path C:\\Downloads", usage="@1 | get-file -path C:\\Downloads",
arg=[ arg=[
SharedArgs.HASH, sh.SharedArgs.HASH,
SharedArgs.STORE, sh.SharedArgs.STORE,
SharedArgs.PATH, sh.SharedArgs.PATH,
CmdletArg("name", description="Output filename (default: from metadata title)"), sh.CmdletArg("name", description="Output filename (default: from metadata title)"),
], ],
detail=[ detail=[
"- Exports file from storage backend to local path", "- Exports file from storage backend to local path",
@@ -42,12 +42,12 @@ class Get_File(Cmdlet):
def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: def run(self, result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
"""Export file via hash+store backend.""" """Export file via hash+store backend."""
debug(f"[get-file] run() called with result type: {type(result)}") debug(f"[get-file] run() called with result type: {type(result)}")
parsed = parse_cmdlet_args(args, self) parsed = sh.parse_cmdlet_args(args, self)
debug(f"[get-file] parsed args: {parsed}") debug(f"[get-file] parsed args: {parsed}")
# Extract hash and store from result or args # Extract hash and store from result or args
file_hash = parsed.get("hash") or get_field(result, "hash") file_hash = parsed.get("hash") or sh.get_field(result, "hash")
store_name = parsed.get("store") or get_field(result, "store") store_name = parsed.get("store") or sh.get_field(result, "store")
output_path = parsed.get("path") output_path = parsed.get("path")
output_name = parsed.get("name") output_name = parsed.get("name")
@@ -62,7 +62,7 @@ class Get_File(Cmdlet):
return 1 return 1
# Normalize hash # Normalize hash
file_hash = normalize_hash(file_hash) file_hash = sh.normalize_hash(file_hash)
if not file_hash: if not file_hash:
log("Error: Invalid hash format") log("Error: Invalid hash format")
return 1 return 1
@@ -84,9 +84,9 @@ class Get_File(Cmdlet):
def resolve_display_title() -> str: def resolve_display_title() -> str:
candidates = [ candidates = [
get_field(result, "title"), sh.get_field(result, "title"),
get_field(result, "name"), sh.get_field(result, "name"),
get_field(result, "filename"), sh.get_field(result, "filename"),
(metadata.get("title") if isinstance(metadata, dict) else None), (metadata.get("title") if isinstance(metadata, dict) else None),
(metadata.get("name") if isinstance(metadata, dict) else None), (metadata.get("name") if isinstance(metadata, dict) else None),
(metadata.get("filename") if isinstance(metadata, dict) else None), (metadata.get("filename") if isinstance(metadata, dict) else None),

View File

@@ -7,7 +7,13 @@ import sys
from SYS.logger import log from SYS.logger import log
from pathlib import Path from pathlib import Path
from ._shared import Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
parse_cmdlet_args = sh.parse_cmdlet_args
get_field = sh.get_field
import pipeline as ctx import pipeline as ctx
from result_table import ResultTable from result_table import ResultTable
@@ -74,9 +80,15 @@ class Get_Metadata(Cmdlet):
hash_value: Optional[str], pages: Optional[int] = None) -> Dict[str, Any]: hash_value: Optional[str], pages: Optional[int] = None) -> Dict[str, Any]:
"""Build a table row dict with metadata fields.""" """Build a table row dict with metadata fields."""
size_mb = None size_mb = None
if isinstance(size_bytes, int): size_int: Optional[int] = None
if size_bytes is not None:
try: try:
size_mb = int(size_bytes / (1024 * 1024)) size_int = int(size_bytes)
except Exception:
size_int = None
if isinstance(size_int, int):
try:
size_mb = int(size_int / (1024 * 1024))
except Exception: except Exception:
size_mb = None size_mb = None
@@ -105,7 +117,7 @@ class Get_Metadata(Cmdlet):
"path": path, "path": path,
"store": store, "store": store,
"mime": mime, "mime": mime,
"size_bytes": size_bytes, "size_bytes": size_int,
"duration_seconds": dur_int, "duration_seconds": dur_int,
"pages": pages_int, "pages": pages_int,
"imported_ts": imported_ts, "imported_ts": imported_ts,
@@ -237,8 +249,8 @@ class Get_Metadata(Cmdlet):
pages=pages, pages=pages,
) )
table_title = title table_title = f"get-metadata: {title}" if title else "get-metadata"
table = ResultTable(table_title).init_command("get-metadata", list(args)) table = ResultTable(table_title).init_command(table_title, "get-metadata", list(args))
self._add_table_body_row(table, row) self._add_table_body_row(table, row)
ctx.set_last_result_table_overlay(table, [row], row) ctx.set_last_result_table_overlay(table, [row], row)
ctx.emit(row) ctx.emit(row)

View File

@@ -7,15 +7,15 @@ import sys
from SYS.logger import log from SYS.logger import log
import pipeline as ctx import pipeline as ctx
from ._shared import ( from . import _shared as sh
Cmdlet,
CmdletArg, Cmdlet = sh.Cmdlet
SharedArgs, CmdletArg = sh.CmdletArg
normalize_hash, SharedArgs = sh.SharedArgs
parse_cmdlet_args, normalize_hash = sh.normalize_hash
normalize_result_input, parse_cmdlet_args = sh.parse_cmdlet_args
should_show_help, normalize_result_input = sh.normalize_result_input
) should_show_help = sh.should_show_help
from Store import Store from Store import Store
from SYS.utils import sha256_file from SYS.utils import sha256_file

View File

@@ -10,7 +10,17 @@ from SYS.logger import log
import models import models
import pipeline as ctx import pipeline as ctx
from API import HydrusNetwork as hydrus_wrapper from API import HydrusNetwork as hydrus_wrapper
from ._shared import Cmdlet, CmdletArg, SharedArgs, normalize_hash, fmt_bytes, get_hash_for_operation, fetch_hydrus_metadata, should_show_help, get_field from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
normalize_hash = sh.normalize_hash
fmt_bytes = sh.fmt_bytes
get_hash_for_operation = sh.get_hash_for_operation
fetch_hydrus_metadata = sh.fetch_hydrus_metadata
should_show_help = sh.should_show_help
get_field = sh.get_field
from API.folder import API_folder_store from API.folder import API_folder_store
from config import get_local_storage_path from config import get_local_storage_path
from result_table import ResultTable from result_table import ResultTable
@@ -224,13 +234,14 @@ def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int:
try: try:
client = None client = None
store_label = "hydrus" store_label = "hydrus"
backend_obj = None
if store_name: if store_name:
# Store specified: do not fall back to a global/default Hydrus client. # Store specified: do not fall back to a global/default Hydrus client.
store_label = str(store_name) store_label = str(store_name)
try: try:
store = Store(config) store = Store(config)
backend = store[str(store_name)] backend_obj = store[str(store_name)]
candidate = getattr(backend, "_client", None) candidate = getattr(backend_obj, "_client", None)
if candidate is not None and hasattr(candidate, "get_file_relationships"): if candidate is not None and hasattr(candidate, "get_file_relationships"):
client = candidate client = candidate
except Exception: except Exception:
@@ -241,6 +252,74 @@ def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int:
else: else:
client = hydrus_wrapper.get_client(config) client = hydrus_wrapper.get_client(config)
def _resolve_related_title(rel_hash: str) -> str:
"""Best-effort resolve a Hydrus hash to a human title.
Preference order:
- title: tag from the backend (fast path)
- Hydrus metadata tags via fetch_hydrus_metadata
- fallback to short hash
"""
h = normalize_hash(rel_hash)
if not h:
return str(rel_hash)
# Prefer backend tag extraction when available.
if backend_obj is not None and hasattr(backend_obj, "get_tag"):
try:
tag_result = backend_obj.get_tag(h)
tags = tag_result[0] if isinstance(tag_result, tuple) and tag_result else tag_result
if isinstance(tags, list):
for t in tags:
if isinstance(t, str) and t.lower().startswith("title:"):
val = t.split(":", 1)[1].strip()
if val:
return val
except Exception:
pass
# Fallback: fetch minimal metadata and scan for a title tag.
try:
meta, _ = fetch_hydrus_metadata(
config,
h,
store_name=store_label if store_name else None,
hydrus_client=client,
include_service_keys_to_tags=True,
include_file_url=False,
include_duration=False,
include_size=False,
include_mime=False,
)
if isinstance(meta, dict):
tags_payload = meta.get("tags")
tag_candidates: list[str] = []
if isinstance(tags_payload, dict):
for svc_data in tags_payload.values():
if not isinstance(svc_data, dict):
continue
storage = svc_data.get("storage_tags")
if isinstance(storage, dict):
for group in storage.values():
if isinstance(group, list):
tag_candidates.extend([str(x) for x in group if isinstance(x, str)])
display = svc_data.get("display_tags")
if isinstance(display, list):
tag_candidates.extend([str(x) for x in display if isinstance(x, str)])
flat = meta.get("tags_flat")
if isinstance(flat, list):
tag_candidates.extend([str(x) for x in flat if isinstance(x, str)])
for t in tag_candidates:
if isinstance(t, str) and t.lower().startswith("title:"):
val = t.split(":", 1)[1].strip()
if val:
return val
except Exception:
pass
return h[:16] + "..."
if client: if client:
rel = client.get_file_relationships(hash_hex) rel = client.get_file_relationships(hash_hex)
if rel: if rel:
@@ -274,7 +353,7 @@ def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int:
found_relationships.append({ found_relationships.append({
"hash": king_hash, "hash": king_hash,
"type": "king", "type": "king",
"title": king_hash, "title": _resolve_related_title(king_hash),
"path": None, "path": None,
"store": store_label, "store": store_label,
}) })
@@ -292,7 +371,7 @@ def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int:
found_relationships.append({ found_relationships.append({
"hash": rel_hash_norm, "hash": rel_hash_norm,
"type": rel_name, "type": rel_name,
"title": rel_hash_norm, # Can't resolve title easily without another API call "title": _resolve_related_title(rel_hash_norm),
"path": None, "path": None,
"store": store_label, "store": store_label,
}) })
@@ -304,7 +383,7 @@ def _run(result: Any, _args: Sequence[str], config: Dict[str, Any]) -> int:
found_relationships.append({ found_relationships.append({
"hash": rel_hash_norm, "hash": rel_hash_norm,
"type": rel_name, "type": rel_name,
"title": rel_hash_norm, "title": _resolve_related_title(rel_hash_norm),
"path": None, "path": None,
"store": store_label, "store": store_label,
}) })

View File

@@ -27,7 +27,15 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple
import pipeline as ctx import pipeline as ctx
from API import HydrusNetwork from API import HydrusNetwork
from API.folder import read_sidecar, write_sidecar, find_sidecar, API_folder_store from API.folder import read_sidecar, write_sidecar, find_sidecar, API_folder_store
from ._shared import normalize_hash, looks_like_hash, Cmdlet, CmdletArg, SharedArgs, parse_cmdlet_args, get_field from . import _shared as sh
normalize_hash = sh.normalize_hash
looks_like_hash = sh.looks_like_hash
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
parse_cmdlet_args = sh.parse_cmdlet_args
get_field = sh.get_field
from config import get_local_storage_path from config import get_local_storage_path

View File

@@ -5,7 +5,15 @@ from typing import Any, Dict, List, Sequence
import sys import sys
import pipeline as ctx import pipeline as ctx
from ._shared import Cmdlet, SharedArgs, parse_cmdlet_args, get_field, normalize_hash from . import _shared as sh
Cmdlet, SharedArgs, parse_cmdlet_args, get_field, normalize_hash = (
sh.Cmdlet,
sh.SharedArgs,
sh.parse_cmdlet_args,
sh.get_field,
sh.normalize_hash,
)
from SYS.logger import log from SYS.logger import log
from Store import Store from Store import Store

View File

@@ -12,17 +12,17 @@ import re as _re
from config import resolve_output_dir from config import resolve_output_dir
from ._shared import ( from . import _shared as sh
Cmdlet,
CmdletArg, Cmdlet = sh.Cmdlet
create_pipe_object_result, CmdletArg = sh.CmdletArg
get_field, create_pipe_object_result = sh.create_pipe_object_result
get_pipe_object_hash, get_field = sh.get_field
get_pipe_object_path, get_pipe_object_hash = sh.get_pipe_object_hash
normalize_result_input, get_pipe_object_path = sh.get_pipe_object_path
parse_cmdlet_args, normalize_result_input = sh.normalize_result_input
should_show_help, parse_cmdlet_args = sh.parse_cmdlet_args
) should_show_help = sh.should_show_help
import pipeline as ctx import pipeline as ctx

View File

@@ -20,7 +20,16 @@ from urllib.parse import urlsplit, quote, urljoin
from SYS.logger import log, debug from SYS.logger import log, debug
from API.HTTP import HTTPClient from API.HTTP import HTTPClient
from SYS.utils import ensure_directory, unique_path, unique_preserve_order from SYS.utils import ensure_directory, unique_path, unique_preserve_order
from ._shared import Cmdlet, CmdletArg, SharedArgs, create_pipe_object_result, normalize_result_input, should_show_help, get_field from . import _shared as sh
Cmdlet = sh.Cmdlet
CmdletArg = sh.CmdletArg
SharedArgs = sh.SharedArgs
create_pipe_object_result = sh.create_pipe_object_result
normalize_result_input = sh.normalize_result_input
should_show_help = sh.should_show_help
get_field = sh.get_field
parse_cmdlet_args = sh.parse_cmdlet_args
import pipeline as pipeline_context import pipeline as pipeline_context
# ============================================================================ # ============================================================================
@@ -33,20 +42,7 @@ import pipeline as pipeline_context
# Playwright & Screenshot Dependencies # Playwright & Screenshot Dependencies
# ============================================================================ # ============================================================================
try: from tool.playwright import HAS_PLAYWRIGHT, PlaywrightTimeoutError, PlaywrightTool
from playwright.sync_api import (
TimeoutError as PlaywrightTimeoutError,
sync_playwright,
)
HAS_PLAYWRIGHT = True
except Exception:
HAS_PLAYWRIGHT = False
PlaywrightTimeoutError = TimeoutError # type: ignore
def sync_playwright(*_args: Any, **_kwargs: Any) -> Any: # type: ignore
raise RuntimeError(
"playwright is required for screenshot capture; install with: pip install playwright; then: playwright install"
)
try: try:
from config import resolve_output_dir from config import resolve_output_dir
@@ -128,6 +124,7 @@ class ScreenshotOptions:
prefer_platform_target: bool = False prefer_platform_target: bool = False
target_selectors: Optional[Sequence[str]] = None target_selectors: Optional[Sequence[str]] = None
selector_timeout_ms: int = 10_000 selector_timeout_ms: int = 10_000
playwright_tool: Optional[PlaywrightTool] = None
@dataclass(slots=True) @dataclass(slots=True)
@@ -324,33 +321,22 @@ def _prepare_output_path(options: ScreenshotOptions) -> Path:
def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) -> None: def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str]) -> None:
"""Capture screenshot using Playwright.""" """Capture screenshot using Playwright."""
debug(f"[_capture] Starting capture for {options.url} -> {destination}") debug(f"[_capture] Starting capture for {options.url} -> {destination}")
playwright = None
browser = None
context = None
try: try:
debug("Starting Playwright...", flush=True) tool = options.playwright_tool or PlaywrightTool({})
playwright = sync_playwright().start() tool.debug_dump()
log("Launching Chromium browser...", flush=True)
log("Launching browser...", flush=True)
format_name = _normalise_format(options.output_format) format_name = _normalise_format(options.output_format)
headless = options.headless or format_name == "pdf" headless = options.headless or format_name == "pdf"
debug(f"[_capture] Format: {format_name}, Headless: {headless}") debug(f"[_capture] Format: {format_name}, Headless: {headless}")
if format_name == "pdf" and not options.headless: if format_name == "pdf" and not options.headless:
warnings.append("pdf output requires headless Chromium; overriding headless mode") warnings.append("pdf output requires headless Chromium; overriding headless mode")
browser = playwright.chromium.launch(
headless=headless, with tool.open_page(headless=headless) as page:
args=["--disable-blink-features=AutomationControlled"],
)
log("Creating browser context...", flush=True)
context = browser.new_context(
user_agent=USER_AGENT,
viewport=DEFAULT_VIEWPORT,
ignore_https_errors=True,
)
page = context.new_page()
log(f"Navigating to {options.url}...", flush=True) log(f"Navigating to {options.url}...", flush=True)
try: try:
page.goto(options.url, timeout=90_000, wait_until="domcontentloaded") tool.goto(page, options.url)
log("Page loaded successfully", flush=True) log("Page loaded successfully", flush=True)
except PlaywrightTimeoutError: except PlaywrightTimeoutError:
warnings.append("navigation timeout; capturing current page state") warnings.append("navigation timeout; capturing current page state")
@@ -448,18 +434,6 @@ def _capture(options: ScreenshotOptions, destination: Path, warnings: List[str])
except Exception as exc: except Exception as exc:
debug(f"[_capture] Exception: {exc}") debug(f"[_capture] Exception: {exc}")
raise ScreenshotError(f"Failed to capture screenshot: {exc}") from exc raise ScreenshotError(f"Failed to capture screenshot: {exc}") from exc
finally:
log("Cleaning up browser resources...", flush=True)
with contextlib.suppress(Exception):
if context is not None:
context.close()
with contextlib.suppress(Exception):
if browser is not None:
browser.close()
with contextlib.suppress(Exception):
if playwright is not None:
playwright.stop()
log("Cleanup complete", flush=True)
def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult: def _capture_screenshot(options: ScreenshotOptions) -> ScreenshotResult:
@@ -511,8 +485,6 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
Screenshots are created using Playwright and marked as temporary Screenshots are created using Playwright and marked as temporary
so they can be cleaned up later with the cleanup cmdlet. so they can be cleaned up later with the cleanup cmdlet.
""" """
from ._shared import parse_cmdlet_args
debug(f"[_run] screen-shot invoked with args: {args}") debug(f"[_run] screen-shot invoked with args: {args}")
# Help check # Help check
@@ -534,6 +506,19 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
parsed = parse_cmdlet_args(args, CMDLET) parsed = parse_cmdlet_args(args, CMDLET)
format_value = parsed.get("format") format_value = parsed.get("format")
if not format_value:
# Default format can be set via config.conf tool block:
# [tool=playwright]
# format="pdf"
try:
tool_cfg = config.get("tool", {}) if isinstance(config, dict) else {}
pw_cfg = tool_cfg.get("playwright") if isinstance(tool_cfg, dict) else None
if isinstance(pw_cfg, dict):
format_value = pw_cfg.get("format")
except Exception:
pass
if not format_value:
format_value = "png"
storage_value = parsed.get("storage") storage_value = parsed.get("storage")
selector_arg = parsed.get("selector") selector_arg = parsed.get("selector")
selectors = [selector_arg] if selector_arg else [] selectors = [selector_arg] if selector_arg else []
@@ -669,6 +654,7 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
prefer_platform_target=False, prefer_platform_target=False,
wait_for_article=False, wait_for_article=False,
full_page=True, full_page=True,
playwright_tool=PlaywrightTool(config),
) )
screenshot_result = _capture_screenshot(options) screenshot_result = _capture_screenshot(options)

View File

@@ -10,7 +10,13 @@ import importlib
from SYS.logger import log, debug from SYS.logger import log, debug
from ProviderCore.registry import get_search_provider, list_search_providers from ProviderCore.registry import get_search_provider, list_search_providers
from ._shared import Cmdlet, CmdletArg, should_show_help from . import _shared as sh
Cmdlet, CmdletArg, should_show_help = (
sh.Cmdlet,
sh.CmdletArg,
sh.should_show_help,
)
import pipeline as ctx import pipeline as ctx
# Optional dependencies # Optional dependencies

View File

@@ -10,7 +10,17 @@ import sys
from SYS.logger import log, debug from SYS.logger import log, debug
from ._shared import Cmdlet, CmdletArg, SharedArgs, get_field, should_show_help, normalize_hash, first_title_tag from . import _shared as sh
Cmdlet, CmdletArg, SharedArgs, get_field, should_show_help, normalize_hash, first_title_tag = (
sh.Cmdlet,
sh.CmdletArg,
sh.SharedArgs,
sh.get_field,
sh.should_show_help,
sh.normalize_hash,
sh.first_title_tag,
)
import pipeline as ctx import pipeline as ctx
@@ -209,6 +219,10 @@ class Search_Store(Cmdlet):
table_title += f" [{storage_backend}]" table_title += f" [{storage_backend}]"
table = ResultTable(table_title) table = ResultTable(table_title)
try:
table.set_source_command("search-store", list(args_list))
except Exception:
pass
if hash_query: if hash_query:
try: try:
table.set_preserve_order(True) table.set_preserve_order(True)
@@ -309,6 +323,11 @@ class Search_Store(Cmdlet):
ext_val = Path(path_str).suffix ext_val = Path(path_str).suffix
except Exception: except Exception:
ext_val = None ext_val = None
if not ext_val and title:
try:
ext_val = Path(str(title)).suffix
except Exception:
ext_val = None
size_bytes = meta_obj.get("size") size_bytes = meta_obj.get("size")
if size_bytes is None: if size_bytes is None:
@@ -333,6 +352,20 @@ class Search_Store(Cmdlet):
ctx.emit(payload) ctx.emit(payload)
if found_any: if found_any:
# Title should reflect the command, query, and only stores present in the table.
store_counts: "OrderedDict[str, int]" = OrderedDict()
for row_item in results_list:
store_val = str(row_item.get("store") or "").strip()
if not store_val:
continue
if store_val not in store_counts:
store_counts[store_val] = 0
store_counts[store_val] += 1
counts_part = " ".join(f"{name}:{count}" for name, count in store_counts.items() if count > 0)
base_title = f"search-store: {query}".strip()
table.title = f"{base_title} | {counts_part}" if counts_part else base_title
ctx.set_last_result_table(table, results_list) ctx.set_last_result_table(table, results_list)
db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2)) db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
db.update_worker_status(worker_id, 'completed') db.update_worker_status(worker_id, 'completed')
@@ -377,28 +410,6 @@ class Search_Store(Cmdlet):
log(f"Backend {backend_name} search failed: {exc}", file=sys.stderr) log(f"Backend {backend_name} search failed: {exc}", file=sys.stderr)
results = all_results[:limit] results = all_results[:limit]
def _format_storage_label(name: str) -> str:
clean = str(name or "").strip()
if not clean:
return "Unknown"
return clean.replace("_", " ").title()
storage_counts: OrderedDict[str, int] = OrderedDict((name, 0) for name in searched_backends)
for item in results or []:
store = get_field(item, "store")
if not store:
continue
key = str(store).lower()
if key not in storage_counts:
storage_counts[key] = 0
storage_counts[key] += 1
if storage_counts or query:
display_counts = OrderedDict((_format_storage_label(name), count) for name, count in storage_counts.items())
summary_line = table.set_storage_summary(display_counts, query, inline=True)
if summary_line:
table.title = summary_line
if results: if results:
for item in results: for item in results:
def _as_dict(obj: Any) -> Dict[str, Any]: def _as_dict(obj: Any) -> Dict[str, Any]:
@@ -428,6 +439,20 @@ class Search_Store(Cmdlet):
results_list.append(normalized) results_list.append(normalized)
ctx.emit(normalized) ctx.emit(normalized)
# Title should reflect the command, query, and only stores present in the table.
store_counts: "OrderedDict[str, int]" = OrderedDict()
for row_item in results_list:
store_val = str(row_item.get("store") or "").strip()
if not store_val:
continue
if store_val not in store_counts:
store_counts[store_val] = 0
store_counts[store_val] += 1
counts_part = " ".join(f"{name}:{count}" for name, count in store_counts.items() if count > 0)
base_title = f"search-store: {query}".strip()
table.title = f"{base_title} | {counts_part}" if counts_part else base_title
ctx.set_last_result_table(table, results_list) ctx.set_last_result_table(table, results_list)
db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2)) db.append_worker_stdout(worker_id, json.dumps(results_list, indent=2))
else: else:

View File

@@ -11,14 +11,14 @@ import re
from SYS.logger import log, debug from SYS.logger import log, debug
from SYS.utils import sha256_file from SYS.utils import sha256_file
from ._shared import ( from . import _shared as sh
Cmdlet,
CmdletArg, Cmdlet = sh.Cmdlet
parse_cmdlet_args, CmdletArg = sh.CmdletArg
normalize_result_input, parse_cmdlet_args = sh.parse_cmdlet_args
extract_tag_from_result, normalize_result_input = sh.normalize_result_input
extract_title_from_result extract_tag_from_result = sh.extract_tag_from_result
) extract_title_from_result = sh.extract_title_from_result
import pipeline as ctx import pipeline as ctx
CMDLET = Cmdlet( CMDLET = Cmdlet(

View File

@@ -26,9 +26,9 @@ def ensure_registry_loaded() -> None:
def _normalize_mod_name(mod_name: str) -> str: def _normalize_mod_name(mod_name: str) -> str:
"""Normalize a command/module name for import resolution.""" """Normalize a command/module name for import resolution."""
normalized = (mod_name or "").strip() normalized = (mod_name or "").strip()
if normalized.startswith('.'): if normalized.startswith("."):
normalized = normalized.lstrip('.') normalized = normalized.lstrip(".")
normalized = normalized.replace('-', '_') normalized = normalized.replace("-", "_")
return normalized return normalized
@@ -83,7 +83,7 @@ def get_cmdlet_metadata(cmd_name: str) -> Optional[Dict[str, Any]]:
if data is None: if data is None:
try: try:
reg_fn = (REGISTRY or {}).get(cmd_name.replace('_', '-').lower()) reg_fn = (REGISTRY or {}).get(cmd_name.replace("_", "-").lower())
if reg_fn: if reg_fn:
owner_mod = getattr(reg_fn, "__module__", "") owner_mod = getattr(reg_fn, "__module__", "")
if owner_mod: if owner_mod:
@@ -186,8 +186,6 @@ def get_cmdlet_arg_flags(cmd_name: str) -> List[str]:
if not meta: if not meta:
return [] return []
# Preserve the order that arguments are defined on the cmdlet (arg=[...]) so
# completions feel stable and predictable.
flags: List[str] = [] flags: List[str] = []
seen: set[str] = set() seen: set[str] = set()

View File

@@ -135,7 +135,7 @@ def _render_detail(meta: Dict[str, Any], args: Sequence[str]) -> None:
def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int: def _run(result: Any, args: Sequence[str], config: Dict[str, Any]) -> int:
try: try:
from cmdlet import catalog as _catalog import cmdlet_catalog as _catalog
CMDLET.arg[0].choices = _normalize_choice_list(_catalog.list_cmdlet_names()) CMDLET.arg[0].choices = _normalize_choice_list(_catalog.list_cmdlet_names())
metadata = _catalog.list_cmdlet_metadata() metadata = _catalog.list_cmdlet_metadata()

View File

@@ -16,7 +16,7 @@ from models import PipeObject
from API.folder import LocalLibrarySearchOptimizer from API.folder import LocalLibrarySearchOptimizer
from config import get_local_storage_path, get_hydrus_access_key, get_hydrus_url from config import get_local_storage_path, get_hydrus_access_key, get_hydrus_url
from hydrus_health_check import get_cookies_file_path
_ALLDEBRID_UNLOCK_CACHE: Dict[str, str] = {} _ALLDEBRID_UNLOCK_CACHE: Dict[str, str] = {}
@@ -372,12 +372,18 @@ def _build_hydrus_header(config: Dict[str, Any]) -> Optional[str]:
def _build_ytdl_options(config: Optional[Dict[str, Any]], hydrus_header: Optional[str]) -> Optional[str]: def _build_ytdl_options(config: Optional[Dict[str, Any]], hydrus_header: Optional[str]) -> Optional[str]:
"""Compose ytdl-raw-options string including cookies and optional Hydrus header.""" """Compose ytdl-raw-options string including cookies and optional Hydrus header."""
opts: List[str] = [] opts: List[str] = []
cookies_path = None
try: try:
cookies_path = get_cookies_file_path() from tool.ytdlp import YtDlpTool
cookiefile = YtDlpTool(config or {}).resolve_cookiefile()
if cookiefile is not None:
cookies_path = str(cookiefile)
except Exception: except Exception:
cookies_path = None cookies_path = None
if cookies_path: if cookies_path:
opts.append(f"cookies={cookies_path.replace('\\', '/')}") opts.append(f"cookies={cookies_path.replace('\\', '/')}" )
else: else:
opts.append("cookies-from-browser=chrome") opts.append("cookies-from-browser=chrome")
if hydrus_header: if hydrus_header:
@@ -407,10 +413,18 @@ def _is_hydrus_path(path: str, hydrus_url: Optional[str]) -> bool:
return True return True
return False return False
def _ensure_ytdl_cookies() -> None: def _ensure_ytdl_cookies(config: Optional[Dict[str, Any]] = None) -> None:
"""Ensure yt-dlp options are set correctly for this session.""" """Ensure yt-dlp options are set correctly for this session."""
from pathlib import Path from pathlib import Path
cookies_path = get_cookies_file_path() cookies_path = None
try:
from tool.ytdlp import YtDlpTool
cookiefile = YtDlpTool(config or {}).resolve_cookiefile()
if cookiefile is not None:
cookies_path = str(cookiefile)
except Exception:
cookies_path = None
if cookies_path: if cookies_path:
# Check if file exists and has content (use forward slashes for path checking) # Check if file exists and has content (use forward slashes for path checking)
check_path = cookies_path.replace('\\', '/') check_path = cookies_path.replace('\\', '/')
@@ -635,7 +649,7 @@ def _queue_items(
pass pass
# Just verify cookies are configured, don't try to set via IPC # Just verify cookies are configured, don't try to set via IPC
_ensure_ytdl_cookies() _ensure_ytdl_cookies(config)
hydrus_header = _build_hydrus_header(config or {}) hydrus_header = _build_hydrus_header(config or {})
ytdl_opts = _build_ytdl_options(config, hydrus_header) ytdl_opts = _build_ytdl_options(config, hydrus_header)
@@ -1426,7 +1440,15 @@ def _start_mpv(items: List[Any], config: Optional[Dict[str, Any]] = None, start_
hydrus_header = _build_hydrus_header(config or {}) hydrus_header = _build_hydrus_header(config or {})
ytdl_opts = _build_ytdl_options(config, hydrus_header) ytdl_opts = _build_ytdl_options(config, hydrus_header)
cookies_path = get_cookies_file_path() cookies_path = None
try:
from tool.ytdlp import YtDlpTool
cookiefile = YtDlpTool(config or {}).resolve_cookiefile()
if cookiefile is not None:
cookies_path = str(cookiefile)
except Exception:
cookies_path = None
if cookies_path: if cookies_path:
debug(f"Starting MPV with cookies file: {cookies_path.replace('\\', '/')}") debug(f"Starting MPV with cookies file: {cookies_path.replace('\\', '/')}")
else: else:

View File

@@ -1,11 +1,5 @@
"""Unified configuration helpers. """
Configuration is defined exclusively via the modular `.conf` format.
- Required: `temp`
- Optional: stores, providers, and other settings
- Modular: optional fragments in `config.d/*.conf` are merged in lexicographic order
""" """
from __future__ import annotations from __future__ import annotations
@@ -130,6 +124,21 @@ def _apply_conf_block(config: Dict[str, Any], kind: str, subtype: str, block: Di
provider[provider_name] = dict(block) provider[provider_name] = dict(block)
return return
if kind_l == "tool":
tool_name = str(subtype).strip().lower()
if not tool_name:
return
tool = config.setdefault("tool", {})
if not isinstance(tool, dict):
config["tool"] = {}
tool = config["tool"]
existing = tool.get(tool_name)
if isinstance(existing, dict):
_merge_dict_inplace(existing, block)
else:
tool[tool_name] = dict(block)
return
def parse_conf_text(text: str, *, base: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: def parse_conf_text(text: str, *, base: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
"""Parse a lightweight .conf format into the app's config dict. """Parse a lightweight .conf format into the app's config dict.
@@ -227,7 +236,7 @@ def _serialize_conf(config: Dict[str, Any]) -> str:
# Top-level scalars first # Top-level scalars first
for key in sorted(config.keys()): for key in sorted(config.keys()):
if key in {"store", "provider"}: if key in {"store", "provider", "tool"}:
continue continue
value = config.get(key) value = config.get(key)
if isinstance(value, dict): if isinstance(value, dict):
@@ -263,6 +272,18 @@ def _serialize_conf(config: Dict[str, Any]) -> str:
for k in sorted(block.keys()): for k in sorted(block.keys()):
lines.append(f"{k}={_format_conf_value(block.get(k))}") lines.append(f"{k}={_format_conf_value(block.get(k))}")
# Tool blocks
tool = config.get("tool")
if isinstance(tool, dict):
for name in sorted(tool.keys()):
block = tool.get(name)
if not isinstance(block, dict):
continue
lines.append("")
lines.append(f"[tool={name}]")
for k in sorted(block.keys()):
lines.append(f"{k}={_format_conf_value(block.get(k))}")
return "\n".join(lines).rstrip() + "\n" return "\n".join(lines).rstrip() + "\n"
@@ -510,12 +531,43 @@ def get_provider_credentials(config: Dict[str, Any], provider: str) -> Optional[
def resolve_cookies_path(config: Dict[str, Any], script_dir: Optional[Path] = None) -> Optional[Path]: def resolve_cookies_path(config: Dict[str, Any], script_dir: Optional[Path] = None) -> Optional[Path]:
value = config.get("cookies") # Support both legacy top-level `cookies=...` and the modular conf style:
if value: # [tool=ytdlp]
# cookies="C:\\path\\cookies.txt"
values: list[Any] = []
try:
values.append(config.get("cookies"))
except Exception:
pass
try:
tool = config.get("tool")
if isinstance(tool, dict):
ytdlp = tool.get("ytdlp")
if isinstance(ytdlp, dict):
values.append(ytdlp.get("cookies"))
values.append(ytdlp.get("cookiefile"))
except Exception:
pass
try:
ytdlp_block = config.get("ytdlp")
if isinstance(ytdlp_block, dict):
values.append(ytdlp_block.get("cookies"))
values.append(ytdlp_block.get("cookiefile"))
except Exception:
pass
base_dir = script_dir or SCRIPT_DIR
for value in values:
if not value:
continue
candidate = Path(str(value)).expanduser() candidate = Path(str(value)).expanduser()
if not candidate.is_absolute():
candidate = (base_dir / candidate).expanduser()
if candidate.is_file(): if candidate.is_file():
return candidate return candidate
base_dir = script_dir or SCRIPT_DIR
default_path = base_dir / "cookies.txt" default_path = base_dir / "cookies.txt"
if default_path.is_file(): if default_path.is_file():
return default_path return default_path

View File

@@ -1,43 +0,0 @@
"""Cookies availability helpers.
This module is intentionally limited to cookie-file resolution used by yt-dlp.
Other service availability checks live in their owning store/provider objects.
"""
import sys
from pathlib import Path
from typing import Any, Dict, Optional, Tuple
from SYS.logger import debug
# Global state for Cookies availability
_COOKIES_FILE_PATH: Optional[str] = None
def initialize_cookies_check(config: Optional[Dict[str, Any]] = None, emit_debug: bool = True) -> Tuple[bool, str]:
"""Resolve cookies file path from config, falling back to cookies.txt in app root.
Returns a tuple of (found, detail_message).
"""
global _COOKIES_FILE_PATH
try:
from config import resolve_cookies_path
cookies_path = resolve_cookies_path(config or {}, script_dir=Path(__file__).parent)
except Exception:
cookies_path = None
if cookies_path and cookies_path.exists():
_COOKIES_FILE_PATH = str(cookies_path)
if emit_debug:
debug(f"Cookies: ENABLED - Found cookies file", file=sys.stderr)
return True, str(cookies_path)
else:
_COOKIES_FILE_PATH = None
return False, "Not found"
def get_cookies_file_path() -> Optional[str]:
"""Get the path to the cookies.txt file if it exists."""
return _COOKIES_FILE_PATH

View File

@@ -348,6 +348,8 @@ class DownloadOptions:
playlist_items: Optional[str] = None # yt-dlp --playlist-items format (e.g., "1-3,5,8") playlist_items: Optional[str] = None # yt-dlp --playlist-items format (e.g., "1-3,5,8")
no_playlist: bool = False # If True, pass --no-playlist to yt-dlp no_playlist: bool = False # If True, pass --no-playlist to yt-dlp
quiet: bool = False # If True, suppress all console output (progress, debug logs) quiet: bool = False # If True, suppress all console output (progress, debug logs)
embed_chapters: bool = False # If True, pass yt-dlp --embed-chapters / embedchapters
write_sub: bool = False # If True, download subtitles (writesubtitles/writeautomaticsub)
class SendFunc(Protocol): class SendFunc(Protocol):

View File

@@ -35,7 +35,7 @@ dependencies = [
"textual>=0.30.0", "textual>=0.30.0",
# Media processing and downloading # Media processing and downloading
"yt-dlp>=2023.11.0", "yt-dlp[default]>=2023.11.0",
"yt-dlp-ejs", # EJS challenge solver scripts for YouTube JavaScript challenges "yt-dlp-ejs", # EJS challenge solver scripts for YouTube JavaScript challenges
"requests>=2.31.0", "requests>=2.31.0",
"httpx>=0.25.0", "httpx>=0.25.0",
@@ -43,7 +43,6 @@ dependencies = [
# Document and data handling # Document and data handling
"pypdf>=3.0.0", "pypdf>=3.0.0",
"img2pdf>=0.6.0",
"mutagen>=1.46.0", "mutagen>=1.46.0",
"cbor2>=4.0", "cbor2>=4.0",
@@ -53,7 +52,6 @@ dependencies = [
# Metadata extraction and processing # Metadata extraction and processing
"musicbrainzngs>=0.7.0", "musicbrainzngs>=0.7.0",
"beautifulsoup4>=4.12.0",
"lxml>=4.9.0", "lxml>=4.9.0",
# Advanced searching and libraries # Advanced searching and libraries

View File

@@ -4,14 +4,13 @@ prompt-toolkit>=3.0.0
textual>=0.30.0 textual>=0.30.0
# Media processing and downloading # Media processing and downloading
yt-dlp>=2023.11.0 yt-dlp[default]>=2023.11.0
requests>=2.31.0 requests>=2.31.0
httpx>=0.25.0 httpx>=0.25.0
ffmpeg-python>=0.2.0 ffmpeg-python>=0.2.0
# Document and data handling # Document and data handling
pypdf>=3.0.0 pypdf>=3.0.0
img2pdf>=0.6.0
mutagen>=1.46.0 mutagen>=1.46.0
cbor2>=4.0 cbor2>=4.0
@@ -21,7 +20,6 @@ python-bidi>=0.4.2
# Metadata extraction and processing # Metadata extraction and processing
musicbrainzngs>=0.7.0 musicbrainzngs>=0.7.0
beautifulsoup4>=4.12.0
lxml>=4.9.0 lxml>=4.9.0
# Advanced searching and libraries # Advanced searching and libraries

View File

@@ -1,336 +0,0 @@
import requests
import random, string
from concurrent import futures
from tqdm import tqdm
import time
from datetime import datetime
import argparse
import os
import sys
import shutil
import json
import re
import base64
import hashlib
from Crypto.Cipher import AES
from Crypto.Util import Counter
def display_error(response, message):
print(message)
print(response)
print(response.text)
exit()
def get_book_infos(session, url):
r = session.get(url).text
infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&")
response = session.get(infos_url)
data = response.json()['data']
title = data['brOptions']['bookTitle'].strip().replace(" ", "_")
title = ''.join( c for c in title if c not in '<>:"/\\|?*' ) # Filter forbidden chars in directory names (Windows & Linux)
title = title[:150] # Trim the title to avoid long file names
metadata = data['metadata']
links = []
for item in data['brOptions']['data']:
for page in item:
links.append(page['uri'])
if len(links) > 1:
print(f"[+] Found {len(links)} pages")
return title, links, metadata
else:
print(f"[-] Error while getting image links")
exit()
def login(email, password):
session = requests.Session()
response = session.get("https://archive.org/services/account/login/")
login_data = response.json()
if not login_data['success']:
display_error(response, "[-] Error while getting login token:")
login_token = login_data["value"]["token"]
headers = {"Content-Type": "application/x-www-form-urlencoded"}
data = {"username":email, "password":password, "t": login_token}
response = session.post("https://archive.org/services/account/login/", headers=headers, data=json.dumps(data))
try:
response_json = response.json()
except:
display_error(response, "[-] Error while login:")
if response_json["success"] == False:
if response_json["value"] == "bad_login":
print("[-] Invalid credentials!")
exit()
display_error(response, "[-] Error while login:")
else:
print("[+] Successful login")
return session
def loan(session, book_id, verbose=True):
data = {
"action": "grant_access",
"identifier": book_id
}
response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data)
data['action'] = "browse_book"
response = session.post("https://archive.org/services/loans/loan/", data=data)
if response.status_code == 400 :
try:
if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
print("This book doesn't need to be borrowed")
return session
else :
display_error(response, "Something went wrong when trying to borrow the book.")
except: # The response is not in JSON format
display_error(response, "The book cannot be borrowed")
data['action'] = "create_token"
response = session.post("https://archive.org/services/loans/loan/", data=data)
if "token" in response.text:
if verbose:
print("[+] Successful loan")
return session
else:
display_error(response, "Something went wrong when trying to borrow the book, maybe you can't borrow this book.")
def return_loan(session, book_id):
data = {
"action": "return_loan",
"identifier": book_id
}
response = session.post("https://archive.org/services/loans/loan/", data=data)
if response.status_code == 200 and response.json()["success"]:
print("[+] Book returned")
else:
display_error(response, "Something went wrong when trying to return the book")
def image_name(pages, page, directory):
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
def deobfuscate_image(image_data, link, obf_header):
"""
@Author: https://github.com/justimm
Decrypts the first 1024 bytes of image_data using AES-CTR.
The obfuscation_header is expected in the form "1|<base64encoded_counter>"
where the base64-decoded counter is 16 bytes.
We derive the AES key by taking the SHA-1 digest of the image URL (with protocol/host removed)
and using the first 16 bytes.
For AES-CTR, we use a 16-byte counter block. The first 8 bytes are used as a fixed prefix,
and the remaining 8 bytes (interpreted as a big-endian integer) are used as the initial counter value.
"""
try:
version, counter_b64 = obf_header.split('|')
except Exception as e:
raise ValueError("Invalid X-Obfuscate header format") from e
if version != '1':
raise ValueError("Unsupported obfuscation version: " + version)
# Derive AES key: replace protocol/host in link with '/'
aesKey = re.sub(r"^https?:\/\/.*?\/", "/", link)
sha1_digest = hashlib.sha1(aesKey.encode('utf-8')).digest()
key = sha1_digest[:16]
# Decode the counter (should be 16 bytes)
counter_bytes = base64.b64decode(counter_b64)
if len(counter_bytes) != 16:
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
prefix = counter_bytes[:8]
initial_value = int.from_bytes(counter_bytes[8:], byteorder='big')
# Create AES-CTR cipher with a 64-bit counter length.
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False)
cipher = AES.new(key, AES.MODE_CTR, counter=ctr)
decrypted_part = cipher.decrypt(image_data[:1024])
new_data = decrypted_part + image_data[1024:]
return new_data
def download_one_image(session, link, i, directory, book_id, pages):
headers = {
"Referer": "https://archive.org/",
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
"Sec-Fetch-Site": "same-site",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Dest": "image",
}
retry = True
response = None
while retry:
try:
response = session.get(link, headers=headers)
if response.status_code == 403:
session = loan(session, book_id, verbose=False)
raise Exception("Borrow again")
elif response.status_code == 200:
retry = False
except:
time.sleep(1) # Wait 1 second before retrying
image = image_name(pages, i, directory)
obf_header = response.headers.get("X-Obfuscate")
image_content = None
if obf_header:
try:
image_content = deobfuscate_image(response.content, link, obf_header)
except Exception as e:
print(f"[ERROR] Deobfuscation failed: {e}")
return
else:
image_content = response.content
with open(image, "wb") as f:
f.write(image_content)
def download(session, n_threads, directory, links, scale, book_id):
print("Downloading pages...")
links = [f"{link}&rotate=0&scale={scale}" for link in links]
pages = len(links)
tasks = []
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
for link in links:
i = links.index(link)
tasks.append(executor.submit(download_one_image, session=session, link=link, i=i, directory=directory, book_id=book_id, pages=pages))
for task in tqdm(futures.as_completed(tasks), total=len(tasks)):
pass
images = [image_name(pages, i, directory) for i in range(len(links))]
return images
def make_pdf(pdf, title, directory):
file = title+".pdf"
# Handle the case where multiple books with the same name are downloaded
i = 1
while os.path.isfile(os.path.join(directory, file)):
file = f"{title}({i}).pdf"
i += 1
with open(os.path.join(directory, file),"wb") as f:
f.write(pdf)
print(f"[+] PDF saved as \"{file}\"")
if __name__ == "__main__":
my_parser = argparse.ArgumentParser()
my_parser.add_argument('-e', '--email', help='Your archive.org email', type=str, required=True)
my_parser.add_argument('-p', '--password', help='Your archive.org password', type=str, required=True)
my_parser.add_argument('-u', '--url', help='Link to the book (https://archive.org/details/XXXX). You can use this argument several times to download multiple books', action='append', type=str)
my_parser.add_argument('-d', '--dir', help='Output directory', type=str)
my_parser.add_argument('-f', '--file', help='File where are stored the URLs of the books to download', type=str)
my_parser.add_argument('-r', '--resolution', help='Image resolution (10 to 0, 0 is the highest), [default 3]', type=int, default=3)
my_parser.add_argument('-t', '--threads', help="Maximum number of threads, [default 50]", type=int, default=50)
my_parser.add_argument('-j', '--jpg', help="Output to individual JPG's rather than a PDF", action='store_true')
my_parser.add_argument('-m', '--meta', help="Output the metadata of the book to a json file (-j option required)", action='store_true')
if len(sys.argv) == 1:
my_parser.print_help(sys.stderr)
sys.exit(1)
args = my_parser.parse_args()
if args.url is None and args.file is None:
my_parser.error("At least one of --url and --file required")
email = args.email
password = args.password
scale = args.resolution
n_threads = args.threads
d = args.dir
if d == None:
d = os.getcwd()
elif not os.path.isdir(d):
print(f"Output directory does not exist!")
exit()
if args.url is not None:
urls = args.url
else:
if os.path.exists(args.file):
with open(args.file) as f:
urls = f.read().strip().split("\n")
else:
print(f"{args.file} does not exist!")
exit()
# Check the urls format
for url in urls:
if not url.startswith("https://archive.org/details/"):
print(f"{url} --> Invalid url. URL must starts with \"https://archive.org/details/\"")
exit()
print(f"{len(urls)} Book(s) to download")
session = login(email, password)
for url in urls:
book_id = list(filter(None, url.split("/")))[3]
print("="*40)
print(f"Current book: https://archive.org/details/{book_id}")
session = loan(session, book_id)
title, links, metadata = get_book_infos(session, url)
directory = os.path.join(d, title)
# Handle the case where multiple books with the same name are downloaded
i = 1
_directory = directory
while os.path.isdir(directory):
directory = f"{_directory}({i})"
i += 1
os.makedirs(directory)
if args.meta:
print("Writing metadata.json...")
with open(f"{directory}/metadata.json",'w') as f:
json.dump(metadata,f)
images = download(session, n_threads, directory, links, scale, book_id)
if not args.jpg: # Create pdf with images and remove the images folder
import img2pdf
# prepare PDF metadata
# sometimes archive metadata is missing
pdfmeta = { }
# ensure metadata are str
for key in ["title", "creator", "associated-names"]:
if key in metadata:
if isinstance(metadata[key], str):
pass
elif isinstance(metadata[key], list):
metadata[key] = "; ".join(metadata[key])
else:
raise Exception("unsupported metadata type")
# title
if 'title' in metadata:
pdfmeta['title'] = metadata['title']
# author
if 'creator' in metadata and 'associated-names' in metadata:
pdfmeta['author'] = metadata['creator'] + "; " + metadata['associated-names']
elif 'creator' in metadata:
pdfmeta['author'] = metadata['creator']
elif 'associated-names' in metadata:
pdfmeta['author'] = metadata['associated-names']
# date
if 'date' in metadata:
try:
pdfmeta['creationdate'] = datetime.strptime(metadata['date'][0:4], '%Y')
except:
pass
# keywords
pdfmeta['keywords'] = [f"https://archive.org/details/{book_id}"]
pdf = img2pdf.convert(images, **pdfmeta)
make_pdf(pdf, title, args.dir if args.dir != None else "")
try:
shutil.rmtree(directory)
except OSError as e:
print ("Error: %s - %s." % (e.filename, e.strerror))
return_loan(session, book_id)

11
tool/__init__.py Normal file
View File

@@ -0,0 +1,11 @@
"""Tool helpers.
This package contains wrappers around external tools (e.g. yt-dlp) so cmdlets can share
common defaults (cookies, timeouts, format selectors) and users can override them via
`config.conf`.
"""
from .ytdlp import YtDlpTool, YtDlpDefaults
from .playwright import PlaywrightTool, PlaywrightDefaults
__all__ = ["YtDlpTool", "YtDlpDefaults", "PlaywrightTool", "PlaywrightDefaults"]

203
tool/playwright.py Normal file
View File

@@ -0,0 +1,203 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict, Iterator, Optional
from SYS.logger import debug
try:
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
from playwright.sync_api import sync_playwright
HAS_PLAYWRIGHT = True
_PLAYWRIGHT_IMPORT_ERROR: Optional[Exception] = None
except Exception as exc: # pragma: no cover
HAS_PLAYWRIGHT = False
_PLAYWRIGHT_IMPORT_ERROR = exc
PlaywrightTimeoutError = TimeoutError # type: ignore
sync_playwright = None # type: ignore
# Re-export for consumers (e.g. cmdlets catching navigation timeouts)
__all__ = ["HAS_PLAYWRIGHT", "PlaywrightTimeoutError", "PlaywrightTool", "PlaywrightDefaults"]
def _get_nested(config: Dict[str, Any], *path: str) -> Any:
cur: Any = config
for key in path:
if not isinstance(cur, dict):
return None
cur = cur.get(key)
return cur
@dataclass(slots=True)
class PlaywrightDefaults:
browser: str = "chromium" # chromium|firefox|webkit
headless: bool = True
user_agent: str = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
viewport_width: int = 1280
viewport_height: int = 1200
navigation_timeout_ms: int = 90_000
ignore_https_errors: bool = True
class PlaywrightTool:
"""Small wrapper to standardize Playwright defaults and lifecycle.
This is meant to keep cmdlets/providers from duplicating:
- sync_playwright start/stop
- browser launch/context creation
- user-agent/viewport defaults
Config overrides (top-level keys):
- playwright.browser="chromium"
- playwright.headless=true
- playwright.user_agent="..."
- playwright.viewport_width=1280
- playwright.viewport_height=1200
- playwright.navigation_timeout_ms=90000
- playwright.ignore_https_errors=true
"""
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
self._config: Dict[str, Any] = dict(config or {})
self.defaults = self._load_defaults()
def _load_defaults(self) -> PlaywrightDefaults:
cfg = self._config
tool_block = _get_nested(cfg, "tool", "playwright")
if not isinstance(tool_block, dict):
tool_block = {}
pw_block = cfg.get("playwright") if isinstance(cfg.get("playwright"), dict) else {}
if not isinstance(pw_block, dict):
pw_block = {}
def _get(name: str, fallback: Any) -> Any:
val = tool_block.get(name)
if val is None:
val = pw_block.get(name)
if val is None:
val = cfg.get(f"playwright_{name}")
if val is None:
val = _get_nested(cfg, "playwright", name)
return fallback if val is None else val
browser = str(_get("browser", PlaywrightDefaults.browser)).strip().lower() or "chromium"
if browser not in {"chromium", "firefox", "webkit"}:
browser = "chromium"
headless_raw = _get("headless", PlaywrightDefaults.headless)
headless = bool(headless_raw)
ua = str(_get("user_agent", PlaywrightDefaults.user_agent))
def _int(name: str, fallback: int) -> int:
raw = _get(name, fallback)
try:
return int(raw)
except Exception:
return fallback
vw = _int("viewport_width", PlaywrightDefaults.viewport_width)
vh = _int("viewport_height", PlaywrightDefaults.viewport_height)
nav_timeout = _int("navigation_timeout_ms", PlaywrightDefaults.navigation_timeout_ms)
ignore_https = bool(_get("ignore_https_errors", PlaywrightDefaults.ignore_https_errors))
return PlaywrightDefaults(
browser=browser,
headless=headless,
user_agent=ua,
viewport_width=vw,
viewport_height=vh,
navigation_timeout_ms=nav_timeout,
ignore_https_errors=ignore_https,
)
def require(self) -> None:
if HAS_PLAYWRIGHT and sync_playwright is not None:
return
detail = str(_PLAYWRIGHT_IMPORT_ERROR or "playwright is not installed")
raise RuntimeError(
"playwright is required; install with: pip install playwright; then: playwright install\n"
f"detail: {detail}"
)
def open_page(
self,
*,
headless: Optional[bool] = None,
user_agent: Optional[str] = None,
viewport_width: Optional[int] = None,
viewport_height: Optional[int] = None,
ignore_https_errors: Optional[bool] = None,
) -> Iterator[Any]:
"""Context manager yielding a Playwright page with sane defaults."""
self.require()
h = self.defaults.headless if headless is None else bool(headless)
ua = self.defaults.user_agent if user_agent is None else str(user_agent)
vw = self.defaults.viewport_width if viewport_width is None else int(viewport_width)
vh = self.defaults.viewport_height if viewport_height is None else int(viewport_height)
ihe = self.defaults.ignore_https_errors if ignore_https_errors is None else bool(ignore_https_errors)
pw = None
browser = None
context = None
try:
assert sync_playwright is not None
pw = sync_playwright().start()
browser_type = getattr(pw, self.defaults.browser, None)
if browser_type is None:
browser_type = pw.chromium
browser = browser_type.launch(
headless=h,
args=["--disable-blink-features=AutomationControlled"],
)
context = browser.new_context(
user_agent=ua,
viewport={"width": vw, "height": vh},
ignore_https_errors=ihe,
)
page = context.new_page()
yield page
finally:
try:
if context is not None:
context.close()
except Exception:
pass
try:
if browser is not None:
browser.close()
except Exception:
pass
try:
if pw is not None:
pw.stop()
except Exception:
pass
def goto(self, page: Any, url: str) -> None:
"""Navigate with configured timeout."""
try:
page.goto(url, timeout=int(self.defaults.navigation_timeout_ms), wait_until="domcontentloaded")
except Exception:
raise
def debug_dump(self) -> None:
try:
debug(
f"[playwright] browser={self.defaults.browser} headless={self.defaults.headless} "
f"viewport={self.defaults.viewport_width}x{self.defaults.viewport_height} "
f"nav_timeout_ms={self.defaults.navigation_timeout_ms}"
)
except Exception:
pass

195
tool/ytdlp.py Normal file
View File

@@ -0,0 +1,195 @@
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence
from SYS.logger import debug
def _get_nested(config: Dict[str, Any], *path: str) -> Any:
cur: Any = config
for key in path:
if not isinstance(cur, dict):
return None
cur = cur.get(key)
return cur
def _parse_csv_list(value: Any) -> Optional[List[str]]:
if value is None:
return None
if isinstance(value, list):
out: List[str] = []
for item in value:
s = str(item).strip()
if s:
out.append(s)
return out or None
s = str(value).strip()
if not s:
return None
# allow either JSON-ish list strings or simple comma-separated values
if s.startswith("[") and s.endswith("]"):
s = s[1:-1]
parts = [p.strip() for p in s.split(",")]
parts = [p for p in parts if p]
return parts or None
@dataclass(slots=True)
class YtDlpDefaults:
"""User-tunable defaults for yt-dlp behavior.
Recommended config.conf keys (top-level dotted keys):
- ytdlp.video_format="bestvideo+bestaudio/best"
- ytdlp.audio_format="251/140/bestaudio"
- ytdlp.format_sort="res:2160,res:1440,res:1080,res:720,res"
Cookies:
- cookies="C:\\path\\cookies.txt" (already supported by config.resolve_cookies_path)
"""
video_format: str = "bestvideo+bestaudio/best"
audio_format: str = "251/140/bestaudio"
format_sort: Optional[List[str]] = None
class YtDlpTool:
"""Centralizes yt-dlp defaults and translation helpers.
This is intentionally small and dependency-light so cmdlets can use it without
forcing a full refactor.
"""
def __init__(self, config: Optional[Dict[str, Any]] = None, *, script_dir: Optional[Path] = None) -> None:
self._config: Dict[str, Any] = dict(config or {})
# `resolve_cookies_path` expects the app root so it can fall back to ./cookies.txt.
# This file lives under ./tool/, so default to the parent directory.
self._script_dir = script_dir or Path(__file__).resolve().parent.parent
self.defaults = self._load_defaults()
self._cookiefile: Optional[Path] = self._init_cookiefile()
def _init_cookiefile(self) -> Optional[Path]:
"""Resolve cookies once at tool init (yt-dlp is the primary consumer)."""
try:
from config import resolve_cookies_path
resolved = resolve_cookies_path(self._config, script_dir=self._script_dir)
if resolved is not None and resolved.is_file():
return resolved
except Exception:
pass
return None
def _load_defaults(self) -> YtDlpDefaults:
cfg = self._config
tool_block = _get_nested(cfg, "tool", "ytdlp")
if not isinstance(tool_block, dict):
tool_block = {}
ytdlp_block = cfg.get("ytdlp") if isinstance(cfg.get("ytdlp"), dict) else {}
if not isinstance(ytdlp_block, dict):
ytdlp_block = {}
# Accept both nested and flat styles.
video_format = (
tool_block.get("video_format")
or tool_block.get("format")
or ytdlp_block.get("video_format")
or ytdlp_block.get("video")
or ytdlp_block.get("format_video")
or cfg.get("ytdlp_video_format")
)
audio_format = (
tool_block.get("audio_format")
or ytdlp_block.get("audio_format")
or ytdlp_block.get("audio")
or ytdlp_block.get("format_audio")
or cfg.get("ytdlp_audio_format")
)
# Also accept dotted keys written as nested dicts: ytdlp.format.video, ytdlp.format.audio
nested_video = _get_nested(cfg, "ytdlp", "format", "video")
nested_audio = _get_nested(cfg, "ytdlp", "format", "audio")
fmt_sort_val = (
tool_block.get("format_sort")
or ytdlp_block.get("format_sort")
or ytdlp_block.get("formatSort")
or cfg.get("ytdlp_format_sort")
or _get_nested(cfg, "ytdlp", "format", "sort")
)
fmt_sort = _parse_csv_list(fmt_sort_val)
defaults = YtDlpDefaults(
video_format=str(nested_video or video_format or YtDlpDefaults.video_format),
audio_format=str(nested_audio or audio_format or YtDlpDefaults.audio_format),
format_sort=fmt_sort,
)
return defaults
def resolve_cookiefile(self) -> Optional[Path]:
return self._cookiefile
def default_format(self, mode: str) -> str:
m = str(mode or "").lower().strip()
if m == "audio":
return self.defaults.audio_format
return self.defaults.video_format
def build_yt_dlp_cli_args(
self,
*,
url: str,
output_dir: Optional[Path] = None,
ytdl_format: Optional[str] = None,
playlist_items: Optional[str] = None,
no_playlist: bool = False,
quiet: bool = True,
extra_args: Optional[Sequence[str]] = None,
) -> List[str]:
"""Build a yt-dlp command line (argv list).
This is primarily for debug output or subprocess execution.
"""
argv: List[str] = ["yt-dlp"]
if quiet:
argv.extend(["--quiet", "--no-warnings"])
argv.append("--no-progress")
cookiefile = self.resolve_cookiefile()
if cookiefile is not None:
argv.extend(["--cookies", str(cookiefile)])
if no_playlist:
argv.append("--no-playlist")
if playlist_items:
argv.extend(["--playlist-items", str(playlist_items)])
fmt = (ytdl_format or "").strip()
if fmt:
# Use long form to avoid confusion with app-level flags.
argv.extend(["--format", fmt])
if self.defaults.format_sort:
for sort_key in self.defaults.format_sort:
argv.extend(["-S", sort_key])
if output_dir is not None:
outtmpl = str((output_dir / "%(title)s.%(ext)s").resolve())
argv.extend(["-o", outtmpl])
if extra_args:
argv.extend([str(a) for a in extra_args if str(a).strip()])
argv.append(str(url))
return argv
def debug_print_cli(self, argv: Sequence[str]) -> None:
try:
debug("yt-dlp argv: " + " ".join(str(a) for a in argv))
except Exception:
pass