hh
This commit is contained in:
@@ -202,6 +202,8 @@ class API_folder_store:
|
||||
|
||||
DB_NAME = "medios-macina.db"
|
||||
SCHEMA_VERSION = 4
|
||||
# Global lock across all instances to prevent 'database is locked' during concurrent operations.
|
||||
_shared_db_lock = RLock()
|
||||
|
||||
def __init__(self, library_root: Path):
|
||||
"""Initialize the database at the library root.
|
||||
@@ -212,10 +214,8 @@ class API_folder_store:
|
||||
self.library_root = expand_path(library_root).resolve()
|
||||
self.db_path = self.library_root / self.DB_NAME
|
||||
self.connection: Optional[sqlite3.Connection] = None
|
||||
# sqlite3 connections are not safe for concurrent use across threads.
|
||||
# We intentionally keep a single connection per API_folder_store instance,
|
||||
# so we must serialize all DB operations on that connection.
|
||||
self._db_lock = RLock()
|
||||
# Use the shared lock
|
||||
self._db_lock = self._shared_db_lock
|
||||
self._init_db()
|
||||
|
||||
def _normalize_input_path(self, file_path: Path) -> Path:
|
||||
|
||||
@@ -1037,7 +1037,7 @@ class Libgen(Provider):
|
||||
if md is not None:
|
||||
md["metadata_enriched_from"] = enriched_source
|
||||
|
||||
if extracted_title and ((not title)
|
||||
if extracted_title and ((not title)
|
||||
or title.startswith("http")):
|
||||
title = extracted_title
|
||||
except Exception as e:
|
||||
@@ -1069,8 +1069,6 @@ class Libgen(Provider):
|
||||
return self.download(sr, output_dir)
|
||||
except Exception:
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
LogFn = Optional[Callable[[str], None]]
|
||||
@@ -1656,9 +1654,10 @@ def _resolve_download_url(
|
||||
session: requests.Session,
|
||||
url: str,
|
||||
log_info: LogFn = None,
|
||||
) -> Optional[str]:
|
||||
) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""Resolve the final download URL by following the LibGen chain."""
|
||||
current_url = url
|
||||
referer = None
|
||||
visited = set()
|
||||
|
||||
def _resolve_html_links_regex(base_url: str, html: str) -> Optional[str]:
|
||||
@@ -1670,57 +1669,44 @@ def _resolve_download_url(
|
||||
if not html:
|
||||
return None
|
||||
|
||||
# LibGen chain helpers (for environments without lxml).
|
||||
# Typical chain:
|
||||
# edition.php?id=... -> file.php?id=...
|
||||
# file.php?id=... -> ads.php?md5=... (or get.php?md5=...)
|
||||
# ads.php?md5=... -> get.php?md5=...
|
||||
# get.php?md5=... -> file response
|
||||
|
||||
# Use a more relaxed regex for href that handles spaces and missing quotes.
|
||||
# Format: href [space] = [space] [quote] link [quote]
|
||||
def _find_link(pattern: str) -> Optional[str]:
|
||||
# This regex allows:
|
||||
# href="link"
|
||||
# href='link'
|
||||
# href=link
|
||||
# href = "link"
|
||||
regex = r"href\s*=\s*['\"]?(" + pattern + r")['\"]?"
|
||||
match = re.search(regex, html, flags=re.IGNORECASE)
|
||||
if match:
|
||||
u = str(match.group(1) or "").strip()
|
||||
# Strip trailing quotes if the regex over-captured (e.g. unquoted link followed by space/quote)
|
||||
u = u.split("'")[0].split('"')[0].split(">")[0].split(" ")[0].strip()
|
||||
if u and not u.lower().startswith("javascript:"):
|
||||
return urljoin(base_url, u)
|
||||
return None
|
||||
|
||||
# Handle edition -> file links.
|
||||
found = _find_link(r'[^"\' >]*file\.php\?id=\d+[^"\' >]*')
|
||||
# Priority patterns for LibGen mirrors (e.g., library.lol, libgen.li)
|
||||
# 1. library.lol "GET" link or direct /main/
|
||||
found = _find_link(r'[^"\' >]*/main/\d+/[^"\' >]*')
|
||||
if found:
|
||||
return found
|
||||
|
||||
# Handle series -> edition links.
|
||||
found = _find_link(r'[^"\' >]*edition\.php\?id=\d+[^"\' >]*')
|
||||
if found:
|
||||
return found
|
||||
|
||||
# Handle file -> ads/get links (sometimes present as the "Libgen" mirror).
|
||||
found = _find_link(r'[^"\' >]*ads\.php\?md5=[a-fA-F0-9]{32}[^"\' >]*')
|
||||
if found:
|
||||
return found
|
||||
|
||||
# Prefer explicit get.php md5 links (most common successful chain).
|
||||
# 2. get.php md5 links
|
||||
found = _find_link(r'[^"\' >]*get\.php\?md5=[a-fA-F0-9]{32}[^"\' >]*')
|
||||
if found:
|
||||
return found
|
||||
|
||||
# Next: library.lol main links.
|
||||
found = _find_link(r'[^"\' >]*library\.lol[^"\' >]*')
|
||||
# 3. ads.php md5 links
|
||||
found = _find_link(r'[^"\' >]*ads\.php\?md5=[a-fA-F0-9]{32}[^"\' >]*')
|
||||
if found:
|
||||
return found
|
||||
|
||||
# Finally: any direct file extension link.
|
||||
# 4. file.php id links
|
||||
found = _find_link(r'[^"\' >]*file\.php\?id=\d+[^"\' >]*')
|
||||
if found:
|
||||
return found
|
||||
|
||||
# 5. edition.php id links
|
||||
found = _find_link(r'[^"\' >]*edition\.php\?id=\d+[^"\' >]*')
|
||||
if found:
|
||||
return found
|
||||
|
||||
# 6. Direct file extensions
|
||||
found = _find_link(r'[^"\' >]+\.(?:pdf|epub|mobi|djvu|azw3|cbz|cbr)(?:\?[^"\' >]*)?')
|
||||
if found:
|
||||
return found
|
||||
@@ -1744,30 +1730,33 @@ def _resolve_download_url(
|
||||
|
||||
_call(log_info, f"[resolve] Loop {idx+1} Checking: {current_url}")
|
||||
|
||||
if current_url.lower().endswith((".pdf",
|
||||
".epub",
|
||||
".mobi",
|
||||
".djvu",
|
||||
".azw3",
|
||||
".cbz",
|
||||
".cbr")):
|
||||
if current_url.lower().split("?")[0].split("#")[0].endswith(
|
||||
(".pdf", ".epub", ".mobi", ".djvu", ".azw3", ".cbz", ".cbr")
|
||||
):
|
||||
_call(log_info, f"[resolve] URL looks like direct file: {current_url}")
|
||||
return current_url
|
||||
return current_url, referer
|
||||
|
||||
try:
|
||||
with session.get(current_url, stream=True, timeout=30) as resp:
|
||||
# Pass Referer to stay in the mirror's good graces
|
||||
headers = {}
|
||||
if referer:
|
||||
headers["Referer"] = referer
|
||||
|
||||
with session.get(current_url, stream=True, timeout=30, headers=headers) as resp:
|
||||
resp.raise_for_status()
|
||||
ct = str(resp.headers.get("Content-Type", "")).lower()
|
||||
|
||||
if "text/html" not in ct:
|
||||
_call(log_info, f"[resolve] URL returned non-HTML ({ct}): {current_url}")
|
||||
return current_url
|
||||
return current_url, referer
|
||||
|
||||
# Only read if it's small enough to be a landing page
|
||||
content = resp.text
|
||||
except Exception as e:
|
||||
_call(log_info, f"[resolve] Failed to fetch {current_url}: {e}")
|
||||
return None
|
||||
return None, None
|
||||
|
||||
next_url = None
|
||||
doc = None
|
||||
if lxml_html is not None:
|
||||
try:
|
||||
@@ -1775,58 +1764,58 @@ def _resolve_download_url(
|
||||
except Exception:
|
||||
doc = None
|
||||
|
||||
if doc is None:
|
||||
if doc is not None:
|
||||
# Try to find common mirror links via XPath
|
||||
get_href = _find_href_by_text(doc, r"^GET$")
|
||||
if get_href:
|
||||
next_url = urljoin(current_url, get_href)
|
||||
|
||||
if not next_url:
|
||||
# Mirror-specific patterns
|
||||
if "series.php" in current_url:
|
||||
hrefs = doc.xpath("//a[contains(@href,'edition.php')]/@href")
|
||||
if hrefs:
|
||||
next_url = urljoin(current_url, str(hrefs[0] or ""))
|
||||
elif "edition.php" in current_url:
|
||||
hrefs = doc.xpath("//a[contains(@href,'file.php')]/@href")
|
||||
if hrefs:
|
||||
next_url = urljoin(current_url, str(hrefs[0] or ""))
|
||||
elif "file.php" in current_url:
|
||||
libgen_href = None
|
||||
for a in doc.xpath("//a[@href]"):
|
||||
if str(a.get("title") or "").strip().lower() == "libgen":
|
||||
libgen_href = str(a.get("href") or "").strip()
|
||||
break
|
||||
if not libgen_href:
|
||||
libgen_href = _find_href_by_text(doc, r"Libgen")
|
||||
if libgen_href:
|
||||
next_url = urljoin(current_url, libgen_href)
|
||||
elif "ads.php" in current_url:
|
||||
hrefs = doc.xpath("//a[contains(@href,'get.php')]/@href")
|
||||
if hrefs:
|
||||
next_url = urljoin(current_url, str(hrefs[0] or ""))
|
||||
|
||||
if not next_url:
|
||||
# General provider links
|
||||
for text in ["Cloudflare", "IPFS.io", "Infura"]:
|
||||
href = _find_href_by_text(doc, re.escape(text))
|
||||
if href:
|
||||
next_url = urljoin(current_url, href)
|
||||
break
|
||||
|
||||
# Fallback to regex if XPath failed or lxml is missing
|
||||
if not next_url:
|
||||
next_url = _resolve_html_links_regex(current_url, content)
|
||||
if next_url:
|
||||
current_url = next_url
|
||||
continue
|
||||
_call(
|
||||
log_info,
|
||||
"[resolve] lxml not available and regex resolver found no links"
|
||||
)
|
||||
return None
|
||||
|
||||
get_href = _find_href_by_text(doc, r"^GET$")
|
||||
if get_href:
|
||||
return urljoin(current_url, get_href)
|
||||
|
||||
if "series.php" in current_url:
|
||||
hrefs = doc.xpath("//a[contains(@href,'edition.php')]/@href")
|
||||
if hrefs:
|
||||
current_url = urljoin(current_url, str(hrefs[0] or ""))
|
||||
continue
|
||||
|
||||
if "edition.php" in current_url:
|
||||
hrefs = doc.xpath("//a[contains(@href,'file.php')]/@href")
|
||||
if hrefs:
|
||||
current_url = urljoin(current_url, str(hrefs[0] or ""))
|
||||
continue
|
||||
|
||||
if "file.php" in current_url:
|
||||
libgen_href = None
|
||||
for a in doc.xpath("//a[@href]"):
|
||||
if str(a.get("title") or "").strip().lower() == "libgen":
|
||||
libgen_href = str(a.get("href") or "").strip()
|
||||
break
|
||||
if not libgen_href:
|
||||
libgen_href = _find_href_by_text(doc, r"Libgen")
|
||||
if libgen_href:
|
||||
current_url = urljoin(current_url, libgen_href)
|
||||
continue
|
||||
|
||||
if "ads.php" in current_url:
|
||||
hrefs = doc.xpath("//a[contains(@href,'get.php')]/@href")
|
||||
if hrefs:
|
||||
return urljoin(current_url, str(hrefs[0] or ""))
|
||||
|
||||
for text in ["Cloudflare", "IPFS.io", "Infura"]:
|
||||
href = _find_href_by_text(doc, re.escape(text))
|
||||
if href:
|
||||
return urljoin(current_url, href)
|
||||
if next_url:
|
||||
referer = current_url
|
||||
current_url = next_url
|
||||
continue
|
||||
|
||||
_call(log_info, "[resolve] No further links found in content")
|
||||
break
|
||||
|
||||
return None
|
||||
return None, None
|
||||
|
||||
|
||||
def _guess_filename_extension(download_url: str,
|
||||
@@ -1931,7 +1920,7 @@ def download_from_mirror(
|
||||
try:
|
||||
_call(log_info, f"[download] Resolving download link from: {mirror_url}")
|
||||
|
||||
download_url = _resolve_download_url(session, mirror_url, log_info)
|
||||
download_url, referer = _resolve_download_url(session, mirror_url, log_info)
|
||||
|
||||
if not download_url:
|
||||
_call(log_error, "[download] Could not find direct download link")
|
||||
@@ -1944,7 +1933,11 @@ def download_from_mirror(
|
||||
headers: Dict[str,
|
||||
str] = {}
|
||||
|
||||
with session.get(download_url, stream=True, timeout=60) as r:
|
||||
req_headers = {}
|
||||
if referer:
|
||||
req_headers["Referer"] = referer
|
||||
|
||||
with session.get(download_url, stream=True, timeout=60, headers=req_headers) as r:
|
||||
r.raise_for_status()
|
||||
headers = dict(r.headers)
|
||||
|
||||
|
||||
@@ -288,14 +288,20 @@ class ZeroTier(Store):
|
||||
url += f"{sep}api_key={self._api_key}"
|
||||
return url
|
||||
|
||||
def download_to_temp(self, file_hash: str, temp_root: Optional[Path] = None, suffix: Optional[str] = None) -> Optional[Path]:
|
||||
def download_to_temp(
|
||||
self,
|
||||
file_hash: str,
|
||||
temp_root: Optional[Path] = None,
|
||||
suffix: Optional[str] = None,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||||
) -> Optional[Path]:
|
||||
"""Download a file from the remote peer to a local temporary file."""
|
||||
import os
|
||||
import httpx
|
||||
import tempfile
|
||||
|
||||
if self._service == "hydrus":
|
||||
return None
|
||||
return None
|
||||
|
||||
url = self.get_file(file_hash)
|
||||
if not url or not isinstance(url, str) or not url.startswith("http"):
|
||||
@@ -314,21 +320,32 @@ class ZeroTier(Store):
|
||||
fd, tmp_path = tempfile.mkstemp(dir=str(temp_root), suffix=suffix)
|
||||
else:
|
||||
fd, tmp_path = tempfile.mkstemp(suffix=suffix)
|
||||
|
||||
os_fd = os.fdopen(fd, 'wb')
|
||||
|
||||
|
||||
os_fd = os.fdopen(fd, "wb")
|
||||
|
||||
headers = {}
|
||||
if self._api_key:
|
||||
headers["X-API-Key"] = self._api_key
|
||||
|
||||
downloaded = 0
|
||||
total = 0
|
||||
with httpx.stream("GET", url, headers=headers, timeout=self._timeout) as r:
|
||||
r.raise_for_status()
|
||||
for chunk in r.iter_bytes():
|
||||
os_fd.write(chunk)
|
||||
|
||||
total = int(r.headers.get("Content-Length", 0))
|
||||
# Use a larger chunk size for ZeroTier/P2P efficiency
|
||||
for chunk in r.iter_bytes(chunk_size=128 * 1024):
|
||||
if chunk:
|
||||
os_fd.write(chunk)
|
||||
downloaded += len(chunk)
|
||||
if progress_callback:
|
||||
try:
|
||||
progress_callback(downloaded, total)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
os_fd.close()
|
||||
return Path(tmp_path)
|
||||
|
||||
|
||||
except Exception as exc:
|
||||
debug(f"ZeroTier download_to_temp failed for {file_hash}: {exc}")
|
||||
return None
|
||||
|
||||
@@ -504,9 +504,15 @@ class Add_File(Cmdlet):
|
||||
# When add-file -store is the last stage, always show a final search-file table.
|
||||
# This is especially important for multi-item ingests (e.g., multi-clip downloads)
|
||||
# so the user always gets a selectable ResultTable.
|
||||
live_progress = None
|
||||
try:
|
||||
live_progress = ctx.get_live_progress()
|
||||
except Exception:
|
||||
live_progress = None
|
||||
|
||||
want_final_search_file = (
|
||||
bool(is_last_stage) and bool(is_storage_backend_location)
|
||||
and bool(location)
|
||||
and bool(location) and bool(live_progress)
|
||||
)
|
||||
auto_search_file_after_add = False
|
||||
|
||||
@@ -994,15 +1000,27 @@ class Add_File(Cmdlet):
|
||||
suffix = metadata.get("ext")
|
||||
|
||||
tmp_dir = Path(tempfile.mkdtemp(prefix="add-file-src-"))
|
||||
|
||||
# Pass suffix to downloader if it supports it
|
||||
|
||||
# Introspect downloader to pass supported args (suffix, progress_callback)
|
||||
import inspect
|
||||
|
||||
sig = inspect.signature(downloader)
|
||||
kwargs = {"temp_root": tmp_dir}
|
||||
if "suffix" in sig.parameters:
|
||||
downloaded = downloader(str(file_hash), temp_root=tmp_dir, suffix=suffix)
|
||||
else:
|
||||
downloaded = downloader(str(file_hash), temp_root=tmp_dir)
|
||||
|
||||
kwargs["suffix"] = suffix
|
||||
|
||||
# Hook into global PipelineProgress if available
|
||||
pp = PipelineProgress.get()
|
||||
if pp and "progress_callback" in sig.parameters:
|
||||
|
||||
def _cb(done, total):
|
||||
# Show fetch progress instead of just 'resolving'
|
||||
pp.update(downloaded=done, total=total, label="peer transfer")
|
||||
|
||||
kwargs["progress_callback"] = _cb
|
||||
|
||||
downloaded = downloader(str(file_hash), **kwargs)
|
||||
|
||||
if isinstance(downloaded, Path) and downloaded.exists():
|
||||
pipe_obj.is_temp = True
|
||||
return downloaded, tmp_dir
|
||||
|
||||
@@ -229,9 +229,11 @@ class Download_File(Cmdlet):
|
||||
|
||||
except Exception as e:
|
||||
log(f"Provider {provider_name} error handling {url}: {e}", file=sys.stderr)
|
||||
# Fallthrough to direct download?
|
||||
# If a provider explicitly claimed it but failed, we'll try direct download as a last resort.
|
||||
pass
|
||||
|
||||
if not handled:
|
||||
debug(f"Provider {provider_name} matched URL but failed to download. Skipping direct fallback to avoid landing pages.")
|
||||
continue
|
||||
|
||||
# Direct Download Fallback
|
||||
result_obj = _download_direct_file(
|
||||
|
||||
Reference in New Issue
Block a user