re
Some checks failed
smoke-mm / Install & smoke test mm --help (push) Has been cancelled
Some checks failed
smoke-mm / Install & smoke test mm --help (push) Has been cancelled
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import html as html_std
|
||||
import logging
|
||||
import re
|
||||
import requests
|
||||
@@ -22,6 +23,595 @@ except ImportError:
|
||||
lxml_html = None
|
||||
|
||||
|
||||
def _strip_html_to_text(raw: str) -> str:
|
||||
s = html_std.unescape(str(raw or ""))
|
||||
s = re.sub(r"(?i)<br\s*/?>", "\n", s)
|
||||
# Help keep lists readable when they are link-heavy.
|
||||
s = re.sub(r"(?i)</a>", ", ", s)
|
||||
s = re.sub(r"<[^>]+>", " ", s)
|
||||
s = re.sub(r"\s+", " ", s)
|
||||
return s.strip()
|
||||
|
||||
|
||||
def _strip_html_to_lines(raw: str) -> List[str]:
|
||||
"""Convert a small HTML snippet to a list of meaningful text lines.
|
||||
|
||||
Unlike `_strip_html_to_text`, this preserves `<br>` as line breaks so we can
|
||||
parse LibGen ads.php tag blocks that use `<br>` separators.
|
||||
"""
|
||||
|
||||
s = html_std.unescape(str(raw or ""))
|
||||
s = re.sub(r"(?is)<script\b.*?</script>", " ", s)
|
||||
s = re.sub(r"(?is)<style\b.*?</style>", " ", s)
|
||||
s = re.sub(r"(?i)<br\s*/?>", "\n", s)
|
||||
s = re.sub(r"(?i)</p\s*>", "\n", s)
|
||||
s = re.sub(r"(?i)</tr\s*>", "\n", s)
|
||||
# Help keep link-heavy lists readable.
|
||||
s = re.sub(r"(?i)</a>", ", ", s)
|
||||
s = re.sub(r"<[^>]+>", " ", s)
|
||||
s = s.replace("\r\n", "\n").replace("\r", "\n")
|
||||
|
||||
out: List[str] = []
|
||||
for line in s.split("\n"):
|
||||
t = re.sub(r"\s+", " ", str(line or "")).strip()
|
||||
if t:
|
||||
out.append(t)
|
||||
return out
|
||||
|
||||
|
||||
def _libgen_md5_from_url(url: str) -> str:
|
||||
try:
|
||||
p = urlparse(str(url or ""))
|
||||
q = p.query or ""
|
||||
except Exception:
|
||||
q = ""
|
||||
m = re.search(r"(?:^|[&?])md5=([a-fA-F0-9]{32})(?:&|$)", q)
|
||||
return str(m.group(1)).lower() if m else ""
|
||||
|
||||
|
||||
def _libgen_ads_url_for_target(url: str) -> str:
|
||||
"""Best-effort conversion of any LibGen URL to an ads.php URL (same host).
|
||||
|
||||
If md5 is not present, returns empty string.
|
||||
"""
|
||||
|
||||
md5 = _libgen_md5_from_url(url)
|
||||
if not md5:
|
||||
return ""
|
||||
try:
|
||||
p = urlparse(str(url or ""))
|
||||
scheme = p.scheme or "https"
|
||||
netloc = p.netloc
|
||||
if not netloc:
|
||||
return ""
|
||||
return f"{scheme}://{netloc}/ads.php?md5={md5}"
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _parse_libgen_ads_tags_html(html: str) -> Dict[str, Any]:
|
||||
"""Parse tags embedded on LibGen ads.php pages.
|
||||
|
||||
Some mirrors render all metadata as a single `<td>` with `<br>` separators:
|
||||
title: ...<br>author(s): ...<br>isbn: ...
|
||||
|
||||
Returns a metadata dict similar to `_parse_libgen_details_html` (subset), plus
|
||||
`_raw_fields` with captured keys.
|
||||
"""
|
||||
|
||||
s = str(html or "")
|
||||
td_blocks = re.findall(r"(?is)<td\b[^>]*>(.*?)</td>", s)
|
||||
|
||||
best_lines: List[str] = []
|
||||
best_score = 0
|
||||
for td in td_blocks:
|
||||
lines = _strip_html_to_lines(td)
|
||||
if not lines:
|
||||
continue
|
||||
score = 0
|
||||
for ln in lines:
|
||||
lo = ln.lower()
|
||||
if ":" in ln and any(k in lo for k in ("title", "author", "publisher", "year", "isbn", "language", "series", "tags")):
|
||||
score += 1
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_lines = lines
|
||||
|
||||
# Fallback: treat the entire page as a line list.
|
||||
if not best_lines:
|
||||
best_lines = _strip_html_to_lines(s)
|
||||
|
||||
raw_fields: Dict[str, str] = {}
|
||||
pending_key: Optional[str] = None
|
||||
|
||||
def _norm_key(k: str) -> str:
|
||||
kk = str(k or "").strip().lower()
|
||||
kk = re.sub(r"\s+", " ", kk)
|
||||
if kk in {"authors", "author(s)", "author(s).", "author(s):"}:
|
||||
return "author"
|
||||
if kk in {"tag", "tags"}:
|
||||
return "tags"
|
||||
return kk
|
||||
|
||||
for ln in best_lines:
|
||||
line = str(ln or "").strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if ":" in line:
|
||||
k, v = line.split(":", 1)
|
||||
k = _norm_key(k)
|
||||
v = str(v or "").strip()
|
||||
if v:
|
||||
raw_fields[k] = v
|
||||
pending_key = None
|
||||
else:
|
||||
pending_key = k
|
||||
continue
|
||||
|
||||
# Continuation line: if the previous key had no inline value, use this.
|
||||
if pending_key:
|
||||
raw_fields[pending_key] = line
|
||||
pending_key = None
|
||||
|
||||
out: Dict[str, Any] = {"_raw_fields": dict(raw_fields)}
|
||||
|
||||
title = str(raw_fields.get("title") or "").strip()
|
||||
if title:
|
||||
out["title"] = title
|
||||
|
||||
publisher = str(raw_fields.get("publisher") or "").strip()
|
||||
if publisher:
|
||||
out["publisher"] = publisher
|
||||
|
||||
year = str(raw_fields.get("year") or "").strip()
|
||||
if year:
|
||||
out["year"] = year
|
||||
|
||||
language = str(raw_fields.get("language") or "").strip()
|
||||
if language:
|
||||
out["language"] = language
|
||||
|
||||
authors_raw = str(raw_fields.get("author") or "").strip()
|
||||
if authors_raw:
|
||||
out["authors"] = _split_listish_text(authors_raw)
|
||||
|
||||
# ISBN: extract all tokens (some pages include multiple).
|
||||
isbn_raw = str(raw_fields.get("isbn") or "").strip()
|
||||
if isbn_raw:
|
||||
isbns = _extract_isbns(isbn_raw)
|
||||
if isbns:
|
||||
out["isbn"] = isbns
|
||||
|
||||
tags_raw = str(raw_fields.get("tags") or "").strip()
|
||||
if tags_raw:
|
||||
# Keep these as freeform tags (split on commas/semicolons/pipes).
|
||||
out["tags"] = _split_listish_text(tags_raw)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def _extract_anchor_texts(raw_html: str) -> List[str]:
|
||||
out: List[str] = []
|
||||
for m in re.finditer(r"(?is)<a\b[^>]*>(.*?)</a>", str(raw_html or "")):
|
||||
t = _strip_html_to_text(m.group(1))
|
||||
if t:
|
||||
out.append(t)
|
||||
# De-dupe, preserve order
|
||||
seen: set[str] = set()
|
||||
uniq: List[str] = []
|
||||
for x in out:
|
||||
k = x.strip()
|
||||
if not k:
|
||||
continue
|
||||
if k.lower() in seen:
|
||||
continue
|
||||
seen.add(k.lower())
|
||||
uniq.append(k)
|
||||
return uniq
|
||||
|
||||
|
||||
def _split_listish_text(value: str) -> List[str]:
|
||||
s = str(value or "").strip()
|
||||
if not s:
|
||||
return []
|
||||
parts = re.split(r"\s*(?:,|;|\|)\s*", s)
|
||||
out: List[str] = []
|
||||
for p in parts:
|
||||
p = str(p or "").strip()
|
||||
if p:
|
||||
out.append(p)
|
||||
return out
|
||||
|
||||
|
||||
def _extract_isbns(text: str) -> List[str]:
|
||||
s = str(text or "")
|
||||
candidates = re.findall(r"\b[0-9Xx][0-9Xx\-\s]{8,20}[0-9Xx]\b", s)
|
||||
out: List[str] = []
|
||||
for c in candidates:
|
||||
n = re.sub(r"[^0-9Xx]", "", c).upper()
|
||||
if len(n) not in (10, 13):
|
||||
continue
|
||||
if n not in out:
|
||||
out.append(n)
|
||||
# Also handle already-clean tokens.
|
||||
for c in re.findall(r"\b(?:97[89])?\d{9}[\dXx]\b", s):
|
||||
n = str(c).upper()
|
||||
if n not in out:
|
||||
out.append(n)
|
||||
return out
|
||||
|
||||
|
||||
def _libgen_id_from_url(url: str) -> str:
|
||||
# Handles edition.php?id=..., file.php?id=...
|
||||
m = re.search(r"(?:\?|&)id=(\d+)", str(url or ""), flags=re.IGNORECASE)
|
||||
return str(m.group(1)) if m else ""
|
||||
|
||||
|
||||
def _prefer_isbn(isbns: List[str]) -> str:
|
||||
vals = [str(x or "").strip() for x in (isbns or []) if str(x or "").strip()]
|
||||
# Prefer ISBN-13, then ISBN-10.
|
||||
for v in vals:
|
||||
if len(v) == 13:
|
||||
return v
|
||||
for v in vals:
|
||||
if len(v) == 10:
|
||||
return v
|
||||
return vals[0] if vals else ""
|
||||
|
||||
|
||||
def _enrich_book_tags_from_isbn(isbn: str, *, config: Optional[Dict[str, Any]] = None) -> Tuple[List[str], str]:
|
||||
"""Return (tags, source_name) for the given ISBN.
|
||||
|
||||
Priority:
|
||||
1) OpenLibrary API-by-ISBN scrape (fast, structured)
|
||||
2) isbnsearch.org scrape via MetadataProvider
|
||||
"""
|
||||
|
||||
isbn_clean = re.sub(r"[^0-9Xx]", "", str(isbn or "")).upper()
|
||||
if len(isbn_clean) not in (10, 13):
|
||||
return [], ""
|
||||
|
||||
# 1) OpenLibrary API lookup by ISBN (short timeout, silent failure).
|
||||
try:
|
||||
url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json"
|
||||
resp = requests.get(url, timeout=4)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
if isinstance(data, dict) and data:
|
||||
book_data = next(iter(data.values()), None)
|
||||
else:
|
||||
book_data = None
|
||||
|
||||
if isinstance(book_data, dict):
|
||||
tags: List[str] = []
|
||||
|
||||
def _add(t: str) -> None:
|
||||
s = str(t or "").strip()
|
||||
if s:
|
||||
tags.append(s)
|
||||
|
||||
if book_data.get("title"):
|
||||
_add(f"title:{book_data['title']}")
|
||||
|
||||
authors = book_data.get("authors")
|
||||
if isinstance(authors, list):
|
||||
for a in authors[:3]:
|
||||
if isinstance(a, dict) and a.get("name"):
|
||||
_add(f"author:{a['name']}")
|
||||
|
||||
if book_data.get("publish_date"):
|
||||
_add(f"publish_date:{book_data['publish_date']}")
|
||||
|
||||
publishers = book_data.get("publishers")
|
||||
if isinstance(publishers, list) and publishers:
|
||||
pub0 = publishers[0]
|
||||
if isinstance(pub0, dict) and pub0.get("name"):
|
||||
_add(f"publisher:{pub0['name']}")
|
||||
|
||||
desc = book_data.get("description")
|
||||
if isinstance(desc, dict) and "value" in desc:
|
||||
desc = desc.get("value")
|
||||
if desc:
|
||||
desc_str = str(desc).strip()
|
||||
if desc_str:
|
||||
_add(f"description:{desc_str[:200]}")
|
||||
|
||||
pages = book_data.get("number_of_pages")
|
||||
if isinstance(pages, int) and pages > 0:
|
||||
_add(f"pages:{pages}")
|
||||
|
||||
identifiers = book_data.get("identifiers")
|
||||
if isinstance(identifiers, dict):
|
||||
|
||||
def _first(value: Any) -> Any:
|
||||
if isinstance(value, list) and value:
|
||||
return value[0]
|
||||
return value
|
||||
|
||||
for key, ns in (
|
||||
("openlibrary", "openlibrary"),
|
||||
("lccn", "lccn"),
|
||||
("oclc", "oclc"),
|
||||
("goodreads", "goodreads"),
|
||||
("librarything", "librarything"),
|
||||
("doi", "doi"),
|
||||
("internet_archive", "internet_archive"),
|
||||
):
|
||||
val = _first(identifiers.get(key))
|
||||
if val:
|
||||
_add(f"{ns}:{val}")
|
||||
|
||||
if not any(str(t).lower().startswith("isbn:") for t in tags):
|
||||
tags.insert(0, f"isbn:{isbn_clean}")
|
||||
|
||||
# De-dupe case-insensitively, preserve order.
|
||||
seen: set[str] = set()
|
||||
out: List[str] = []
|
||||
for t in tags:
|
||||
k = str(t).strip().lower()
|
||||
if not k or k in seen:
|
||||
continue
|
||||
seen.add(k)
|
||||
out.append(str(t).strip())
|
||||
|
||||
if out:
|
||||
return out, "openlibrary"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 2) isbnsearch metadata provider fallback.
|
||||
try:
|
||||
from Provider.metadata_provider import get_metadata_provider
|
||||
|
||||
provider = get_metadata_provider("isbnsearch", config or {})
|
||||
if provider is None:
|
||||
return [], ""
|
||||
items = provider.search(isbn_clean, limit=1)
|
||||
if not items:
|
||||
return [], ""
|
||||
tags = provider.to_tags(items[0])
|
||||
if not any(str(t).lower().startswith("isbn:") for t in tags):
|
||||
tags = [f"isbn:{isbn_clean}"] + [str(t) for t in tags]
|
||||
return [str(t) for t in tags if str(t).strip()], provider.name
|
||||
except Exception:
|
||||
return [], ""
|
||||
|
||||
|
||||
def _fetch_libgen_details_html(url: str, *, timeout: Optional[Tuple[float, float]] = None) -> Optional[str]:
|
||||
try:
|
||||
if timeout is None:
|
||||
timeout = (DEFAULT_CONNECT_TIMEOUT, DEFAULT_READ_TIMEOUT)
|
||||
session = requests.Session()
|
||||
session.headers.update(
|
||||
{
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36",
|
||||
}
|
||||
)
|
||||
with session.get(str(url), stream=True, timeout=timeout) as resp:
|
||||
resp.raise_for_status()
|
||||
ct = str(resp.headers.get("Content-Type", "")).lower()
|
||||
if "text/html" not in ct:
|
||||
return None
|
||||
return resp.text
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _parse_libgen_details_html(html: str) -> Dict[str, Any]:
|
||||
"""Parse LibGen details-page HTML (edition.php/file.php) into a metadata dict.
|
||||
|
||||
Best-effort and intentionally tolerant of mirror variations.
|
||||
"""
|
||||
|
||||
out: Dict[str, Any] = {}
|
||||
raw_fields: Dict[str, str] = {}
|
||||
s = str(html or "")
|
||||
|
||||
# Fast path: try to pull simple Label/Value table rows.
|
||||
for m in re.finditer(
|
||||
r"(?is)<tr\b[^>]*>\s*<t[dh]\b[^>]*>\s*([^<]{1,80}?)\s*:??\s*</t[dh]>\s*<t[dh]\b[^>]*>(.*?)</t[dh]>\s*</tr>",
|
||||
s,
|
||||
):
|
||||
label = _strip_html_to_text(m.group(1))
|
||||
raw_val_html = str(m.group(2) or "")
|
||||
if not label:
|
||||
continue
|
||||
val_text = _strip_html_to_text(raw_val_html)
|
||||
if not val_text:
|
||||
continue
|
||||
raw_fields[label] = val_text
|
||||
|
||||
norm = re.sub(r"[^a-z0-9]+", "_", label.strip().lower()).strip("_")
|
||||
if not norm:
|
||||
continue
|
||||
|
||||
# Prefer anchors for multi-valued fields.
|
||||
anchors = _extract_anchor_texts(raw_val_html)
|
||||
if anchors:
|
||||
out[norm] = anchors
|
||||
else:
|
||||
out[norm] = val_text
|
||||
|
||||
# Some libgen.gl edition pages group metadata as repeated blocks like:
|
||||
# <strong>Title:</strong>
|
||||
# The Title
|
||||
# We'll parse those too (best-effort, no DOM required).
|
||||
strong_matches = list(re.finditer(r"(?is)<strong\b[^>]*>(.*?)</strong>", s))
|
||||
if strong_matches:
|
||||
for idx, m in enumerate(strong_matches):
|
||||
label_raw = _strip_html_to_text(m.group(1))
|
||||
label = str(label_raw or "").strip()
|
||||
if not label:
|
||||
continue
|
||||
|
||||
# Normalize label (strip trailing colon if present).
|
||||
if label.endswith(":"):
|
||||
label = label[:-1].strip()
|
||||
|
||||
chunk_start = m.end()
|
||||
chunk_end = strong_matches[idx + 1].start() if (idx + 1) < len(strong_matches) else len(s)
|
||||
raw_val_html = s[chunk_start:chunk_end]
|
||||
|
||||
# If we already have a value for this label from a table row, keep it.
|
||||
if label in raw_fields:
|
||||
continue
|
||||
|
||||
val_text = _strip_html_to_text(raw_val_html)
|
||||
if not val_text:
|
||||
continue
|
||||
|
||||
raw_fields[label] = val_text
|
||||
|
||||
norm = re.sub(r"[^a-z0-9]+", "_", label.strip().lower()).strip("_")
|
||||
if not norm:
|
||||
continue
|
||||
|
||||
anchors = _extract_anchor_texts(raw_val_html)
|
||||
if anchors:
|
||||
out[norm] = anchors
|
||||
else:
|
||||
out[norm] = val_text
|
||||
|
||||
# Normalize keys of interest.
|
||||
def _first_str(v: Any) -> str:
|
||||
if isinstance(v, list) and v:
|
||||
return str(v[0] or "").strip()
|
||||
return str(v or "").strip()
|
||||
|
||||
title = _first_str(out.get("title"))
|
||||
if title:
|
||||
out["title"] = title
|
||||
|
||||
authors = out.get("author_s") or out.get("authors") or out.get("author")
|
||||
if isinstance(authors, str):
|
||||
authors_list = _split_listish_text(authors)
|
||||
elif isinstance(authors, list):
|
||||
authors_list = [str(x).strip() for x in authors if str(x).strip()]
|
||||
else:
|
||||
authors_list = []
|
||||
if authors_list:
|
||||
out["authors"] = authors_list
|
||||
|
||||
publisher = _first_str(out.get("publisher"))
|
||||
if publisher:
|
||||
out["publisher"] = publisher
|
||||
|
||||
year = _first_str(out.get("year"))
|
||||
if year:
|
||||
out["year"] = year
|
||||
|
||||
language = _first_str(out.get("language"))
|
||||
if language:
|
||||
out["language"] = language
|
||||
|
||||
oclc = _first_str(out.get("oclc_worldcat")) or _first_str(out.get("oclc"))
|
||||
if oclc:
|
||||
m_oclc = re.search(r"\b\d{5,15}\b", oclc)
|
||||
out["oclc"] = str(m_oclc.group(0)) if m_oclc else oclc
|
||||
|
||||
tags_val = out.get("tags")
|
||||
if isinstance(tags_val, list):
|
||||
tags_list = [str(x).strip() for x in tags_val if str(x).strip()]
|
||||
elif isinstance(tags_val, str):
|
||||
tags_list = _split_listish_text(tags_val)
|
||||
else:
|
||||
tags_list = []
|
||||
if tags_list:
|
||||
out["tags"] = tags_list
|
||||
|
||||
isbn_val = out.get("isbn")
|
||||
isbn_text = ""
|
||||
if isinstance(isbn_val, list):
|
||||
isbn_text = " ".join([str(x) for x in isbn_val if x])
|
||||
else:
|
||||
isbn_text = str(isbn_val or "")
|
||||
isbns = _extract_isbns(isbn_text)
|
||||
if isbns:
|
||||
out["isbn"] = isbns
|
||||
|
||||
edition_id = _first_str(out.get("edition_id"))
|
||||
if edition_id:
|
||||
m_eid = re.search(r"\b\d+\b", edition_id)
|
||||
out["edition_id"] = str(m_eid.group(0)) if m_eid else edition_id
|
||||
|
||||
if raw_fields:
|
||||
out["_raw_fields"] = raw_fields
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def _libgen_metadata_to_tags(meta: Dict[str, Any]) -> List[str]:
|
||||
tags: List[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
def _add(t: str) -> None:
|
||||
s = str(t or "").strip()
|
||||
if not s:
|
||||
return
|
||||
k = s.lower()
|
||||
if k in seen:
|
||||
return
|
||||
seen.add(k)
|
||||
tags.append(s)
|
||||
|
||||
title = str(meta.get("title") or "").strip()
|
||||
if title:
|
||||
_add(f"title:{title}")
|
||||
|
||||
for a in meta.get("authors") or []:
|
||||
a = str(a or "").strip()
|
||||
if a:
|
||||
_add(f"author:{a}")
|
||||
|
||||
publisher = str(meta.get("publisher") or "").strip()
|
||||
if publisher:
|
||||
_add(f"publisher:{publisher}")
|
||||
|
||||
year = str(meta.get("year") or "").strip()
|
||||
if year:
|
||||
_add(f"year:{year}")
|
||||
|
||||
language = str(meta.get("language") or "").strip()
|
||||
if language:
|
||||
_add(f"language:{language}")
|
||||
|
||||
for isbn in meta.get("isbn") or []:
|
||||
isbn = str(isbn or "").strip().replace("-", "")
|
||||
if isbn:
|
||||
_add(f"isbn:{isbn}")
|
||||
|
||||
oclc = str(meta.get("oclc") or "").strip()
|
||||
if oclc:
|
||||
_add(f"oclc:{oclc}")
|
||||
|
||||
edition_id = str(meta.get("edition_id") or "").strip()
|
||||
if edition_id:
|
||||
_add(f"libgen_edition_id:{edition_id}")
|
||||
|
||||
# Freeform tags (no "tags:" prefix).
|
||||
for t in meta.get("tags") or []:
|
||||
t = str(t or "").strip()
|
||||
if t:
|
||||
_add(t)
|
||||
|
||||
# Any additional structured fields we captured are preserved under a libgen_ namespace.
|
||||
raw_fields = meta.get("_raw_fields")
|
||||
if isinstance(raw_fields, dict):
|
||||
for k, v in raw_fields.items():
|
||||
lk = str(k or "").strip().lower()
|
||||
if lk in {"title", "author(s)", "authors", "author", "publisher", "year", "isbn", "language", "oclc/worldcat", "tags", "edition id"}:
|
||||
continue
|
||||
vv = str(v or "").strip()
|
||||
if not vv:
|
||||
continue
|
||||
ns = re.sub(r"[^a-z0-9]+", "_", lk).strip("_")
|
||||
if ns:
|
||||
_add(f"libgen_{ns}:{vv}")
|
||||
|
||||
return tags
|
||||
|
||||
|
||||
class Libgen(Provider):
|
||||
# Domains that should be routed to this provider when the user supplies a URL.
|
||||
# (Used by ProviderCore.registry.match_provider_name_for_url)
|
||||
@@ -146,6 +736,15 @@ class Libgen(Provider):
|
||||
md5 = str(md.get("md5") or "").strip()
|
||||
extension = str(md.get("extension") or "").strip().lstrip(".")
|
||||
|
||||
# If the user passed ads.php/get.php directly, capture md5 from the URL so
|
||||
# filenames are stable (avoid always writing `libgen.pdf`).
|
||||
if (not md5) and isinstance(target, str) and target.startswith("http"):
|
||||
md5 = _libgen_md5_from_url(target)
|
||||
if md5:
|
||||
md["md5"] = md5
|
||||
|
||||
# Defer LibGen details-page metadata and ISBN enrichment until AFTER the file is downloaded.
|
||||
|
||||
if (not target) or target.startswith("libgen:"):
|
||||
if md5 and re.fullmatch(r"[a-fA-F0-9]{32}", md5):
|
||||
target = urljoin(MIRRORS[0], f"/ads.php?md5={md5}")
|
||||
@@ -153,7 +752,10 @@ class Libgen(Provider):
|
||||
if not target:
|
||||
return None
|
||||
|
||||
base_name = sanitize_filename(title or md5 or "libgen")
|
||||
if title and title.startswith("http"):
|
||||
title = ""
|
||||
|
||||
base_name = sanitize_filename(title or md5 or (f"libgen_{_libgen_id_from_url(target)}" if _libgen_id_from_url(target) else "libgen"))
|
||||
out_path = output_dir / base_name
|
||||
if extension:
|
||||
out_path = out_path.with_suffix(f".{extension}")
|
||||
@@ -190,6 +792,122 @@ class Libgen(Provider):
|
||||
ok, final_path = download_from_mirror(target, out_path, progress_callback=progress_callback)
|
||||
progress_bar.finish()
|
||||
if ok and final_path:
|
||||
# After the download completes, best-effort fetch details metadata (title + ISBN)
|
||||
# and then enrich tags via OpenLibrary/isbnsearch. This ensures enrichment never
|
||||
# blocks the download itself.
|
||||
try:
|
||||
if isinstance(target, str) and target.startswith("http"):
|
||||
low = target.lower()
|
||||
# Preferred: ads.php pages often embed a complete tag block.
|
||||
# Parse it post-download (best-effort) and do NOT perform external
|
||||
# enrichment (OpenLibrary/isbnsearch) unless the user later chooses to.
|
||||
if ("/ads.php" in low) or ("/get.php" in low):
|
||||
ads_url = target if "/ads.php" in low else _libgen_ads_url_for_target(target)
|
||||
if ads_url:
|
||||
html = _fetch_libgen_details_html(ads_url, timeout=(DEFAULT_CONNECT_TIMEOUT, 4.0))
|
||||
if html:
|
||||
meta = _parse_libgen_ads_tags_html(html)
|
||||
extracted_title = str(meta.get("title") or "").strip()
|
||||
if extracted_title:
|
||||
md["title"] = extracted_title
|
||||
result.tag.add(f"title:{extracted_title}")
|
||||
if (not title) or title.startswith("http"):
|
||||
title = extracted_title
|
||||
|
||||
authors = meta.get("authors") if isinstance(meta.get("authors"), list) else []
|
||||
for a in (authors or []):
|
||||
aa = str(a or "").strip()
|
||||
if aa:
|
||||
result.tag.add(f"author:{aa}")
|
||||
|
||||
publisher = str(meta.get("publisher") or "").strip()
|
||||
if publisher:
|
||||
md["publisher"] = publisher
|
||||
result.tag.add(f"publisher:{publisher}")
|
||||
|
||||
year = str(meta.get("year") or "").strip()
|
||||
if year:
|
||||
md["year"] = year
|
||||
result.tag.add(f"year:{year}")
|
||||
|
||||
language = str(meta.get("language") or "").strip()
|
||||
if language:
|
||||
md["language"] = language
|
||||
result.tag.add(f"language:{language}")
|
||||
|
||||
isbns = meta.get("isbn") if isinstance(meta.get("isbn"), list) else []
|
||||
isbns = [str(x).strip() for x in (isbns or []) if str(x).strip()]
|
||||
if isbns:
|
||||
md["isbn"] = isbns
|
||||
for isbn_val in isbns:
|
||||
result.tag.add(f"isbn:{isbn_val}")
|
||||
|
||||
free_tags = meta.get("tags") if isinstance(meta.get("tags"), list) else []
|
||||
for t in (free_tags or []):
|
||||
tt = str(t or "").strip()
|
||||
if tt:
|
||||
result.tag.add(tt)
|
||||
|
||||
# Preserve any other extracted fields (namespaced).
|
||||
raw_fields = meta.get("_raw_fields")
|
||||
if isinstance(raw_fields, dict):
|
||||
for k, v in raw_fields.items():
|
||||
lk = str(k or "").strip().lower()
|
||||
if lk in {"title", "author", "authors", "publisher", "year", "isbn", "language", "tags"}:
|
||||
continue
|
||||
vv = str(v or "").strip()
|
||||
if not vv:
|
||||
continue
|
||||
ns = re.sub(r"[^a-z0-9]+", "_", lk).strip("_")
|
||||
if ns:
|
||||
result.tag.add(f"libgen_{ns}:{vv}")
|
||||
|
||||
# Legacy: edition/file/series details pages (title + ISBN) + external enrichment.
|
||||
if ("/edition.php" in low) or ("/file.php" in low) or ("/series.php" in low):
|
||||
html = _fetch_libgen_details_html(target)
|
||||
if html:
|
||||
meta = _parse_libgen_details_html(html)
|
||||
|
||||
if not meta.get("edition_id"):
|
||||
eid = _libgen_id_from_url(target)
|
||||
if eid:
|
||||
meta["edition_id"] = eid
|
||||
|
||||
extracted_title = str(meta.get("title") or "").strip()
|
||||
extracted_isbns = meta.get("isbn") if isinstance(meta.get("isbn"), list) else []
|
||||
extracted_isbns = [str(x).strip() for x in (extracted_isbns or []) if str(x).strip()]
|
||||
|
||||
if extracted_title:
|
||||
md["title"] = extracted_title
|
||||
result.tag.add(f"title:{extracted_title}")
|
||||
if extracted_isbns:
|
||||
md["isbn"] = extracted_isbns
|
||||
for isbn_val in extracted_isbns:
|
||||
isbn_norm = str(isbn_val).strip().replace("-", "")
|
||||
if isbn_norm:
|
||||
result.tag.add(f"isbn:{isbn_norm}")
|
||||
if meta.get("edition_id"):
|
||||
md["edition_id"] = str(meta.get("edition_id"))
|
||||
|
||||
preferred_isbn = _prefer_isbn(extracted_isbns)
|
||||
if preferred_isbn:
|
||||
enriched_tags, enriched_source = _enrich_book_tags_from_isbn(
|
||||
preferred_isbn,
|
||||
config=getattr(self, "config", None),
|
||||
)
|
||||
if enriched_tags:
|
||||
try:
|
||||
result.tag.update(set(enriched_tags))
|
||||
except Exception:
|
||||
pass
|
||||
if enriched_source:
|
||||
md["metadata_enriched_from"] = enriched_source
|
||||
|
||||
if extracted_title and ((not title) or title.startswith("http")):
|
||||
title = extracted_title
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return Path(final_path)
|
||||
return None
|
||||
except Exception:
|
||||
@@ -751,6 +1469,34 @@ def _resolve_download_url(
|
||||
if not html:
|
||||
return None
|
||||
|
||||
# LibGen chain helpers (for environments without lxml).
|
||||
# Typical chain:
|
||||
# edition.php?id=... -> file.php?id=...
|
||||
# file.php?id=... -> ads.php?md5=... (or get.php?md5=...)
|
||||
# ads.php?md5=... -> get.php?md5=...
|
||||
# get.php?md5=... -> file response
|
||||
|
||||
# Handle edition -> file links.
|
||||
m = re.search(r'href=["\']([^"\']*file\.php\?id=\d+[^"\']*)["\']', html, flags=re.IGNORECASE)
|
||||
if m:
|
||||
href = str(m.group(1) or "").strip()
|
||||
if href and not href.lower().startswith("javascript:"):
|
||||
return urljoin(base_url, href)
|
||||
|
||||
# Handle series -> edition links.
|
||||
m = re.search(r'href=["\']([^"\']*edition\.php\?id=\d+[^"\']*)["\']', html, flags=re.IGNORECASE)
|
||||
if m:
|
||||
href = str(m.group(1) or "").strip()
|
||||
if href and not href.lower().startswith("javascript:"):
|
||||
return urljoin(base_url, href)
|
||||
|
||||
# Handle file -> ads/get links (sometimes present as the "Libgen" mirror).
|
||||
m = re.search(r'href=["\']([^"\']*ads\.php\?md5=[a-fA-F0-9]{32}[^"\']*)["\']', html, flags=re.IGNORECASE)
|
||||
if m:
|
||||
href = str(m.group(1) or "").strip()
|
||||
if href and not href.lower().startswith("javascript:"):
|
||||
return urljoin(base_url, href)
|
||||
|
||||
# Prefer explicit get.php md5 links (most common successful chain).
|
||||
m = re.search(r'href=["\']([^"\']*get\.php\?md5=[a-fA-F0-9]{32}[^"\']*)["\']', html, flags=re.IGNORECASE)
|
||||
if m:
|
||||
|
||||
@@ -2,6 +2,8 @@ from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List, Optional, Type, cast
|
||||
import html as html_std
|
||||
import re
|
||||
import requests
|
||||
import sys
|
||||
import json
|
||||
@@ -279,6 +281,199 @@ class GoogleBooksMetadataProvider(MetadataProvider):
|
||||
return tags
|
||||
|
||||
|
||||
class ISBNsearchMetadataProvider(MetadataProvider):
|
||||
"""Metadata provider that scrapes isbnsearch.org by ISBN.
|
||||
|
||||
This is a best-effort HTML scrape. It expects the query to be an ISBN.
|
||||
"""
|
||||
|
||||
@property
|
||||
def name(self) -> str: # type: ignore[override]
|
||||
return "isbnsearch"
|
||||
|
||||
@staticmethod
|
||||
def _strip_html_to_text(raw: str) -> str:
|
||||
s = html_std.unescape(str(raw or ""))
|
||||
s = re.sub(r"(?i)<br\s*/?>", "\n", s)
|
||||
s = re.sub(r"<[^>]+>", " ", s)
|
||||
s = re.sub(r"\s+", " ", s)
|
||||
return s.strip()
|
||||
|
||||
@staticmethod
|
||||
def _clean_isbn(query: str) -> str:
|
||||
s = str(query or "").strip()
|
||||
if not s:
|
||||
return ""
|
||||
s = s.replace("isbn:", "").replace("ISBN:", "")
|
||||
s = re.sub(r"[^0-9Xx]", "", s).upper()
|
||||
if len(s) in (10, 13):
|
||||
return s
|
||||
# Try to locate an ISBN-like token inside the query.
|
||||
m = re.search(r"\b(?:97[89])?\d{9}[\dXx]\b", s)
|
||||
return str(m.group(0)).upper() if m else ""
|
||||
|
||||
def search(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
|
||||
_ = limit
|
||||
isbn = self._clean_isbn(query)
|
||||
if not isbn:
|
||||
return []
|
||||
|
||||
url = f"https://isbnsearch.org/isbn/{isbn}"
|
||||
try:
|
||||
resp = requests.get(url, timeout=10)
|
||||
resp.raise_for_status()
|
||||
html = str(resp.text or "")
|
||||
if not html:
|
||||
return []
|
||||
except Exception as exc:
|
||||
log(f"ISBNsearch scrape failed: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
title = ""
|
||||
m_title = re.search(r"(?is)<h1\b[^>]*>(.*?)</h1>", html)
|
||||
if m_title:
|
||||
title = self._strip_html_to_text(m_title.group(1))
|
||||
|
||||
raw_fields: Dict[str, str] = {}
|
||||
strong_matches = list(re.finditer(r"(?is)<strong\b[^>]*>(.*?)</strong>", html))
|
||||
for idx, m in enumerate(strong_matches):
|
||||
label_raw = self._strip_html_to_text(m.group(1))
|
||||
label = str(label_raw or "").strip()
|
||||
if not label:
|
||||
continue
|
||||
if label.endswith(":"):
|
||||
label = label[:-1].strip()
|
||||
|
||||
chunk_start = m.end()
|
||||
# Stop at next <strong> or end of document.
|
||||
chunk_end = strong_matches[idx + 1].start() if (idx + 1) < len(strong_matches) else len(html)
|
||||
chunk = html[chunk_start:chunk_end]
|
||||
# Prefer stopping within the same paragraph when possible.
|
||||
m_end = re.search(r"(?is)(</p>|<br\s*/?>)", chunk)
|
||||
if m_end:
|
||||
chunk = chunk[: m_end.start()]
|
||||
|
||||
val_text = self._strip_html_to_text(chunk)
|
||||
if not val_text:
|
||||
continue
|
||||
raw_fields[label] = val_text
|
||||
|
||||
def _get(*labels: str) -> str:
|
||||
for lab in labels:
|
||||
for k, v in raw_fields.items():
|
||||
if str(k).strip().lower() == str(lab).strip().lower():
|
||||
return str(v or "").strip()
|
||||
return ""
|
||||
|
||||
# Map common ISBNsearch labels.
|
||||
author_text = _get("Author", "Authors", "Author(s)")
|
||||
publisher = _get("Publisher")
|
||||
published = _get("Published", "Publication Date", "Publish Date")
|
||||
language = _get("Language")
|
||||
pages = _get("Pages")
|
||||
isbn_13 = _get("ISBN-13", "ISBN13")
|
||||
isbn_10 = _get("ISBN-10", "ISBN10")
|
||||
|
||||
year = ""
|
||||
if published:
|
||||
m_year = re.search(r"\b(\d{4})\b", published)
|
||||
year = str(m_year.group(1)) if m_year else ""
|
||||
|
||||
authors: List[str] = []
|
||||
if author_text:
|
||||
# Split on common separators; keep multi-part names intact.
|
||||
for part in re.split(r"\s*(?:,|;|\band\b|\&|\|)\s*", author_text, flags=re.IGNORECASE):
|
||||
p = str(part or "").strip()
|
||||
if p:
|
||||
authors.append(p)
|
||||
|
||||
# Prefer parsed title, but fall back to og:title if needed.
|
||||
if not title:
|
||||
m_og = re.search(r"(?is)<meta\b[^>]*property=['\"]og:title['\"][^>]*content=['\"](.*?)['\"][^>]*>", html)
|
||||
if m_og:
|
||||
title = self._strip_html_to_text(m_og.group(1))
|
||||
|
||||
# Ensure ISBN tokens are normalized.
|
||||
isbn_tokens: List[str] = []
|
||||
for token in [isbn_13, isbn_10, isbn]:
|
||||
t = self._clean_isbn(token)
|
||||
if t and t not in isbn_tokens:
|
||||
isbn_tokens.append(t)
|
||||
|
||||
item: Dict[str, Any] = {
|
||||
"title": title or "",
|
||||
# Keep UI columns compatible with the generic metadata table.
|
||||
"artist": ", ".join(authors) if authors else "",
|
||||
"album": publisher or "",
|
||||
"year": year or "",
|
||||
"provider": self.name,
|
||||
"authors": authors,
|
||||
"publisher": publisher or "",
|
||||
"language": language or "",
|
||||
"pages": pages or "",
|
||||
"identifiers": {
|
||||
"isbn_13": next((t for t in isbn_tokens if len(t) == 13), None),
|
||||
"isbn_10": next((t for t in isbn_tokens if len(t) == 10), None),
|
||||
},
|
||||
"raw_fields": raw_fields,
|
||||
}
|
||||
|
||||
# Only return usable items.
|
||||
if not item.get("title") and not any(item["identifiers"].values()):
|
||||
return []
|
||||
|
||||
return [item]
|
||||
|
||||
def to_tags(self, item: Dict[str, Any]) -> List[str]:
|
||||
tags: List[str] = []
|
||||
|
||||
title = str(item.get("title") or "").strip()
|
||||
if title:
|
||||
tags.append(f"title:{title}")
|
||||
|
||||
authors = item.get("authors") or []
|
||||
if isinstance(authors, list):
|
||||
for a in authors:
|
||||
a = str(a or "").strip()
|
||||
if a:
|
||||
tags.append(f"author:{a}")
|
||||
|
||||
publisher = str(item.get("publisher") or "").strip()
|
||||
if publisher:
|
||||
tags.append(f"publisher:{publisher}")
|
||||
|
||||
year = str(item.get("year") or "").strip()
|
||||
if year:
|
||||
tags.append(f"year:{year}")
|
||||
|
||||
language = str(item.get("language") or "").strip()
|
||||
if language:
|
||||
tags.append(f"language:{language}")
|
||||
|
||||
identifiers = item.get("identifiers") or {}
|
||||
if isinstance(identifiers, dict):
|
||||
for key in ("isbn_13", "isbn_10"):
|
||||
val = identifiers.get(key)
|
||||
if val:
|
||||
tags.append(f"isbn:{val}")
|
||||
|
||||
tags.append(f"source:{self.name}")
|
||||
|
||||
# Dedup case-insensitively, preserve order.
|
||||
seen: set[str] = set()
|
||||
out: List[str] = []
|
||||
for t in tags:
|
||||
s = str(t or "").strip()
|
||||
if not s:
|
||||
continue
|
||||
k = s.lower()
|
||||
if k in seen:
|
||||
continue
|
||||
seen.add(k)
|
||||
out.append(s)
|
||||
return out
|
||||
|
||||
|
||||
class MusicBrainzMetadataProvider(MetadataProvider):
|
||||
"""Metadata provider for MusicBrainz recordings."""
|
||||
|
||||
@@ -517,6 +712,7 @@ _METADATA_PROVIDERS: Dict[str, Type[MetadataProvider]] = {
|
||||
"openlibrary": OpenLibraryMetadataProvider,
|
||||
"googlebooks": GoogleBooksMetadataProvider,
|
||||
"google": GoogleBooksMetadataProvider,
|
||||
"isbnsearch": ISBNsearchMetadataProvider,
|
||||
"musicbrainz": MusicBrainzMetadataProvider,
|
||||
"ytdlp": YtdlpMetadataProvider,
|
||||
}
|
||||
|
||||
@@ -190,6 +190,7 @@ def _archive_id_from_url(url: str) -> str:
|
||||
u = str(url or "").strip()
|
||||
if not u:
|
||||
return ""
|
||||
|
||||
try:
|
||||
p = urlparse(u)
|
||||
host = (p.hostname or "").lower().strip()
|
||||
@@ -215,6 +216,175 @@ def _archive_id_from_url(url: str) -> str:
|
||||
return ""
|
||||
|
||||
|
||||
def _coerce_archive_field_list(value: Any) -> List[str]:
|
||||
"""Coerce an Archive.org metadata field to a list of strings."""
|
||||
if value is None:
|
||||
return []
|
||||
if isinstance(value, list):
|
||||
out: List[str] = []
|
||||
for v in value:
|
||||
try:
|
||||
s = str(v).strip()
|
||||
except Exception:
|
||||
continue
|
||||
if s:
|
||||
out.append(s)
|
||||
return out
|
||||
if isinstance(value, (tuple, set)):
|
||||
out = []
|
||||
for v in value:
|
||||
try:
|
||||
s = str(v).strip()
|
||||
except Exception:
|
||||
continue
|
||||
if s:
|
||||
out.append(s)
|
||||
return out
|
||||
try:
|
||||
s = str(value).strip()
|
||||
except Exception:
|
||||
return []
|
||||
return [s] if s else []
|
||||
|
||||
|
||||
def _archive_item_metadata_to_tags(archive_id: str, item_metadata: Dict[str, Any]) -> List[str]:
|
||||
"""Map Archive.org metadata JSON (the `metadata` object) to tag strings.
|
||||
|
||||
This is intentionally best-effort and conservative: it focuses on stable,
|
||||
useful bibliographic fields (title/author/publisher/ISBN/identifier/topics).
|
||||
"""
|
||||
archive_id_clean = str(archive_id or "").strip()
|
||||
meta = item_metadata if isinstance(item_metadata, dict) else {}
|
||||
|
||||
tags: List[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
def _add(tag: str) -> None:
|
||||
try:
|
||||
t = str(tag).strip()
|
||||
except Exception:
|
||||
return
|
||||
if not t:
|
||||
return
|
||||
if t.lower() in seen:
|
||||
return
|
||||
seen.add(t.lower())
|
||||
tags.append(t)
|
||||
|
||||
if archive_id_clean:
|
||||
_add(f"internet_archive:{archive_id_clean}")
|
||||
|
||||
# Title
|
||||
for title in _coerce_archive_field_list(meta.get("title"))[:1]:
|
||||
_add(f"title:{title}")
|
||||
|
||||
# Authors/creators
|
||||
creators: List[str] = []
|
||||
creators.extend(_coerce_archive_field_list(meta.get("creator")))
|
||||
creators.extend(_coerce_archive_field_list(meta.get("author")))
|
||||
for creator in creators[:3]:
|
||||
_add(f"author:{creator}")
|
||||
|
||||
# Publisher
|
||||
for publisher in _coerce_archive_field_list(meta.get("publisher"))[:3]:
|
||||
_add(f"publisher:{publisher}")
|
||||
|
||||
# Publish date/year
|
||||
for date_val in _coerce_archive_field_list(meta.get("date"))[:1]:
|
||||
_add(f"publish_date:{date_val}")
|
||||
for year_val in _coerce_archive_field_list(meta.get("year"))[:1]:
|
||||
_add(f"publish_date:{year_val}")
|
||||
|
||||
# Language
|
||||
for lang in _coerce_archive_field_list(meta.get("language"))[:3]:
|
||||
_add(f"language:{lang}")
|
||||
|
||||
# Topics/subjects: follow existing OpenLibrary behavior (un-namespaced tags)
|
||||
for subj in _coerce_archive_field_list(meta.get("subject"))[:15]:
|
||||
if len(subj) > 200:
|
||||
subj = subj[:200]
|
||||
_add(subj)
|
||||
|
||||
# ISBNs and identifiers
|
||||
def _clean_isbn(raw: str) -> str:
|
||||
return str(raw or "").replace("-", "").strip()
|
||||
|
||||
for isbn in _coerce_archive_field_list(meta.get("isbn"))[:10]:
|
||||
isbn_clean = _clean_isbn(isbn)
|
||||
if isbn_clean:
|
||||
_add(f"isbn:{isbn_clean}")
|
||||
|
||||
identifiers: List[str] = []
|
||||
identifiers.extend(_coerce_archive_field_list(meta.get("identifier")))
|
||||
identifiers.extend(_coerce_archive_field_list(meta.get("external-identifier")))
|
||||
added_other = 0
|
||||
for ident in identifiers:
|
||||
ident_s = str(ident or "").strip()
|
||||
if not ident_s:
|
||||
continue
|
||||
low = ident_s.lower()
|
||||
|
||||
if low.startswith("urn:isbn:"):
|
||||
val = _clean_isbn(ident_s.split(":", 2)[-1])
|
||||
if val:
|
||||
_add(f"isbn:{val}")
|
||||
continue
|
||||
if low.startswith("isbn:"):
|
||||
val = _clean_isbn(ident_s.split(":", 1)[-1])
|
||||
if val:
|
||||
_add(f"isbn:{val}")
|
||||
continue
|
||||
if low.startswith("urn:oclc:"):
|
||||
val = ident_s.split(":", 2)[-1].strip()
|
||||
if val:
|
||||
_add(f"oclc:{val}")
|
||||
continue
|
||||
if low.startswith("oclc:"):
|
||||
val = ident_s.split(":", 1)[-1].strip()
|
||||
if val:
|
||||
_add(f"oclc:{val}")
|
||||
continue
|
||||
if low.startswith("urn:lccn:"):
|
||||
val = ident_s.split(":", 2)[-1].strip()
|
||||
if val:
|
||||
_add(f"lccn:{val}")
|
||||
continue
|
||||
if low.startswith("lccn:"):
|
||||
val = ident_s.split(":", 1)[-1].strip()
|
||||
if val:
|
||||
_add(f"lccn:{val}")
|
||||
continue
|
||||
if low.startswith("doi:"):
|
||||
val = ident_s.split(":", 1)[-1].strip()
|
||||
if val:
|
||||
_add(f"doi:{val}")
|
||||
continue
|
||||
|
||||
if archive_id_clean and low == archive_id_clean.lower():
|
||||
continue
|
||||
if added_other >= 5:
|
||||
continue
|
||||
if len(ident_s) > 200:
|
||||
ident_s = ident_s[:200]
|
||||
_add(f"identifier:{ident_s}")
|
||||
added_other += 1
|
||||
|
||||
return tags
|
||||
|
||||
|
||||
def _fetch_archive_item_metadata(archive_id: str, *, timeout: int = 8) -> Dict[str, Any]:
|
||||
ident = str(archive_id or "").strip()
|
||||
if not ident:
|
||||
return {}
|
||||
resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=int(timeout))
|
||||
resp.raise_for_status()
|
||||
data = resp.json() if resp is not None else {}
|
||||
if not isinstance(data, dict):
|
||||
return {}
|
||||
meta = data.get("metadata")
|
||||
return meta if isinstance(meta, dict) else {}
|
||||
|
||||
|
||||
class OpenLibrary(Provider):
|
||||
# Domains that should be routed to this provider when the user supplies a URL.
|
||||
# (Used by ProviderCore.registry.match_provider_name_for_url)
|
||||
@@ -1046,6 +1216,25 @@ class OpenLibrary(Provider):
|
||||
log("[openlibrary] No archive identifier available; cannot download", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Best-effort metadata scrape to attach bibliographic tags for downstream cmdlets.
|
||||
try:
|
||||
archive_meta = _fetch_archive_item_metadata(archive_id)
|
||||
tags = _archive_item_metadata_to_tags(archive_id, archive_meta)
|
||||
if tags:
|
||||
try:
|
||||
result.tag.update(tags)
|
||||
except Exception:
|
||||
# Fallback for callers that pass plain dicts.
|
||||
pass
|
||||
if isinstance(meta, dict):
|
||||
meta["archive_id"] = archive_id
|
||||
if archive_meta:
|
||||
meta["archive_metadata"] = archive_meta
|
||||
result.full_metadata = meta
|
||||
except Exception:
|
||||
# Never block downloads on metadata fetch.
|
||||
pass
|
||||
|
||||
safe_title = sanitize_filename(result.title)
|
||||
if not safe_title or "http" in safe_title.lower():
|
||||
safe_title = sanitize_filename(archive_id) or "archive"
|
||||
|
||||
@@ -306,36 +306,41 @@ class Soulseek(Provider):
|
||||
client = SoulSeekClient(settings)
|
||||
|
||||
with _suppress_aioslsk_noise():
|
||||
try:
|
||||
await client.start()
|
||||
await client.login()
|
||||
except Exception as exc:
|
||||
log(f"[soulseek] Login failed: {type(exc).__name__}: {exc}", file=sys.stderr)
|
||||
return []
|
||||
async with _suppress_aioslsk_asyncio_task_noise():
|
||||
try:
|
||||
await client.start()
|
||||
await client.login()
|
||||
except Exception as exc:
|
||||
log(f"[soulseek] Login failed: {type(exc).__name__}: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
try:
|
||||
search_request = await client.searches.search(query)
|
||||
await self._collect_results(search_request, timeout=timeout)
|
||||
return self._flatten_results(search_request)[:limit]
|
||||
except Exception as exc:
|
||||
log(f"[soulseek] Search error: {type(exc).__name__}: {exc}", file=sys.stderr)
|
||||
return []
|
||||
finally:
|
||||
# Best-effort: try to cancel/close the search request before stopping
|
||||
# the client to reduce stray reply spam.
|
||||
try:
|
||||
if "search_request" in locals() and search_request is not None:
|
||||
cancel = getattr(search_request, "cancel", None)
|
||||
if callable(cancel):
|
||||
maybe = cancel()
|
||||
if asyncio.iscoroutine(maybe):
|
||||
await maybe
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
await client.stop()
|
||||
except Exception:
|
||||
pass
|
||||
search_request = await client.searches.search(query)
|
||||
await self._collect_results(search_request, timeout=timeout)
|
||||
return self._flatten_results(search_request)[:limit]
|
||||
except Exception as exc:
|
||||
log(f"[soulseek] Search error: {type(exc).__name__}: {exc}", file=sys.stderr)
|
||||
return []
|
||||
finally:
|
||||
# Best-effort: try to cancel/close the search request before stopping
|
||||
# the client to reduce stray reply spam.
|
||||
try:
|
||||
if "search_request" in locals() and search_request is not None:
|
||||
cancel = getattr(search_request, "cancel", None)
|
||||
if callable(cancel):
|
||||
maybe = cancel()
|
||||
if asyncio.iscoroutine(maybe):
|
||||
await maybe
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
await client.stop()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
await asyncio.sleep(0)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _flatten_results(self, search_request: Any) -> List[dict]:
|
||||
flat: List[dict] = []
|
||||
@@ -558,93 +563,101 @@ async def download_soulseek_file(
|
||||
async def _attempt_once(attempt_num: int) -> tuple[Optional[Path], Any, int, float]:
|
||||
client = SoulSeekClient(settings)
|
||||
with _suppress_aioslsk_noise():
|
||||
try:
|
||||
async with _suppress_aioslsk_asyncio_task_noise():
|
||||
await client.start()
|
||||
await client.login()
|
||||
debug(f"[soulseek] Logged in as {login_user}")
|
||||
|
||||
log(
|
||||
f"[soulseek] Download attempt {attempt_num}: {username} :: {local_filename}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
debug(f"[soulseek] Requesting download from {username}: {filename}")
|
||||
|
||||
transfer = await client.transfers.add(Transfer(username, filename, TransferDirection.DOWNLOAD))
|
||||
transfer.local_path = str(output_path)
|
||||
await client.transfers.queue(transfer)
|
||||
|
||||
start_time = time.time()
|
||||
last_progress_time = start_time
|
||||
progress_bar = ProgressBar()
|
||||
|
||||
while not transfer.is_finalized():
|
||||
elapsed = time.time() - start_time
|
||||
if elapsed > timeout:
|
||||
log(f"[soulseek] Download timeout after {timeout}s", file=sys.stderr)
|
||||
bytes_done = int(getattr(transfer, "bytes_transfered", 0) or 0)
|
||||
state_val = getattr(getattr(transfer, "state", None), "VALUE", None)
|
||||
progress_bar.finish()
|
||||
return None, state_val, bytes_done, elapsed
|
||||
|
||||
bytes_done = int(getattr(transfer, "bytes_transfered", 0) or 0)
|
||||
total_bytes = int(getattr(transfer, "filesize", 0) or 0)
|
||||
now = time.time()
|
||||
if now - last_progress_time >= 0.5:
|
||||
progress_bar.update(
|
||||
downloaded=bytes_done,
|
||||
total=total_bytes if total_bytes > 0 else None,
|
||||
label="download",
|
||||
file=sys.stderr,
|
||||
)
|
||||
last_progress_time = now
|
||||
|
||||
await asyncio.sleep(1)
|
||||
|
||||
final_state = getattr(getattr(transfer, "state", None), "VALUE", None)
|
||||
downloaded_path = Path(transfer.local_path) if getattr(transfer, "local_path", None) else output_path
|
||||
final_elapsed = time.time() - start_time
|
||||
|
||||
# Clear in-place progress bar.
|
||||
progress_bar.finish()
|
||||
|
||||
# If a file was written, treat it as success even if state is odd.
|
||||
async with _suppress_aioslsk_asyncio_task_noise():
|
||||
try:
|
||||
if downloaded_path.exists() and downloaded_path.stat().st_size > 0:
|
||||
if final_state != TransferState.COMPLETE:
|
||||
log(
|
||||
f"[soulseek] Transfer finalized as {final_state}, but file exists ({downloaded_path.stat().st_size} bytes). Keeping file.",
|
||||
await client.start()
|
||||
await client.login()
|
||||
debug(f"[soulseek] Logged in as {login_user}")
|
||||
|
||||
log(
|
||||
f"[soulseek] Download attempt {attempt_num}: {username} :: {local_filename}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
debug(f"[soulseek] Requesting download from {username}: {filename}")
|
||||
|
||||
transfer = await client.transfers.add(Transfer(username, filename, TransferDirection.DOWNLOAD))
|
||||
transfer.local_path = str(output_path)
|
||||
await client.transfers.queue(transfer)
|
||||
|
||||
start_time = time.time()
|
||||
last_progress_time = start_time
|
||||
progress_bar = ProgressBar()
|
||||
|
||||
while not transfer.is_finalized():
|
||||
elapsed = time.time() - start_time
|
||||
if elapsed > timeout:
|
||||
log(f"[soulseek] Download timeout after {timeout}s", file=sys.stderr)
|
||||
bytes_done = int(getattr(transfer, "bytes_transfered", 0) or 0)
|
||||
state_val = getattr(getattr(transfer, "state", None), "VALUE", None)
|
||||
progress_bar.finish()
|
||||
return None, state_val, bytes_done, elapsed
|
||||
|
||||
bytes_done = int(getattr(transfer, "bytes_transfered", 0) or 0)
|
||||
total_bytes = int(getattr(transfer, "filesize", 0) or 0)
|
||||
now = time.time()
|
||||
if now - last_progress_time >= 0.5:
|
||||
progress_bar.update(
|
||||
downloaded=bytes_done,
|
||||
total=total_bytes if total_bytes > 0 else None,
|
||||
label="download",
|
||||
file=sys.stderr,
|
||||
)
|
||||
last_progress_time = now
|
||||
|
||||
await asyncio.sleep(1)
|
||||
|
||||
final_state = getattr(getattr(transfer, "state", None), "VALUE", None)
|
||||
downloaded_path = (
|
||||
Path(transfer.local_path)
|
||||
if getattr(transfer, "local_path", None)
|
||||
else output_path
|
||||
)
|
||||
final_elapsed = time.time() - start_time
|
||||
|
||||
# Clear in-place progress bar.
|
||||
progress_bar.finish()
|
||||
|
||||
# If a file was written, treat it as success even if state is odd.
|
||||
try:
|
||||
if downloaded_path.exists() and downloaded_path.stat().st_size > 0:
|
||||
if final_state != TransferState.COMPLETE:
|
||||
log(
|
||||
f"[soulseek] Transfer finalized as {final_state}, but file exists ({downloaded_path.stat().st_size} bytes). Keeping file.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return downloaded_path, final_state, int(downloaded_path.stat().st_size), final_elapsed
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if final_state == TransferState.COMPLETE and downloaded_path.exists():
|
||||
debug(f"[soulseek] Download complete: {downloaded_path}")
|
||||
return downloaded_path, final_state, int(downloaded_path.stat().st_size), final_elapsed
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if final_state == TransferState.COMPLETE and downloaded_path.exists():
|
||||
debug(f"[soulseek] Download complete: {downloaded_path}")
|
||||
return downloaded_path, final_state, int(downloaded_path.stat().st_size), final_elapsed
|
||||
fail_bytes = int(getattr(transfer, "bytes_transfered", 0) or 0)
|
||||
fail_total = int(getattr(transfer, "filesize", 0) or 0)
|
||||
reason = getattr(transfer, "reason", None)
|
||||
log(
|
||||
f"[soulseek] Download failed: state={final_state} bytes={fail_bytes}/{fail_total} reason={reason}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
fail_bytes = int(getattr(transfer, "bytes_transfered", 0) or 0)
|
||||
fail_total = int(getattr(transfer, "filesize", 0) or 0)
|
||||
reason = getattr(transfer, "reason", None)
|
||||
log(
|
||||
f"[soulseek] Download failed: state={final_state} bytes={fail_bytes}/{fail_total} reason={reason}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
# Clean up 0-byte placeholder.
|
||||
try:
|
||||
if downloaded_path.exists() and downloaded_path.stat().st_size == 0:
|
||||
downloaded_path.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
return None, final_state, fail_bytes, final_elapsed
|
||||
|
||||
finally:
|
||||
try:
|
||||
await client.stop()
|
||||
except Exception:
|
||||
pass
|
||||
# Clean up 0-byte placeholder.
|
||||
try:
|
||||
if downloaded_path.exists() and downloaded_path.stat().st_size == 0:
|
||||
downloaded_path.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
return None, final_state, fail_bytes, final_elapsed
|
||||
finally:
|
||||
try:
|
||||
await client.stop()
|
||||
except Exception:
|
||||
pass
|
||||
# Let cancellation/cleanup callbacks run while our exception handler is still installed.
|
||||
try:
|
||||
await asyncio.sleep(0)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Retry a couple times only for fast 0-byte failures (common transient case).
|
||||
max_attempts = 3
|
||||
|
||||
Reference in New Issue
Block a user