Files
Medios-Macina/Provider/vimm.py

186 lines
6.0 KiB
Python
Raw Normal View History

2026-01-05 07:51:19 -08:00
"""Vimm provider skeleton (lxml + HTTPClient).
This is a lightweight, resilient provider implementation intended as a
starting point for implementing a full Vimm (vimm.net) provider.
It prefers server-rendered HTML parsing via lxml and uses the repo's
`HTTPClient` helper for robust HTTP calls (timeouts/retries).
Selectors in `search()` are intentionally permissive heuristics; update the
XPaths to match the real site HTML when you have an actual fixture.
"""
from __future__ import annotations
import re
import sys
from typing import Any, Dict, List, Optional
from urllib.parse import urljoin, quote_plus
from lxml import html as lxml_html
from API.HTTP import HTTPClient
from ProviderCore.base import Provider, SearchResult
from SYS.logger import log, debug
class Vimm(Provider):
"""Provider for vimm.net vault listings (skeleton).
- Uses lxml for parsing
- No authentication required
"""
URL = ("https://vimm.net/vault/",)
URL_DOMAINS = ("vimm.net",)
def validate(self) -> bool:
# This provider has no required config; consider more checks if needed.
return True
def _parse_size_bytes(self, size_str: str) -> Optional[int]:
if not size_str:
return None
try:
s = str(size_str or "").strip().replace(",", "")
m = re.search(r"(?P<val>[\d\.]+)\s*(?P<unit>[KMGT]?B)?", s, flags=re.I)
if not m:
return None
val = float(m.group("val"))
unit = (m.group("unit") or "B").upper()
mul = {
"B": 1,
"KB": 1024,
"MB": 1024 ** 2,
"GB": 1024 ** 3,
"TB": 1024 ** 4,
}.get(unit, 1)
return int(val * mul)
except Exception:
return None
def search(
self,
query: str,
limit: int = 50,
filters: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[SearchResult]:
q = (query or "").strip()
if not q:
return []
# Build search/list URL
base = "https://vimm.net/vault/"
url = f"{base}?p=list&q={quote_plus(q)}"
try:
with HTTPClient(timeout=20.0) as client:
resp = client.get(url)
content = resp.content
except Exception as exc:
log(f"[vimm] HTTP fetch failed: {exc}", file=sys.stderr)
return []
try:
doc = lxml_html.fromstring(content)
except Exception as exc:
log(f"[vimm] HTML parse failed: {exc}", file=sys.stderr)
return []
results: List[SearchResult] = []
# Candidate XPaths for list items (tweak to match real DOM)
container_xpaths = [
'//div[contains(@class,"list-item")]',
'//div[contains(@class,"result")]',
'//li[contains(@class,"item")]',
'//tr[contains(@class,"result")]',
'//article',
]
nodes = []
for xp in container_xpaths:
try:
found = doc.xpath(xp)
if found:
nodes = found
debug(f"[vimm] using xpath {xp} -> {len(found)} nodes")
break
except Exception:
continue
# Fallback: try generic anchors under a list area
if not nodes:
try:
nodes = doc.xpath('//div[contains(@id,"list")]/div') or doc.xpath('//div[contains(@class,"results")]/div')
except Exception:
nodes = []
for n in (nodes or [])[: max(1, int(limit))]:
try:
# Prefer explicit title anchors
title = None
href = None
try:
# a few heuristic searches for a meaningful anchor
a = (n.xpath('.//a[contains(@class,"title")]') or
n.xpath('.//h2/a') or
n.xpath('.//a[contains(@href,"/vault/")]') or
n.xpath('.//a'))
if a:
a0 = a[0]
title = a0.text_content().strip()
href = a0.get('href')
except Exception:
title = None
href = None
if not title:
title = (n.text_content() or "").strip()
path = urljoin(base, href) if href else ""
# Extract size & platform heuristics
size_text = ""
try:
s = n.xpath('.//*[contains(@class,"size")]/text()') or n.xpath('.//span[contains(text(),"MB") or contains(text(),"GB")]/text()')
if s:
size_text = str(s[0]).strip()
except Exception:
size_text = ""
size_bytes = self._parse_size_bytes(size_text)
platform = ""
try:
p = n.xpath('.//*[contains(@class,"platform")]/text()')
if p:
platform = str(p[0]).strip()
except Exception:
platform = ""
columns = []
if platform:
columns.append(("Platform", platform))
if size_text:
columns.append(("Size", size_text))
results.append(
SearchResult(
table="vimm",
title=str(title or "").strip(),
path=str(path or ""),
detail="",
annotations=[],
media_kind="file",
size_bytes=size_bytes,
tag={"vimm"},
columns=columns,
full_metadata={"raw": lxml_html.tostring(n, encoding="unicode")},
)
)
except Exception:
continue
return results[: max(0, int(limit))]