2025-12-19 02:29:42 -08:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import asyncio
|
|
|
|
|
import re
|
2025-12-19 03:25:52 -08:00
|
|
|
import sys
|
|
|
|
|
import time
|
2025-12-19 02:29:42 -08:00
|
|
|
from pathlib import Path
|
|
|
|
|
from typing import Any, Dict, Optional, Tuple
|
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
|
|
|
|
from ProviderCore.base import Provider, SearchResult
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _looks_like_telegram_message_url(url: str) -> bool:
|
|
|
|
|
try:
|
|
|
|
|
parsed = urlparse(str(url))
|
|
|
|
|
except Exception:
|
|
|
|
|
return False
|
|
|
|
|
host = (parsed.hostname or "").lower().strip()
|
|
|
|
|
if host in {"t.me", "telegram.me"}:
|
|
|
|
|
return True
|
|
|
|
|
if host.endswith(".t.me"):
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_telegram_message_url(url: str) -> Tuple[str, int]:
|
|
|
|
|
"""Parse a Telegram message URL into (entity, message_id).
|
|
|
|
|
|
|
|
|
|
Supported:
|
|
|
|
|
- https://t.me/<username>/<msg_id>
|
|
|
|
|
- https://t.me/s/<username>/<msg_id>
|
|
|
|
|
- https://t.me/c/<internal_channel_id>/<msg_id>
|
|
|
|
|
"""
|
|
|
|
|
parsed = urlparse(str(url))
|
|
|
|
|
path = (parsed.path or "").strip("/")
|
|
|
|
|
if not path:
|
|
|
|
|
raise ValueError(f"Invalid Telegram URL: {url}")
|
|
|
|
|
|
|
|
|
|
parts = [p for p in path.split("/") if p]
|
|
|
|
|
if not parts:
|
|
|
|
|
raise ValueError(f"Invalid Telegram URL: {url}")
|
|
|
|
|
|
|
|
|
|
# Strip preview prefix
|
|
|
|
|
if parts and parts[0].lower() == "s":
|
|
|
|
|
parts = parts[1:]
|
|
|
|
|
|
|
|
|
|
if len(parts) < 2:
|
|
|
|
|
raise ValueError(f"Invalid Telegram URL (expected /<chat>/<msg>): {url}")
|
|
|
|
|
|
|
|
|
|
chat = parts[0]
|
|
|
|
|
msg_raw = parts[1]
|
|
|
|
|
|
|
|
|
|
# t.me/c/<id>/<msg>
|
|
|
|
|
if chat.lower() == "c":
|
|
|
|
|
if len(parts) < 3:
|
|
|
|
|
raise ValueError(f"Invalid Telegram /c/ URL: {url}")
|
|
|
|
|
chat = f"c:{parts[1]}"
|
|
|
|
|
msg_raw = parts[2]
|
|
|
|
|
|
|
|
|
|
m = re.fullmatch(r"\d+", str(msg_raw).strip())
|
|
|
|
|
if not m:
|
|
|
|
|
raise ValueError(f"Invalid Telegram message id in URL: {url}")
|
|
|
|
|
|
|
|
|
|
return str(chat), int(msg_raw)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Telegram(Provider):
|
|
|
|
|
"""Telegram provider using Telethon.
|
|
|
|
|
|
|
|
|
|
Config:
|
|
|
|
|
[provider=telegram]
|
|
|
|
|
app_id=
|
|
|
|
|
api_hash=
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
|
|
|
super().__init__(config)
|
|
|
|
|
telegram_conf = self.config.get("provider", {}).get("telegram", {}) if isinstance(self.config, dict) else {}
|
|
|
|
|
self._app_id = telegram_conf.get("app_id")
|
|
|
|
|
self._api_hash = telegram_conf.get("api_hash")
|
|
|
|
|
|
|
|
|
|
def validate(self) -> bool:
|
|
|
|
|
try:
|
|
|
|
|
__import__("telethon")
|
|
|
|
|
except Exception:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
app_id = int(self._app_id) if self._app_id not in (None, "") else None
|
|
|
|
|
except Exception:
|
|
|
|
|
app_id = None
|
|
|
|
|
api_hash = str(self._api_hash).strip() if self._api_hash not in (None, "") else ""
|
|
|
|
|
return bool(app_id and api_hash)
|
|
|
|
|
|
|
|
|
|
def _session_base_path(self) -> Path:
|
|
|
|
|
root = Path(__file__).resolve().parents[1]
|
|
|
|
|
session_dir = root / "Log" / "medeia_macina"
|
|
|
|
|
try:
|
|
|
|
|
session_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
return session_dir / "telegram"
|
|
|
|
|
|
|
|
|
|
def _credentials(self) -> Tuple[int, str]:
|
|
|
|
|
raw_app_id = self._app_id
|
|
|
|
|
if raw_app_id in (None, ""):
|
|
|
|
|
raise Exception("Telegram app_id missing")
|
|
|
|
|
try:
|
|
|
|
|
app_id = int(str(raw_app_id).strip())
|
|
|
|
|
except Exception:
|
|
|
|
|
raise Exception("Telegram app_id invalid")
|
|
|
|
|
api_hash = str(self._api_hash or "").strip()
|
|
|
|
|
if not api_hash:
|
|
|
|
|
raise Exception("Telegram api_hash missing")
|
|
|
|
|
return app_id, api_hash
|
|
|
|
|
|
|
|
|
|
def _ensure_event_loop(self) -> None:
|
|
|
|
|
"""Telethon sync wrapper requires an event loop to exist in this thread."""
|
|
|
|
|
try:
|
|
|
|
|
asyncio.get_event_loop()
|
|
|
|
|
except RuntimeError:
|
|
|
|
|
loop = asyncio.new_event_loop()
|
|
|
|
|
asyncio.set_event_loop(loop)
|
|
|
|
|
|
|
|
|
|
def _download_message_media_sync(self, *, url: str, output_dir: Path) -> Tuple[Path, Dict[str, Any]]:
|
|
|
|
|
try:
|
|
|
|
|
from telethon import errors
|
|
|
|
|
from telethon.sync import TelegramClient
|
|
|
|
|
from telethon.tl.types import PeerChannel
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
raise Exception(f"Telethon not available: {exc}")
|
|
|
|
|
|
|
|
|
|
self._ensure_event_loop()
|
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
|
if getattr(loop, "is_running", lambda: False)():
|
|
|
|
|
raise Exception("Telegram provider cannot run while an event loop is already running")
|
|
|
|
|
|
|
|
|
|
def _resolve(value):
|
|
|
|
|
if asyncio.iscoroutine(value):
|
|
|
|
|
return loop.run_until_complete(value)
|
|
|
|
|
return value
|
|
|
|
|
|
|
|
|
|
app_id, api_hash = self._credentials()
|
|
|
|
|
session_base = self._session_base_path()
|
|
|
|
|
chat, message_id = _parse_telegram_message_url(url)
|
|
|
|
|
|
2025-12-19 03:25:52 -08:00
|
|
|
def _format_bytes(num: Optional[int]) -> str:
|
|
|
|
|
try:
|
|
|
|
|
if num is None:
|
|
|
|
|
return "?B"
|
|
|
|
|
n = float(num)
|
|
|
|
|
suffixes = ["B", "KB", "MB", "GB", "TB"]
|
|
|
|
|
for s in suffixes:
|
|
|
|
|
if n < 1024 or s == suffixes[-1]:
|
|
|
|
|
if s == "B":
|
|
|
|
|
return f"{int(n)}{s}"
|
|
|
|
|
return f"{n:.1f}{s}"
|
|
|
|
|
n /= 1024
|
|
|
|
|
except Exception:
|
|
|
|
|
return "?B"
|
|
|
|
|
|
2025-12-19 02:29:42 -08:00
|
|
|
client = TelegramClient(str(session_base), app_id, api_hash)
|
|
|
|
|
try:
|
|
|
|
|
# This prompts on first run for phone/code and persists the session.
|
|
|
|
|
_resolve(client.start())
|
|
|
|
|
|
|
|
|
|
if chat.startswith("c:"):
|
|
|
|
|
channel_id = int(chat.split(":", 1)[1])
|
|
|
|
|
entity = PeerChannel(channel_id)
|
|
|
|
|
else:
|
|
|
|
|
entity = chat
|
|
|
|
|
if isinstance(entity, str) and entity and not entity.startswith("@"):
|
|
|
|
|
entity = "@" + entity
|
|
|
|
|
|
|
|
|
|
# Use the list form to be robust across Telethon sync/async stubs.
|
|
|
|
|
messages = _resolve(client.get_messages(entity, ids=[message_id]))
|
|
|
|
|
message = None
|
|
|
|
|
if isinstance(messages, (list, tuple)):
|
|
|
|
|
message = messages[0] if messages else None
|
|
|
|
|
else:
|
|
|
|
|
try:
|
|
|
|
|
# TotalList is list-like
|
|
|
|
|
message = messages[0] # type: ignore[index]
|
|
|
|
|
except Exception:
|
|
|
|
|
message = None
|
|
|
|
|
if not message:
|
|
|
|
|
raise Exception("Telegram message not found")
|
|
|
|
|
if not getattr(message, "media", None):
|
|
|
|
|
raise Exception("Telegram message has no media")
|
|
|
|
|
|
|
|
|
|
chat_title = ""
|
|
|
|
|
chat_username = ""
|
|
|
|
|
chat_id = None
|
|
|
|
|
try:
|
|
|
|
|
chat_obj = getattr(message, "chat", None)
|
|
|
|
|
if chat_obj is not None:
|
|
|
|
|
maybe_title = getattr(chat_obj, "title", None)
|
|
|
|
|
maybe_username = getattr(chat_obj, "username", None)
|
|
|
|
|
maybe_id = getattr(chat_obj, "id", None)
|
|
|
|
|
if isinstance(maybe_title, str):
|
|
|
|
|
chat_title = maybe_title.strip()
|
|
|
|
|
if isinstance(maybe_username, str):
|
|
|
|
|
chat_username = maybe_username.strip()
|
|
|
|
|
if maybe_id is not None:
|
|
|
|
|
chat_id = int(maybe_id)
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
caption = ""
|
|
|
|
|
try:
|
|
|
|
|
maybe_caption = getattr(message, "message", None)
|
|
|
|
|
if isinstance(maybe_caption, str):
|
|
|
|
|
caption = maybe_caption.strip()
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
msg_id = None
|
|
|
|
|
msg_date = None
|
|
|
|
|
try:
|
|
|
|
|
msg_id = int(getattr(message, "id", 0) or 0)
|
|
|
|
|
except Exception:
|
|
|
|
|
msg_id = None
|
|
|
|
|
try:
|
|
|
|
|
msg_date = getattr(message, "date", None)
|
|
|
|
|
except Exception:
|
|
|
|
|
msg_date = None
|
|
|
|
|
|
|
|
|
|
file_name = ""
|
|
|
|
|
file_mime = ""
|
|
|
|
|
file_size = None
|
|
|
|
|
try:
|
|
|
|
|
file_obj = getattr(message, "file", None)
|
|
|
|
|
maybe_name = getattr(file_obj, "name", None)
|
|
|
|
|
maybe_mime = getattr(file_obj, "mime_type", None)
|
|
|
|
|
maybe_size = getattr(file_obj, "size", None)
|
|
|
|
|
if isinstance(maybe_name, str):
|
|
|
|
|
file_name = maybe_name.strip()
|
|
|
|
|
if isinstance(maybe_mime, str):
|
|
|
|
|
file_mime = maybe_mime.strip()
|
|
|
|
|
if maybe_size is not None:
|
|
|
|
|
file_size = int(maybe_size)
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
2025-12-19 03:25:52 -08:00
|
|
|
# Progress callback: prints to stderr so it doesn't interfere with pipeline stdout.
|
|
|
|
|
last_print = {"t": 0.0}
|
|
|
|
|
def _progress(current: int, total: int) -> None:
|
|
|
|
|
try:
|
|
|
|
|
now = time.monotonic()
|
|
|
|
|
# Throttle to avoid spamming.
|
|
|
|
|
if now - float(last_print.get("t", 0.0)) < 0.25 and current < total:
|
|
|
|
|
return
|
|
|
|
|
last_print["t"] = now
|
|
|
|
|
|
|
|
|
|
pct = ""
|
|
|
|
|
try:
|
|
|
|
|
if total and total > 0:
|
|
|
|
|
pct = f" {min(100.0, (current / total) * 100.0):5.1f}%"
|
|
|
|
|
except Exception:
|
|
|
|
|
pct = ""
|
|
|
|
|
|
|
|
|
|
line = f"[telegram] Downloading{pct} ({_format_bytes(current)}/{_format_bytes(total)})"
|
|
|
|
|
sys.stderr.write("\r" + line)
|
|
|
|
|
sys.stderr.flush()
|
|
|
|
|
except Exception:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
downloaded = _resolve(client.download_media(message, file=str(output_dir), progress_callback=_progress))
|
|
|
|
|
try:
|
|
|
|
|
sys.stderr.write("\n")
|
|
|
|
|
sys.stderr.flush()
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
2025-12-19 02:29:42 -08:00
|
|
|
if not downloaded:
|
|
|
|
|
raise Exception("Telegram download returned no file")
|
|
|
|
|
downloaded_path = Path(str(downloaded))
|
|
|
|
|
date_iso = None
|
|
|
|
|
try:
|
|
|
|
|
if msg_date is not None and hasattr(msg_date, "isoformat"):
|
|
|
|
|
date_iso = msg_date.isoformat() # type: ignore[union-attr]
|
|
|
|
|
except Exception:
|
|
|
|
|
date_iso = None
|
|
|
|
|
|
|
|
|
|
info: Dict[str, Any] = {
|
|
|
|
|
"provider": "telegram",
|
|
|
|
|
"source_url": url,
|
|
|
|
|
"chat": {
|
|
|
|
|
"key": chat,
|
|
|
|
|
"title": chat_title,
|
|
|
|
|
"username": chat_username,
|
|
|
|
|
"id": chat_id,
|
|
|
|
|
},
|
|
|
|
|
"message": {
|
|
|
|
|
"id": msg_id,
|
|
|
|
|
"date": date_iso,
|
|
|
|
|
"caption": caption,
|
|
|
|
|
},
|
|
|
|
|
"file": {
|
|
|
|
|
"name": file_name,
|
|
|
|
|
"mime_type": file_mime,
|
|
|
|
|
"size": file_size,
|
|
|
|
|
"downloaded_path": str(downloaded_path),
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
return downloaded_path, info
|
|
|
|
|
except errors.RPCError as exc:
|
|
|
|
|
raise Exception(f"Telegram RPC error: {exc}")
|
|
|
|
|
finally:
|
|
|
|
|
try:
|
|
|
|
|
_resolve(client.disconnect())
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def download_url(self, url: str, output_dir: Path) -> Tuple[Path, Dict[str, Any]]:
|
|
|
|
|
"""Download a Telegram message URL and return (path, metadata)."""
|
|
|
|
|
if not _looks_like_telegram_message_url(url):
|
|
|
|
|
raise ValueError("Not a Telegram URL")
|
|
|
|
|
return self._download_message_media_sync(url=url, output_dir=output_dir)
|
|
|
|
|
|
|
|
|
|
def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
|
|
|
|
|
url = str(getattr(result, "path", "") or "")
|
|
|
|
|
if not url:
|
|
|
|
|
return None
|
|
|
|
|
if not _looks_like_telegram_message_url(url):
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
path, _info = self._download_message_media_sync(url=url, output_dir=output_dir)
|
|
|
|
|
return path
|