Files
Medios-Macina/Provider/telegram.py
2025-12-19 03:25:52 -08:00

330 lines
9.0 KiB
Python

from __future__ import annotations
import asyncio
import re
import sys
import time
from pathlib import Path
from typing import Any, Dict, Optional, Tuple
from urllib.parse import urlparse
from ProviderCore.base import Provider, SearchResult
def _looks_like_telegram_message_url(url: str) -> bool:
try:
parsed = urlparse(str(url))
except Exception:
return False
host = (parsed.hostname or "").lower().strip()
if host in {"t.me", "telegram.me"}:
return True
if host.endswith(".t.me"):
return True
return False
def _parse_telegram_message_url(url: str) -> Tuple[str, int]:
"""Parse a Telegram message URL into (entity, message_id).
Supported:
- https://t.me/<username>/<msg_id>
- https://t.me/s/<username>/<msg_id>
- https://t.me/c/<internal_channel_id>/<msg_id>
"""
parsed = urlparse(str(url))
path = (parsed.path or "").strip("/")
if not path:
raise ValueError(f"Invalid Telegram URL: {url}")
parts = [p for p in path.split("/") if p]
if not parts:
raise ValueError(f"Invalid Telegram URL: {url}")
# Strip preview prefix
if parts and parts[0].lower() == "s":
parts = parts[1:]
if len(parts) < 2:
raise ValueError(f"Invalid Telegram URL (expected /<chat>/<msg>): {url}")
chat = parts[0]
msg_raw = parts[1]
# t.me/c/<id>/<msg>
if chat.lower() == "c":
if len(parts) < 3:
raise ValueError(f"Invalid Telegram /c/ URL: {url}")
chat = f"c:{parts[1]}"
msg_raw = parts[2]
m = re.fullmatch(r"\d+", str(msg_raw).strip())
if not m:
raise ValueError(f"Invalid Telegram message id in URL: {url}")
return str(chat), int(msg_raw)
class Telegram(Provider):
"""Telegram provider using Telethon.
Config:
[provider=telegram]
app_id=
api_hash=
"""
def __init__(self, config: Optional[Dict[str, Any]] = None):
super().__init__(config)
telegram_conf = self.config.get("provider", {}).get("telegram", {}) if isinstance(self.config, dict) else {}
self._app_id = telegram_conf.get("app_id")
self._api_hash = telegram_conf.get("api_hash")
def validate(self) -> bool:
try:
__import__("telethon")
except Exception:
return False
try:
app_id = int(self._app_id) if self._app_id not in (None, "") else None
except Exception:
app_id = None
api_hash = str(self._api_hash).strip() if self._api_hash not in (None, "") else ""
return bool(app_id and api_hash)
def _session_base_path(self) -> Path:
root = Path(__file__).resolve().parents[1]
session_dir = root / "Log" / "medeia_macina"
try:
session_dir.mkdir(parents=True, exist_ok=True)
except Exception:
pass
return session_dir / "telegram"
def _credentials(self) -> Tuple[int, str]:
raw_app_id = self._app_id
if raw_app_id in (None, ""):
raise Exception("Telegram app_id missing")
try:
app_id = int(str(raw_app_id).strip())
except Exception:
raise Exception("Telegram app_id invalid")
api_hash = str(self._api_hash or "").strip()
if not api_hash:
raise Exception("Telegram api_hash missing")
return app_id, api_hash
def _ensure_event_loop(self) -> None:
"""Telethon sync wrapper requires an event loop to exist in this thread."""
try:
asyncio.get_event_loop()
except RuntimeError:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
def _download_message_media_sync(self, *, url: str, output_dir: Path) -> Tuple[Path, Dict[str, Any]]:
try:
from telethon import errors
from telethon.sync import TelegramClient
from telethon.tl.types import PeerChannel
except Exception as exc:
raise Exception(f"Telethon not available: {exc}")
self._ensure_event_loop()
loop = asyncio.get_event_loop()
if getattr(loop, "is_running", lambda: False)():
raise Exception("Telegram provider cannot run while an event loop is already running")
def _resolve(value):
if asyncio.iscoroutine(value):
return loop.run_until_complete(value)
return value
app_id, api_hash = self._credentials()
session_base = self._session_base_path()
chat, message_id = _parse_telegram_message_url(url)
def _format_bytes(num: Optional[int]) -> str:
try:
if num is None:
return "?B"
n = float(num)
suffixes = ["B", "KB", "MB", "GB", "TB"]
for s in suffixes:
if n < 1024 or s == suffixes[-1]:
if s == "B":
return f"{int(n)}{s}"
return f"{n:.1f}{s}"
n /= 1024
except Exception:
return "?B"
client = TelegramClient(str(session_base), app_id, api_hash)
try:
# This prompts on first run for phone/code and persists the session.
_resolve(client.start())
if chat.startswith("c:"):
channel_id = int(chat.split(":", 1)[1])
entity = PeerChannel(channel_id)
else:
entity = chat
if isinstance(entity, str) and entity and not entity.startswith("@"):
entity = "@" + entity
# Use the list form to be robust across Telethon sync/async stubs.
messages = _resolve(client.get_messages(entity, ids=[message_id]))
message = None
if isinstance(messages, (list, tuple)):
message = messages[0] if messages else None
else:
try:
# TotalList is list-like
message = messages[0] # type: ignore[index]
except Exception:
message = None
if not message:
raise Exception("Telegram message not found")
if not getattr(message, "media", None):
raise Exception("Telegram message has no media")
chat_title = ""
chat_username = ""
chat_id = None
try:
chat_obj = getattr(message, "chat", None)
if chat_obj is not None:
maybe_title = getattr(chat_obj, "title", None)
maybe_username = getattr(chat_obj, "username", None)
maybe_id = getattr(chat_obj, "id", None)
if isinstance(maybe_title, str):
chat_title = maybe_title.strip()
if isinstance(maybe_username, str):
chat_username = maybe_username.strip()
if maybe_id is not None:
chat_id = int(maybe_id)
except Exception:
pass
caption = ""
try:
maybe_caption = getattr(message, "message", None)
if isinstance(maybe_caption, str):
caption = maybe_caption.strip()
except Exception:
pass
msg_id = None
msg_date = None
try:
msg_id = int(getattr(message, "id", 0) or 0)
except Exception:
msg_id = None
try:
msg_date = getattr(message, "date", None)
except Exception:
msg_date = None
file_name = ""
file_mime = ""
file_size = None
try:
file_obj = getattr(message, "file", None)
maybe_name = getattr(file_obj, "name", None)
maybe_mime = getattr(file_obj, "mime_type", None)
maybe_size = getattr(file_obj, "size", None)
if isinstance(maybe_name, str):
file_name = maybe_name.strip()
if isinstance(maybe_mime, str):
file_mime = maybe_mime.strip()
if maybe_size is not None:
file_size = int(maybe_size)
except Exception:
pass
# Progress callback: prints to stderr so it doesn't interfere with pipeline stdout.
last_print = {"t": 0.0}
def _progress(current: int, total: int) -> None:
try:
now = time.monotonic()
# Throttle to avoid spamming.
if now - float(last_print.get("t", 0.0)) < 0.25 and current < total:
return
last_print["t"] = now
pct = ""
try:
if total and total > 0:
pct = f" {min(100.0, (current / total) * 100.0):5.1f}%"
except Exception:
pct = ""
line = f"[telegram] Downloading{pct} ({_format_bytes(current)}/{_format_bytes(total)})"
sys.stderr.write("\r" + line)
sys.stderr.flush()
except Exception:
return
downloaded = _resolve(client.download_media(message, file=str(output_dir), progress_callback=_progress))
try:
sys.stderr.write("\n")
sys.stderr.flush()
except Exception:
pass
if not downloaded:
raise Exception("Telegram download returned no file")
downloaded_path = Path(str(downloaded))
date_iso = None
try:
if msg_date is not None and hasattr(msg_date, "isoformat"):
date_iso = msg_date.isoformat() # type: ignore[union-attr]
except Exception:
date_iso = None
info: Dict[str, Any] = {
"provider": "telegram",
"source_url": url,
"chat": {
"key": chat,
"title": chat_title,
"username": chat_username,
"id": chat_id,
},
"message": {
"id": msg_id,
"date": date_iso,
"caption": caption,
},
"file": {
"name": file_name,
"mime_type": file_mime,
"size": file_size,
"downloaded_path": str(downloaded_path),
},
}
return downloaded_path, info
except errors.RPCError as exc:
raise Exception(f"Telegram RPC error: {exc}")
finally:
try:
_resolve(client.disconnect())
except Exception:
pass
def download_url(self, url: str, output_dir: Path) -> Tuple[Path, Dict[str, Any]]:
"""Download a Telegram message URL and return (path, metadata)."""
if not _looks_like_telegram_message_url(url):
raise ValueError("Not a Telegram URL")
return self._download_message_media_sync(url=url, output_dir=output_dir)
def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
url = str(getattr(result, "path", "") or "")
if not url:
return None
if not _looks_like_telegram_message_url(url):
return None
path, _info = self._download_message_media_sync(url=url, output_dir=output_dir)
return path