hj

2025-12-26 21:04:09 -08:00
parent 9310478a37
commit a595453a9b
7 changed files with 611 additions and 5 deletions
--- a/Provider/internetarchive.py
+++ b/Provider/internetarchive.py
@@ -0,0 +1,521 @@
+from __future__ import annotations
+
+import importlib
+import os
+import re
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from ProviderCore.base import Provider, SearchResult
+from ProviderCore.download import sanitize_filename
+from SYS.logger import log
+
+
+def _ia() -> Any:
+    try:
+        return importlib.import_module("internetarchive")
+    except Exception as exc:
+        raise Exception(f"internetarchive module not available: {exc}")
+
+
+def _pick_provider_config(config: Any) -> Dict[str, Any]:
+    if not isinstance(config, dict):
+        return {}
+    provider = config.get("provider")
+    if not isinstance(provider, dict):
+        return {}
+    entry = provider.get("internetarchive")
+    if isinstance(entry, dict):
+        return entry
+    return {}
+
+
+def _looks_fielded_query(q: str) -> bool:
+    low = (q or "").lower()
+    return (":" in low) or (" and " in low) or (" or " in low) or (" not " in low) or ("(" in low)
+
+
+def _extract_identifier_from_any(value: str) -> str:
+    raw = str(value or "").strip()
+    if not raw:
+        return ""
+
+    if raw.lower().startswith("ia:"):
+        return raw.split(":", 1)[1].strip()
+
+    if raw.startswith("http://") or raw.startswith("https://"):
+        try:
+            from urllib.parse import urlparse
+
+            p = urlparse(raw)
+            host = (p.hostname or "").lower().strip()
+            path = (p.path or "").strip("/")
+        except Exception:
+            return ""
+
+        if not host.endswith("archive.org"):
+            return ""
+
+        parts = [x for x in path.split("/") if x]
+        # /details/<identifier>
+        if len(parts) >= 2 and parts[0].lower() == "details":
+            return str(parts[1]).strip()
+        # /download/<identifier>/<filename>
+        if len(parts) >= 2 and parts[0].lower() == "download":
+            return str(parts[1]).strip()
+
+        return ""
+
+    # Assume bare identifier
+    return raw
+
+
+def _extract_download_filename_from_url(url: str) -> str:
+    raw = str(url or "").strip()
+    if not raw:
+        return ""
+    if not (raw.startswith("http://") or raw.startswith("https://")):
+        return ""
+
+    try:
+        from urllib.parse import urlparse
+
+        p = urlparse(raw)
+        host = (p.hostname or "").lower().strip()
+        path = (p.path or "").strip("/")
+    except Exception:
+        return ""
+
+    if not host.endswith("archive.org"):
+        return ""
+
+    parts = [x for x in path.split("/") if x]
+    # /download/<identifier>/<filename>
+    if len(parts) >= 3 and parts[0].lower() == "download":
+        return str(parts[2]).strip()
+
+    return ""
+
+
+def _normalize_identifier(s: str) -> str:
+    text = str(s or "").strip().lower()
+    if not text:
+        return ""
+
+    # Internet Archive identifiers are fairly permissive; keep alnum, '_', '-', '.' and collapse the rest.
+    text = re.sub(r"[^a-z0-9_.-]+", "-", text)
+    text = re.sub(r"-+", "-", text).strip("-._")
+
+    if len(text) > 80:
+        text = text[:80].rstrip("-._")
+
+    return text
+
+
+def _best_file_candidate(files: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+    if not files:
+        return None
+
+    def _is_metadata(f: Dict[str, Any]) -> bool:
+        source = str(f.get("source") or "").strip().lower()
+        fmt = str(f.get("format") or "").strip().lower()
+        if source == "metadata":
+            return True
+        if fmt in {"metadata", "archive bittorrent"}:
+            return True
+        if fmt.startswith("thumbnail"):
+            return True
+        return False
+
+    def _size(f: Dict[str, Any]) -> int:
+        try:
+            return int(f.get("size") or 0)
+        except Exception:
+            return 0
+
+    candidates = [f for f in files if not _is_metadata(f)]
+    if not candidates:
+        candidates = list(files)
+
+    # Prefer originals.
+    originals = [f for f in candidates if str(f.get("source") or "").strip().lower() == "original"]
+    pool = originals if originals else candidates
+
+    pool = [f for f in pool if str(f.get("name") or "").strip()]
+    if not pool:
+        return None
+
+    pool.sort(key=_size, reverse=True)
+    return pool[0]
+
+
+class InternetArchive(Provider):
+    """Internet Archive provider using the `internetarchive` Python module.
+
+    Supports:
+    - search-provider -provider internetarchive <query>
+    - download-file / provider.download() from search results
+    - add-file -provider internetarchive (uploads)
+
+    Config (optional):
+    [provider=internetarchive]
+    access_key="..."   # optional (upload)
+    secret_key="..."   # optional (upload)
+    collection="..."   # optional (upload)
+    mediatype="..."    # optional (upload)
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        super().__init__(config)
+        conf = _pick_provider_config(self.config)
+        self._access_key = conf.get("access_key")
+        self._secret_key = conf.get("secret_key")
+        self._collection = conf.get("collection") or conf.get("default_collection")
+        self._mediatype = conf.get("mediatype") or conf.get("default_mediatype")
+
+    def validate(self) -> bool:
+        try:
+            _ia()
+            return True
+        except Exception:
+            return False
+
+    @staticmethod
+    def _media_kind_from_mediatype(mediatype: str) -> str:
+        mt = str(mediatype or "").strip().lower()
+        if mt in {"texts"}:
+            return "book"
+        if mt in {"audio", "etree"}:
+            return "audio"
+        if mt in {"movies"}:
+            return "video"
+        if mt in {"image"}:
+            return "image"
+        return "file"
+
+    def search(
+        self,
+        query: str,
+        limit: int = 50,
+        filters: Optional[Dict[str, Any]] = None,
+        **_kwargs: Any,
+    ) -> List[SearchResult]:
+        ia = _ia()
+        search_items = getattr(ia, "search_items", None)
+        if not callable(search_items):
+            raise Exception("internetarchive.search_items is not available")
+
+        q = str(query or "").strip()
+        if not q:
+            return []
+
+        # If the user supplied a plain string, default to title search.
+        if not _looks_fielded_query(q) and q not in {"*", "*.*"}:
+            q = f'title:("{q}")'
+
+        fields = [
+            "identifier",
+            "title",
+            "mediatype",
+            "creator",
+            "date",
+            "downloads",
+            "collection",
+        ]
+
+        try:
+            search: Any = search_items(q, fields=fields)
+        except Exception as exc:
+            raise Exception(f"Internet Archive search failed: {exc}")
+
+        out: List[SearchResult] = []
+        for row in search:
+            if len(out) >= int(limit or 50):
+                break
+
+            if not isinstance(row, dict):
+                continue
+
+            identifier = str(row.get("identifier") or "").strip()
+            if not identifier:
+                continue
+
+            title = str(row.get("title") or identifier).strip() or identifier
+            mediatype = str(row.get("mediatype") or "").strip()
+            creator = str(row.get("creator") or "").strip()
+            date = str(row.get("date") or "").strip()
+
+            annotations: List[str] = []
+            if mediatype:
+                annotations.append(mediatype)
+            if date:
+                annotations.append(date)
+            if creator:
+                annotations.append(creator)
+
+            detail_parts: List[str] = []
+            if creator:
+                detail_parts.append(creator)
+            if date:
+                detail_parts.append(date)
+
+            path = f"https://archive.org/details/{identifier}"
+
+            sr = SearchResult(
+                table="internetarchive",
+                title=title,
+                path=path,
+                detail=" · ".join(detail_parts),
+                annotations=annotations,
+                media_kind=self._media_kind_from_mediatype(mediatype),
+                size_bytes=None,
+                tag=set(),
+                columns=[
+                    ("identifier", identifier),
+                    ("mediatype", mediatype),
+                    ("date", date),
+                ],
+                full_metadata=dict(row),
+            )
+            out.append(sr)
+
+        return out
+
+    def download_url(self, url: str, output_dir: Path) -> Optional[Path]:
+        """Download an Internet Archive URL.
+
+        Supports:
+        - https://archive.org/details/<identifier>
+        - https://archive.org/download/<identifier>/<filename>
+        """
+        sr = SearchResult(table="internetarchive", title=str(url), path=str(url), full_metadata={})
+        return self.download(sr, output_dir)
+
+    def download(self, result: SearchResult, output_dir: Path) -> Optional[Path]:
+        ia = _ia()
+        get_item = getattr(ia, "get_item", None)
+        download_fn = getattr(ia, "download", None)
+        if not callable(get_item):
+            raise Exception("internetarchive.get_item is not available")
+        if not callable(download_fn):
+            raise Exception("internetarchive.download is not available")
+
+        identifier = _extract_identifier_from_any(str(getattr(result, "path", "") or ""))
+        if not identifier:
+            return None
+
+        requested_filename = ""
+        try:
+            requested_filename = _extract_download_filename_from_url(str(result.path))
+        except Exception:
+            requested_filename = ""
+
+        try:
+            output_dir.mkdir(parents=True, exist_ok=True)
+        except Exception:
+            pass
+
+        try:
+            item: Any = get_item(identifier)
+        except Exception as exc:
+            raise Exception(f"Internet Archive item lookup failed: {exc}")
+
+        files: List[Dict[str, Any]] = []
+        try:
+            raw_files = getattr(item, "files", None)
+            if isinstance(raw_files, list):
+                for f in raw_files:
+                    if isinstance(f, dict):
+                        files.append(f)
+        except Exception:
+            files = []
+
+        if not files:
+            try:
+                for f in item.get_files():
+                    name = getattr(f, "name", None)
+                    if not name and isinstance(f, dict):
+                        name = f.get("name")
+                    if not name:
+                        continue
+                    files.append({
+                        "name": str(name),
+                        "size": getattr(f, "size", None),
+                        "format": getattr(f, "format", None),
+                        "source": getattr(f, "source", None),
+                    })
+            except Exception:
+                files = []
+
+        chosen_name = ""
+        if requested_filename:
+            chosen_name = requested_filename
+        else:
+            chosen = _best_file_candidate(files)
+            if chosen is not None:
+                chosen_name = str(chosen.get("name") or "").strip()
+
+        if not chosen_name:
+            raise Exception("Internet Archive item has no downloadable files")
+
+        # Download the selected file.
+        try:
+            download_fn(
+                identifier,
+                files=[chosen_name],
+                destdir=str(output_dir),
+                no_directory=True,
+                ignore_existing=True,
+                verbose=False,
+            )
+        except TypeError:
+            # Older versions may not support some flags.
+            download_fn(
+                identifier,
+                files=[chosen_name],
+                destdir=str(output_dir),
+            )
+        except Exception as exc:
+            raise Exception(f"Internet Archive download failed: {exc}")
+
+        # Resolve downloaded path (library behavior varies by version/flags).
+        candidates = [
+            output_dir / chosen_name,
+            output_dir / identifier / chosen_name,
+        ]
+        for p in candidates:
+            try:
+                if p.exists():
+                    return p
+            except Exception:
+                continue
+
+        # As a last resort, try to find by basename.
+        try:
+            for root in (output_dir, output_dir / identifier):
+                if root.exists() and root.is_dir():
+                    for child in root.iterdir():
+                        if child.is_file() and child.name == chosen_name:
+                            return child
+        except Exception:
+            pass
+
+        return None
+
+    def upload(self, file_path: str, **kwargs: Any) -> str:
+        """Upload a file to Internet Archive.
+
+        If a piped item includes a tag `ia:<identifier>`, uploads to that identifier.
+        Otherwise creates a new identifier derived from the filename/title and hash.
+
+        Returns the item URL.
+        """
+        ia = _ia()
+        upload_fn = getattr(ia, "upload", None)
+        if not callable(upload_fn):
+            raise Exception("internetarchive.upload is not available")
+
+        p = Path(str(file_path))
+        if not p.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+
+        pipe_obj = kwargs.get("pipe_obj")
+
+        title = ""
+        file_hash = ""
+        tags: List[str] = []
+        try:
+            if pipe_obj is not None:
+                title = str(getattr(pipe_obj, "title", "") or "").strip()
+                file_hash = str(getattr(pipe_obj, "hash", "") or "").strip()
+                tags_val = getattr(pipe_obj, "tag", None)
+                if isinstance(tags_val, list):
+                    tags = [str(t) for t in tags_val if t]
+        except Exception:
+            title = ""
+            file_hash = ""
+            tags = []
+
+        identifier = ""
+        for t in tags:
+            low = str(t or "").strip()
+            if low.lower().startswith("ia:"):
+                identifier = low.split(":", 1)[1].strip()
+                break
+            if low.lower().startswith("internetarchive:"):
+                identifier = low.split(":", 1)[1].strip()
+                break
+
+        if not identifier:
+            base_title = title or p.stem
+            slug = _normalize_identifier(base_title)
+            suffix = ""
+            if file_hash:
+                suffix = str(file_hash)[:10]
+            if slug and suffix:
+                identifier = f"{slug}-{suffix}"
+            elif slug:
+                identifier = slug
+            elif suffix:
+                identifier = f"medeia-{suffix}"
+            else:
+                identifier = _normalize_identifier(p.stem) or "medeia-upload"
+
+        identifier = _normalize_identifier(identifier)
+        if not identifier:
+            raise Exception("Could not determine Internet Archive identifier")
+
+        meta: Dict[str, Any] = {}
+        if title:
+            meta["title"] = title
+        else:
+            meta["title"] = p.stem
+
+        if isinstance(self._collection, str) and self._collection.strip():
+            meta["collection"] = self._collection.strip()
+        if isinstance(self._mediatype, str) and self._mediatype.strip():
+            meta["mediatype"] = self._mediatype.strip()
+
+        # Build upload options; credentials are optional if the user has internetarchive configured globally.
+        upload_kwargs: Dict[str, Any] = {"metadata": meta}
+        ak = os.getenv("IA_ACCESS_KEY") or self._access_key
+        sk = os.getenv("IA_SECRET_KEY") or self._secret_key
+        if isinstance(ak, str) and ak.strip():
+            upload_kwargs["access_key"] = ak.strip()
+        if isinstance(sk, str) and sk.strip():
+            upload_kwargs["secret_key"] = sk.strip()
+
+        # Use a friendly uploaded filename.
+        upload_name = sanitize_filename(p.name)
+        files = {upload_name: str(p)}
+
+        try:
+            resp: Any = upload_fn(identifier, files=files, **upload_kwargs)
+        except TypeError:
+            # Older versions may require positional args.
+            resp = upload_fn(identifier, files, meta)
+        except Exception as exc:
+            log(f"[internetarchive] Upload error: {exc}", file=sys.stderr)
+            raise
+
+        # Drain generator responses to catch failures.
+        try:
+            if resp is not None:
+                for r in resp:
+                    if isinstance(r, dict) and r.get("success") is False:
+                        raise Exception(str(r.get("error") or r))
+        except Exception as exc:
+            raise Exception(f"Internet Archive upload failed: {exc}")
+
+        item_url = f"https://archive.org/details/{identifier}"
+
+        try:
+            if pipe_obj is not None:
+                from Store import Store
+
+                Store(self.config, suppress_debug=True).try_add_url_for_pipe_object(pipe_obj, item_url)
+        except Exception:
+            pass
+
+        return item_url
--- a/Provider/telegram.py
+++ b/Provider/telegram.py
@@ -13,6 +13,74 @@ from urllib.parse import urlparse
 from ProviderCore.base import Provider, SearchResult


+_TELEGRAM_DEFAULT_TIMESTAMP_STEM_RE = re.compile(
+	r"^(?P<prefix>photo|video|document|audio|voice|animation)_(?P<date>\d{4}-\d{2}-\d{2})_(?P<time>\d{2}-\d{2}-\d{2})(?: \(\d+\))?$",
+	flags=re.IGNORECASE,
+)
+
+
+def _unique_path(path: Path) -> Path:
+	try:
+		if not path.exists():
+			return path
+	except Exception:
+		return path
+
+	stem = path.stem
+	suffix = path.suffix
+	parent = path.parent
+	for i in range(1, 10_000):
+		candidate = parent / f"{stem} ({i}){suffix}"
+		try:
+			if not candidate.exists():
+				return candidate
+		except Exception:
+			return candidate
+	return parent / f"{stem} (copy){suffix}"
+
+
+def _maybe_strip_telegram_timestamped_default_filename(*, downloaded_path: Path) -> Path:
+	"""Normalize Telethon's default timestamped names.
+
+	Examples:
+	- photo_2025-12-27_02-58-09.jpg -> photo.jpg
+	"""
+	try:
+		stem = downloaded_path.stem
+		suffix = downloaded_path.suffix
+	except Exception:
+		return downloaded_path
+
+	if not suffix:
+		return downloaded_path
+
+	m = _TELEGRAM_DEFAULT_TIMESTAMP_STEM_RE.fullmatch(str(stem))
+	if not m:
+		return downloaded_path
+
+	prefix = str(m.group("prefix") or "").strip().lower()
+	if not prefix:
+		return downloaded_path
+
+	new_candidate = downloaded_path.with_name(f"{prefix}{suffix}")
+	if new_candidate == downloaded_path:
+		return downloaded_path
+
+	new_path = _unique_path(new_candidate)
+	try:
+		if downloaded_path.exists():
+			try:
+				downloaded_path.rename(new_path)
+				return new_path
+			except Exception:
+				shutil.move(str(downloaded_path), str(new_path))
+				return new_path
+	except Exception:
+		return downloaded_path
+
+	return downloaded_path
+
+
 def _looks_like_telegram_message_url(url: str) -> bool:
 	try:
 		parsed = urlparse(str(url))
@@ -945,6 +1013,13 @@ class Telegram(Provider):
 					raise Exception("Telegram download returned no file")
 				downloaded_path = Path(str(downloaded))

+				# Telethon's default media filenames include timestamps (e.g. photo_YYYY-MM-DD_HH-MM-SS.jpg).
+				# Strip those timestamps ONLY when Telegram didn't provide an explicit filename.
+				if not file_name:
+					downloaded_path = _maybe_strip_telegram_timestamped_default_filename(
+						downloaded_path=downloaded_path,
+					)
+
 				date_iso = None
 				try:
 					if msg_date is not None and hasattr(msg_date, "isoformat"):
--- a/ProviderCore/registry.py
+++ b/ProviderCore/registry.py
@@ -24,6 +24,7 @@ from Provider.youtube import YouTube
 from Provider.fileio import FileIO
 from Provider.zeroxzero import ZeroXZero
 from Provider.loc import LOC
+from Provider.internetarchive import InternetArchive


 _PROVIDERS: Dict[str, Type[Provider]] = {
@@ -31,6 +32,7 @@ _PROVIDERS: Dict[str, Type[Provider]] = {
    "alldebrid": AllDebrid,
    "libgen": Libgen,
    "openlibrary": OpenLibrary,
+    "internetarchive": InternetArchive,
    "soulseek": Soulseek,
    "bandcamp": Bandcamp,
    "youtube": YouTube,
--- a/cmdlet/add_file.py
+++ b/cmdlet/add_file.py
@@ -49,7 +49,7 @@ class Add_File(Cmdlet):
            arg=[
                SharedArgs.PATH,
                SharedArgs.STORE,
-                CmdletArg(name="provider", type="string", required=False, description="File hosting provider (e.g., 0x0)", alias="prov"),
+                CmdletArg(name="provider", type="string", required=False, description="File hosting provider (e.g., 0x0, file.io, internetarchive)", alias="prov"),
                CmdletArg(
                    name="room",
                    type="string",
@@ -66,6 +66,9 @@ class Add_File(Cmdlet):
                "    <path>: Copy file to specified directory",
                "- File provider options (use -provider):",
                "    0x0: Upload to 0x0.st for temporary hosting",
+                "    file.io: Upload to file.io for temporary hosting",
+                "    matrix: Upload to a Matrix room (requires Matrix config)",
+                "    internetarchive: Upload to archive.org (optional tag: ia:<identifier> to upload into an existing item)",
            ],
            exec=self.run,
        )
--- a/cmdlet/search_provider.py
+++ b/cmdlet/search_provider.py
@@ -31,10 +31,10 @@ class Search_Provider(Cmdlet):
    def __init__(self):
        super().__init__(
            name="search-provider",
-            summary="Search external providers (bandcamp, libgen, soulseek, youtube, alldebrid, loc)",
+            summary="Search external providers (bandcamp, libgen, soulseek, youtube, alldebrid, loc, internetarchive)",
            usage="search-provider -provider <provider> <query> [-limit N] [-open ID]",
            arg=[
-                CmdletArg("provider", type="string", required=True, description="Provider name: bandcamp, libgen, soulseek, youtube, alldebrid, loc"),
+                CmdletArg("provider", type="string", required=True, description="Provider name: bandcamp, libgen, soulseek, youtube, alldebrid, loc, internetarchive"),
                CmdletArg("query", type="string", required=True, description="Search query (supports provider-specific syntax)"),
                CmdletArg("limit", type="int", description="Maximum results to return (default: 50)"),
                CmdletArg("open", type="int", description="(alldebrid) Open folder/magnet by ID and list its files"),
@@ -54,12 +54,15 @@ class Search_Provider(Cmdlet):
                "  Example: search-provider -provider soulseek \"pink floyd\"",
                "- youtube: Search YouTube for videos",
                "  Example: search-provider -provider youtube \"tutorial\"",
+                "- internetarchive: Search archive.org items (advancedsearch syntax)",
+                "  Example: search-provider -provider internetarchive \"title:(lincoln) AND mediatype:texts\"",
                "",
                "Query syntax:",
                "- bandcamp: Use 'artist:Name' to search by artist",
                "- libgen: Supports isbn:, author:, title: prefixes",
                "- soulseek: Plain text search",
                "- youtube: Plain text search",
+                "- internetarchive: Archive.org advancedsearch query syntax",
                "",
                "Results can be piped to other cmdlet:",
                "  search-provider -provider bandcamp \"artist:grace\" | @1 | download-data",
--- a/readme.md
+++ b/readme.md
@@ -5,7 +5,7 @@ Medios-Macina is a CLI media manager and toolkit focused on downloading, tagging
 ## Features
 - **Flexible syntax structure:** chain commands with `|` and select options from tables with `@N`.
 - **Multiple file stores:** *HYDRUSNETWORK, FOLDER*
- **Provider plugin integration:** *YOUTUBE, OPENLIBRARY/ARCHIVE.ORG, SOULSEEK, LIBGEN, ALLDEBRID, TELEGRAM, BANDCAMP*
+- **Provider plugin integration:** *YOUTUBE, OPENLIBRARY, INTERNETARCHIVE, SOULSEEK, LIBGEN, ALLDEBRID, TELEGRAM, BANDCAMP*
 - **Module Mixing:** *[Playwright](https://github.com/microsoft/playwright), [yt-dlp](https://github.com/yt-dlp/yt-dlp), [aioslsk](https://github.com/JurgenR/aioslsk), [telethon](https://github.com/LonamiWebs/Telethon),[typer](https://github.com/fastapi/typer)*
 - **MPV Manager:** Play audio, video, and even images in a custom designed MPV with trimming, screenshotting, and more built right in! 

@@ -77,7 +77,8 @@ search-store "ext:mp3"
 - **HydrusNetwork**: use for database-backed media storage and advanced tagging (requires running Hydrus client/server).
 - **Local folder**: copy files to a configured directory (fast and simple).
 - **YouTube / yt-dlp**: robust media downloader for YouTube and many hosts.
- **OpenLibrary / Archive.org**: scripted metadata scraping and optional downloads.
+- **OpenLibrary**: book metadata, borrowing, and Archive.org downloads.
+- **Internet Archive**: search/download/upload via the `internetarchive` module.
 - **Soulseek, LibGen, All-Debrid, Others**: provider support is modular—add or configure providers in `config.conf`.

 ## Troubleshooting & tips 🛠️
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,7 @@ yt-dlp[default]>=2023.11.0
 requests>=2.31.0
 httpx>=0.25.0
 telethon>=1.36.0
+internetarchive>=4.1.0

 # Document and data handling
 pypdf>=3.0.0