From edc33f452887033baf5f415e7ba1e90d71df3bf9 Mon Sep 17 00:00:00 2001 From: Nose Date: Tue, 6 Jan 2026 16:19:29 -0800 Subject: [PATCH] ssd --- .gitignore | 3 +- API/HTTP.py | 5 + API/data/alldebrid.json | 224 +++++-------- Provider/metadata_provider.py | 522 +++++++++++++++++++++++++++++ Provider/openlibrary.py | 497 ++++------------------------ SYS/json_table.py | 110 +++++++ SYS/result_table.py | 72 ++-- SYS/result_table_adapters.py | 11 +- cmdlet/download_file.py | 601 ++++++++++++++++++++-------------- cmdlet/get_tag.py | 28 +- 10 files changed, 1192 insertions(+), 881 deletions(-) create mode 100644 SYS/json_table.py diff --git a/.gitignore b/.gitignore index d77c823..21ca1ce 100644 --- a/.gitignore +++ b/.gitignore @@ -236,5 +236,4 @@ scripts/mm.ps1 scripts/mm .style.yapf .yapfignore - - +tmp_* \ No newline at end of file diff --git a/API/HTTP.py b/API/HTTP.py index b95e44c..becbf47 100644 --- a/API/HTTP.py +++ b/API/HTTP.py @@ -144,6 +144,11 @@ def _resolve_verify_value(verify_ssl: bool) -> Union[bool, str]: return True + +def get_requests_verify_value(verify_ssl: bool = True) -> Union[bool, str]: + """Expose the verified value for reuse outside of HTTPClient (requests sessions).""" + return _resolve_verify_value(verify_ssl) + # Default configuration DEFAULT_TIMEOUT = 30.0 DEFAULT_RETRIES = 3 diff --git a/API/data/alldebrid.json b/API/data/alldebrid.json index 36560d5..7194bbf 100644 --- a/API/data/alldebrid.json +++ b/API/data/alldebrid.json @@ -92,7 +92,7 @@ "(hitfile\\.net/[a-z0-9A-Z]{4,9})" ], "regexp": "(hitf\\.(to|cc)/([a-z0-9A-Z]{4,9}))|(htfl\\.(net|to|cc)/([a-z0-9A-Z]{4,9}))|(hitfile\\.(net)/download/free/([a-z0-9A-Z]{4,9}))|((hitfile\\.net/[a-z0-9A-Z]{4,9}))", - "status": true + "status": false }, "mega": { "name": "mega", @@ -389,7 +389,7 @@ "(filespace\\.com/[a-zA-Z0-9]{12})" ], "regexp": "(filespace\\.com/fd/([a-zA-Z0-9]{12}))|((filespace\\.com/[a-zA-Z0-9]{12}))", - "status": true + "status": false }, "filezip": { "name": "filezip", @@ -412,7 +412,7 @@ "(gigapeta\\.com/dl/[0-9a-zA-Z]{13,15})" ], "regexp": "(gigapeta\\.com/dl/[0-9a-zA-Z]{13,15})", - "status": false + "status": true }, "google": { "name": "google", @@ -507,7 +507,7 @@ "mediafire\\.com/(\\?|download/|file/|download\\.php\\?)([0-9a-z]{15})" ], "regexp": "mediafire\\.com/(\\?|download/|file/|download\\.php\\?)([0-9a-z]{15})", - "status": false + "status": true }, "mexashare": { "name": "mexashare", @@ -1650,18 +1650,6 @@ ], "regexp": "https?://music\\.apple\\.com/[\\w-]+/post/(\\d+)" }, - "appledaily": { - "name": "appledaily", - "type": "free", - "domains": [ - "ent.appledaily.com.tw", - "appledaily.com.tw" - ], - "regexps": [ - "https?://(www|ent)\\.appledaily\\.com\\.tw/[^/]+/[^/]+/[^/]+/(\\d+)/(\\d+)(/.*)?" - ], - "regexp": "https?://(www|ent)\\.appledaily\\.com\\.tw/[^/]+/[^/]+/[^/]+/(\\d+)/(\\d+)(/.*)?" - }, "applepodcasts": { "name": "applepodcasts", "type": "free", @@ -2058,9 +2046,9 @@ "https?://(?:([^.]+)\\.)?bandcamp\\.com/album/([^/?#&]+)", "https?://([^/]+)\\.bandcamp\\.com/track/([^/?#&]+)", "https?://(?!www\\.)([^.]+)\\.bandcamp\\.com(?:/music)?/?(?:[#?]|$)", - "https?://(?:www\\.)?bandcamp\\.com/?\\?(?:.*?&)?show=(\\d+)" + "https?://(?:www\\.)?bandcamp\\.com/radio/?\\?(?:[^#]+&)?show=(\\d+)" ], - "regexp": "(https?://(?:([^.]+)\\.)?bandcamp\\.com/album/([^/?#&]+))|(https?://([^/]+)\\.bandcamp\\.com/track/([^/?#&]+))|(https?://(?!www\\.)([^.]+)\\.bandcamp\\.com(?:/music)?/?(?:[#?]|$))|(https?://(?:www\\.)?bandcamp\\.com/?\\?(?:.*?&)?show=(\\d+))" + "regexp": "(https?://(?:([^.]+)\\.)?bandcamp\\.com/album/([^/?#&]+))|(https?://([^/]+)\\.bandcamp\\.com/track/([^/?#&]+))|(https?://(?!www\\.)([^.]+)\\.bandcamp\\.com(?:/music)?/?(?:[#?]|$))|(https?://(?:www\\.)?bandcamp\\.com/radio/?\\?(?:[^#]+&)?show=(\\d+))" }, "bandlab": { "name": "bandlab", @@ -3296,23 +3284,25 @@ "type": "free", "domains": [ "cda.pl", - "ebd.cda.pl" + "ebd.cda.pl", + "m.cda.pl" ], "regexps": [ - "https?://(?:(?:www\\.)?cda\\.pl/video|ebd\\.cda\\.pl/[0-9]+x[0-9]+)/([0-9a-z]+)" + "https?://(?:(?:(?:www|m)\\.)?cda\\.pl/video|ebd\\.cda\\.pl/[0-9]+x[0-9]+)/([0-9a-z]+)" ], - "regexp": "https?://(?:(?:www\\.)?cda\\.pl/video|ebd\\.cda\\.pl/[0-9]+x[0-9]+)/([0-9a-z]+)" + "regexp": "https?://(?:(?:(?:www|m)\\.)?cda\\.pl/video|ebd\\.cda\\.pl/[0-9]+x[0-9]+)/([0-9a-z]+)" }, "cdafolder": { "name": "cdafolder", "type": "free", "domains": [ - "cda.pl" + "cda.pl", + "m.cda.pl" ], "regexps": [ - "https?://(?:www\\.)?cda\\.pl/([\\w-]+)/folder/(\\d+)" + "https?://(?:(?:www|m)\\.)?cda\\.pl/([\\w-]+)/folder/(\\d+)" ], - "regexp": "https?://(?:www\\.)?cda\\.pl/([\\w-]+)/folder/(\\d+)" + "regexp": "https?://(?:(?:www|m)\\.)?cda\\.pl/([\\w-]+)/folder/(\\d+)" }, "cellebrite": { "name": "cellebrite", @@ -3768,6 +3758,17 @@ ], "regexp": "https?://www\\.craftsy\\.com/class/([\\w-]+)" }, + "croatian.film": { + "name": "croatian.film", + "type": "free", + "domains": [ + "croatian.film" + ], + "regexps": [ + "https://?(?:www\\.)?croatian\\.film/[a-z]{2}/[^/?#]+/(\\d+)" + ], + "regexp": "https://?(?:www\\.)?croatian\\.film/[a-z]{2}/[^/?#]+/(\\d+)" + }, "crooksandliars": { "name": "crooksandliars", "type": "free", @@ -4379,9 +4380,9 @@ "dropbox.com" ], "regexps": [ - "https?://(?:www\\.)?dropbox\\.com/(?:(?:e/)?scl/fi|sh?)/(\\w+)" + "https?://(?:www\\.)?dropbox\\.com/(?:(?:e/)?scl/f[io]|sh?)/(\\w+)" ], - "regexp": "https?://(?:www\\.)?dropbox\\.com/(?:(?:e/)?scl/fi|sh?)/(\\w+)" + "regexp": "https?://(?:www\\.)?dropbox\\.com/(?:(?:e/)?scl/f[io]|sh?)/(\\w+)" }, "dropout": { "name": "dropout", @@ -5088,6 +5089,17 @@ ], "regexp": "https?://www\\.fifa\\.com/fifaplus/\\w{2}/watch/([^#?]+/)?(\\w+)" }, + "filmarchiv": { + "name": "filmarchiv", + "type": "free", + "domains": [ + "filmarchiv.at" + ], + "regexps": [ + "https?://(?:www\\.)?filmarchiv\\.at/de/filmarchiv-on/video/(f_[0-9a-zA-Z]{5,})" + ], + "regexp": "https?://(?:www\\.)?filmarchiv\\.at/de/filmarchiv-on/video/(f_[0-9a-zA-Z]{5,})" + }, "filmon": { "name": "filmon", "type": "free", @@ -7954,37 +7966,6 @@ ], "regexp": "https?://(?:w(?:ww)?\\.)?mgtv\\.com/[bv]/(?:[^/]+/)*(\\d+)\\.html" }, - "manototv": { - "name": "manototv", - "type": "free", - "domains": [ - "manototv.com" - ], - "regexps": [ - "https?://(?:www\\.)?manototv\\.com/episode/([0-9]+)" - ], - "regexp": "https?://(?:www\\.)?manototv\\.com/episode/([0-9]+)" - }, - "manototvlive": { - "name": "manototvlive", - "type": "free", - "domains": [], - "regexps": [ - "https?://(?:www\\.)?manototv\\.com/live/" - ], - "regexp": "https?://(?:www\\.)?manototv\\.com/live/" - }, - "manototvshow": { - "name": "manototvshow", - "type": "free", - "domains": [ - "manototv.com" - ], - "regexps": [ - "https?://(?:www\\.)?manototv\\.com/show/([0-9]+)" - ], - "regexp": "https?://(?:www\\.)?manototv\\.com/show/([0-9]+)" - }, "manyvids": { "name": "manyvids", "type": "free", @@ -9321,9 +9302,10 @@ "https?://(?:www\\.|beta\\.)?(?:watchnebula\\.com|nebula\\.app|nebula\\.tv)/(?!myshows|library|videos)([\\w-]+)/?(?:$|[?#])", "https?://(?:www\\.|beta\\.)?(?:watchnebula\\.com|nebula\\.app|nebula\\.tv)/(?!(?:myshows|library|videos)/)([\\w-]+)/([\\w-]+)/?(?:$|[?#])", "https?://(?:www\\.|beta\\.)?(?:watchnebula\\.com|nebula\\.app|nebula\\.tv)/videos/([\\w-]+)", + "https?://(?:www\\.|beta\\.)?(?:watchnebula\\.com|nebula\\.app|nebula\\.tv)/([\\w-]+)/season/([\\w-]+)", "https?://(?:www\\.|beta\\.)?(?:watchnebula\\.com|nebula\\.app|nebula\\.tv)/(myshows|library/latest-videos)/?(?:$|[?#])" ], - "regexp": "(https?://(?:www\\.|beta\\.)?(?:watchnebula\\.com|nebula\\.app|nebula\\.tv)/(?!myshows|library|videos)([\\w-]+)/?(?:$|[?#]))|(https?://(?:www\\.|beta\\.)?(?:watchnebula\\.com|nebula\\.app|nebula\\.tv)/(?!(?:myshows|library|videos)/)([\\w-]+)/([\\w-]+)/?(?:$|[?#]))|(https?://(?:www\\.|beta\\.)?(?:watchnebula\\.com|nebula\\.app|nebula\\.tv)/videos/([\\w-]+))|(https?://(?:www\\.|beta\\.)?(?:watchnebula\\.com|nebula\\.app|nebula\\.tv)/(myshows|library/latest-videos)/?(?:$|[?#]))" + "regexp": "(https?://(?:www\\.|beta\\.)?(?:watchnebula\\.com|nebula\\.app|nebula\\.tv)/(?!myshows|library|videos)([\\w-]+)/?(?:$|[?#]))|(https?://(?:www\\.|beta\\.)?(?:watchnebula\\.com|nebula\\.app|nebula\\.tv)/(?!(?:myshows|library|videos)/)([\\w-]+)/([\\w-]+)/?(?:$|[?#]))|(https?://(?:www\\.|beta\\.)?(?:watchnebula\\.com|nebula\\.app|nebula\\.tv)/videos/([\\w-]+))|(https?://(?:www\\.|beta\\.)?(?:watchnebula\\.com|nebula\\.app|nebula\\.tv)/([\\w-]+)/season/([\\w-]+))|(https?://(?:www\\.|beta\\.)?(?:watchnebula\\.com|nebula\\.app|nebula\\.tv)/(myshows|library/latest-videos)/?(?:$|[?#]))" }, "nekohacker": { "name": "nekohacker", @@ -9402,10 +9384,10 @@ "https?://(?:y\\.)?music\\.163\\.com/(?:[#m]/)?song\\?.*?\\bid=([0-9]+)", "https?://music\\.163\\.com/(?:#/)?(?:playlist|discover/toplist)\\?id=([0-9]+)", "https?://music\\.163\\.com/(?:#/)?mv\\?id=([0-9]+)", - "https?://music\\.163\\.com/(?:#/)?program\\?id=([0-9]+)", + "https?://music\\.163\\.com/(?:#/)?(?:dj|program)\\?id=([0-9]+)", "https?://music\\.163\\.com/(?:#/)?artist\\?id=([0-9]+)" ], - "regexp": "(https?://music\\.163\\.com/(?:#/)?album\\?id=([0-9]+))|(https?://music\\.163\\.com/(?:#/)?djradio\\?id=([0-9]+))|(https?://(?:y\\.)?music\\.163\\.com/(?:[#m]/)?song\\?.*?\\bid=([0-9]+))|(https?://music\\.163\\.com/(?:#/)?(?:playlist|discover/toplist)\\?id=([0-9]+))|(https?://music\\.163\\.com/(?:#/)?mv\\?id=([0-9]+))|(https?://music\\.163\\.com/(?:#/)?program\\?id=([0-9]+))|(https?://music\\.163\\.com/(?:#/)?artist\\?id=([0-9]+))" + "regexp": "(https?://music\\.163\\.com/(?:#/)?album\\?id=([0-9]+))|(https?://music\\.163\\.com/(?:#/)?djradio\\?id=([0-9]+))|(https?://(?:y\\.)?music\\.163\\.com/(?:[#m]/)?song\\?.*?\\bid=([0-9]+))|(https?://music\\.163\\.com/(?:#/)?(?:playlist|discover/toplist)\\?id=([0-9]+))|(https?://music\\.163\\.com/(?:#/)?mv\\?id=([0-9]+))|(https?://music\\.163\\.com/(?:#/)?(?:dj|program)\\?id=([0-9]+))|(https?://music\\.163\\.com/(?:#/)?artist\\?id=([0-9]+))" }, "netplustv": { "name": "netplustv", @@ -9478,9 +9460,9 @@ "netzkino.de" ], "regexps": [ - "https?://(?:www\\.)?netzkino\\.de/\\#!/[^/]+/([^/]+)" + "https?://(?:www\\.)?netzkino\\.de/details/([^/?#]+)" ], - "regexp": "https?://(?:www\\.)?netzkino\\.de/\\#!/[^/]+/([^/]+)" + "regexp": "https?://(?:www\\.)?netzkino\\.de/details/([^/?#]+)" }, "newgrounds": { "name": "newgrounds", @@ -9519,37 +9501,6 @@ ], "regexp": "https?://(?:www\\.)?newsy\\.com/stories/([^/?#$&]+)" }, - "nextmedia": { - "name": "nextmedia", - "type": "free", - "domains": [ - "hk.apple.nextmedia.com" - ], - "regexps": [ - "https?://hk\\.apple\\.nextmedia\\.com/[^/]+/[^/]+/(\\d+)/(\\d+)" - ], - "regexp": "https?://hk\\.apple\\.nextmedia\\.com/[^/]+/[^/]+/(\\d+)/(\\d+)" - }, - "nextmediaactionnews": { - "name": "nextmediaactionnews", - "type": "free", - "domains": [ - "hk.dv.nextmedia.com" - ], - "regexps": [ - "https?://hk\\.dv\\.nextmedia\\.com/actionnews/[^/]+/(\\d+)/(\\d+)/\\d+" - ], - "regexp": "https?://hk\\.dv\\.nextmedia\\.com/actionnews/[^/]+/(\\d+)/(\\d+)/\\d+" - }, - "nexttv": { - "name": "nexttv", - "type": "free", - "domains": [], - "regexps": [ - "https?://(?:www\\.)?nexttv\\.com\\.tw/(?:[^/]+/)+(\\d+)" - ], - "regexp": "https?://(?:www\\.)?nexttv\\.com\\.tw/(?:[^/]+/)+(\\d+)" - }, "nexx": { "name": "nexx", "type": "free", @@ -9809,7 +9760,7 @@ "name": "nitter", "type": "free", "domains": [ - "nitter.priv.pw" + "nitter.projectsegfau.lt" ], "regexps": [ "https?://(?:3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad\\.onion|nitter\\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\\.onion|nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd\\.onion|npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid\\.onion|nitter\\.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd\\.onion|i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad\\.onion|26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid\\.onion|vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad\\.onion|iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd\\.onion|erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad\\.onion|ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd\\.onion|jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid\\.onion|nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad\\.onion|nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd\\.onion|nitter\\.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd\\.onion|ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad\\.onion|ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd\\.onion|nitter\\.i2p|u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa\\.b32\\.i2p|nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd\\.onion|nitter\\.lacontrevoie\\.fr|nitter\\.fdn\\.fr|nitter\\.1d4\\.us|nitter\\.kavin\\.rocks|nitter\\.unixfox\\.eu|nitter\\.domain\\.glass|nitter\\.namazso\\.eu|birdsite\\.xanny\\.family|nitter\\.moomoo\\.me|bird\\.trom\\.tf|nitter\\.it|twitter\\.censors\\.us|nitter\\.grimneko\\.de|twitter\\.076\\.ne\\.jp|nitter\\.fly\\.dev|notabird\\.site|nitter\\.weiler\\.rocks|nitter\\.sethforprivacy\\.com|nitter\\.cutelab\\.space|nitter\\.nl|nitter\\.mint\\.lgbt|nitter\\.bus\\-hit\\.me|nitter\\.esmailelbob\\.xyz|tw\\.artemislena\\.eu|nitter\\.winscloud\\.net|nitter\\.tiekoetter\\.com|nitter\\.spaceint\\.fr|nitter\\.privacy\\.com\\.de|nitter\\.poast\\.org|nitter\\.bird\\.froth\\.zone|nitter\\.dcs0\\.hu|twitter\\.dr460nf1r3\\.org|nitter\\.garudalinux\\.org|twitter\\.femboy\\.hu|nitter\\.cz|nitter\\.privacydev\\.net|nitter\\.evil\\.site|tweet\\.lambda\\.dance|nitter\\.kylrth\\.com|nitter\\.foss\\.wtf|nitter\\.priv\\.pw|nitter\\.tokhmi\\.xyz|nitter\\.catalyst\\.sx|unofficialbird\\.com|nitter\\.projectsegfau\\.lt|nitter\\.eu\\.projectsegfau\\.lt|singapore\\.unofficialbird\\.com|canada\\.unofficialbird\\.com|india\\.unofficialbird\\.com|nederland\\.unofficialbird\\.com|uk\\.unofficialbird\\.com|n\\.l5\\.ca|nitter\\.slipfox\\.xyz|nitter\\.soopy\\.moe|nitter\\.qwik\\.space|read\\.whatever\\.social|nitter\\.rawbit\\.ninja|nt\\.vern\\.cc|ntr\\.odyssey346\\.dev|nitter\\.ir|nitter\\.privacytools\\.io|nitter\\.sneed\\.network|n\\.sneed\\.network|nitter\\.manasiwibi\\.com|nitter\\.smnz\\.de|nitter\\.twei\\.space|nitter\\.inpt\\.fr|nitter\\.d420\\.de|nitter\\.caioalonso\\.com|nitter\\.at|nitter\\.drivet\\.xyz|nitter\\.pw|nitter\\.nicfab\\.eu|bird\\.habedieeh\\.re|nitter\\.hostux\\.net|nitter\\.adminforge\\.de|nitter\\.platypush\\.tech|nitter\\.mask\\.sh|nitter\\.pufe\\.org|nitter\\.us\\.projectsegfau\\.lt|nitter\\.arcticfoxes\\.net|t\\.com\\.sb|nitter\\.kling\\.gg|nitter\\.ktachibana\\.party|nitter\\.riverside\\.rocks|nitter\\.girlboss\\.ceo|nitter\\.lunar\\.icu|twitter\\.moe\\.ngo|nitter\\.freedit\\.eu|ntr\\.frail\\.duckdns\\.org|nitter\\.librenode\\.org|n\\.opnxng\\.com|nitter\\.plus\\.st|nitter\\.ethibox\\.fr|nitter\\.net|is\\-nitter\\.resolv\\.ee|lu\\-nitter\\.resolv\\.ee|nitter\\.13ad\\.de|nitter\\.40two\\.app|nitter\\.cattube\\.org|nitter\\.cc|nitter\\.dark\\.fail|nitter\\.himiko\\.cloud|nitter\\.koyu\\.space|nitter\\.mailstation\\.de|nitter\\.mastodont\\.cat|nitter\\.tedomum\\.net|nitter\\.tokhmi\\.xyz|nitter\\.weaponizedhumiliation\\.com|nitter\\.vxempire\\.xyz|tweet\\.lambda\\.dance|nitter\\.ca|nitter\\.42l\\.fr|nitter\\.pussthecat\\.org|nitter\\.nixnet\\.services|nitter\\.eu|nitter\\.actionsack\\.com|nitter\\.hu|twitr\\.gq|nittereu\\.moomoo\\.me|bird\\.from\\.tf|twitter\\.grimneko\\.de|nitter\\.alefvanoon\\.xyz|n\\.hyperborea\\.cloud|twitter\\.mstdn\\.social|nitter\\.silkky\\.cloud|nttr\\.stream|fuckthesacklers\\.network|nitter\\.govt\\.land|nitter\\.datatunnel\\.xyz|de\\.nttr\\.stream|twtr\\.bch\\.bar|nitter\\.exonip\\.de|nitter\\.mastodon\\.pro|nitter\\.notraxx\\.ch|nitter\\.skrep\\.in|nitter\\.snopyta\\.org)/(.+)/status/([0-9]+)(#.)?" @@ -10613,6 +10564,17 @@ ], "regexp": "(https?://(?:www\\.)?palcomp3\\.com(?:\\.br)?/([^/?&#]+))|(https?://(?:www\\.)?palcomp3\\.com(?:\\.br)?/([^/]+)/([^/?&#]+))|(https?://(?:www\\.)?palcomp3\\.com(?:\\.br)?/([^/]+)/([^/?&#]+)/?#clipe)" }, + "pandatv": { + "name": "pandatv", + "type": "free", + "domains": [ + "pandalive.co.kr" + ], + "regexps": [ + "https?://(?:www\\.|m\\.)?pandalive\\.co\\.kr/play/(\\w+)" + ], + "regexp": "https?://(?:www\\.|m\\.)?pandalive\\.co\\.kr/play/(\\w+)" + }, "panopto": { "name": "panopto", "type": "free", @@ -10704,10 +10666,10 @@ "parti.com" ], "regexps": [ - "https?://(?:www\\.)?parti\\.com/creator/([\\w]+)/([\\w/-]+)", + "https?://(?:www\\.)?parti\\.com/(?!video/)([\\w/-]+)", "https?://(?:www\\.)?parti\\.com/video/(\\d+)" ], - "regexp": "(https?://(?:www\\.)?parti\\.com/creator/([\\w]+)/([\\w/-]+))|(https?://(?:www\\.)?parti\\.com/video/(\\d+))" + "regexp": "(https?://(?:www\\.)?parti\\.com/(?!video/)([\\w/-]+))|(https?://(?:www\\.)?parti\\.com/video/(\\d+))" }, "patreon": { "name": "patreon", @@ -12963,15 +12925,6 @@ ], "regexp": "https?://(?:www\\.)?sciencechannel\\.com/video/([^/]+/[^/?#]+)" }, - "screen.yahoo": { - "name": "screen.yahoo", - "type": "free", - "domains": [], - "regexps": [ - "yvsearch(|[1-9][0-9]*|all):([\\s\\S]+)" - ], - "regexp": "yvsearch(|[1-9][0-9]*|all):([\\s\\S]+)" - }, "screen9": { "name": "screen9", "type": "free", @@ -13060,28 +13013,6 @@ ], "regexp": "https?://(?:www\\.)?scrolller\\.com/([\\w-]+)" }, - "scte": { - "name": "scte", - "type": "free", - "domains": [ - "learning.scte.org" - ], - "regexps": [ - "https?://learning\\.scte\\.org/mod/scorm/view\\.php?.*?\\bid=(\\d+)" - ], - "regexp": "https?://learning\\.scte\\.org/mod/scorm/view\\.php?.*?\\bid=(\\d+)" - }, - "sctecourse": { - "name": "sctecourse", - "type": "free", - "domains": [ - "learning.scte.org" - ], - "regexps": [ - "https?://learning\\.scte\\.org/(?:mod/sub)?course/view\\.php?.*?\\bid=(\\d+)" - ], - "regexp": "https?://learning\\.scte\\.org/(?:mod/sub)?course/view\\.php?.*?\\bid=(\\d+)" - }, "sejm": { "name": "sejm", "type": "free", @@ -14199,6 +14130,19 @@ ], "regexp": "https?://www\\.taptap\\.io/post/(\\d+)" }, + "tarangplus": { + "name": "tarangplus", + "type": "free", + "domains": [ + "tarangplus.in" + ], + "regexps": [ + "https?://(?:www\\.)?tarangplus\\.in/([^#?/]+)/([^#?/]+)/episodes/?(?:$|[?#])", + "https?://(?:www\\.)?tarangplus\\.in/([^#?/]+)/all/?(?:$|[?#])", + "https?://(?:www\\.)?tarangplus\\.in/(?:movies|[^#?/]+/[^#?/]+)/(?!episodes)([^#?/]+)" + ], + "regexp": "(https?://(?:www\\.)?tarangplus\\.in/([^#?/]+)/([^#?/]+)/episodes/?(?:$|[?#]))|(https?://(?:www\\.)?tarangplus\\.in/([^#?/]+)/all/?(?:$|[?#]))|(https?://(?:www\\.)?tarangplus\\.in/(?:movies|[^#?/]+/[^#?/]+)/(?!episodes)([^#?/]+))" + }, "tass": { "name": "tass", "type": "free", @@ -15152,10 +15096,10 @@ "tubitv.com" ], "regexps": [ - "https?://(?:www\\.)?tubitv\\.com/(video|movies|tv-shows)/(\\d+)", + "https?://(?:www\\.)?tubitv\\.com/(?:[a-z]{2}-[a-z]{2}/)?(video|movies|tv-shows)/(\\d+)", "https?://(?:www\\.)?tubitv\\.com/series/\\d+/([^/?#]+)(?:/season-(\\d+))?" ], - "regexp": "(https?://(?:www\\.)?tubitv\\.com/(video|movies|tv-shows)/(\\d+))|(https?://(?:www\\.)?tubitv\\.com/series/\\d+/([^/?#]+)(?:/season-(\\d+))?)" + "regexp": "(https?://(?:www\\.)?tubitv\\.com/(?:[a-z]{2}-[a-z]{2}/)?(video|movies|tv-shows)/(\\d+))|(https?://(?:www\\.)?tubitv\\.com/series/\\d+/([^/?#]+)(?:/season-(\\d+))?)" }, "tumblr": { "name": "tumblr", @@ -15304,10 +15248,10 @@ "tv5unis.ca" ], "regexps": [ - "https?://(?:www\\.)?tv5unis\\.ca/videos/([^/]+)(?:/saisons/(\\d+)/episodes/(\\d+))?/?(?:[?#&]|$)", - "https?://(?:www\\.)?tv5unis\\.ca/videos/[^/]+/(\\d+)" + "https?://(?:www\\.)?tv5unis\\.ca/videos/([^/?#]+)(?:/saisons/(\\d+)/episodes/(\\d+))?/?(?:[?#&]|$)", + "https?://(?:www\\.)?tv5unis\\.ca/videos/[^/?#]+/(\\d+)" ], - "regexp": "(https?://(?:www\\.)?tv5unis\\.ca/videos/([^/]+)(?:/saisons/(\\d+)/episodes/(\\d+))?/?(?:[?#&]|$))|(https?://(?:www\\.)?tv5unis\\.ca/videos/[^/]+/(\\d+))" + "regexp": "(https?://(?:www\\.)?tv5unis\\.ca/videos/([^/?#]+)(?:/saisons/(\\d+)/episodes/(\\d+))?/?(?:[?#&]|$))|(https?://(?:www\\.)?tv5unis\\.ca/videos/[^/?#]+/(\\d+))" }, "tv8.it": { "name": "tv8.it", @@ -17315,8 +17259,6 @@ "name": "yahoo", "type": "free", "domains": [ - "screen.yahoo.com", - "uk.screen.yahoo.com", "news.yahoo.com", "yahoo.com", "gma.yahoo.com", @@ -17329,9 +17271,10 @@ ], "regexps": [ "(https?://(?:([a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\\.)?(?:[\\da-zA-Z_-]+\\.)?yahoo\\.com/(?:[^/]+/)*([^?&#]*-[0-9]+(?:-[a-z]+)?)\\.html)", - "https?://news\\.yahoo\\.co\\.jp/(?:articles|feature)/([a-zA-Z0-9]+)" + "https?://news\\.yahoo\\.co\\.jp/(?:articles|feature)/([a-zA-Z0-9]+)", + "yvsearch(|[1-9][0-9]*|all):([\\s\\S]+)" ], - "regexp": "((https?://(?:([a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\\.)?(?:[\\da-zA-Z_-]+\\.)?yahoo\\.com/(?:[^/]+/)*([^?&#]*-[0-9]+(?:-[a-z]+)?)\\.html))|(https?://news\\.yahoo\\.co\\.jp/(?:articles|feature)/([a-zA-Z0-9]+))" + "regexp": "((https?://(?:([a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\\.)?(?:[\\da-zA-Z_-]+\\.)?yahoo\\.com/(?:[^/]+/)*([^?&#]*-[0-9]+(?:-[a-z]+)?)\\.html))|(https?://news\\.yahoo\\.co\\.jp/(?:articles|feature)/([a-zA-Z0-9]+))|(yvsearch(|[1-9][0-9]*|all):([\\s\\S]+))" }, "yandexdisk": { "name": "yandexdisk", @@ -17718,14 +17661,17 @@ "name": "zdf", "type": "free", "domains": [ - "zdf.de" + "zdf.de", + "zdfheute.de", + "logo.de" ], "regexps": [ "https?://www\\.zdf\\.de/(?:[^/?#]+/)*([^/?#]+)", "https?://(?:www\\.)?zdf\\.de/(?:video|play)/(?:[^/?#]+/)*([^/?#]+)", - "https?://(?:www\\.)?zdf\\.de/(?:[^/?#]+/)*([^/?#]+)\\.html" + "https?://(?:www\\.)?zdf\\.de/(?:[^/?#]+/)*([^/?#]+)\\.html", + "https?://(?:www\\.)?(?:zdfheute|logo)\\.de/(?:[^/?#]+/)*([^/?#]+)\\.html" ], - "regexp": "(https?://www\\.zdf\\.de/(?:[^/?#]+/)*([^/?#]+))|(https?://(?:www\\.)?zdf\\.de/(?:video|play)/(?:[^/?#]+/)*([^/?#]+))|(https?://(?:www\\.)?zdf\\.de/(?:[^/?#]+/)*([^/?#]+)\\.html)" + "regexp": "(https?://www\\.zdf\\.de/(?:[^/?#]+/)*([^/?#]+))|(https?://(?:www\\.)?zdf\\.de/(?:video|play)/(?:[^/?#]+/)*([^/?#]+))|(https?://(?:www\\.)?zdf\\.de/(?:[^/?#]+/)*([^/?#]+)\\.html)|(https?://(?:www\\.)?(?:zdfheute|logo)\\.de/(?:[^/?#]+/)*([^/?#]+)\\.html)" }, "zee5": { "name": "zee5", diff --git a/Provider/metadata_provider.py b/Provider/metadata_provider.py index 2a1b0d3..4a088e5 100644 --- a/Provider/metadata_provider.py +++ b/Provider/metadata_provider.py @@ -8,6 +8,9 @@ import requests import sys import json import subprocess + +from API.HTTP import HTTPClient +from ProviderCore.base import SearchResult try: # Optional dependency for IMDb scraping from imdbinfo.services import search_title # type: ignore except ImportError: # pragma: no cover - optional @@ -15,6 +18,7 @@ except ImportError: # pragma: no cover - optional from SYS.logger import log, debug from SYS.metadata import imdb_tag +from SYS.json_table import normalize_record try: # Optional dependency import musicbrainzngs # type: ignore @@ -892,6 +896,524 @@ class YtdlpMetadataProvider(MetadataProvider): return out +def _coerce_archive_field_list(value: Any) -> List[str]: + """Coerce an Archive.org metadata field to a list of strings.""" + + if value is None: + return [] + if isinstance(value, list): + out: List[str] = [] + for v in value: + try: + s = str(v).strip() + except Exception: + continue + if s: + out.append(s) + return out + if isinstance(value, (tuple, set)): + out = [] + for v in value: + try: + s = str(v).strip() + except Exception: + continue + if s: + out.append(s) + return out + try: + s = str(value).strip() + except Exception: + return [] + return [s] if s else [] + + +def archive_item_metadata_to_tags(archive_id: str, + item_metadata: Dict[str, Any]) -> List[str]: + """Coerce Archive.org metadata into a stable set of bibliographic tags.""" + + archive_id_clean = str(archive_id or "").strip() + meta = item_metadata if isinstance(item_metadata, dict) else {} + + tags: List[str] = [] + seen: set[str] = set() + + def _add(tag: str) -> None: + try: + t = str(tag).strip() + except Exception: + return + if not t: + return + if t.lower() in seen: + return + seen.add(t.lower()) + tags.append(t) + + if archive_id_clean: + _add(f"internet_archive:{archive_id_clean}") + + for title in _coerce_archive_field_list(meta.get("title"))[:1]: + _add(f"title:{title}") + + creators: List[str] = [] + creators.extend(_coerce_archive_field_list(meta.get("creator"))) + creators.extend(_coerce_archive_field_list(meta.get("author"))) + for creator in creators[:3]: + _add(f"author:{creator}") + + for publisher in _coerce_archive_field_list(meta.get("publisher"))[:3]: + _add(f"publisher:{publisher}") + + for date_val in _coerce_archive_field_list(meta.get("date"))[:1]: + _add(f"publish_date:{date_val}") + for year_val in _coerce_archive_field_list(meta.get("year"))[:1]: + _add(f"publish_date:{year_val}") + + for lang in _coerce_archive_field_list(meta.get("language"))[:3]: + _add(f"language:{lang}") + + for subj in _coerce_archive_field_list(meta.get("subject"))[:15]: + if len(subj) > 200: + subj = subj[:200] + _add(subj) + + def _clean_isbn(raw: str) -> str: + return str(raw or "").replace("-", "").strip() + + for isbn in _coerce_archive_field_list(meta.get("isbn"))[:10]: + isbn_clean = _clean_isbn(isbn) + if isbn_clean: + _add(f"isbn:{isbn_clean}") + + identifiers: List[str] = [] + identifiers.extend(_coerce_archive_field_list(meta.get("identifier"))) + identifiers.extend(_coerce_archive_field_list(meta.get("external-identifier"))) + added_other = 0 + for ident in identifiers: + ident_s = str(ident or "").strip() + if not ident_s: + continue + low = ident_s.lower() + + if low.startswith("urn:isbn:"): + val = _clean_isbn(ident_s.split(":", 2)[-1]) + if val: + _add(f"isbn:{val}") + continue + if low.startswith("isbn:"): + val = _clean_isbn(ident_s.split(":", 1)[-1]) + if val: + _add(f"isbn:{val}") + continue + if low.startswith("urn:oclc:"): + val = ident_s.split(":", 2)[-1].strip() + if val: + _add(f"oclc:{val}") + continue + if low.startswith("oclc:"): + val = ident_s.split(":", 1)[-1].strip() + if val: + _add(f"oclc:{val}") + continue + if low.startswith("urn:lccn:"): + val = ident_s.split(":", 2)[-1].strip() + if val: + _add(f"lccn:{val}") + continue + if low.startswith("lccn:"): + val = ident_s.split(":", 1)[-1].strip() + if val: + _add(f"lccn:{val}") + continue + if low.startswith("doi:"): + val = ident_s.split(":", 1)[-1].strip() + if val: + _add(f"doi:{val}") + continue + + if archive_id_clean and low == archive_id_clean.lower(): + continue + if added_other >= 5: + continue + if len(ident_s) > 200: + ident_s = ident_s[:200] + _add(f"identifier:{ident_s}") + added_other += 1 + + return tags + + +def fetch_archive_item_metadata(archive_id: str, + *, + timeout: int = 8) -> Dict[str, Any]: + ident = str(archive_id or "").strip() + if not ident: + return {} + resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=int(timeout)) + resp.raise_for_status() + data = resp.json() if resp is not None else {} + if not isinstance(data, dict): + return {} + meta = data.get("metadata") + return meta if isinstance(meta, dict) else {} + + +def scrape_isbn_metadata(isbn: str) -> List[str]: + """Scrape metadata tags for an ISBN using OpenLibrary's books API.""" + + new_tags: List[str] = [] + + isbn_clean = str(isbn or "").replace("isbn:", "").replace("-", "").strip() + if not isbn_clean: + return [] + + url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json" + try: + with HTTPClient() as client: + response = client.get(url) + response.raise_for_status() + data = json.loads(response.content.decode("utf-8")) + except Exception as exc: + log(f"Failed to fetch ISBN metadata: {exc}", file=sys.stderr) + return [] + + if not data: + log(f"No ISBN metadata found for: {isbn}") + return [] + + book_data = next(iter(data.values()), None) + if not isinstance(book_data, dict): + return [] + + if "title" in book_data: + new_tags.append(f"title:{book_data['title']}") + + authors = book_data.get("authors") + if isinstance(authors, list): + for author in authors[:3]: + if isinstance(author, dict) and author.get("name"): + new_tags.append(f"author:{author['name']}") + + if book_data.get("publish_date"): + new_tags.append(f"publish_date:{book_data['publish_date']}") + + publishers = book_data.get("publishers") + if isinstance(publishers, list) and publishers: + pub = publishers[0] + if isinstance(pub, dict) and pub.get("name"): + new_tags.append(f"publisher:{pub['name']}") + + if "description" in book_data: + desc = book_data.get("description") + if isinstance(desc, dict) and "value" in desc: + desc = desc.get("value") + if desc: + desc_str = str(desc).strip() + if desc_str: + new_tags.append(f"description:{desc_str[:200]}") + + page_count = book_data.get("number_of_pages") + if isinstance(page_count, int) and page_count > 0: + new_tags.append(f"pages:{page_count}") + + identifiers = book_data.get("identifiers") + if isinstance(identifiers, dict): + + def _first(value: Any) -> Any: + if isinstance(value, list) and value: + return value[0] + return value + + for key, ns in ( + ("openlibrary", "openlibrary"), + ("lccn", "lccn"), + ("oclc", "oclc"), + ("goodreads", "goodreads"), + ("librarything", "librarything"), + ("doi", "doi"), + ("internet_archive", "internet_archive"), + ): + val = _first(identifiers.get(key)) + if val: + new_tags.append(f"{ns}:{val}") + + debug(f"Found {len(new_tags)} tag(s) from ISBN lookup") + return new_tags + + +def scrape_openlibrary_metadata(olid: str) -> List[str]: + """Scrape metadata tags for an OpenLibrary ID using the edition JSON endpoint.""" + + new_tags: List[str] = [] + + olid_text = str(olid or "").strip() + if not olid_text: + return [] + + olid_norm = olid_text + try: + if not olid_norm.startswith("OL"): + olid_norm = f"OL{olid_norm}" + if not olid_norm.endswith("M"): + olid_norm = f"{olid_norm}M" + except Exception: + olid_norm = olid_text + + new_tags.append(f"openlibrary:{olid_norm}") + + olid_clean = olid_text.replace("OL", "").replace("M", "") + if not olid_clean.isdigit(): + olid_clean = olid_text + + if not olid_text.startswith("OL"): + url = f"https://openlibrary.org/books/OL{olid_clean}M.json" + else: + url = f"https://openlibrary.org/books/{olid_text}.json" + + try: + with HTTPClient() as client: + response = client.get(url) + response.raise_for_status() + data = json.loads(response.content.decode("utf-8")) + except Exception as exc: + log(f"Failed to fetch OpenLibrary metadata: {exc}", file=sys.stderr) + return [] + + if not isinstance(data, dict) or not data: + log(f"No OpenLibrary metadata found for: {olid_text}") + return [] + + if "title" in data: + new_tags.append(f"title:{data['title']}") + + authors = data.get("authors") + if isinstance(authors, list): + for author in authors[:3]: + if isinstance(author, dict) and author.get("name"): + new_tags.append(f"author:{author['name']}") + continue + + author_key = None + if isinstance(author, dict): + if isinstance(author.get("author"), dict): + author_key = author.get("author", {}).get("key") + if not author_key: + author_key = author.get("key") + + if isinstance(author_key, str) and author_key.startswith("/"): + try: + author_url = f"https://openlibrary.org{author_key}.json" + with HTTPClient(timeout=10) as client: + author_resp = client.get(author_url) + author_resp.raise_for_status() + author_data = json.loads(author_resp.content.decode("utf-8")) + if isinstance(author_data, dict) and author_data.get("name"): + new_tags.append(f"author:{author_data['name']}") + continue + except Exception: + pass + + if isinstance(author, str) and author: + new_tags.append(f"author:{author}") + + if data.get("publish_date"): + new_tags.append(f"publish_date:{data['publish_date']}") + + publishers = data.get("publishers") + if isinstance(publishers, list) and publishers: + pub = publishers[0] + if isinstance(pub, dict) and pub.get("name"): + new_tags.append(f"publisher:{pub['name']}") + elif isinstance(pub, str) and pub: + new_tags.append(f"publisher:{pub}") + + if "description" in data: + desc = data.get("description") + if isinstance(desc, dict) and "value" in desc: + desc = desc.get("value") + if desc: + desc_str = str(desc).strip() + if desc_str: + new_tags.append(f"description:{desc_str[:200]}") + + page_count = data.get("number_of_pages") + if isinstance(page_count, int) and page_count > 0: + new_tags.append(f"pages:{page_count}") + + subjects = data.get("subjects") + if isinstance(subjects, list): + for subject in subjects[:10]: + if isinstance(subject, str): + subject_clean = subject.strip() + if subject_clean and subject_clean not in new_tags: + new_tags.append(subject_clean) + + identifiers = data.get("identifiers") + if isinstance(identifiers, dict): + + def _first(value: Any) -> Any: + if isinstance(value, list) and value: + return value[0] + return value + + for key, ns in ( + ("isbn_10", "isbn_10"), + ("isbn_13", "isbn_13"), + ("lccn", "lccn"), + ("oclc_numbers", "oclc"), + ("goodreads", "goodreads"), + ("internet_archive", "internet_archive"), + ): + val = _first(identifiers.get(key)) + if val: + new_tags.append(f"{ns}:{val}") + + ocaid = data.get("ocaid") + if isinstance(ocaid, str) and ocaid.strip(): + new_tags.append(f"internet_archive:{ocaid.strip()}") + + debug(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup") + return new_tags + + +SAMPLE_ITEMS: List[Dict[str, Any]] = [ + { + "title": "Sample OpenLibrary book", + "path": "https://openlibrary.org/books/OL123M", + "openlibrary_id": "OL123M", + "archive_id": "samplearchive123", + "availability": "borrow", + "availability_reason": "sample", + "direct_url": "https://archive.org/download/sample.pdf", + "author_name": ["OpenLibrary Demo"], + "first_publish_year": 2023, + "ia": ["samplearchive123"], + }, +] + + +try: + from typing import Iterable + + from SYS.result_table_api import ColumnSpec, ResultModel, metadata_column, title_column + from SYS.result_table_adapters import register_provider + + def _ensure_search_result(item: Any) -> SearchResult: + if isinstance(item, SearchResult): + return item + if isinstance(item, dict): + data = dict(item) + title = str(data.get("title") or data.get("name") or "OpenLibrary") + path = str(data.get("path") or data.get("url") or "") + detail = str(data.get("detail") or "") + annotations = list(data.get("annotations") or []) + media_kind = str(data.get("media_kind") or "book") + return SearchResult( + table="openlibrary", + title=title, + path=path, + detail=detail, + annotations=annotations, + media_kind=media_kind, + columns=data.get("columns") or [], + full_metadata={**data, "raw": dict(item)}, + ) + return SearchResult( + table="openlibrary", + title=str(item or "OpenLibrary"), + path="", + detail="", + annotations=[], + media_kind="book", + full_metadata={"raw": {}}, + ) + + def _adapter(items: Iterable[Any]) -> Iterable[ResultModel]: + for item in items: + sr = _ensure_search_result(item) + metadata = dict(getattr(sr, "full_metadata", {}) or {}) + raw = metadata.get("raw") + if isinstance(raw, dict): + normalized = normalize_record(raw) + for key, val in normalized.items(): + metadata.setdefault(key, val) + + def _make_url() -> str: + candidate = ( + metadata.get("selection_url") or + metadata.get("direct_url") or + metadata.get("url") or + metadata.get("path") or + sr.path or + "" + ) + return str(candidate or "").strip() + + selection_url = _make_url() + if selection_url: + metadata["selection_url"] = selection_url + authors_value = metadata.get("authors_display") or metadata.get("authors") or metadata.get("author_name") or "" + if isinstance(authors_value, list): + authors_value = ", ".join(str(v) for v in authors_value if v) + authors_text = str(authors_value or "").strip() + if authors_text: + metadata["authors_display"] = authors_text + year_value = metadata.get("year") or metadata.get("first_publish_year") + if year_value and not isinstance(year_value, str): + year_value = str(year_value) + if year_value: + metadata["year"] = str(year_value) + metadata.setdefault("openlibrary_id", metadata.get("openlibrary_id") or metadata.get("olid")) + metadata.setdefault("source", metadata.get("source") or "openlibrary") + yield ResultModel( + title=str(sr.title or metadata.get("title") or selection_url or "OpenLibrary"), + path=selection_url or None, + metadata=metadata, + source="openlibrary", + ) + + def _columns_factory(rows: List[ResultModel]) -> List[ColumnSpec]: + cols: List[ColumnSpec] = [title_column()] + def _has(key: str) -> bool: + return any((row.metadata or {}).get(key) for row in rows) + + if _has("authors_display"): + cols.append( + ColumnSpec( + "authors_display", + "Author", + lambda r: (r.metadata or {}).get("authors_display") or "", + ) + ) + if _has("year"): + cols.append(metadata_column("year", "Year")) + if _has("availability"): + cols.append(metadata_column("availability", "Avail")) + if _has("archive_id"): + cols.append(metadata_column("archive_id", "Archive ID")) + if _has("openlibrary_id"): + cols.append(metadata_column("openlibrary_id", "OLID")) + return cols + + def _selection_fn(row: ResultModel) -> List[str]: + metadata = row.metadata or {} + url = str(metadata.get("selection_url") or row.path or "").strip() + if url: + return ["-url", url] + return ["-title", row.title or ""] + + register_provider( + "openlibrary", + _adapter, + columns=_columns_factory, + selection_fn=_selection_fn, + metadata={"description": "OpenLibrary search provider (JSON result table template)"}, + ) +except Exception: + pass + + # Registry --------------------------------------------------------------- _METADATA_PROVIDERS: Dict[str, diff --git a/Provider/openlibrary.py b/Provider/openlibrary.py index d298eed..27bf76b 100644 --- a/Provider/openlibrary.py +++ b/Provider/openlibrary.py @@ -11,18 +11,29 @@ import sys import tempfile import time from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple, Union from urllib.parse import urlparse import requests -from API.HTTP import HTTPClient +from API.HTTP import HTTPClient, get_requests_verify_value from ProviderCore.base import Provider, SearchResult from SYS.utils import sanitize_filename from SYS.cli_syntax import get_field, get_free_text, parse_query from SYS.logger import debug, log +from Provider.metadata_provider import ( + archive_item_metadata_to_tags, + fetch_archive_item_metadata, +) from SYS.utils import unique_path +_ARCHIVE_VERIFY_VALUE = get_requests_verify_value() + +def _create_archive_session() -> requests.Session: + session = requests.Session() + session.verify = _ARCHIVE_VERIFY_VALUE + return session + try: from Crypto.Cipher import AES # type: ignore from Crypto.Util import Counter # type: ignore @@ -262,182 +273,6 @@ def title_hint_from_url_slug(u: str) -> str: return slug or "OpenLibrary" -def _coerce_archive_field_list(value: Any) -> List[str]: - """Coerce an Archive.org metadata field to a list of strings.""" - if value is None: - return [] - if isinstance(value, list): - out: List[str] = [] - for v in value: - try: - s = str(v).strip() - except Exception: - continue - if s: - out.append(s) - return out - if isinstance(value, (tuple, set)): - out = [] - for v in value: - try: - s = str(v).strip() - except Exception: - continue - if s: - out.append(s) - return out - try: - s = str(value).strip() - except Exception: - return [] - return [s] if s else [] - - -def _archive_item_metadata_to_tags(archive_id: str, - item_metadata: Dict[str, - Any]) -> List[str]: - """Map Archive.org metadata JSON (the `metadata` object) to tag strings. - - This is intentionally best-effort and conservative: it focuses on stable, - useful bibliographic fields (title/author/publisher/ISBN/identifier/topics). - """ - archive_id_clean = str(archive_id or "").strip() - meta = item_metadata if isinstance(item_metadata, - dict) else {} - - tags: List[str] = [] - seen: set[str] = set() - - def _add(tag: str) -> None: - try: - t = str(tag).strip() - except Exception: - return - if not t: - return - if t.lower() in seen: - return - seen.add(t.lower()) - tags.append(t) - - if archive_id_clean: - _add(f"internet_archive:{archive_id_clean}") - - # Title - for title in _coerce_archive_field_list(meta.get("title"))[:1]: - _add(f"title:{title}") - - # Authors/creators - creators: List[str] = [] - creators.extend(_coerce_archive_field_list(meta.get("creator"))) - creators.extend(_coerce_archive_field_list(meta.get("author"))) - for creator in creators[:3]: - _add(f"author:{creator}") - - # Publisher - for publisher in _coerce_archive_field_list(meta.get("publisher"))[:3]: - _add(f"publisher:{publisher}") - - # Publish date/year - for date_val in _coerce_archive_field_list(meta.get("date"))[:1]: - _add(f"publish_date:{date_val}") - for year_val in _coerce_archive_field_list(meta.get("year"))[:1]: - _add(f"publish_date:{year_val}") - - # Language - for lang in _coerce_archive_field_list(meta.get("language"))[:3]: - _add(f"language:{lang}") - - # Topics/subjects: follow existing OpenLibrary behavior (un-namespaced tags) - for subj in _coerce_archive_field_list(meta.get("subject"))[:15]: - if len(subj) > 200: - subj = subj[:200] - _add(subj) - - # ISBNs and identifiers - def _clean_isbn(raw: str) -> str: - return str(raw or "").replace("-", "").strip() - - for isbn in _coerce_archive_field_list(meta.get("isbn"))[:10]: - isbn_clean = _clean_isbn(isbn) - if isbn_clean: - _add(f"isbn:{isbn_clean}") - - identifiers: List[str] = [] - identifiers.extend(_coerce_archive_field_list(meta.get("identifier"))) - identifiers.extend(_coerce_archive_field_list(meta.get("external-identifier"))) - added_other = 0 - for ident in identifiers: - ident_s = str(ident or "").strip() - if not ident_s: - continue - low = ident_s.lower() - - if low.startswith("urn:isbn:"): - val = _clean_isbn(ident_s.split(":", 2)[-1]) - if val: - _add(f"isbn:{val}") - continue - if low.startswith("isbn:"): - val = _clean_isbn(ident_s.split(":", 1)[-1]) - if val: - _add(f"isbn:{val}") - continue - if low.startswith("urn:oclc:"): - val = ident_s.split(":", 2)[-1].strip() - if val: - _add(f"oclc:{val}") - continue - if low.startswith("oclc:"): - val = ident_s.split(":", 1)[-1].strip() - if val: - _add(f"oclc:{val}") - continue - if low.startswith("urn:lccn:"): - val = ident_s.split(":", 2)[-1].strip() - if val: - _add(f"lccn:{val}") - continue - if low.startswith("lccn:"): - val = ident_s.split(":", 1)[-1].strip() - if val: - _add(f"lccn:{val}") - continue - if low.startswith("doi:"): - val = ident_s.split(":", 1)[-1].strip() - if val: - _add(f"doi:{val}") - continue - - if archive_id_clean and low == archive_id_clean.lower(): - continue - if added_other >= 5: - continue - if len(ident_s) > 200: - ident_s = ident_s[:200] - _add(f"identifier:{ident_s}") - added_other += 1 - - return tags - - -def _fetch_archive_item_metadata(archive_id: str, - *, - timeout: int = 8) -> Dict[str, - Any]: - ident = str(archive_id or "").strip() - if not ident: - return {} - resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=int(timeout)) - resp.raise_for_status() - data = resp.json() if resp is not None else {} - if not isinstance(data, dict): - return {} - meta = data.get("metadata") - return meta if isinstance(meta, - dict) else {} - - class OpenLibrary(Provider): TABLE_AUTO_STAGES = { @@ -466,7 +301,7 @@ class OpenLibrary(Provider): def __init__(self, config: Optional[Dict[str, Any]] = None): super().__init__(config) - self._session = requests.Session() + self._session = _create_archive_session() class BookNotAvailableError(Exception): """Raised when a book is not available for borrowing (waitlisted/in use).""" @@ -612,7 +447,7 @@ class OpenLibrary(Provider): @classmethod def _archive_login(cls, email: str, password: str) -> requests.Session: """Login to archive.org using the token-based services endpoint (matches test-login.py).""" - session = requests.Session() + session = _create_archive_session() token_resp = session.get( "https://archive.org/services/account/login/", @@ -766,7 +601,11 @@ class OpenLibrary(Provider): if not ident: return False, "no-archive-id" try: - resp = requests.get(f"https://archive.org/metadata/{ident}", timeout=8) + resp = requests.get( + f"https://archive.org/metadata/{ident}", + timeout=8, + verify=_ARCHIVE_VERIFY_VALUE, + ) resp.raise_for_status() data = resp.json() if resp is not None else {} meta = data.get("metadata", @@ -976,7 +815,11 @@ class OpenLibrary(Provider): """Check for a directly downloadable original PDF in Archive.org metadata.""" try: metadata_url = f"https://archive.org/metadata/{book_id}" - response = requests.get(metadata_url, timeout=6) + response = requests.get( + metadata_url, + timeout=6, + verify=_ARCHIVE_VERIFY_VALUE, + ) response.raise_for_status() metadata = response.json() files = metadata.get("files") if isinstance(metadata, dict) else None @@ -993,7 +836,8 @@ class OpenLibrary(Provider): check_response = requests.head( pdf_url, timeout=4, - allow_redirects=True + allow_redirects=True, + verify=_ARCHIVE_VERIFY_VALUE, ) if check_response.status_code == 200: return True, pdf_url @@ -1001,235 +845,6 @@ class OpenLibrary(Provider): except Exception: return False, "" - @staticmethod - def scrape_isbn_metadata(isbn: str) -> List[str]: - """Scrape tags for an ISBN using Open Library API. - - Returns tags such as: - - title:<...>, author:<...>, publish_date:<...>, publisher:<...>, description:<...>, pages:<...> - - identifiers: openlibrary:<...>, lccn:<...>, oclc:<...>, goodreads:<...>, librarything:<...>, doi:<...>, internet_archive:<...> - """ - new_tags: List[str] = [] - - isbn_clean = str(isbn or "").replace("isbn:", "").replace("-", "").strip() - if not isbn_clean: - return [] - - url = f"https://openlibrary.org/api/books?bibkeys=ISBN:{isbn_clean}&jscmd=data&format=json" - try: - with HTTPClient() as client: - response = client.get(url) - response.raise_for_status() - data = json_module.loads(response.content.decode("utf-8")) - except Exception as exc: - log(f"Failed to fetch ISBN metadata: {exc}", file=sys.stderr) - return [] - - if not data: - log(f"No ISBN metadata found for: {isbn}") - return [] - - book_data = next(iter(data.values()), None) - if not isinstance(book_data, dict): - return [] - - if "title" in book_data: - new_tags.append(f"title:{book_data['title']}") - - authors = book_data.get("authors") - if isinstance(authors, list): - for author in authors[:3]: - if isinstance(author, dict) and author.get("name"): - new_tags.append(f"author:{author['name']}") - - if book_data.get("publish_date"): - new_tags.append(f"publish_date:{book_data['publish_date']}") - - publishers = book_data.get("publishers") - if isinstance(publishers, list) and publishers: - pub = publishers[0] - if isinstance(pub, dict) and pub.get("name"): - new_tags.append(f"publisher:{pub['name']}") - - if "description" in book_data: - desc = book_data.get("description") - if isinstance(desc, dict) and "value" in desc: - desc = desc.get("value") - if desc: - desc_str = str(desc).strip() - if desc_str: - new_tags.append(f"description:{desc_str[:200]}") - - page_count = book_data.get("number_of_pages") - if isinstance(page_count, int) and page_count > 0: - new_tags.append(f"pages:{page_count}") - - identifiers = book_data.get("identifiers") - if isinstance(identifiers, dict): - - def _first(value: Any) -> Any: - if isinstance(value, list) and value: - return value[0] - return value - - for key, ns in ( - ("openlibrary", "openlibrary"), - ("lccn", "lccn"), - ("oclc", "oclc"), - ("goodreads", "goodreads"), - ("librarything", "librarything"), - ("doi", "doi"), - ("internet_archive", "internet_archive"), - ): - val = _first(identifiers.get(key)) - if val: - new_tags.append(f"{ns}:{val}") - - debug(f"Found {len(new_tags)} tag(s) from ISBN lookup") - return new_tags - - @staticmethod - def scrape_openlibrary_metadata(olid: str) -> List[str]: - """Scrape tags for an OpenLibrary ID using the .json API endpoint.""" - new_tags: List[str] = [] - - olid_text = str(olid or "").strip() - if not olid_text: - return [] - - # Normalize OLID to the common "OLM" form when possible. - olid_norm = olid_text - try: - if not olid_norm.startswith("OL"): - olid_norm = f"OL{olid_norm}" - if not olid_norm.endswith("M"): - olid_norm = f"{olid_norm}M" - except Exception: - olid_norm = olid_text - - # Ensure we always include a scrapeable identifier tag. - new_tags.append(f"openlibrary:{olid_norm}") - - # Accept OL9674499M, 9674499M, or just digits. - olid_clean = olid_text.replace("OL", "").replace("M", "") - if not olid_clean.isdigit(): - olid_clean = olid_text - - if not olid_text.startswith("OL"): - url = f"https://openlibrary.org/books/OL{olid_clean}M.json" - else: - url = f"https://openlibrary.org/books/{olid_text}.json" - - try: - with HTTPClient() as client: - response = client.get(url) - response.raise_for_status() - data = json_module.loads(response.content.decode("utf-8")) - except Exception as exc: - log(f"Failed to fetch OpenLibrary metadata: {exc}", file=sys.stderr) - return [] - - if not isinstance(data, dict) or not data: - log(f"No OpenLibrary metadata found for: {olid_text}") - return [] - - if "title" in data: - new_tags.append(f"title:{data['title']}") - - authors = data.get("authors") - if isinstance(authors, list): - for author in authors[:3]: - if isinstance(author, dict) and author.get("name"): - new_tags.append(f"author:{author['name']}") - continue - - # Common OL shape: {"key": "/authors/OL...A"} or {"author": {"key": ...}} - author_key = None - if isinstance(author, dict): - if isinstance(author.get("author"), dict): - author_key = author.get("author", - {}).get("key") - if not author_key: - author_key = author.get("key") - - if isinstance(author_key, str) and author_key.startswith("/"): - try: - author_url = f"https://openlibrary.org{author_key}.json" - with HTTPClient(timeout=10) as client: - author_resp = client.get(author_url) - author_resp.raise_for_status() - author_data = json_module.loads( - author_resp.content.decode("utf-8") - ) - if isinstance(author_data, dict) and author_data.get("name"): - new_tags.append(f"author:{author_data['name']}") - continue - except Exception: - pass - - if isinstance(author, str) and author: - new_tags.append(f"author:{author}") - - if data.get("publish_date"): - new_tags.append(f"publish_date:{data['publish_date']}") - - publishers = data.get("publishers") - if isinstance(publishers, list) and publishers: - pub = publishers[0] - if isinstance(pub, dict) and pub.get("name"): - new_tags.append(f"publisher:{pub['name']}") - elif isinstance(pub, str) and pub: - new_tags.append(f"publisher:{pub}") - - if "description" in data: - desc = data.get("description") - if isinstance(desc, dict) and "value" in desc: - desc = desc.get("value") - if desc: - desc_str = str(desc).strip() - if desc_str: - new_tags.append(f"description:{desc_str[:200]}") - - page_count = data.get("number_of_pages") - if isinstance(page_count, int) and page_count > 0: - new_tags.append(f"pages:{page_count}") - - subjects = data.get("subjects") - if isinstance(subjects, list): - for subject in subjects[:10]: - if isinstance(subject, str): - subject_clean = subject.strip() - if subject_clean and subject_clean not in new_tags: - new_tags.append(subject_clean) - - identifiers = data.get("identifiers") - if isinstance(identifiers, dict): - - def _first(value: Any) -> Any: - if isinstance(value, list) and value: - return value[0] - return value - - for key, ns in ( - ("isbn_10", "isbn_10"), - ("isbn_13", "isbn_13"), - ("lccn", "lccn"), - ("oclc_numbers", "oclc"), - ("goodreads", "goodreads"), - ("internet_archive", "internet_archive"), - ): - val = _first(identifiers.get(key)) - if val: - new_tags.append(f"{ns}:{val}") - - # Some editions expose a direct Archive.org identifier as "ocaid". - ocaid = data.get("ocaid") - if isinstance(ocaid, str) and ocaid.strip(): - new_tags.append(f"internet_archive:{ocaid.strip()}") - - debug(f"Found {len(new_tags)} tag(s) from OpenLibrary lookup") - return new_tags - def search( self, query: str, @@ -1293,7 +908,7 @@ class OpenLibrary(Provider): ia_val_local = [] ia_ids_local = [str(x) for x in ia_val_local if x] - session_local = requests.Session() + session_local = _create_archive_session() try: archive_id_local = _resolve_archive_id( @@ -1423,19 +1038,38 @@ class OpenLibrary(Provider): "borrow"}: annotations.append(availability) + book_path = ( + f"https://openlibrary.org/books/{edition_id}" if edition_id else + ( + f"https://openlibrary.org{work_key}" + if isinstance(work_key, str) and work_key.startswith("/") else + "https://openlibrary.org" + ) + ) + metadata = { + "openlibrary_id": edition_id, + "openlibrary_key": work_key, + "authors": authors_list, + "year": year, + "isbn_10": isbn_10, + "isbn_13": isbn_13, + "ia": ia_ids, + "availability": availability, + "availability_reason": availability_reason, + "archive_id": archive_id, + "direct_url": direct_url, + "raw": doc, + } + if book_path: + metadata["selection_url"] = book_path + metadata["_selection_args"] = ["-url", book_path] + metadata["_selection_action"] = ["download-file", "-url", book_path] + results.append( SearchResult( table="openlibrary", title=book_title, - path=( - f"https://openlibrary.org/books/{edition_id}" if edition_id else - ( - f"https://openlibrary.org{work_key}" - if isinstance(work_key, - str) and work_key.startswith("/") else - "https://openlibrary.org" - ) - ), + path=book_path, detail=( (f"By: {', '.join(authors_list)}" if authors_list else "") + (f" ({year})" if year else "") @@ -1443,20 +1077,7 @@ class OpenLibrary(Provider): annotations=annotations, media_kind="book", columns=columns, - full_metadata={ - "openlibrary_id": edition_id, - "openlibrary_key": work_key, - "authors": authors_list, - "year": year, - "isbn_10": isbn_10, - "isbn_13": isbn_13, - "ia": ia_ids, - "availability": availability, - "availability_reason": availability_reason, - "archive_id": archive_id, - "direct_url": direct_url, - "raw": doc, - }, + full_metadata=metadata, ) ) @@ -1507,8 +1128,8 @@ class OpenLibrary(Provider): # Best-effort metadata scrape to attach bibliographic tags for downstream cmdlets. try: - archive_meta = _fetch_archive_item_metadata(archive_id) - tags = _archive_item_metadata_to_tags(archive_id, archive_meta) + archive_meta = fetch_archive_item_metadata(archive_id) + tags = archive_item_metadata_to_tags(archive_id, archive_meta) if tags: try: result.tag.update(tags) diff --git a/SYS/json_table.py b/SYS/json_table.py new file mode 100644 index 0000000..c45a2e4 --- /dev/null +++ b/SYS/json_table.py @@ -0,0 +1,110 @@ +"""Helper utilities for normalizing JSON result tables. + +This mirrors the intent of the existing `SYS.html_table` helper but operates on +JSON payloads (API responses, JSON APIs, etc.). It exposes: + +- `extract_records` for locating and normalizing the first list of record dicts + from a JSON document. +- `normalize_record` for coercing arbitrary values into printable strings. + +These helpers make it easy for providers that consume JSON to populate +`ResultModel` metadata without hand-writing ad-hoc sanitizers. +""" +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Sequence, Tuple + +_DEFAULT_LIST_KEYS: Tuple[str, ...] = ("results", "items", "docs", "records") + + +def _coerce_value(value: Any) -> str: + """Convert a JSON value into a compact string representation.""" + if value is None: + return "" + if isinstance(value, bool): + return "true" if value else "false" + if isinstance(value, (list, tuple, set)): + parts = [_coerce_value(v) for v in value] + cleaned = [part for part in parts if part] + return ", ".join(cleaned) + if isinstance(value, dict): + parts: List[str] = [] + for subkey, subvalue in value.items(): + part = _coerce_value(subvalue) + if part: + parts.append(f"{subkey}:{part}") + return ", ".join(parts) + try: + return str(value).strip() + except Exception: + return "" + + +def normalize_record(record: Dict[str, Any]) -> Dict[str, str]: + """Return a copy of ``record`` with keys lowered and values coerced to strings.""" + out: Dict[str, str] = {} + if not isinstance(record, dict): + return out + for key, value in record.items(): + normalized_key = str(key or "").strip().lower() + if not normalized_key: + continue + normalized_value = _coerce_value(value) + if normalized_value: + out[normalized_key] = normalized_value + return out + + +def _traverse(data: Any, path: Sequence[str]) -> Optional[Any]: + current = data + for key in path: + if not isinstance(current, dict): + return None + current = current.get(key) + return current + + +def extract_records( + data: Any, + *, + path: Optional[Sequence[str]] = None, + list_keys: Optional[Sequence[str]] = None, +) -> Tuple[List[Dict[str, str]], Optional[str]]: + """Extract normalized record dicts from ``data``. + + Args: + data: JSON document (dict/list) that may contain tabular records. + path: optional key path to traverse before looking for a list. + list_keys: candidate keys to inspect when ``path`` is not provided. + + Returns: + (records, chosen_path) where ``records`` is the list of normalized dicts + and ``chosen_path`` is either the traversed path or the key that matched. + """ + list_keys = list_keys or _DEFAULT_LIST_KEYS + chosen_path: Optional[str] = None + candidates: List[Any] = [] + + if path: + found = _traverse(data, path) + if isinstance(found, list): + candidates = found + chosen_path = ".".join(path) + + if not candidates and isinstance(data, dict): + for key in list_keys: + found = data.get(key) + if isinstance(found, list): + candidates = found + chosen_path = key + break + + if not candidates and isinstance(data, list): + candidates = data + chosen_path = "" + + records: List[Dict[str, str]] = [] + for entry in candidates: + if isinstance(entry, dict): + records.append(normalize_record(entry)) + return records, chosen_path diff --git a/SYS/result_table.py b/SYS/result_table.py index 64bbe76..843f1e3 100644 --- a/SYS/result_table.py +++ b/SYS/result_table.py @@ -783,56 +783,56 @@ class ResultTable: def _add_search_result(self, row: ResultRow, result: Any) -> None: """Extract and add SearchResult fields to row.""" - # If provider supplied explicit columns, render those and skip legacy defaults cols = getattr(result, "columns", None) + used_explicit_columns = False if cols: + used_explicit_columns = True for name, value in cols: row.add_column(name, value) - return + else: + # Core fields (legacy fallback) + title = getattr(result, "title", "") + table = str(getattr(result, "table", "") or "").lower() - # Core fields (legacy fallback) - title = getattr(result, "title", "") - table = str(getattr(result, "table", "") or "").lower() + # Handle extension separation for local files + extension = "" + if title and table == "local": + path_obj = Path(title) + if path_obj.suffix: + extension = path_obj.suffix.lstrip(".") + title = path_obj.stem - # Handle extension separation for local files - extension = "" - if title and table == "local": - path_obj = Path(title) - if path_obj.suffix: - extension = path_obj.suffix.lstrip(".") - title = path_obj.stem + if title: + row.add_column("Title", title) - if title: - row.add_column("Title", title) + # Extension column + row.add_column("Ext", extension) - # Extension column - row.add_column("Ext", extension) + if hasattr(result, "table") and getattr(result, "table", None): + row.add_column("Source", str(getattr(result, "table"))) - if hasattr(result, "table") and getattr(result, "table", None): - row.add_column("Source", str(getattr(result, "table"))) + if hasattr(result, "detail") and result.detail: + row.add_column("Detail", result.detail) - if hasattr(result, "detail") and result.detail: - row.add_column("Detail", result.detail) + if hasattr(result, "media_kind") and result.media_kind: + row.add_column("Type", result.media_kind) - if hasattr(result, "media_kind") and result.media_kind: - row.add_column("Type", result.media_kind) + # Tag summary + if hasattr(result, "tag_summary") and result.tag_summary: + row.add_column("Tag", str(result.tag_summary)) - # Tag summary - if hasattr(result, "tag_summary") and result.tag_summary: - row.add_column("Tag", str(result.tag_summary)) + # Duration (for media) + if hasattr(result, "duration_seconds") and result.duration_seconds: + dur = _format_duration_hms(result.duration_seconds) + row.add_column("Duration", dur or str(result.duration_seconds)) - # Duration (for media) - if hasattr(result, "duration_seconds") and result.duration_seconds: - dur = _format_duration_hms(result.duration_seconds) - row.add_column("Duration", dur or str(result.duration_seconds)) + # Size (for files) + if hasattr(result, "size_bytes") and result.size_bytes: + row.add_column("Size", _format_size(result.size_bytes, integer_only=False)) - # Size (for files) - if hasattr(result, "size_bytes") and result.size_bytes: - row.add_column("Size", _format_size(result.size_bytes, integer_only=False)) - - # Annotations - if hasattr(result, "annotations") and result.annotations: - row.add_column("Annotations", ", ".join(str(a) for a in result.annotations)) + # Annotations + if hasattr(result, "annotations") and result.annotations: + row.add_column("Annotations", ", ".join(str(a) for a in result.annotations)) try: md = getattr(result, "full_metadata", None) diff --git a/SYS/result_table_adapters.py b/SYS/result_table_adapters.py index 935e9f1..ea23df7 100644 --- a/SYS/result_table_adapters.py +++ b/SYS/result_table_adapters.py @@ -61,15 +61,22 @@ class Provider: def serialize_row(self, row: ResultModel) -> Dict[str, Any]: r = ensure_result_model(row) - return { + metadata = r.metadata or {} + out: Dict[str, Any] = { "title": r.title, "path": r.path, "ext": r.ext, "size_bytes": r.size_bytes, - "metadata": r.metadata or {}, + "metadata": metadata, "source": r.source or self.name, "_selection_args": self.selection_args(r), } + selection_action = metadata.get("_selection_action") or metadata.get("selection_action") + if selection_action: + out["_selection_action"] = [ + str(x) for x in selection_action if x is not None + ] + return out def serialize_rows(self, rows: Iterable[ResultModel]) -> List[Dict[str, Any]]: return [self.serialize_row(r) for r in rows] diff --git a/cmdlet/download_file.py b/cmdlet/download_file.py index d210139..0705833 100644 --- a/cmdlet/download_file.py +++ b/cmdlet/download_file.py @@ -30,6 +30,7 @@ from SYS.result_table import ResultTable from SYS.rich_display import stderr_console as get_stderr_console from SYS import pipeline as pipeline_context from SYS.utils import sha256_file +from SYS.metadata import normalize_urls as normalize_url_list from rich.prompt import Confirm from tool.ytdlp import ( @@ -125,243 +126,6 @@ class Download_File(Cmdlet): debug(f"[download-file] run invoked with args: {list(args)}") return self._run_impl(result, args, config) - @staticmethod - def _normalize_urls(parsed: Dict[str, Any]) -> List[str]: - raw_url = parsed.get("url", []) - if isinstance(raw_url, str): - raw_url = [raw_url] - - expanded_urls: List[str] = [] - for u in raw_url or []: - if u is None: - continue - s = str(u).strip() - if not s: - continue - if "," in s: - parts = [p.strip() for p in s.split(",")] - expanded_urls.extend([p for p in parts if p]) - else: - expanded_urls.append(s) - - return expanded_urls - - @staticmethod - def _rewrite_archive_org_urls(raw_urls: Sequence[str]) -> List[str]: - """Rewrite Archive.org URLs using metadata JSON to pick the right flow. - - - /metadata/: - - if lendable (collection contains inlibrary/printdisabled/lendinglibrary) -> /borrow/ - - else -> /details/ - - /details/: - - if lendable -> /borrow/ - - This makes `download-file` do the right thing for borrow-only items. - """ - - out: List[str] = [] - for u in list(raw_urls or []): - s = str(u or "").strip() - if not s: - continue - - try: - p = urlparse(s) - host = (p.hostname or "").strip().lower() - path = (p.path or "").strip() - except Exception: - out.append(s) - continue - - if not host or (host != "archive.org" and not host.endswith(".archive.org")): - out.append(s) - continue - - low_path = path.lower().strip() - if not (low_path.startswith("/metadata/") or low_path.startswith("/details/")): - out.append(s) - continue - - parts = [x for x in path.split("/") if x] - if len(parts) < 2: - out.append(s) - continue - head = str(parts[0] or "").strip().lower() - archive_id = str(parts[1] or "").strip() - if head not in {"metadata", "details"} or not archive_id: - out.append(s) - continue - - lendable = False - try: - meta_url = f"https://archive.org/metadata/{archive_id}" - resp = requests.get(meta_url, timeout=8) - resp.raise_for_status() - data = resp.json() if resp is not None else {} - meta = data.get("metadata", {}) if isinstance(data, dict) else {} - collection = meta.get("collection") if isinstance(meta, dict) else None - - values: List[str] = [] - if isinstance(collection, list): - values = [str(x).strip().lower() for x in collection if str(x).strip()] - elif isinstance(collection, str): - values = [collection.strip().lower()] if collection.strip() else [] - - lendable = any(v in {"inlibrary", "lendinglibrary"} for v in values) - except Exception: - lendable = False - - if lendable: - debug(f"[download-file] archive.org item '{archive_id}' looks lendable; using borrow flow") - out.append(f"https://archive.org/borrow/{archive_id}") - continue - - # Non-lendable: turn metadata URLs into details URLs so IA picker can show files. - if head == "metadata": - out.append(f"https://archive.org/details/{archive_id}") - continue - - out.append(s) - - return out - - @staticmethod - def _collect_piped_items_if_no_urls(result: Any, - raw_urls: Sequence[str]) -> List[Any]: - if raw_urls: - return [] - if isinstance(result, list): - return list(result) - if result: - return [result] - return [] - - @staticmethod - def _safe_total_items(raw_urls: Sequence[str], piped_items: Sequence[Any]) -> int: - try: - return int(len(raw_urls or []) + len(piped_items or [])) - except Exception: - return 1 - - @staticmethod - def _build_preview( - raw_urls: Sequence[str], - piped_items: Sequence[Any], - total_items: int - ) -> List[Any]: - try: - preview: List[Any] = [] - preview.extend(list(raw_urls or [])[:max(0, total_items)]) - if len(preview) < total_items: - preview.extend( - list(piped_items or [])[:max(0, - total_items - len(preview))] - ) - return preview - except Exception: - return [] - - @staticmethod - def _load_provider_registry() -> Dict[str, Any]: - try: - from ProviderCore.registry import ( - get_search_provider as _get_search_provider, - get_provider as _get_provider, - match_provider_name_for_url as _match_provider_name_for_url, - SearchResult as _SearchResult, - ) - - return { - "get_search_provider": _get_search_provider, - "get_provider": _get_provider, - "match_provider_name_for_url": _match_provider_name_for_url, - "SearchResult": _SearchResult, - } - - except Exception: - return { - "get_search_provider": None, - "get_provider": None, - "match_provider_name_for_url": None, - "SearchResult": None, - } - - @staticmethod - def _path_from_download_result(result_obj: Any) -> Path: - file_path = None - if hasattr(result_obj, "path"): - file_path = getattr(result_obj, "path") - elif isinstance(result_obj, dict): - file_path = result_obj.get("path") - if not file_path: - file_path = str(result_obj) - return Path(str(file_path)) - - def _emit_local_file( - self, - *, - downloaded_path: Path, - source: Optional[str], - title_hint: Optional[str], - tags_hint: Optional[List[str]], - media_kind_hint: Optional[str], - full_metadata: Optional[Dict[str, - Any]], - progress: PipelineProgress, - config: Dict[str, - Any], - provider_hint: Optional[str] = None, - ) -> None: - title_val = (title_hint or downloaded_path.stem - or "Unknown").strip() or downloaded_path.stem - hash_value = self._compute_file_hash(downloaded_path) - notes: Optional[Dict[str, str]] = None - try: - if isinstance(full_metadata, dict): - subtitles = full_metadata.get("_tidal_lyrics_subtitles") - if isinstance(subtitles, str) and subtitles.strip(): - notes = {"lyric": subtitles} - except Exception: - notes = None - tag: List[str] = [] - if tags_hint: - tag.extend([str(t) for t in tags_hint if t]) - if not any(str(t).lower().startswith("title:") for t in tag): - tag.insert(0, f"title:{title_val}") - - payload: Dict[str, - Any] = { - "path": str(downloaded_path), - "hash": hash_value, - "title": title_val, - "action": "cmdlet:download-file", - "download_mode": "file", - "store": "local", - "media_kind": media_kind_hint or "file", - "tag": tag, - } - if provider_hint: - payload["provider"] = str(provider_hint) - if full_metadata: - payload["full_metadata"] = full_metadata - if notes: - payload["notes"] = notes - if source and str(source).startswith("http"): - payload["url"] = source - elif source: - payload["source_url"] = source - - pipeline_context.emit(payload) - - # When running with a local progress UI (standalone cmdlet), ensure - # the pipe advances on emit. - progress.on_emit(payload) - - # Automatically register url with local library - if payload.get("url"): - pipe_obj = coerce_to_pipe_object(payload) - register_url_with_local_library(pipe_obj, config) - def _process_explicit_urls( self, *, @@ -373,6 +137,7 @@ class Download_File(Cmdlet): registry: Dict[str, Any], progress: PipelineProgress, + context_items: Sequence[Any] = (), ) -> tuple[int, Optional[int]]: downloaded_count = 0 @@ -381,6 +146,12 @@ class Download_File(Cmdlet): get_provider = registry.get("get_provider") match_provider_name_for_url = registry.get("match_provider_name_for_url") + context_items_list: List[Any] + try: + context_items_list = list(context_items) if context_items else [] + except Exception: + context_items_list = [] + for url in raw_urls: try: debug(f"Processing URL: {url}") @@ -521,14 +292,15 @@ class Download_File(Cmdlet): if provider_name and get_provider is not None and SearchResult is not None: # OpenLibrary URLs should be handled by the OpenLibrary provider. if provider_name == "openlibrary": + url_str = str(url).strip() provider = get_provider("openlibrary", config) if provider is None: raise DownloadError( "OpenLibrary provider not configured or not available" ) - edition_id = ol_provider.edition_id_from_url(str(url)) - title_hint = ol_provider.title_hint_from_url_slug(str(url)) + edition_id = ol_provider.edition_id_from_url(url_str) + title_hint = ol_provider.title_hint_from_url_slug(url_str) download_payload: Optional[Dict[str, Any]] = None try: @@ -596,9 +368,95 @@ class Download_File(Cmdlet): progress_cb = _progress - if hasattr(provider, "download_url"): + # Prefer piped OpenLibrary context (selection row) when present so we keep + # resolved metadata like archive_id and availability. + ctx_item = None + ctx_md: Dict[str, Any] = {} + ctx_title: Optional[str] = None + ctx_tags: Optional[List[str]] = None + ctx_media_kind: Optional[str] = None + for candidate in context_items_list: + try: + table_val = get_field(candidate, "table") + except Exception: + table_val = None + if str(table_val or "").lower() != "openlibrary": + continue + + md_val = get_field(candidate, "full_metadata") + md_dict = md_val if isinstance(md_val, dict) else {} + cand_olid = str(md_dict.get("openlibrary_id") or md_dict.get("olid") or "").strip() + cand_archive = str(md_dict.get("archive_id") or "").strip() + cand_url = str( + get_field(candidate, "path") + or get_field(candidate, "url") + or md_dict.get("selection_url") + or "" + ).strip() + + matched = False + if edition_id and cand_olid and cand_olid == edition_id: + matched = True + elif cand_url and url_str and cand_url == url_str: + matched = True + elif (not edition_id) and cand_archive and cand_archive in url_str: + matched = True + + if matched: + ctx_item = candidate + ctx_md = md_dict + ctx_title = get_field(candidate, "title") + ctx_media_kind = get_field(candidate, "media_kind") + tags_val = get_field(candidate, "tag") + if isinstance(tags_val, list): + ctx_tags = [str(t) for t in tags_val if t] + break + + if ctx_item is not None and SearchResult is not None: + sr_meta = dict(ctx_md) if isinstance(ctx_md, dict) else {} + if edition_id and not sr_meta.get("openlibrary_id"): + sr_meta["openlibrary_id"] = edition_id + + sr_title = str(ctx_title or title_hint or "").strip() or title_hint + sr_media_kind = str(ctx_media_kind or "book") + + sr_obj = ( + ctx_item + if isinstance(ctx_item, SearchResult) + else SearchResult( + table="openlibrary", + title=sr_title, + path=url_str, + media_kind=sr_media_kind, + full_metadata=sr_meta, + ) + ) + + try: + sr_obj.path = url_str # type: ignore[attr-defined] + except Exception: + pass + try: + if ctx_tags: + sr_obj.tag = set(ctx_tags) # type: ignore[attr-defined] + except Exception: + pass + + downloaded_path = provider.download( + sr_obj, + final_output_dir, + progress_callback=progress_cb + ) # type: ignore[call-arg] + + if downloaded_path: + download_payload = { + "path": Path(downloaded_path), + "search_result": sr_obj, + } + + if download_payload is None and hasattr(provider, "download_url"): download_payload = provider.download_url( # type: ignore[attr-defined] - str(url), + url_str, final_output_dir, progress_cb, ) @@ -606,12 +464,12 @@ class Download_File(Cmdlet): if download_payload is None: sr = None if hasattr(provider, "search_result_from_url"): - sr = provider.search_result_from_url(str(url)) # type: ignore[attr-defined] + sr = provider.search_result_from_url(url_str) # type: ignore[attr-defined] if sr is None: sr = SearchResult( table="openlibrary", title=title_hint, - path=str(url), + path=url_str, media_kind="book", full_metadata={ "openlibrary_id": edition_id, @@ -811,6 +669,97 @@ class Download_File(Cmdlet): downloaded_count += 1 continue + if provider_name and get_provider is not None and SearchResult is not None: + provider = get_provider(provider_name, config) + + if provider is not None and hasattr(provider, "download_url"): + try: + downloaded_path = provider.download_url( + str(url), + final_output_dir + ) # type: ignore[attr-defined] + except Exception as exc: + raise DownloadError(str(exc)) + + if downloaded_path: + self._emit_local_file( + downloaded_path=Path(downloaded_path), + source=str(url), + title_hint=Path(str(downloaded_path)).stem, + tags_hint=None, + media_kind_hint="file", + full_metadata=None, + provider_hint=str(provider_name), + progress=progress, + config=config, + ) + downloaded_count += 1 + continue + + if provider is not None: + sr_obj = None + try: + sr_obj = SearchResult( + table=str(provider_name), + title=str(url), + path=str(url), + full_metadata={}, + ) + downloaded_path = provider.download( + sr_obj, + final_output_dir + ) # type: ignore[call-arg] + except Exception: + downloaded_path = None + + if (not downloaded_path + ) and str(provider_name).lower() == "libgen": + raise DownloadError( + "LibGen URL did not resolve to a downloadable file" + ) + + if downloaded_path: + emit_tags: Optional[List[str]] = None + full_md: Optional[Dict[str, Any]] = None + title_hint = Path(str(downloaded_path)).stem + media_kind_hint = "file" + + if str(provider_name + ).lower() == "libgen" and sr_obj is not None: + media_kind_hint = "book" + try: + sr_tags = getattr(sr_obj, "tag", None) + if isinstance(sr_tags, set) and sr_tags: + emit_tags = sorted( + [str(t) for t in sr_tags if t] + ) + except Exception: + emit_tags = None + + try: + sr_full_md = getattr(sr_obj, "full_metadata", None) + if isinstance(sr_full_md, dict): + full_md = sr_full_md + t = str(sr_full_md.get("title") or "").strip() + if t: + title_hint = t + except Exception: + full_md = None + + self._emit_local_file( + downloaded_path=Path(downloaded_path), + source=str(url), + title_hint=title_hint, + tags_hint=emit_tags, + media_kind_hint=media_kind_hint, + full_metadata=full_md, + provider_hint=str(provider_name), + progress=progress, + config=config, + ) + downloaded_count += 1 + continue + result_obj = _download_direct_file( str(url), final_output_dir, @@ -1237,6 +1186,170 @@ class Download_File(Cmdlet): return downloaded_count, queued_magnet_submissions + @staticmethod + def _path_from_download_result(result_obj: Any) -> Path: + file_path = None + if hasattr(result_obj, "path"): + file_path = getattr(result_obj, "path") + elif isinstance(result_obj, dict): + file_path = result_obj.get("path") + if not file_path: + file_path = str(result_obj) + return Path(str(file_path)) + + def _emit_local_file( + self, + *, + downloaded_path: Path, + source: Optional[str], + title_hint: Optional[str], + tags_hint: Optional[List[str]], + media_kind_hint: Optional[str], + full_metadata: Optional[Dict[str, Any]], + progress: PipelineProgress, + config: Dict[str, Any], + provider_hint: Optional[str] = None, + ) -> None: + title_val = (title_hint or downloaded_path.stem or "Unknown").strip() or downloaded_path.stem + hash_value = self._compute_file_hash(downloaded_path) + notes: Optional[Dict[str, str]] = None + try: + if isinstance(full_metadata, dict): + subtitles = full_metadata.get("_tidal_lyrics_subtitles") + if isinstance(subtitles, str) and subtitles.strip(): + notes = {"lyric": subtitles} + except Exception: + notes = None + tag: List[str] = [] + if tags_hint: + tag.extend([str(t) for t in tags_hint if t]) + if not any(str(t).lower().startswith("title:") for t in tag): + tag.insert(0, f"title:{title_val}") + + payload: Dict[str, Any] = { + "path": str(downloaded_path), + "hash": hash_value, + "title": title_val, + "action": "cmdlet:download-file", + "download_mode": "file", + "store": "local", + "media_kind": media_kind_hint or "file", + "tag": tag, + } + if provider_hint: + payload["provider"] = str(provider_hint) + if full_metadata: + payload["full_metadata"] = full_metadata + if notes: + payload["notes"] = notes + if source and str(source).startswith("http"): + payload["url"] = source + elif source: + payload["source_url"] = source + + pipeline_context.emit(payload) + + @staticmethod + def _normalize_urls(parsed: Dict[str, Any]) -> List[str]: + urls: List[str] = [] + url_value: Any = None + if isinstance(parsed, dict): + url_value = parsed.get("url") + + try: + urls = normalize_url_list(url_value) + except Exception: + urls = [] + + if not urls and isinstance(parsed, dict): + query_val = parsed.get("query") + try: + if isinstance(query_val, str) and query_val.strip().lower().startswith("url:"): + urls = normalize_url_list(query_val) + except Exception: + pass + + return urls + + @staticmethod + def _collect_piped_items_if_no_urls(result: Any, raw_url: Sequence[str]) -> List[Any]: + if raw_url: + return [] + if result is None: + return [] + if isinstance(result, list): + return list(result) + return [result] + + @staticmethod + def _load_provider_registry() -> Dict[str, Any]: + """Lightweight accessor for provider helpers without hard dependencies.""" + try: + from ProviderCore import registry as provider_registry # type: ignore + from ProviderCore.base import SearchResult # type: ignore + + return { + "get_provider": getattr(provider_registry, "get_provider", None), + "get_search_provider": getattr(provider_registry, "get_search_provider", None), + "match_provider_name_for_url": getattr(provider_registry, "match_provider_name_for_url", None), + "SearchResult": SearchResult, + } + except Exception: + return { + "get_provider": None, + "get_search_provider": None, + "match_provider_name_for_url": None, + "SearchResult": None, + } + + @staticmethod + def _safe_total_items(raw_url: Sequence[str], piped_items: Sequence[Any]) -> int: + """Return a sane item count for progress display.""" + try: + url_count = len(raw_url or []) + except Exception: + url_count = 0 + try: + piped_count = len(piped_items or []) + except Exception: + piped_count = 0 + total = url_count + piped_count + return total if total > 0 else 1 + + @staticmethod + def _build_preview(raw_url: Sequence[str], piped_items: Sequence[Any], total_items: int) -> List[str]: + """Construct a short preview list for the local progress UI.""" + preview: List[str] = [] + + try: + for url in raw_url or []: + if len(preview) >= 5: + break + preview.append(str(url)) + except Exception: + pass + + if len(preview) < 5: + try: + items = piped_items if isinstance(piped_items, list) else list(piped_items or []) + except Exception: + items = [] + for item in items: + if len(preview) >= 5: + break + try: + label = get_field(item, "title") or get_field(item, "path") or get_field(item, "url") + except Exception: + label = None + if label: + preview.append(str(label)) + + # If we still have nothing, supply a generic placeholder to avoid empty previews. + if not preview and total_items: + preview.append(f"{total_items} item(s)") + + return preview + # === Streaming helpers (yt-dlp) === @staticmethod @@ -3531,7 +3644,6 @@ class Download_File(Cmdlet): parsed = parse_cmdlet_args(args, self) raw_url = self._normalize_urls(parsed) - raw_url = self._rewrite_archive_org_urls(raw_url) piped_items = self._collect_piped_items_if_no_urls(result, raw_url) had_piped_input = False @@ -3660,6 +3772,7 @@ class Download_File(Cmdlet): quiet_mode=quiet_mode, registry=registry, progress=progress, + context_items=(result if isinstance(result, list) else ([result] if result else [])), ) downloaded_count += int(urls_downloaded) if early_exit is not None: diff --git a/cmdlet/get_tag.py b/cmdlet/get_tag.py index b881a9c..451b810 100644 --- a/cmdlet/get_tag.py +++ b/cmdlet/get_tag.py @@ -14,15 +14,12 @@ import sys from SYS.logger import log, debug -try: - from Provider.openlibrary import OpenLibrary - - _ol_scrape_isbn_metadata = OpenLibrary.scrape_isbn_metadata - _ol_scrape_openlibrary_metadata = OpenLibrary.scrape_openlibrary_metadata -except Exception: - _ol_scrape_isbn_metadata = None # type: ignore[assignment] - _ol_scrape_openlibrary_metadata = None # type: ignore[assignment] -from Provider.metadata_provider import get_metadata_provider, list_metadata_providers +from Provider.metadata_provider import ( + get_metadata_provider, + list_metadata_providers, + scrape_isbn_metadata, + scrape_openlibrary_metadata, +) import subprocess from pathlib import Path from typing import Any, Dict, List, Optional, Sequence, Tuple @@ -270,9 +267,6 @@ def _pick_supported_ytdlp_url(urls: List[str]) -> Optional[str]: return candidates[0] if candidates else None -_scrape_isbn_metadata = _ol_scrape_isbn_metadata # type: ignore[assignment] -_scrape_openlibrary_metadata = _ol_scrape_openlibrary_metadata # type: ignore[assignment] - # Tag item for ResultTable display and piping from dataclasses import dataclass @@ -1039,22 +1033,16 @@ def _extract_url_formats(formats: list) -> List[Tuple[str, str]]: def _scrape_isbn_metadata(isbn: str) -> List[str]: - if _ol_scrape_isbn_metadata is None: - log("OpenLibrary scraper unavailable", file=sys.stderr) - return [] try: - return list(_ol_scrape_isbn_metadata(isbn)) + return list(scrape_isbn_metadata(isbn)) except Exception as e: log(f"ISBN scraping error: {e}", file=sys.stderr) return [] def _scrape_openlibrary_metadata(olid: str) -> List[str]: - if _ol_scrape_openlibrary_metadata is None: - log("OpenLibrary scraper unavailable", file=sys.stderr) - return [] try: - return list(_ol_scrape_openlibrary_metadata(olid)) + return list(scrape_openlibrary_metadata(olid)) except Exception as e: log(f"OpenLibrary scraping error: {e}", file=sys.stderr) return []