dfdfsdd
This commit is contained in:
336
test-login.py
Normal file
336
test-login.py
Normal file
@@ -0,0 +1,336 @@
|
||||
import requests
|
||||
import random, string
|
||||
from concurrent import futures
|
||||
from tqdm import tqdm
|
||||
import time
|
||||
from datetime import datetime
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import json
|
||||
import re
|
||||
import base64
|
||||
import hashlib
|
||||
from Crypto.Cipher import AES
|
||||
from Crypto.Util import Counter
|
||||
|
||||
def display_error(response, message):
|
||||
print(message)
|
||||
print(response)
|
||||
print(response.text)
|
||||
exit()
|
||||
|
||||
def get_book_infos(session, url):
|
||||
r = session.get(url).text
|
||||
infos_url = "https:" + r.split('"url":"')[1].split('"')[0].replace("\\u0026", "&")
|
||||
response = session.get(infos_url)
|
||||
data = response.json()['data']
|
||||
title = data['brOptions']['bookTitle'].strip().replace(" ", "_")
|
||||
title = ''.join( c for c in title if c not in '<>:"/\\|?*' ) # Filter forbidden chars in directory names (Windows & Linux)
|
||||
title = title[:150] # Trim the title to avoid long file names
|
||||
metadata = data['metadata']
|
||||
links = []
|
||||
for item in data['brOptions']['data']:
|
||||
for page in item:
|
||||
links.append(page['uri'])
|
||||
|
||||
if len(links) > 1:
|
||||
print(f"[+] Found {len(links)} pages")
|
||||
return title, links, metadata
|
||||
else:
|
||||
print(f"[-] Error while getting image links")
|
||||
exit()
|
||||
|
||||
def login(email, password):
|
||||
session = requests.Session()
|
||||
response = session.get("https://archive.org/services/account/login/")
|
||||
login_data = response.json()
|
||||
if not login_data['success']:
|
||||
display_error(response, "[-] Error while getting login token:")
|
||||
|
||||
login_token = login_data["value"]["token"]
|
||||
|
||||
headers = {"Content-Type": "application/x-www-form-urlencoded"}
|
||||
data = {"username":email, "password":password, "t": login_token}
|
||||
|
||||
response = session.post("https://archive.org/services/account/login/", headers=headers, data=json.dumps(data))
|
||||
try:
|
||||
response_json = response.json()
|
||||
except:
|
||||
display_error(response, "[-] Error while login:")
|
||||
|
||||
if response_json["success"] == False:
|
||||
if response_json["value"] == "bad_login":
|
||||
print("[-] Invalid credentials!")
|
||||
exit()
|
||||
display_error(response, "[-] Error while login:")
|
||||
else:
|
||||
print("[+] Successful login")
|
||||
return session
|
||||
|
||||
def loan(session, book_id, verbose=True):
|
||||
data = {
|
||||
"action": "grant_access",
|
||||
"identifier": book_id
|
||||
}
|
||||
response = session.post("https://archive.org/services/loans/loan/searchInside.php", data=data)
|
||||
data['action'] = "browse_book"
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data)
|
||||
|
||||
if response.status_code == 400 :
|
||||
try:
|
||||
if response.json()["error"] == "This book is not available to borrow at this time. Please try again later.":
|
||||
print("This book doesn't need to be borrowed")
|
||||
return session
|
||||
else :
|
||||
display_error(response, "Something went wrong when trying to borrow the book.")
|
||||
except: # The response is not in JSON format
|
||||
display_error(response, "The book cannot be borrowed")
|
||||
|
||||
data['action'] = "create_token"
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data)
|
||||
|
||||
if "token" in response.text:
|
||||
if verbose:
|
||||
print("[+] Successful loan")
|
||||
return session
|
||||
else:
|
||||
display_error(response, "Something went wrong when trying to borrow the book, maybe you can't borrow this book.")
|
||||
|
||||
def return_loan(session, book_id):
|
||||
data = {
|
||||
"action": "return_loan",
|
||||
"identifier": book_id
|
||||
}
|
||||
response = session.post("https://archive.org/services/loans/loan/", data=data)
|
||||
if response.status_code == 200 and response.json()["success"]:
|
||||
print("[+] Book returned")
|
||||
else:
|
||||
display_error(response, "Something went wrong when trying to return the book")
|
||||
|
||||
def image_name(pages, page, directory):
|
||||
return f"{directory}/{(len(str(pages)) - len(str(page))) * '0'}{page}.jpg"
|
||||
|
||||
def deobfuscate_image(image_data, link, obf_header):
|
||||
"""
|
||||
@Author: https://github.com/justimm
|
||||
Decrypts the first 1024 bytes of image_data using AES-CTR.
|
||||
The obfuscation_header is expected in the form "1|<base64encoded_counter>"
|
||||
where the base64-decoded counter is 16 bytes.
|
||||
We derive the AES key by taking the SHA-1 digest of the image URL (with protocol/host removed)
|
||||
and using the first 16 bytes.
|
||||
For AES-CTR, we use a 16-byte counter block. The first 8 bytes are used as a fixed prefix,
|
||||
and the remaining 8 bytes (interpreted as a big-endian integer) are used as the initial counter value.
|
||||
"""
|
||||
try:
|
||||
version, counter_b64 = obf_header.split('|')
|
||||
except Exception as e:
|
||||
raise ValueError("Invalid X-Obfuscate header format") from e
|
||||
|
||||
if version != '1':
|
||||
raise ValueError("Unsupported obfuscation version: " + version)
|
||||
|
||||
# Derive AES key: replace protocol/host in link with '/'
|
||||
aesKey = re.sub(r"^https?:\/\/.*?\/", "/", link)
|
||||
sha1_digest = hashlib.sha1(aesKey.encode('utf-8')).digest()
|
||||
key = sha1_digest[:16]
|
||||
|
||||
# Decode the counter (should be 16 bytes)
|
||||
counter_bytes = base64.b64decode(counter_b64)
|
||||
if len(counter_bytes) != 16:
|
||||
raise ValueError(f"Expected counter to be 16 bytes, got {len(counter_bytes)}")
|
||||
|
||||
prefix = counter_bytes[:8]
|
||||
initial_value = int.from_bytes(counter_bytes[8:], byteorder='big')
|
||||
|
||||
# Create AES-CTR cipher with a 64-bit counter length.
|
||||
ctr = Counter.new(64, prefix=prefix, initial_value=initial_value, little_endian=False)
|
||||
cipher = AES.new(key, AES.MODE_CTR, counter=ctr)
|
||||
|
||||
decrypted_part = cipher.decrypt(image_data[:1024])
|
||||
new_data = decrypted_part + image_data[1024:]
|
||||
return new_data
|
||||
|
||||
def download_one_image(session, link, i, directory, book_id, pages):
|
||||
headers = {
|
||||
"Referer": "https://archive.org/",
|
||||
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
|
||||
"Sec-Fetch-Site": "same-site",
|
||||
"Sec-Fetch-Mode": "no-cors",
|
||||
"Sec-Fetch-Dest": "image",
|
||||
}
|
||||
retry = True
|
||||
response = None
|
||||
while retry:
|
||||
try:
|
||||
response = session.get(link, headers=headers)
|
||||
if response.status_code == 403:
|
||||
session = loan(session, book_id, verbose=False)
|
||||
raise Exception("Borrow again")
|
||||
elif response.status_code == 200:
|
||||
retry = False
|
||||
except:
|
||||
time.sleep(1) # Wait 1 second before retrying
|
||||
|
||||
image = image_name(pages, i, directory)
|
||||
|
||||
obf_header = response.headers.get("X-Obfuscate")
|
||||
image_content = None
|
||||
if obf_header:
|
||||
try:
|
||||
image_content = deobfuscate_image(response.content, link, obf_header)
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Deobfuscation failed: {e}")
|
||||
return
|
||||
else:
|
||||
image_content = response.content
|
||||
|
||||
with open(image, "wb") as f:
|
||||
f.write(image_content)
|
||||
|
||||
def download(session, n_threads, directory, links, scale, book_id):
|
||||
print("Downloading pages...")
|
||||
links = [f"{link}&rotate=0&scale={scale}" for link in links]
|
||||
pages = len(links)
|
||||
|
||||
tasks = []
|
||||
with futures.ThreadPoolExecutor(max_workers=n_threads) as executor:
|
||||
for link in links:
|
||||
i = links.index(link)
|
||||
tasks.append(executor.submit(download_one_image, session=session, link=link, i=i, directory=directory, book_id=book_id, pages=pages))
|
||||
for task in tqdm(futures.as_completed(tasks), total=len(tasks)):
|
||||
pass
|
||||
|
||||
images = [image_name(pages, i, directory) for i in range(len(links))]
|
||||
return images
|
||||
|
||||
def make_pdf(pdf, title, directory):
|
||||
file = title+".pdf"
|
||||
# Handle the case where multiple books with the same name are downloaded
|
||||
i = 1
|
||||
while os.path.isfile(os.path.join(directory, file)):
|
||||
file = f"{title}({i}).pdf"
|
||||
i += 1
|
||||
|
||||
with open(os.path.join(directory, file),"wb") as f:
|
||||
f.write(pdf)
|
||||
print(f"[+] PDF saved as \"{file}\"")
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
my_parser = argparse.ArgumentParser()
|
||||
my_parser.add_argument('-e', '--email', help='Your archive.org email', type=str, required=True)
|
||||
my_parser.add_argument('-p', '--password', help='Your archive.org password', type=str, required=True)
|
||||
my_parser.add_argument('-u', '--url', help='Link to the book (https://archive.org/details/XXXX). You can use this argument several times to download multiple books', action='append', type=str)
|
||||
my_parser.add_argument('-d', '--dir', help='Output directory', type=str)
|
||||
my_parser.add_argument('-f', '--file', help='File where are stored the URLs of the books to download', type=str)
|
||||
my_parser.add_argument('-r', '--resolution', help='Image resolution (10 to 0, 0 is the highest), [default 3]', type=int, default=3)
|
||||
my_parser.add_argument('-t', '--threads', help="Maximum number of threads, [default 50]", type=int, default=50)
|
||||
my_parser.add_argument('-j', '--jpg', help="Output to individual JPG's rather than a PDF", action='store_true')
|
||||
my_parser.add_argument('-m', '--meta', help="Output the metadata of the book to a json file (-j option required)", action='store_true')
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
my_parser.print_help(sys.stderr)
|
||||
sys.exit(1)
|
||||
args = my_parser.parse_args()
|
||||
|
||||
if args.url is None and args.file is None:
|
||||
my_parser.error("At least one of --url and --file required")
|
||||
|
||||
email = args.email
|
||||
password = args.password
|
||||
scale = args.resolution
|
||||
n_threads = args.threads
|
||||
d = args.dir
|
||||
|
||||
if d == None:
|
||||
d = os.getcwd()
|
||||
elif not os.path.isdir(d):
|
||||
print(f"Output directory does not exist!")
|
||||
exit()
|
||||
|
||||
if args.url is not None:
|
||||
urls = args.url
|
||||
else:
|
||||
if os.path.exists(args.file):
|
||||
with open(args.file) as f:
|
||||
urls = f.read().strip().split("\n")
|
||||
else:
|
||||
print(f"{args.file} does not exist!")
|
||||
exit()
|
||||
|
||||
# Check the urls format
|
||||
for url in urls:
|
||||
if not url.startswith("https://archive.org/details/"):
|
||||
print(f"{url} --> Invalid url. URL must starts with \"https://archive.org/details/\"")
|
||||
exit()
|
||||
|
||||
print(f"{len(urls)} Book(s) to download")
|
||||
session = login(email, password)
|
||||
|
||||
for url in urls:
|
||||
book_id = list(filter(None, url.split("/")))[3]
|
||||
print("="*40)
|
||||
print(f"Current book: https://archive.org/details/{book_id}")
|
||||
session = loan(session, book_id)
|
||||
title, links, metadata = get_book_infos(session, url)
|
||||
|
||||
directory = os.path.join(d, title)
|
||||
# Handle the case where multiple books with the same name are downloaded
|
||||
i = 1
|
||||
_directory = directory
|
||||
while os.path.isdir(directory):
|
||||
directory = f"{_directory}({i})"
|
||||
i += 1
|
||||
os.makedirs(directory)
|
||||
|
||||
if args.meta:
|
||||
print("Writing metadata.json...")
|
||||
with open(f"{directory}/metadata.json",'w') as f:
|
||||
json.dump(metadata,f)
|
||||
|
||||
images = download(session, n_threads, directory, links, scale, book_id)
|
||||
|
||||
if not args.jpg: # Create pdf with images and remove the images folder
|
||||
import img2pdf
|
||||
|
||||
# prepare PDF metadata
|
||||
# sometimes archive metadata is missing
|
||||
pdfmeta = { }
|
||||
# ensure metadata are str
|
||||
for key in ["title", "creator", "associated-names"]:
|
||||
if key in metadata:
|
||||
if isinstance(metadata[key], str):
|
||||
pass
|
||||
elif isinstance(metadata[key], list):
|
||||
metadata[key] = "; ".join(metadata[key])
|
||||
else:
|
||||
raise Exception("unsupported metadata type")
|
||||
# title
|
||||
if 'title' in metadata:
|
||||
pdfmeta['title'] = metadata['title']
|
||||
# author
|
||||
if 'creator' in metadata and 'associated-names' in metadata:
|
||||
pdfmeta['author'] = metadata['creator'] + "; " + metadata['associated-names']
|
||||
elif 'creator' in metadata:
|
||||
pdfmeta['author'] = metadata['creator']
|
||||
elif 'associated-names' in metadata:
|
||||
pdfmeta['author'] = metadata['associated-names']
|
||||
# date
|
||||
if 'date' in metadata:
|
||||
try:
|
||||
pdfmeta['creationdate'] = datetime.strptime(metadata['date'][0:4], '%Y')
|
||||
except:
|
||||
pass
|
||||
# keywords
|
||||
pdfmeta['keywords'] = [f"https://archive.org/details/{book_id}"]
|
||||
|
||||
pdf = img2pdf.convert(images, **pdfmeta)
|
||||
make_pdf(pdf, title, args.dir if args.dir != None else "")
|
||||
try:
|
||||
shutil.rmtree(directory)
|
||||
except OSError as e:
|
||||
print ("Error: %s - %s." % (e.filename, e.strerror))
|
||||
|
||||
return_loan(session, book_id)
|
||||
Reference in New Issue
Block a user