71 lines
1.9 KiB
Python

#
# books.py
# Gera um arquivo .sql com INSERTs de livros obtidos do Project Gutemberg
# https://gutendex.com/
#
import http.client
import json
import time
from urllib.parse import urlparse
API_URL = "https://gutendex.com/books/"
BOOKS_COUNT = 1000
FETCH_DELAY = 1#s
OUTPUT = "./books.sql"
def fetch(resource):
url = urlparse(resource)
client = http.client.HTTPSConnection if url.scheme == "https" else http.client.HTTPConnection
conn = client(url.netloc)
conn.request("GET", f"{url.path}?{url.query}")
res = conn.getresponse()
if res.status < 200 or res.status > 299:
return {}
return json.loads(res.read())
def write_inserts(file, page):
data = fetch(f"{API_URL}?page={page}")
books = data["results"]
lines = []
esc = lambda str: str.replace('"', '\\"')
for book in books:
summaries = book["summaries"]
imgs = [book["formats"][f] for f in book["formats"] if f.startswith("image/")]
isbn = ""
titulo = book["title"]
autor = ",".join([a["name"] for a in book["authors"]])
genero = ",".join(book["subjects"])
descricao = summaries[0] if len(summaries) > 0 else ""
foto = imgs[0] if len(imgs) > 0 else ""
keywords = ",".join(book["bookshelves"])
lines.append(f'("{esc(isbn)}", "{esc(titulo)}", "{esc(autor)}", "{esc(genero)}", "{esc(descricao)}", "{esc(foto)}", "{esc(keywords)}", true, NOW(), NOW())')
values = ",\n".join(lines)
insert = f"INSERT INTO Livro (Isbn, Titulo, Autor, Genero, Descricao, Foto, Keywords, Ativo, CriadoEm, AtualizadoEm) VALUES {values};\n"
file.write(insert)
return len(lines)
def collect(n, file, _page=1):
if n <= 0:
return
written = write_inserts(file, _page)
time.sleep(FETCH_DELAY)
collect(n-written, file, _page+1)
def run():
with open(OUTPUT, "w") as file:
collect(BOOKS_COUNT, file)
run()