Skip to content

Commit 983db16

Browse files
Initial import
0 parents  commit 983db16

File tree

3 files changed

+226
-0
lines changed

3 files changed

+226
-0
lines changed

README.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Unsong book fetcher
2+
3+
Grabs the text of the book UNSONG by Scott Alexander from [unsongbook.com](http://unsongbook.com) and makes an epub out of it.
4+
5+
Requires Python 3 and PIL. Creates a folder `cache` which caches the downloaded text and images so it can be re-run again later and be a lot quicker. Requires Calibre's `ebook-convert` to actually do the conversion to epub.

create_unsong_book.sh

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/bin/bash
2+
3+
echo Fetching book
4+
python3 get_unsong.py
5+
6+
echo Making ebook
7+
ebook-convert Unsong.html Unsong.epub \
8+
--level1-toc="//h:h1" \
9+
--level2-toc="//h:h2[re:test(., 'chapter|interlude', 'i')]" \
10+
--no-chapters-in-toc --no-default-epub-cover \
11+
--authors "Scott Alexander" --language en \
12+
--chapter /
13+
14+
echo Done in Unsong.epub

get_unsong.py

+207
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
import urllib.request
2+
import sys
3+
from bs4 import BeautifulSoup
4+
import re
5+
import os
6+
import json
7+
import datetime
8+
import base64
9+
import magic
10+
import io
11+
from PIL import Image, ImageDraw, ImageFont
12+
13+
CHAPTERS = []
14+
AUTHOR_NOTES = []
15+
header = """<!doctype html>
16+
<html>
17+
<head>
18+
<meta charset="utf-8">
19+
<title>Unsong</title>
20+
</head>
21+
<body>
22+
"""
23+
footer = """<hr><article>
24+
<p>Complete up to the date of creation of this ebook, which was %s.</p>
25+
<p>Made from <a href="http://unsongbook.com/">the Unsong book website</a> by
26+
the <a href="">Unsong fetcher script</a>
27+
by <a href="https://kryogenix.org">Stuart Langridge</a>.</p>
28+
</article>
29+
</body></html>""" % (datetime.datetime.now(),)
30+
31+
def create_book():
32+
# create cover
33+
title_img_data = fetch_or_get("http://i.imgur.com/d9LvKMc.png", binary=True)
34+
bio = io.BytesIO(title_img_data)
35+
title_img = Image.open(bio)
36+
tw, th = title_img.size
37+
cw = int(tw * 1.5)
38+
ch = int(cw * 1.6)
39+
cover_img = Image.new("RGBA", (cw, ch))
40+
draw = ImageDraw.Draw(cover_img)
41+
gradient = ((180,119,14), (210,181,100))
42+
height = cover_img.size[1]
43+
rs, gs, bs = gradient[0]
44+
re, ge, be = gradient[1]
45+
rr = re - rs; gr = ge - gs; br = be - bs
46+
for i in range(height):
47+
r = rs + int(rr*i/height)
48+
g = gs + int(gr*i/height)
49+
b = bs + int(br*i/height)
50+
draw.line([(0,i), (cw,i)], fill=(r,g,b))
51+
52+
tlx = int((cw - tw) / 2)
53+
tly = int((ch - th) / 2)
54+
cover_img.paste(title_img, (tlx, tly), title_img)
55+
56+
font = None
57+
try:
58+
font = ImageFont.truetype("/usr/share/texlive/texmf-dist/fonts/truetype/public/opensans/OpenSans-Light.ttf", size=24)
59+
except:
60+
font = None
61+
62+
txt = "Scott Alexander"
63+
txtw, txth = draw.textsize(txt, font=font)
64+
draw.text((int((cw - txtw) / 2), ch - 100), txt, fill=(0,0,0), font=font)
65+
66+
bio = io.BytesIO()
67+
cover_img.save(bio, "PNG")
68+
cover_src = "data:image/png;base64,%s" % (base64.encodestring(bio.getvalue()).decode("utf-8"))
69+
70+
# Special handling for chapter 18, which should be in book II but Alexander has done the
71+
# navigation links wrongly,so we manually insert it before c19
72+
nchapters = []
73+
c18 = None
74+
for c in CHAPTERS:
75+
if "Chapter 18:" in c:
76+
c18 = c
77+
continue
78+
elif "Chapter 19" in c:
79+
nchapters.append(c18)
80+
nchapters.append(c)
81+
82+
fp = open("Unsong.html", encoding="utf-8", mode="w")
83+
fp.write(header)
84+
fp.write("<header><img src='%s' alt=''><h1>Unsong</h1><h2>Scott Alexander</h2></header>" % cover_src)
85+
fp.write("<main>")
86+
fp.write("\n\n\n".join(nchapters))
87+
fp.write("</main>")
88+
fp.write("<section>")
89+
fp.write("<h1>Appendix: Author Notes</h1>")
90+
fp.write("\n\n\n".join(AUTHOR_NOTES))
91+
fp.write("</section>")
92+
fp.write(footer)
93+
fp.close()
94+
95+
def slugify(url):
96+
return re.sub(r"[^A-Za-z0-9]", "_", url)
97+
98+
def fetch_or_get(url, binary=False):
99+
slug = slugify(url)
100+
slug = "cache/%s" % slug
101+
if os.path.exists(slug):
102+
if binary:
103+
fp = open(slug, mode="rb")
104+
else:
105+
fp = open(slug, encoding="utf-8")
106+
data = fp.read()
107+
fp.close()
108+
#print("Got", url, "from cache")
109+
else:
110+
print("Fetching", url, "from web")
111+
req = urllib.request.Request(
112+
url,
113+
data=None,
114+
headers={
115+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
116+
}
117+
)
118+
fp = urllib.request.urlopen(req)
119+
data = fp.read()
120+
fp.close()
121+
if binary:
122+
fp = open(slug, mode="wb")
123+
fp.write(data)
124+
fp.close()
125+
else:
126+
fp = open(slug, encoding="utf-8", mode="w")
127+
fp.write(data.decode("utf-8"))
128+
fp.close()
129+
return data
130+
131+
def get_cached_parsed(url):
132+
slug = "CACHED_PARSED_%s" % (slugify(url),)
133+
slug = "cache/%s" % slug
134+
if not os.path.exists(slug): return
135+
fp = open(slug, encoding="utf-8")
136+
data = json.load(fp)
137+
fp.close()
138+
return data
139+
140+
def put_cached_parsed(url, data):
141+
slug = "CACHED_PARSED_%s" % (slugify(url),)
142+
slug = "cache/%s" % slug
143+
fp = open(slug, encoding="utf-8", mode="w")
144+
json.dump(data, fp)
145+
fp.close()
146+
147+
def get_url(url):
148+
data = fetch_or_get(url, binary=False)
149+
cached_parsed = get_cached_parsed(url)
150+
if cached_parsed:
151+
return cached_parsed
152+
details = {}
153+
soup = BeautifulSoup(data, "lxml")
154+
post = soup.find_all("div", "post")
155+
nav = soup.find_all("div", "pjgm-navigation")
156+
heading = post[0].find_all("h1", "pjgm-posttitle")[0]
157+
if heading.text.lower().startswith("book"):
158+
details["type"] = "book"
159+
elif heading.text.lower().startswith("author"):
160+
details["type"] = "author note"
161+
else:
162+
details["type"] = "chapter"
163+
if details["type"] == "book":
164+
heading.name = "h1"
165+
else:
166+
heading.name = "h2"
167+
content = post[0].find_all("div", "pjgm-postcontent")[0]
168+
prev = None
169+
next = None
170+
prevs = nav[0].find_all("a", {"rel": "prev"})
171+
if prevs: prev = prevs[0].attrs["href"]
172+
nexts = nav[0].find_all("a", {"rel": "next"})
173+
if nexts: next = nexts[0].attrs["href"]
174+
share = soup.find_all("div", "sharedaddy")
175+
[s.extract() for s in share]
176+
177+
# cache images
178+
for img in content.find_all("img"):
179+
img_url = img["src"]
180+
img_data = fetch_or_get(img_url, binary=True)
181+
magic_identifier = magic.open(magic.MIME)
182+
magic_identifier.load()
183+
img_type = magic_identifier.buffer(img_data)
184+
magic_identifier.close()
185+
img_type = img_type.split(";")[0]
186+
187+
img["src"] = "data:%s;base64,%s" % (img_type, base64.encodestring(img_data).decode("utf-8"))
188+
189+
html = '<article class="%s">\n%s\n%s\n</article>\n' % (details["type"], heading, content)
190+
output = (prev, html, details, next)
191+
put_cached_parsed(url, output)
192+
193+
return output
194+
195+
def get_next(next):
196+
previous, html, details, next = get_url(next)
197+
if details["type"] == "author note":
198+
AUTHOR_NOTES.append(html)
199+
else:
200+
CHAPTERS.append(html)
201+
if next:
202+
get_next(next)
203+
else:
204+
create_book()
205+
206+
if __name__ == "__main__":
207+
get_next("http://unsongbook.com/prologue-2/")

0 commit comments

Comments
 (0)