1
+ import urllib .request
2
+ import sys
3
+ from bs4 import BeautifulSoup
4
+ import re
5
+ import os
6
+ import json
7
+ import datetime
8
+ import base64
9
+ import magic
10
+ import io
11
+ from PIL import Image , ImageDraw , ImageFont
12
+
13
+ CHAPTERS = []
14
+ AUTHOR_NOTES = []
15
+ header = """<!doctype html>
16
+ <html>
17
+ <head>
18
+ <meta charset="utf-8">
19
+ <title>Unsong</title>
20
+ </head>
21
+ <body>
22
+ """
23
+ footer = """<hr><article>
24
+ <p>Complete up to the date of creation of this ebook, which was %s.</p>
25
+ <p>Made from <a href="http://unsongbook.com/">the Unsong book website</a> by
26
+ the <a href="">Unsong fetcher script</a>
27
+ by <a href="https://kryogenix.org">Stuart Langridge</a>.</p>
28
+ </article>
29
+ </body></html>""" % (datetime .datetime .now (),)
30
+
31
+ def create_book ():
32
+ # create cover
33
+ title_img_data = fetch_or_get ("http://i.imgur.com/d9LvKMc.png" , binary = True )
34
+ bio = io .BytesIO (title_img_data )
35
+ title_img = Image .open (bio )
36
+ tw , th = title_img .size
37
+ cw = int (tw * 1.5 )
38
+ ch = int (cw * 1.6 )
39
+ cover_img = Image .new ("RGBA" , (cw , ch ))
40
+ draw = ImageDraw .Draw (cover_img )
41
+ gradient = ((180 ,119 ,14 ), (210 ,181 ,100 ))
42
+ height = cover_img .size [1 ]
43
+ rs , gs , bs = gradient [0 ]
44
+ re , ge , be = gradient [1 ]
45
+ rr = re - rs ; gr = ge - gs ; br = be - bs
46
+ for i in range (height ):
47
+ r = rs + int (rr * i / height )
48
+ g = gs + int (gr * i / height )
49
+ b = bs + int (br * i / height )
50
+ draw .line ([(0 ,i ), (cw ,i )], fill = (r ,g ,b ))
51
+
52
+ tlx = int ((cw - tw ) / 2 )
53
+ tly = int ((ch - th ) / 2 )
54
+ cover_img .paste (title_img , (tlx , tly ), title_img )
55
+
56
+ font = None
57
+ try :
58
+ font = ImageFont .truetype ("/usr/share/texlive/texmf-dist/fonts/truetype/public/opensans/OpenSans-Light.ttf" , size = 24 )
59
+ except :
60
+ font = None
61
+
62
+ txt = "Scott Alexander"
63
+ txtw , txth = draw .textsize (txt , font = font )
64
+ draw .text ((int ((cw - txtw ) / 2 ), ch - 100 ), txt , fill = (0 ,0 ,0 ), font = font )
65
+
66
+ bio = io .BytesIO ()
67
+ cover_img .save (bio , "PNG" )
68
+ cover_src = "data:image/png;base64,%s" % (base64 .encodestring (bio .getvalue ()).decode ("utf-8" ))
69
+
70
+ # Special handling for chapter 18, which should be in book II but Alexander has done the
71
+ # navigation links wrongly,so we manually insert it before c19
72
+ nchapters = []
73
+ c18 = None
74
+ for c in CHAPTERS :
75
+ if "Chapter 18:" in c :
76
+ c18 = c
77
+ continue
78
+ elif "Chapter 19" in c :
79
+ nchapters .append (c18 )
80
+ nchapters .append (c )
81
+
82
+ fp = open ("Unsong.html" , encoding = "utf-8" , mode = "w" )
83
+ fp .write (header )
84
+ fp .write ("<header><img src='%s' alt=''><h1>Unsong</h1><h2>Scott Alexander</h2></header>" % cover_src )
85
+ fp .write ("<main>" )
86
+ fp .write ("\n \n \n " .join (nchapters ))
87
+ fp .write ("</main>" )
88
+ fp .write ("<section>" )
89
+ fp .write ("<h1>Appendix: Author Notes</h1>" )
90
+ fp .write ("\n \n \n " .join (AUTHOR_NOTES ))
91
+ fp .write ("</section>" )
92
+ fp .write (footer )
93
+ fp .close ()
94
+
95
+ def slugify (url ):
96
+ return re .sub (r"[^A-Za-z0-9]" , "_" , url )
97
+
98
+ def fetch_or_get (url , binary = False ):
99
+ slug = slugify (url )
100
+ slug = "cache/%s" % slug
101
+ if os .path .exists (slug ):
102
+ if binary :
103
+ fp = open (slug , mode = "rb" )
104
+ else :
105
+ fp = open (slug , encoding = "utf-8" )
106
+ data = fp .read ()
107
+ fp .close ()
108
+ #print("Got", url, "from cache")
109
+ else :
110
+ print ("Fetching" , url , "from web" )
111
+ req = urllib .request .Request (
112
+ url ,
113
+ data = None ,
114
+ headers = {
115
+ 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
116
+ }
117
+ )
118
+ fp = urllib .request .urlopen (req )
119
+ data = fp .read ()
120
+ fp .close ()
121
+ if binary :
122
+ fp = open (slug , mode = "wb" )
123
+ fp .write (data )
124
+ fp .close ()
125
+ else :
126
+ fp = open (slug , encoding = "utf-8" , mode = "w" )
127
+ fp .write (data .decode ("utf-8" ))
128
+ fp .close ()
129
+ return data
130
+
131
+ def get_cached_parsed (url ):
132
+ slug = "CACHED_PARSED_%s" % (slugify (url ),)
133
+ slug = "cache/%s" % slug
134
+ if not os .path .exists (slug ): return
135
+ fp = open (slug , encoding = "utf-8" )
136
+ data = json .load (fp )
137
+ fp .close ()
138
+ return data
139
+
140
+ def put_cached_parsed (url , data ):
141
+ slug = "CACHED_PARSED_%s" % (slugify (url ),)
142
+ slug = "cache/%s" % slug
143
+ fp = open (slug , encoding = "utf-8" , mode = "w" )
144
+ json .dump (data , fp )
145
+ fp .close ()
146
+
147
+ def get_url (url ):
148
+ data = fetch_or_get (url , binary = False )
149
+ cached_parsed = get_cached_parsed (url )
150
+ if cached_parsed :
151
+ return cached_parsed
152
+ details = {}
153
+ soup = BeautifulSoup (data , "lxml" )
154
+ post = soup .find_all ("div" , "post" )
155
+ nav = soup .find_all ("div" , "pjgm-navigation" )
156
+ heading = post [0 ].find_all ("h1" , "pjgm-posttitle" )[0 ]
157
+ if heading .text .lower ().startswith ("book" ):
158
+ details ["type" ] = "book"
159
+ elif heading .text .lower ().startswith ("author" ):
160
+ details ["type" ] = "author note"
161
+ else :
162
+ details ["type" ] = "chapter"
163
+ if details ["type" ] == "book" :
164
+ heading .name = "h1"
165
+ else :
166
+ heading .name = "h2"
167
+ content = post [0 ].find_all ("div" , "pjgm-postcontent" )[0 ]
168
+ prev = None
169
+ next = None
170
+ prevs = nav [0 ].find_all ("a" , {"rel" : "prev" })
171
+ if prevs : prev = prevs [0 ].attrs ["href" ]
172
+ nexts = nav [0 ].find_all ("a" , {"rel" : "next" })
173
+ if nexts : next = nexts [0 ].attrs ["href" ]
174
+ share = soup .find_all ("div" , "sharedaddy" )
175
+ [s .extract () for s in share ]
176
+
177
+ # cache images
178
+ for img in content .find_all ("img" ):
179
+ img_url = img ["src" ]
180
+ img_data = fetch_or_get (img_url , binary = True )
181
+ magic_identifier = magic .open (magic .MIME )
182
+ magic_identifier .load ()
183
+ img_type = magic_identifier .buffer (img_data )
184
+ magic_identifier .close ()
185
+ img_type = img_type .split (";" )[0 ]
186
+
187
+ img ["src" ] = "data:%s;base64,%s" % (img_type , base64 .encodestring (img_data ).decode ("utf-8" ))
188
+
189
+ html = '<article class="%s">\n %s\n %s\n </article>\n ' % (details ["type" ], heading , content )
190
+ output = (prev , html , details , next )
191
+ put_cached_parsed (url , output )
192
+
193
+ return output
194
+
195
+ def get_next (next ):
196
+ previous , html , details , next = get_url (next )
197
+ if details ["type" ] == "author note" :
198
+ AUTHOR_NOTES .append (html )
199
+ else :
200
+ CHAPTERS .append (html )
201
+ if next :
202
+ get_next (next )
203
+ else :
204
+ create_book ()
205
+
206
+ if __name__ == "__main__" :
207
+ get_next ("http://unsongbook.com/prologue-2/" )
0 commit comments