forked from danvk/oldnyc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_static_site.py
executable file
·206 lines (168 loc) · 6.61 KB
/
generate_static_site.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/usr/bin/env python
'''Generate a static version of oldnyc.org consisting entirely of JSON.'''
import chardet
from collections import defaultdict, OrderedDict
import csv
import json
import record
import re
from distutils.dir_util import copy_tree
from shutil import copyfile
import subprocess
import sys
import time
import os
from dates import extract_years
from ocr import cleaner
import title_cleaner
# Make sure the oldnyc.github.io repo is in a clean state.
git_status = subprocess.check_output('git -C ../oldnyc.github.io status --porcelain'.split(' '))
if git_status.strip():
sys.stderr.write('Make sure the ../oldnyc.github.io repo exists and is clean.\n')
sys.exit(1)
# strip leading 'var popular_photos = ' and trailing ';'
popular_photos = json.loads(open('viewer/static/js/popular-photos.js', 'rb').read()[20:-2])
pop_ids = {x['id'] for x in popular_photos}
# strip leading 'var lat_lons = ' and trailing ';'
lat_lon_to_ids = json.loads(open('viewer/static/js/nyc-lat-lons-ny.js', 'rb').read()[15:-1])
rs = record.AllRecords('nyc/photos.pickle')
id_to_record = {r.photo_id(): r for r in rs}
id_to_dims = {}
for photo_id, width, height in csv.reader(open('nyc-image-sizes.txt')):
id_to_dims[photo_id] = (int(width), int(height))
# rotated images based on user feedback
user_rotations = json.load(open('analysis/rotations/rotations.json'))
id_to_rotation = user_rotations['fixes']
def get_back_id(photo_id):
return re.sub(r'f?(?:-[a-z])?$', 'b', photo_id)
# Load the previous iteration of OCR. Corrections are applied on top of
# this.
old_data = json.load(open('../oldnyc.github.io/data.json', 'rb'))
old_photo_id_to_text = {r['photo_id']: r['text'] for r in old_data['photos'] if r['text']}
manual_ocr_fixes = json.load(open('ocr/feedback/fixes.json', 'rb'))
back_id_to_correction = manual_ocr_fixes['fixes']
id_to_text = {}
for photo_id in id_to_record.iterkeys():
back_id = get_back_id(photo_id)
if photo_id in old_photo_id_to_text:
id_to_text[photo_id] = old_photo_id_to_text[photo_id]
if back_id in back_id_to_correction:
id_to_text[photo_id] = back_id_to_correction[back_id]['text']
# (This was only helpful on the initial run, when data came straight from
# Ocropus.)
# for k, txt in id_to_text.iteritems():
# id_to_text[k] = cleaner.clean(txt)
back_id_to_text = None # clear
def image_url(photo_id, is_thumb):
degrees = id_to_rotation.get(photo_id)
if not degrees:
return 'http://oldnyc-assets.nypl.org/%s/%s.jpg' % (
'thumb' if is_thumb else '600px', photo_id)
else:
return 'http://www.oldnyc.org/rotated-assets/%s/%s.%s.jpg' % (
'thumb' if is_thumb else '600px', photo_id, degrees)
def decode(b):
try:
return b.decode('utf8')
except UnicodeDecodeError:
return b.decode(chardet.detect(b)['encoding'])
def make_response(photo_ids):
response = OrderedDict()
for photo_id in photo_ids:
r = id_to_record[photo_id]
w, h = id_to_dims[photo_id]
ocr_text = id_to_text.get(photo_id)
# See also viewer/app.py
title = decode(r.title())
original_title = None
if title_cleaner.is_pure_location(title):
original_title = title
title = ''
assert r.description() == ''
assert r.note() == ''
rotation = id_to_rotation.get(photo_id)
if rotation and (rotation % 180 == 90):
w, h = h, w
date = re.sub(r'\s+', ' ', r.date())
response[photo_id] = {
'title': title,
'date': date,
'years': extract_years(date),
'folder': decode(r.location()),
'width': w,
'height': h,
'text': ocr_text,
'image_url': image_url(photo_id, is_thumb=False),
'thumb_url': image_url(photo_id, is_thumb=True)
}
if original_title:
response[photo_id]['original_title'] = original_title
if rotation:
response[photo_id]['rotation'] = rotation
# Sort by earliest date; undated photos go to the back.
ids = sorted(photo_ids, key=lambda id: min(response[id]['years']) or 'z')
return OrderedDict((id_, response[id_]) for id_ in ids)
def merge(*args):
'''Merge dictionaries.'''
o = {}
for x in args:
o.update(x)
return o
def group_by_year(response):
counts = defaultdict(int)
for record in response.values():
for year in extract_years(record['date']):
counts[year] += 1
return OrderedDict((y, counts[y]) for y in sorted(counts.keys()))
all_photos = []
latlon_to_count = {}
id4_to_latlon = defaultdict(lambda: {}) # first 4 of id -> id -> latlon
textless_photo_ids = []
for latlon, photo_ids in lat_lon_to_ids.iteritems():
outfile = '../oldnyc.github.io/by-location/%s.json' % latlon.replace(',', '')
response = make_response(photo_ids)
latlon_to_count[latlon] = group_by_year(response) # len(response)
json.dump(response, open(outfile, 'wb'), indent=2)
for id_ in photo_ids:
id4_to_latlon[id_[:4]][id_] = latlon
for photo_id, response in response.iteritems():
if not response['text'] and 'f' in photo_id:
textless_photo_ids.append(photo_id)
lat, lon = [float(x) for x in latlon.split(',')]
response['photo_id'] = photo_id
response['location'] = {
'lat': lat,
'lon': lon
}
response['width'] = int(response['width'])
response['height'] = int(response['height'])
all_photos.append(response)
json.dump(make_response(pop_ids),
open('../oldnyc.github.io/popular.json', 'wb'), indent=2)
with open('../oldnyc.github.io/lat-lon-counts.js', 'wb') as f:
f.write('var lat_lons = %s;' % json.dumps(latlon_to_count, indent=2))
for id4, id_to_latlon in id4_to_latlon.iteritems():
json.dump(id_to_latlon,
open('../oldnyc.github.io/id4-to-location/%s.json' % id4, 'wb'),
indent=2)
# List of photos IDs without backing text
json.dump({
'photo_ids': textless_photo_ids
},
open('../oldnyc.github.io/notext.json', 'wb'))
# Complete data dump
all_photos.sort(key=lambda photo: photo['photo_id'])
timestamps = {
'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()),
'rotation_time': user_rotations['last_date'],
'ocr_time': manual_ocr_fixes['last_date'],
'ocr_ms': manual_ocr_fixes['last_timestamp']
}
json.dump(merge({
'photos': all_photos,
}, timestamps),
open('../oldnyc.github.io/data.json', 'wb'),
indent=2)
json.dump(timestamps,
open('../oldnyc.github.io/timestamps.json', 'wb'),
indent=2)