-
Notifications
You must be signed in to change notification settings - Fork 0
/
ukiyo-e_scraper.py
executable file
·195 lines (180 loc) · 7.16 KB
/
ukiyo-e_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/python3
import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup
import os
import time
import re
import pyexiv2
import glob
# output_directory is where the image files will be saved
output_directory = ""
# log file is where any failed downloads, corrupt images, etc. will be written
logfile = open("", "a")
# how long to wait between each request in seconds
wait_time = 10
if not os.path.exists(output_directory):
os.makedirs(output_directory)
# ukiyo-e.org isn't perfectly stable so we need to use retries so it won't crash midway
sess = requests.Session()
retries = Retry(total=10, backoff_factor=10)
sess.mount("https://", HTTPAdapter(max_retries=retries))
top_url = "https://ukiyo-e.org"
top_page = requests.get(top_url, timeout=10)
top_soup = BeautifulSoup(top_page.text, "html.parser")
artist_pages = [
{"artist_name": x["title"], "artist_url": x["href"]}
for x in top_soup.find_all("a", class_="artist")
]
for artist in artist_pages:
# create a folder for each individual artist in the dataset
artist_path = os.path.join(output_directory, artist["artist_name"])
if not os.path.exists(artist_path):
os.makedirs(artist_path)
print(
f"throttling for {wait_time} seconds before grabbing first artist page for {artist['artist_name']}"
)
time.sleep(wait_time)
prints = []
soups = []
try:
artist_page = requests.get(artist["artist_url"], timeout=10)
artist_soup = BeautifulSoup(artist_page.text, "html.parser")
soups.append(artist_soup)
except Exception as e:
logfile.write(
f"got {type(e)} error when trying to download an artist page {artist['artist_url']}\n"
)
continue
# Handle pagination because only 100 prints are displayed at a time
try:
print_count = int(
"".join([_ for _ in artist_soup.find("strong").text if _ in "0123456789"])
)
except Exception:
logfile.write(f"failed to get print count for {artist['artist_url']}\n")
print_count = 0
if len(os.listdir(artist_path)) >= print_count:
# already finished downloading this artist
print(f"skipping artist {artist['artist_name']}")
continue
if print_count > 100:
for i in range(1, print_count // 100 + 1):
print(
f"throttling for {wait_time} seconds before grabbing extra artist pages"
)
time.sleep(wait_time)
start = i * 100
try:
artist_page = requests.get(
artist["artist_url"] + f"?start={start}", timeout=10
)
artist_soup = BeautifulSoup(artist_page.text, "html.parser")
soups.append(artist_soup)
except Exception as e:
logfile.write(
f"got {type(e)} error when trying to download an artist page {artist['artist_url']}\n"
)
# iterate over each artist page to get a list of individual works
for soup in soups:
for div in soup.find_all("div", class_="img col-xs-6 col-sm-4 col-md-3"):
try:
print_url = div.find("a")["href"]
print_metadata = artist.copy()
print_metadata["print_url"] = print_url
print_metadata["artist_path"] = artist_path + "/"
prints.append(print_metadata)
except Exception:
logfile.write(f"{div} failed while extracting div from {artist}\n")
# each print has its own page with metadata and a link to the full res image
for work in prints:
filename = "_".join(work["print_url"].split("/")[-2:])
filepath = work["artist_path"] + filename
if glob.glob(filepath + ".*"):
# file already downloaded
print(f"{filepath} already downloaded")
continue
print(f"throttling for {wait_time} seconds before grabbing print page\n")
time.sleep(wait_time)
try:
print_page = requests.get(work["print_url"], timeout=10)
print_soup = BeautifulSoup(print_page.text, "html.parser")
except Exception as e:
logfile.write(
f"got {type(e)} error when trying to download a print {work['print_url']}\n"
)
continue
metadata = print_soup.find("div", class_="details")
if metadata:
text_metadata = re.sub(
r"\t+", " ", re.sub(r"\s+", " ", metadata.text)
).strip()
else:
# no metadata so no download link
continue
image_search = metadata.find("a", class_="btn", href=True)
if image_search:
image_url = image_search["href"]
try:
image_extension = image_url.split(".")[-1]
except Exception:
image_extension = ""
else:
# no image just skip
continue
# description is the main with title as the fallback
description_search = re.search(
r"Description\s*:([\s\S]+?)(Download Image|$)", text_metadata, re.IGNORECASE
)
title_search = re.search(
r"Title\s*:([\s\S]+?)(?:[\S]+?:)", text_metadata, re.IGNORECASE
)
description = ""
if description_search:
description = description_search.group(1).strip()
elif title_search:
description = title_search.group(1).strip()
# dates are in an extremely inconsistent format so we just grab it all
date_search = re.search(
r"Date\s*:([\s\S]+?)(?:[\S]+?:)", text_metadata, re.IGNORECASE
)
date = ""
if date_search:
date = date_search.group(1).strip()
description = f"{work['artist_name']}, {description}, {date}".strip(", ")
# download image and save with metadata added to the exif tags
filepath = filepath + "." + image_extension
print(f"throttling for {wait_time} seconds before grabbing image: {filepath}")
time.sleep(wait_time)
try:
image_response = requests.get(image_url, timeout=10)
except Exception as e:
logfile.write(
f"got {type(e)} error when trying to download an image {image_url}\n"
)
continue
if image_response:
try:
image = pyexiv2.ImageData(image_response.content)
except Exception:
# no image content
logfile.write(f"{image_url} contains no content\n")
continue
else:
logfile.write(f"{image_url} did not resolve\n")
# if something isn't jpeg or png then writing the exif data will fail
try:
image.clear_exif()
image.modify_exif(
{
"Exif.Image.ImageDescription": description,
"Exif.Image.Artist": work["artist_name"],
}
)
except Exception:
logfile.write(f"{filename} has no exif capability\n")
# write file to disk
with open(filepath, "wb") as outfile:
outfile.write(image.get_bytes())
image.close()
logfile.close()