-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_deepdreams.py
362 lines (279 loc) · 14.2 KB
/
get_deepdreams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
import requests, os, urllib, math, secrets
from bs4 import BeautifulSoup
"""
HOW TO USE THIS SCRIPT:
1. Add a "secrets.py" file with your DDG login credentials (see the comment above the definition
of the username/email/password variables)
2. Run this script (in IDLE, hit F5)
3. Type a number between 1 and 4 into the console. The options are straight forward; they're just
how you'd like the dreams to be sorted by. Options are 'latest', 'best', and 'all'. All includes
non-public dreams too, and is basically the 'latest' sorting but includes non-public dreams.
Option '4' is all of the above run one by one (could parallize this but might annoy DDG and I think
they only like one auth at a time, so best not to do this without asking them nicely)
4. Sit back and watch your dreams come true. Dreams get downloaded into directories where ever this script
lives. Directories include 'latest_dreams' (option 1), 'best_dreams' (option 2) and
'all_dreams' (option 3). Option 4 will populate all of these directories for you. :)
NOTEs:
'DDG' => DeepDreamGenerator
TODOs:
- Make a command line option for username, email, and password
- More error checking
- Rework the script so it isn't a giant 300 line file, I made get_deep_dreams() as a hacky work
around to add option 4 (which runs all sorting options)
- Probably should ask DDG if this is really okay, but but now they haven't yelled at me or banned me
Also now I have my dreams so I don't really care if they ban me but I would be really sad if they did
(DDG if you're reading this, I tried to follow your rules, plz don't ban me ok?)
"""
#
# ¡¡¡ NOTE: !!!
# DO NOT SCRAP MY DREAMS, that goes against DeepDreamGenerator's guidelines:
# https://deepdreamgenerator.com/info
#
# However, I feel like I SHOULD be allowed to download my own dreams, hence this script.
#
# Thus, you MUST user your own user credentials to run this script. I no longer include
# mine and my secrets.py file is .gitignore'd so make sure to make a DDG account to
# take full advantage of this script
#
"""
********************************************************************************
* Note: using another .py file called "secrets.py" that is hidden from Git/GitHub
* If you wish to use this script with the "/best" or "all" filters, you'll
* need to provide credentials for your Deep Dream Generator account in the
* "secrets.yaml" file. You'll need to create it yourself in the format:
*
* email: your@email.com
* password: your_password
* username: your_ddg_username
*
* Based on: https://stackoverflow.com/a/25501861
*
********************************************************************************
"""
username = secrets.username
email = secrets.email
password = secrets.password
dream_url = "https://deepdreamgenerator.com/u/" + username
login_url = "https://deepdreamgenerator.com/login"
"""
If using the "/best" sorting option, you MUST log into Deep Dream Generator
Following this site's recommendation on how to log in using Python + Requests
so we can still download dreams. :-)
https://towardsdatascience.com/scraping-data-behind-site-logins-with-python-ee0676f523ee
Some other very helpful articles/posts:
https://kazuar.github.io/scraping-tutorial/
https://stackoverflow.com/a/25284165
"""
session = requests.Session()
# Get the login page so we can get the token from the page
login_page = session.get(login_url)
# Parse out the token - it's under an input with the name '_token'
login_soup = BeautifulSoup(login_page.content, 'html.parser')
token = login_soup.find('input', {'name': '_token'}).get('value')
# Obviously NOT publishing these details to Github nor do I recommend ANYONE do such a thing on purpose...
payload = {'email': email,
'password': password,
'_token': token}
# Try to post the payload to deep dream generator so we can log in
s = session.post(login_url, data=payload, headers = dict(referer=login_url))
# Get the largest dream page number - used to estimate how many dreams we
# might be downloading
def get_largest_page_number(pagination_ul_list):
max_page = 0
for pagination_element in pagination_ul_list:
if pagination_element is not None:
try:
if int(pagination_element.text) > max_page:
max_page = int(pagination_element.text)
except:
# A few of the pagination elements aren't valid integers
# Examples: < ... >
# So just ignore them.
#print("Invalid page number found for %s" % pagination_element)
continue
return max_page
# Get all the URLs to download images off of
def get_img_urls(number_of_pages):
img_urls = []
# Counter to make sure we got the right amount of dream urls
real_number_of_dreams = 0
# Now we can loop over all the deep dream pages.
# We have the number of pages from parsing the first page.
# Number of pages is currently 9, so this loop should go pages 1 to 9
# +1 because range doesn't include the end value
for page_num in range (1, int(number_of_pages) + 1):
# I deleted the copy/pasta code, so now this loop handles all dreams
# Thus, must handle the edge case of first page not having a ?page=
# in the dream page URL
cur_dream_url = ''
if page_num == 1:
cur_dream_url = dream_url
else:
cur_dream_url = dream_url + "?page=" + str(page_num)
print("cur_dream_url is: %s" % cur_dream_url)
# Step 1: Get the dream page
# Step 2: Use BeautifulSoup 2 parse the dream page
# Step 3: Get list of dream
# NOTE: probably fine using requests here, but if that breaks, switch to 'session' instead
dream_page = session.get(cur_dream_url)
dream_soup = BeautifulSoup(dream_page.content, 'html.parser')
dreams = dream_soup.find_all('div', class_='item')
# Counter to monitor number of dreams per page
# Testing shows max of 24 dreams per page, but could be less due to incomplete pages
dream_count = 0
# Step 4: Get list of dream img URLs
for dream in dreams:
# Find the img's HTML
img_html = dream.find('img', class_='light-gallery-item')
# Ok, that worked, we have the imgs! Now just save off the data-src
# for each one - that's our img url :-)
img_url = img_html['data-src']
img_urls.append(img_url)
dream_count += 1
real_number_of_dreams += 1
# At this point, we're done for the current page.
# Let's check how many dreams we got for the page though
print("Found %d dreams for page %d!" % (dream_count, page_num))
# Debugging - add a break here if things break, so you don't go through
# all the pages just to find out something broke with the img downloading code.
#break;
# Some debugging checks
# Should have gotten the same number of dreams as seen previously
# So number_of_dreams should equal real_number_of_dreams
print("\nFound %d dreams over %d pages" % (real_number_of_dreams, int(number_of_pages)))
return img_urls
# Given a sorting type and an list of image URLs, this function downloads all of the
# images which haven't been previously downloaded before
def download_all_dreams(downloads_dir, sorting_type, img_urls):
dream_count = 1
dream_downloaded = 1
dream_errors = 0
# Second, download all the images
for img_url in img_urls:
name = os.path.basename(img_url)
filename = os.path.join(downloads_dir, name)
# If sorting by best, then filename should be "dream_num###.jpg"
# instead of whatever randomly generated filename DeepDreamGenerator uses
if sorting_type == "2":
name = "dream_num" + str(dream_count) + ".jpg"
filename = os.path.join(downloads_dir, name)
if not os.path.isfile(filename):
print("Downloading: %s to %s" % (name, filename))
try:
urllib.request.urlretrieve(img_url, filename)
dream_downloaded += 1
except Exception as exception:
print(exception)
print("Encountered unknown error when trying to download %s. Continuing." % filename)
dream_errors += 1
else:
print("%s already downloaded!" % filename)
dream_count += 1
# At this point, we should have all the dreams nicely downloaded
# Since we skip dreams already downloaded, this shouldn't take long to run
# after publishing new dreams. niace!
print("\nFound %d dreams. Downloaded %d of them this run.\n" % (dream_count, dream_downloaded))
print("NOTE: found %d errors when trying to download dreams.\n" % dream_errors)
""""
Basically, this will get all the dreams at a given deepdreamgenerator URL.
You can do:
/ ==> latest dreams by default
/best ==> best dreams, sorted by the number of likes on DDG
/account ==> latest dreams but also those that aren't public
"""
def get_deep_dreams():
print("Dream url is: %s" % dream_url)
# Step 1: Get the first dream page, determine how many dreams I have
dream_page = session.get(dream_url)
# Debug what we got - should say <Response [200]> if successful
print(str(dream_page))
# Step 2: Use BeautifulSoup 2 parse le page
dream_soup = BeautifulSoup(dream_page.content, 'html.parser')
# Debug what soup got
# This is YUGE so uncomment if you dare.
#print(dream_soup)
# Step 3: Get number of dreams I have
# Looks like it's contained in a span with the class 'counter-box'
counter_box = dream_soup.find('span', class_='counter-box')
print(counter_box)
# Check for no counter_box; this is a tell tale sign we're stuck at the log in page for DDG...
# If so, the safest option to warn the user & exit, instead of causing an exception on the next line.
if counter_box is None:
print("ERROR: log in to Deep Dream Generator Failed! Check your creds or maybe this script borked. :(")
# Ye this is apparently not recommended but it doesn't quit the Idle shell on my Windows box so...
raise SystemExit
number_of_dreams = int(counter_box.text)
# Divide number of dreams by 24, AND ALWAYS round up [ hence math.ceil() ]
# Since we want 9.1 to turn into Page 10, not Page 9
number_of_pages = math.ceil(number_of_dreams / 24)
print("We should parse %d pages I thinks...\n" % number_of_pages)
# For sorting_type "3", we'll have to grab the number of pages from the
# "pagination" ul class. Note that these can be spans or hrefs! so don't
# be too specific to beautifulsoup.
pagination_ul_list = dream_soup.find_all(class_='page-link')
max_page = get_largest_page_number(pagination_ul_list)
# Debugging
print("Looks like the number of pages should be %d" % max_page)
# Only use max_page if we're downloading ALL of my dreams
# Otherwise we'll stick to the tried & true counter_box
if sorting_type == "3":
number_of_pages = max_page
# List of all the dream img urls for mass downloading later on
img_urls = get_img_urls(number_of_pages)
# Debugging: does our list contain the same number of URLs?
print("Number of dreams we thought we might find: %d" % number_of_dreams)
print("img_urls contains %d img urls\n" % len(img_urls))
# Now that we have a list of img URLs, download them to an img directory
# First, make sure an "img" directory exists
downloads_dir = "latest_dreams"
# Using a separate dir for best dream sorting download
if sorting_type == "2":
downloads_dir = "best_dreams"
elif sorting_type == "3":
downloads_dir = "all_dreams"
does_dir_exist = os.path.isdir(downloads_dir)
if does_dir_exist is False:
print("Making the %s directory..." % downloads_dir)
os.mkdir(downloads_dir)
else:
print("%s directory already exists :)" % downloads_dir)
# Finally, download all of my dreams!
download_all_dreams(downloads_dir, sorting_type, img_urls)
print("Hopefully all your dreams have come true!")
# Step 0: Do we want to download dreams by latest or best sorting? Or all the dreams even non-public ones?
# If we pick latest, we should name the dream based on the DDG file name
# If we pick best, we should name the dreams like "dream_num1.jpg", "dream_num2",
# etc where 1 == #1 dream, 2 == #2 dream, etc.
# If we pick all, then we should just name dreams based on the DDG file name
print("¡¡¡¡ WARNING: only download dreams you personally created !!!!\n\n")
print("** Deep Dream Generator Downloader V1.1 **")
print("Enter 1 for latest dream sorting, 2 for best dream sorting, 3 for all dreams and 4 for all of these: ")
sorting_type = ""
while sorting_type != "1" and sorting_type != "2" and sorting_type != "3" and sorting_type != "4":
sorting_type = input()
if sorting_type != "1" and sorting_type != "2" and sorting_type != "3" and sorting_type != "4" :
print("\nError: invalid sorting type. ")
print("Enter 1 for latest dream sorting, 2 for best dream sorting, 3 for all dreams \
and 4 for all of these: ")
if sorting_type == "2":
dream_url = dream_url + "/best"
elif sorting_type == "3":
dream_url = dream_url + "/account"
elif sorting_type == "4":
# Method 1 first, "latest" sorting
# Make sure to set sorting_type so the rest of the script works; noticed directories weren't being
# created when I ran this for the first time on a different machine. Opps!
sorting_type = "1"
dream_url = "https://deepdreamgenerator.com/u/" + username
get_deep_dreams()
# Method 2 second, "best" sorting
sorting_type = "2"
dream_url = "https://deepdreamgenerator.com/u/" + username + "/best"
get_deep_dreams()
# Finally Method 3, "all dreams even though not public"
sorting_type = "3"
dream_url = "https://deepdreamgenerator.com/u/" + username + "/account"
get_deep_dreams()
quit() # Fin
# If we end up here, then options 1 - 3 were selected. So grab the requested dreams!
get_deep_dreams()