-
Notifications
You must be signed in to change notification settings - Fork 23
/
crawler.py
126 lines (108 loc) · 3.77 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python
"""
crawler.py
For any given site implementation, use the getOtherRecipeLinks()
results to crawl for other recipes automatically.
"""
from queue import Queue
from threading import Thread
from random import seed, randint
from time import sleep
from settings import PAUSE_CRAWLER, PAUSE_TIME_RANGE
import sys
sys.path.append('sites')
# sites implemented so far
from allrecipes import AllRecipes
from epicurious import Epicurious
from foodnetwork import FoodNetwork
from saveur import Saveur
from sirogohan import SiroGohan
from wsonoma import WilliamsSonoma
AVAILABLE = {
'AllRecipes' : AllRecipes,
'Epicurious' : Epicurious,
'FoodNetwork' : FoodNetwork,
'SiroGohan' : SiroGohan,
'Saveur' : Saveur,
'WilliamsSonoma' : WilliamsSonoma,
}
pending = Queue()
fetched = Queue()
def site (label):
"""Return the site module corresponding to this label,
defaulting to None if not available"""
return AVAILABLE.get(label, None)
def fetch (src, save, db, collection, p, f):
"""This is the worker function to get the next recipe from
the pending queue, save it, and put all the related urls
on the pending queue for other workers to process"""
while True:
url = p.get()
if url in f.queue:
p.task_done()
else:
try:
recipe = src(url)
if save:
recipe.save()
if db is not None and collection is not None:
recipe.store(db, collection)
f.put(url)
list(map(lambda x: p.put(x), [link for link in recipe.getOtherRecipeLinks() if link != url]))
except ValueError:
print('[warning] could not fetch:', url)
p.task_done()
if PAUSE_CRAWLER:
# pause a random interval between PAUSE_TIME_RANGE seconds before continuing
sleep(randint(PAUSE_TIME_RANGE[0], PAUSE_TIME_RANGE[1]))
if __name__ == "__main__":
"""Create a command-line main() entry point"""
if len(sys.argv) < 4:
# Define the usage
possibleSources = sorted(AVAILABLE.keys())
print(sys.argv[0], \
'[site: (' + '|'.join(possibleSources) +')]', \
'[file of seed urls]', \
'[threads]', \
'[save() (defaults to True)]', \
'[store() database (defaults to None)]', \
'[store() collection (defaults to None)]')
else:
# Do the deed
if PAUSE_CRAWLER:
seed()
module = site(sys.argv[1])
if module is None:
print('Sorry, that site is not yet available')
else:
threads = 1
try:
threads = int(sys.argv[3])
except ValueError:
pass
save = True
try:
saveReq = sys.argv[4].lower()
save = saveReq.startswith('t') or saveReq == 'true'
except IndexError:
pass
db = None
collection = None
try:
db = sys.argv[5]
collection = sys.argv[6]
except IndexError:
pass
for i in range(threads):
worker = Thread(target=fetch, args=(module, save, db, collection, pending, fetched,))
worker.setDaemon(True)
worker.start()
# load the file of initial urls and seed the pending queue
with open(sys.argv[2], 'r') as f:
links = f.read()
list(map(lambda link: pending.put(link), links.splitlines()))
pending.join()
# show the summary
print('Fetched and parsed:')
for i, link in enumerate(set(fetched.queue)):
print("{:,}".format(1+i), '\t', link)