Skip to content

Commit 0a267bd

Browse files
author
Sornalingam
committed
Merge branch 'dev' into uitest
# Conflicts: # docs/topics/changelog.rst # requirements.txt # rest/tests/test_rest_service.py
2 parents efb40bd + d90d4e9 commit 0a267bd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+487
-345
lines changed

ansible/roles/kafka/templates/consumer.properties.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
# Zookeeper connection string
1818
# comma separated host:port pairs, each corresponding to a zk
1919
# server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002"
20-
zookeeper.connect=127.0.0.1:2181
20+
zookeeper.connect={% for host in zookeeper_host_list %}{{ host }}:{{ zookeeper_client_port|default(2181) }}{% if not loop.last %},{% endif %}{% endfor %}
2121

2222
# timeout in ms for connecting to zookeeper
2323
zookeeper.connection.timeout.ms=6000

crawler/config/example.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
domains:
2-
dmoz.org:
2+
dmoztools.net:
33
window: 60
44
hits: 60
55
scale: 1.0
66
wikipedia.org:
77
window: 60
88
hits: 30
9-
scale: 0.5
9+
scale: 0.5

crawler/crawling/distributed_scheduler.py

Lines changed: 35 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from scrapy.http import Request
88
from scrapy.conf import settings
99
from scrapy.utils.python import to_unicode
10+
from scrapy.utils.reqser import request_to_dict, request_from_dict
1011

1112
import redis
1213
import random
@@ -81,7 +82,7 @@ def __init__(self, server, persist, update_int, timeout, retries, logger,
8182
self.ip_update_interval = ip_refresh
8283
self.add_type = add_type
8384
self.add_ip = add_ip
84-
self.item_retires = retries
85+
self.item_retries = retries
8586
self.logger = logger
8687
self.ip_regex = re.compile(ip_regex)
8788
self.backlog_blacklist = backlog_blacklist
@@ -391,7 +392,7 @@ def enqueue_request(self, request):
391392
if not request.dont_filter and self.dupefilter.request_seen(request):
392393
self.logger.debug("Request not added back to redis")
393394
return
394-
req_dict = self.request_to_dict(request)
395+
req_dict = request_to_dict(request, self.spider)
395396

396397
if not self.is_blacklisted(req_dict['meta']['appid'],
397398
req_dict['meta']['crawlid']):
@@ -436,28 +437,6 @@ def enqueue_request(self, request):
436437
.format(appid=req_dict['meta']['appid'],
437438
id=req_dict['meta']['crawlid']))
438439

439-
def request_to_dict(self, request):
440-
'''
441-
Convert Request object to a dict.
442-
modified from scrapy.utils.reqser
443-
'''
444-
req_dict = {
445-
# urls should be safe (safe_string_url)
446-
'url': to_unicode(request.url),
447-
'method': request.method,
448-
'headers': dict(request.headers),
449-
'body': request.body,
450-
'cookies': request.cookies,
451-
'meta': request.meta,
452-
'_encoding': request._encoding,
453-
'priority': request.priority,
454-
'dont_filter': request.dont_filter,
455-
# callback/errback are assumed to be a bound instance of the spider
456-
'callback': None if request.callback is None else request.callback.__name__,
457-
'errback': None if request.errback is None else request.errback.__name__,
458-
}
459-
return req_dict
460-
461440
def find_item(self):
462441
'''
463442
Finds an item from the throttled queues
@@ -504,50 +483,46 @@ def next_request(self):
504483
if item:
505484
self.logger.debug(u"Found url to crawl {url}" \
506485
.format(url=item['url']))
507-
try:
508-
req = Request(item['url'])
509-
except ValueError:
510-
# need absolute url
511-
# need better url validation here
512-
req = Request('http://' + item['url'])
513-
514-
try:
515-
if 'callback' in item and item['callback'] is not None:
516-
req.callback = getattr(self.spider, item['callback'])
517-
except AttributeError:
518-
self.logger.warn("Unable to find callback method")
519-
520-
try:
521-
if 'errback' in item and item['errback'] is not None:
522-
req.errback = getattr(self.spider, item['errback'])
523-
except AttributeError:
524-
self.logger.warn("Unable to find errback method")
525-
526486
if 'meta' in item:
527-
item = item['meta']
528-
529-
# defaults not in schema
530-
if 'curdepth' not in item:
531-
item['curdepth'] = 0
532-
if "retry_times" not in item:
533-
item['retry_times'] = 0
534-
535-
for key in list(item.keys()):
536-
req.meta[key] = item[key]
487+
# item is a serialized request
488+
req = request_from_dict(item, self.spider)
489+
else:
490+
# item is a feed from outside, parse it manually
491+
req = self.request_from_feed(item)
537492

538493
# extra check to add items to request
539-
if 'useragent' in item and item['useragent'] is not None:
540-
req.headers['User-Agent'] = item['useragent']
541-
if 'cookie' in item and item['cookie'] is not None:
542-
if isinstance(item['cookie'], dict):
543-
req.cookies = item['cookie']
544-
elif isinstance(item['cookie'], basestring):
545-
req.cookies = self.parse_cookie(item['cookie'])
494+
if 'useragent' in req.meta and req.meta['useragent'] is not None:
495+
req.headers['User-Agent'] = req.meta['useragent']
546496

547497
return req
548498

549499
return None
550500

501+
def request_from_feed(self, item):
502+
try:
503+
req = Request(item['url'])
504+
except ValueError:
505+
# need absolute url
506+
# need better url validation here
507+
req = Request('http://' + item['url'])
508+
509+
# defaults not in schema
510+
if 'curdepth' not in item:
511+
item['curdepth'] = 0
512+
if "retry_times" not in item:
513+
item['retry_times'] = 0
514+
515+
for key in list(item.keys()):
516+
req.meta[key] = item[key]
517+
518+
# extra check to add items to request
519+
if 'cookie' in item and item['cookie'] is not None:
520+
if isinstance(item['cookie'], dict):
521+
req.cookies = item['cookie']
522+
elif isinstance(item['cookie'], basestring):
523+
req.cookies = self.parse_cookie(item['cookie'])
524+
return req
525+
551526
def parse_cookie(self, string):
552527
'''
553528
Parses a cookie string like returned in a Set-Cookie header

crawler/crawling/items.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ class RawResponseItem(Item):
1919
attrs = Field()
2020
success = Field()
2121
exception = Field()
22+
encoding = Field()

crawler/crawling/pipelines.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,19 @@ def process_item(self, item, spider):
181181
prefix = self.topic_prefix
182182

183183
try:
184+
# Get the encoding. If it's not a key of datum, return utf-8
185+
encoding = datum.get('encoding', 'utf-8')
186+
184187
if self.use_base64:
185-
datum['body'] = base64.b64encode(bytes(datum['body'], 'utf-8'))
188+
# When running in Python 2 datum['body'] is a string
189+
if isinstance(datum['body'], str):
190+
datum['body'] = bytes(datum['body'], encoding)
191+
# In Python 3 datum['body'] is already in byte form
192+
datum['body'] = base64.b64encode(datum['body'])
193+
194+
elif 'utf-8' != encoding:
195+
datum['body'] = datum['body'].decode(datum['encoding'])
196+
186197
message = ujson.dumps(datum, sort_keys=True)
187198
except:
188199
message = 'json failed to parse'

crawler/crawling/spiders/link_spider.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def parse(self, response):
4040
item["response_headers"] = self.reconstruct_headers(response)
4141
item["request_headers"] = response.request.headers
4242
item["body"] = response.body
43+
item["encoding"] = response.encoding
4344
item["links"] = []
4445

4546
# determine whether to continue spidering

crawler/crawling/spiders/wandering_spider.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ def parse(self, response):
4646
item["response_headers"] = self.reconstruct_headers(response)
4747
item["request_headers"] = response.request.headers
4848
item["body"] = response.body
49+
item["encoding"] = response.encoding
4950
item["links"] = []
5051
# we want to know how far our spider gets
5152
if item['attrs'] is None:

crawler/requirements.txt

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,40 @@
1-
attrs==16.3.0 # Updated from 16.1.0
2-
cffi==1.9.1 # Updated from 1.7.0
1+
attrs==17.2.0 # Updated from 16.3.0
2+
cffi==1.10.0 # Updated from 1.9.1
33
ConcurrentLogHandler==0.9.1
4-
cryptography==1.8.1 # Updated from 1.5
5-
cssselect==1.0.1 # Updated from 0.9.2
4+
cryptography==2.0.3 # Updated from 1.8.1
5+
cssselect==1.0.1
66
enum34==1.1.6
77
funcsigs==1.0.2
8-
future==0.16.0 # Updated from 0.15.2
9-
idna==2.5 # Updated from 2.1
10-
ipaddress==1.0.18 # Updated from 1.0.16
11-
kafka-python==1.3.3 # Updated from 1.3.2
12-
kazoo==2.2.1
13-
lxml==3.7.3 # Updated from 3.6.4
8+
future==0.16.0
9+
idna==2.6 # Updated from 2.5
10+
ipaddress==1.0.18
11+
kafka-python==1.3.4 # Updated from 1.3.3
12+
kazoo==2.4.0 # Updated from 2.2.1
13+
lxml==3.8.0 # Updated from 3.7.3
1414
mock==2.0.0
1515
nose==1.3.7
16-
parsel==1.1.0 # Updated from 1.0.3
17-
pbr==2.0.0 # Updated from 1.10.0
18-
pyasn1==0.2.3 # Updated from 0.1.9
19-
pyasn1-modules==0.0.8
20-
pycparser==2.17 # Updated from 2.14
16+
parsel==1.2.0 # Updated from 1.1.0
17+
pbr==3.1.1 # Updated from 2.0.0
18+
pyasn1==0.3.2 # Updated from 0.2.3
19+
pyasn1-modules==0.0.11 # Updated from 0.0.8
20+
pycparser==2.18 # Updated from 2.17
2121
PyDispatcher==2.0.5
22-
pyOpenSSL==16.2.0 # Updated from 16.1.0
23-
python-json-logger==0.1.7 # Updated from 0.1.5
22+
pyOpenSSL==17.2.0 # Updated from 16.2.0
23+
python-json-logger==0.1.8 # Updated from 0.1.7
2424
PyYAML==3.12
2525
queuelib==1.4.2
2626
redis==2.10.5
27-
requests==2.13.0 # Updated from 2.11.1
28-
requests-file==1.4.1 # Updated from 1.4
27+
requests==2.18.3 # Updated from 2.13.0
28+
requests-file==1.4.2 # Updated from 1.4.1
2929
retrying==1.3.3
30-
Scrapy==1.3.3
30+
Scrapy==1.4.0 # Updated from 1.3.3
3131
../utils # scutils==1.3.0dev0
32-
service-identity==16.0.0
32+
service-identity==17.0.0 # Updated from 16.0.0
3333
six==1.10.0
34-
testfixtures==4.13.5 # Updated from 4.10.0
35-
tldextract==2.0.2 # Updated from 2.0.1
36-
Twisted==17.1.0 # Updated from 16.4.0
34+
testfixtures==5.1.1 # Updated from 4.13.5
35+
tldextract==2.1.0 # Updated from 2.0.2
36+
Twisted==17.5.0 # Updated from 17.1.0
3737
ujson==1.35
38-
w3lib==1.17.0 # Updated from 1.16.0
39-
zope.interface==4.3.3 # Updated from 4.2.0
38+
w3lib==1.18.0 # Updated from 1.17.0
39+
zope.interface==4.4.2 # Updated from 4.3.3
4040
# Generated with piprot 0.9.7

crawler/tests/online.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,10 @@ class CustomSpider(LinkSpider):
3535
class TestLinkSpider(TestCase):
3636

3737
example_feed = "{\"allowed_domains\":null,\"allow_regex\":null,\""\
38-
"crawlid\":\"abc12345\",\"url\":\"istresearch.com\",\"expires\":0,\""\
38+
"crawlid\":\"abc12345\",\"url\":\"http://dmoztools.net/\",\"expires\":0,\""\
3939
"ts\":1461549923.7956631184,\"priority\":1,\"deny_regex\":null,\""\
4040
"cookie\":null,\"attrs\":null,\"appid\":\"test\",\"spiderid\":\""\
41-
"link\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}"
41+
"test-spider\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}"
4242

4343
def setUp(self):
4444
self.settings = get_project_settings()
@@ -77,7 +77,7 @@ def test_crawler_process(self):
7777
d = runner.crawl(CustomSpider)
7878
d.addBoth(lambda _: reactor.stop())
7979
# add crawl to redis
80-
key = "test-spider:istresearch.com:queue"
80+
key = "test-spider:dmoztools.net:queue"
8181
self.redis_conn.zadd(key, self.example_feed, -99)
8282

8383
# run the spider, give 20 seconds to see the url, crawl it,
@@ -101,7 +101,7 @@ def thread_func():
101101
and the_dict['crawlid'] == 'abc12345':
102102
message_count += 1
103103

104-
self.assertEquals(message_count, 1)
104+
self.assertEqual(message_count, 1)
105105

106106
def tearDown(self):
107107
keys = self.redis_conn.keys('stats:crawler:*:test-spider:*')

crawler/tests/test_distributed_scheduler.py

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from mock import MagicMock
88
from crawling.distributed_scheduler import DistributedScheduler
99
from scrapy.http import Request
10+
from scrapy.utils.reqser import request_to_dict
1011
from scutils.redis_throttled_queue import RedisThrottledQueue
1112

1213

@@ -59,7 +60,7 @@ def test_enqueue_request(self, t):
5960

6061
# test request already seen
6162
self.scheduler.dupefilter.request_seen = MagicMock(return_value=True)
62-
self.assertEquals(self.scheduler.enqueue_request(self.req), None)
63+
self.assertEqual(self.scheduler.enqueue_request(self.req), None)
6364

6465
# test request not expiring and queue seen
6566
self.scheduler.queue_keys = ['link:ex.com:queue']
@@ -139,6 +140,21 @@ def test_find_item(self):
139140
self.assertEqual(self.scheduler.find_item(), None) # should also not raise exception
140141

141142

143+
class TestDistributedSchedulerRequestFromFeed(ThrottleMixin, TestCase):
144+
def test_request_from_feed(self):
145+
self.req = self.get_request()
146+
feed = {
147+
"url": "http://ex.com",
148+
"crawlid": "abc123",
149+
"appid": "myapp",
150+
"spiderid": "link",
151+
}
152+
out = self.scheduler.request_from_feed(feed)
153+
self.assertEqual(out.url, 'http://ex.com')
154+
for key in out.meta:
155+
self.assertEqual(out.meta[key], self.req.meta[key])
156+
157+
142158
class TestDistributedSchedulerNextRequest(ThrottleMixin, TestCase):
143159

144160
@mock.patch('time.time', return_value=5)
@@ -169,28 +185,42 @@ def test_next_request(self, t):
169185
except Exception as e:
170186
self.assertEqual(str(e), "ip")
171187

172-
# test got item
173-
self.scheduler.find_item = MagicMock(
174-
return_value={"url": "http://ex.com",
175-
"crawlid": "abc123",
176-
"appid": "myapp",
177-
"spiderid": "link"})
188+
# test request from feed
189+
feed = {
190+
"url": "http://ex.com",
191+
"crawlid": "abc123",
192+
"appid": "myapp",
193+
"spiderid": "link",
194+
}
195+
self.scheduler.find_item = MagicMock(return_value=feed)
196+
out = self.scheduler.next_request()
197+
self.assertEqual(out.url, 'http://ex.com')
198+
for key in out.meta:
199+
self.assertEqual(out.meta[key], self.req.meta[key])
200+
201+
# test request from serialized request
202+
exist_req = Request('http://ex.com')
203+
exist_item = request_to_dict(exist_req)
204+
exist_item["meta"]["crawlid"] = "abc123"
205+
exist_item["meta"]["appid"] = "myapp"
206+
exist_item["meta"]["spiderid"] = "link"
207+
self.scheduler.find_item = MagicMock(return_value=exist_item)
178208
out = self.scheduler.next_request()
179-
self.assertEquals(out.url, 'http://ex.com')
209+
self.assertEqual(out.url, 'http://ex.com')
180210
for key in out.meta:
181211
self.assertEqual(out.meta[key], self.req.meta[key])
182212

183213
# test didn't get item
184214
self.scheduler.find_item = MagicMock(return_value=None)
185-
self.assertEquals(self.scheduler.next_request(), None)
215+
self.assertEqual(self.scheduler.next_request(), None)
186216

187217

188218
class TestDistributedSchedulerChangeConfig(ThrottleMixin, TestCase):
189219

190220
def test_change_config(self):
191221
good_string = ""\
192222
"domains:\n"\
193-
" dmoz.org:\n"\
223+
" dmoztools.net:\n"\
194224
" window: 60\n"\
195225
" hits: 60\n"\
196226
" scale: 1.0\n"\

0 commit comments

Comments
 (0)