istresearch
diff --git a/‎ansible/roles/kafka/templates/consumer.properties.j2‎
Lines changed: 1 addition & 1 deletion b/‎ansible/roles/kafka/templates/consumer.properties.j2‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crawler/config/example.yml‎
Lines changed: 2 additions & 2 deletions b/‎crawler/config/example.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎crawler/crawling/distributed_scheduler.py‎
Lines changed: 35 additions & 60 deletions b/‎crawler/crawling/distributed_scheduler.py‎
Lines changed: 35 additions & 60 deletions
diff --git a/‎crawler/crawling/items.py‎
Lines changed: 1 addition & 0 deletions b/‎crawler/crawling/items.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crawler/crawling/pipelines.py‎
Lines changed: 12 additions & 1 deletion b/‎crawler/crawling/pipelines.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎crawler/crawling/spiders/link_spider.py‎
Lines changed: 1 addition & 0 deletions b/‎crawler/crawling/spiders/link_spider.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crawler/crawling/spiders/wandering_spider.py‎
Lines changed: 1 addition & 0 deletions b/‎crawler/crawling/spiders/wandering_spider.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crawler/requirements.txt‎
Lines changed: 26 additions & 26 deletions b/‎crawler/requirements.txt‎
Lines changed: 26 additions & 26 deletions
diff --git a/‎crawler/tests/online.py‎
Lines changed: 4 additions & 4 deletions b/‎crawler/tests/online.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎crawler/tests/test_distributed_scheduler.py‎
Lines changed: 40 additions & 10 deletions b/‎crawler/tests/test_distributed_scheduler.py‎
Lines changed: 40 additions & 10 deletions
@@ -17,7 +17,7 @@
 # Zookeeper connection string
 # comma separated host:port pairs, each corresponding to a zk
 # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002"
-zookeeper.connect=127.0.0.1:2181
+zookeeper.connect={% for host in zookeeper_host_list %}{{ host }}:{{ zookeeper_client_port|default(2181) }}{% if not loop.last %},{% endif %}{% endfor %}
 
 # timeout in ms for connecting to zookeeper
 zookeeper.connection.timeout.ms=6000
 
@@ -1,9 +1,9 @@
 domains:
-    dmoz.org:
+    dmoztools.net:
         window: 60
         hits: 60
         scale: 1.0
     wikipedia.org:
         window: 60
         hits: 30
-        scale: 0.5
+        scale: 0.5
@@ -7,6 +7,7 @@
 from scrapy.http import Request
 from scrapy.conf import settings
 from scrapy.utils.python import to_unicode
+from scrapy.utils.reqser import request_to_dict, request_from_dict
 
 import redis
 import random
@@ -81,7 +82,7 @@ def __init__(self, server, persist, update_int, timeout, retries, logger,
         self.ip_update_interval = ip_refresh
         self.add_type = add_type
         self.add_ip = add_ip
-        self.item_retires = retries
+        self.item_retries = retries
         self.logger = logger
         self.ip_regex = re.compile(ip_regex)
         self.backlog_blacklist = backlog_blacklist
@@ -391,7 +392,7 @@ def enqueue_request(self, request):
         if not request.dont_filter and self.dupefilter.request_seen(request):
             self.logger.debug("Request not added back to redis")
             return
-        req_dict = self.request_to_dict(request)
+        req_dict = request_to_dict(request, self.spider)
 
         if not self.is_blacklisted(req_dict['meta']['appid'],
                                    req_dict['meta']['crawlid']):
@@ -436,28 +437,6 @@ def enqueue_request(self, request):
                               .format(appid=req_dict['meta']['appid'],
                                       id=req_dict['meta']['crawlid']))
 
-    def request_to_dict(self, request):
-        '''
-        Convert Request object to a dict.
-        modified from scrapy.utils.reqser
-        '''
-        req_dict = {
-            # urls should be safe (safe_string_url)
-            'url': to_unicode(request.url),
-            'method': request.method,
-            'headers': dict(request.headers),
-            'body': request.body,
-            'cookies': request.cookies,
-            'meta': request.meta,
-            '_encoding': request._encoding,
-            'priority': request.priority,
-            'dont_filter': request.dont_filter,
-             #  callback/errback are assumed to be a bound instance of the spider
-            'callback': None if request.callback is None else request.callback.__name__,
-            'errback': None if request.errback is None else request.errback.__name__,
-        }
-        return req_dict
-
     def find_item(self):
         '''
         Finds an item from the throttled queues
@@ -504,50 +483,46 @@ def next_request(self):
         if item:
             self.logger.debug(u"Found url to crawl {url}" \
                     .format(url=item['url']))
-            try:
-                req = Request(item['url'])
-            except ValueError:
-                # need absolute url
-                # need better url validation here
-                req = Request('http://' + item['url'])
-
-            try:
-                if 'callback' in item and item['callback'] is not None:
-                    req.callback = getattr(self.spider, item['callback'])
-            except AttributeError:
-                self.logger.warn("Unable to find callback method")
-
-            try:
-                if 'errback' in item and item['errback'] is not None:
-                    req.errback = getattr(self.spider, item['errback'])
-            except AttributeError:
-                self.logger.warn("Unable to find errback method")
-
             if 'meta' in item:
-                item = item['meta']
-
-            # defaults not in schema
-            if 'curdepth' not in item:
-                item['curdepth'] = 0
-            if "retry_times" not in item:
-                item['retry_times'] = 0
-
-            for key in list(item.keys()):
-                req.meta[key] = item[key]
+                # item is a serialized request
+                req = request_from_dict(item, self.spider)
+            else:
+                # item is a feed from outside, parse it manually
+                req = self.request_from_feed(item)
 
             # extra check to add items to request
-            if 'useragent' in item and item['useragent'] is not None:
-                req.headers['User-Agent'] = item['useragent']
-            if 'cookie' in item and item['cookie'] is not None:
-                if isinstance(item['cookie'], dict):
-                    req.cookies = item['cookie']
-                elif isinstance(item['cookie'], basestring):
-                    req.cookies = self.parse_cookie(item['cookie'])
+            if 'useragent' in req.meta and req.meta['useragent'] is not None:
+                req.headers['User-Agent'] = req.meta['useragent']
 
             return req
 
         return None
 
+    def request_from_feed(self, item):
+        try:
+            req = Request(item['url'])
+        except ValueError:
+            # need absolute url
+            # need better url validation here
+            req = Request('http://' + item['url'])
+
+        # defaults not in schema
+        if 'curdepth' not in item:
+            item['curdepth'] = 0
+        if "retry_times" not in item:
+            item['retry_times'] = 0
+
+        for key in list(item.keys()):
+            req.meta[key] = item[key]
+
+        # extra check to add items to request
+        if 'cookie' in item and item['cookie'] is not None:
+            if isinstance(item['cookie'], dict):
+                req.cookies = item['cookie']
+            elif isinstance(item['cookie'], basestring):
+                req.cookies = self.parse_cookie(item['cookie'])
+        return req
+
     def parse_cookie(self, string):
         '''
         Parses a cookie string like returned in a Set-Cookie header
 
@@ -19,3 +19,4 @@ class RawResponseItem(Item):
     attrs = Field()
     success = Field()
     exception = Field()
+    encoding = Field()
@@ -181,8 +181,19 @@ def process_item(self, item, spider):
             prefix = self.topic_prefix
 
             try:
+                # Get the encoding. If it's not a key of datum, return utf-8
+                encoding = datum.get('encoding', 'utf-8')
+
                 if self.use_base64:
-                    datum['body'] = base64.b64encode(bytes(datum['body'], 'utf-8'))
+                    # When running in Python 2 datum['body'] is a string
+                    if isinstance(datum['body'], str):
+                        datum['body'] = bytes(datum['body'], encoding)
+                    # In Python 3 datum['body'] is already in byte form
+                    datum['body'] = base64.b64encode(datum['body'])
+
+                elif 'utf-8' != encoding:
+                    datum['body'] = datum['body'].decode(datum['encoding'])
+
                 message = ujson.dumps(datum, sort_keys=True)
             except:
                 message = 'json failed to parse'
 
@@ -40,6 +40,7 @@ def parse(self, response):
         item["response_headers"] = self.reconstruct_headers(response)
         item["request_headers"] = response.request.headers
         item["body"] = response.body
+        item["encoding"] = response.encoding
         item["links"] = []
 
         # determine whether to continue spidering
 
@@ -46,6 +46,7 @@ def parse(self, response):
         item["response_headers"] = self.reconstruct_headers(response)
         item["request_headers"] = response.request.headers
         item["body"] = response.body
+        item["encoding"] = response.encoding
         item["links"] = []
         # we want to know how far our spider gets
         if item['attrs'] is None:
 
@@ -1,40 +1,40 @@
-attrs==16.3.0 # Updated from 16.1.0
-cffi==1.9.1 # Updated from 1.7.0
+attrs==17.2.0 # Updated from 16.3.0
+cffi==1.10.0 # Updated from 1.9.1
 ConcurrentLogHandler==0.9.1
-cryptography==1.8.1 # Updated from 1.5
-cssselect==1.0.1 # Updated from 0.9.2
+cryptography==2.0.3 # Updated from 1.8.1
+cssselect==1.0.1
 enum34==1.1.6
 funcsigs==1.0.2
-future==0.16.0 # Updated from 0.15.2
-idna==2.5 # Updated from 2.1
-ipaddress==1.0.18 # Updated from 1.0.16
-kafka-python==1.3.3 # Updated from 1.3.2
-kazoo==2.2.1
-lxml==3.7.3 # Updated from 3.6.4
+future==0.16.0
+idna==2.6 # Updated from 2.5
+ipaddress==1.0.18
+kafka-python==1.3.4 # Updated from 1.3.3
+kazoo==2.4.0 # Updated from 2.2.1
+lxml==3.8.0 # Updated from 3.7.3
 mock==2.0.0
 nose==1.3.7
-parsel==1.1.0 # Updated from 1.0.3
-pbr==2.0.0 # Updated from 1.10.0
-pyasn1==0.2.3 # Updated from 0.1.9
-pyasn1-modules==0.0.8
-pycparser==2.17 # Updated from 2.14
+parsel==1.2.0 # Updated from 1.1.0
+pbr==3.1.1 # Updated from 2.0.0
+pyasn1==0.3.2 # Updated from 0.2.3
+pyasn1-modules==0.0.11 # Updated from 0.0.8
+pycparser==2.18 # Updated from 2.17
 PyDispatcher==2.0.5
-pyOpenSSL==16.2.0 # Updated from 16.1.0
-python-json-logger==0.1.7 # Updated from 0.1.5
+pyOpenSSL==17.2.0 # Updated from 16.2.0
+python-json-logger==0.1.8 # Updated from 0.1.7
 PyYAML==3.12
 queuelib==1.4.2
 redis==2.10.5
-requests==2.13.0 # Updated from 2.11.1
-requests-file==1.4.1 # Updated from 1.4
+requests==2.18.3 # Updated from 2.13.0
+requests-file==1.4.2 # Updated from 1.4.1
 retrying==1.3.3
-Scrapy==1.3.3
+Scrapy==1.4.0 # Updated from 1.3.3
 ../utils # scutils==1.3.0dev0
-service-identity==16.0.0
+service-identity==17.0.0 # Updated from 16.0.0
 six==1.10.0
-testfixtures==4.13.5 # Updated from 4.10.0
-tldextract==2.0.2 # Updated from 2.0.1
-Twisted==17.1.0 # Updated from 16.4.0
+testfixtures==5.1.1 # Updated from 4.13.5
+tldextract==2.1.0 # Updated from 2.0.2
+Twisted==17.5.0 # Updated from 17.1.0
 ujson==1.35
-w3lib==1.17.0 # Updated from 1.16.0
-zope.interface==4.3.3 # Updated from 4.2.0
+w3lib==1.18.0 # Updated from 1.17.0
+zope.interface==4.4.2 # Updated from 4.3.3
 # Generated with piprot 0.9.7
@@ -35,10 +35,10 @@ class CustomSpider(LinkSpider):
 class TestLinkSpider(TestCase):
 
     example_feed = "{\"allowed_domains\":null,\"allow_regex\":null,\""\
-        "crawlid\":\"abc12345\",\"url\":\"istresearch.com\",\"expires\":0,\""\
+        "crawlid\":\"abc12345\",\"url\":\"http://dmoztools.net/\",\"expires\":0,\""\
         "ts\":1461549923.7956631184,\"priority\":1,\"deny_regex\":null,\""\
         "cookie\":null,\"attrs\":null,\"appid\":\"test\",\"spiderid\":\""\
-        "link\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}"
+        "test-spider\",\"useragent\":null,\"deny_extensions\":null,\"maxdepth\":0}"
 
     def setUp(self):
         self.settings = get_project_settings()
@@ -77,7 +77,7 @@ def test_crawler_process(self):
         d = runner.crawl(CustomSpider)
         d.addBoth(lambda _: reactor.stop())
         # add crawl to redis
-        key = "test-spider:istresearch.com:queue"
+        key = "test-spider:dmoztools.net:queue"
         self.redis_conn.zadd(key, self.example_feed, -99)
 
         # run the spider, give 20 seconds to see the url, crawl it,
@@ -101,7 +101,7 @@ def thread_func():
                     and the_dict['crawlid'] == 'abc12345':
                 message_count += 1
 
-        self.assertEquals(message_count, 1)
+        self.assertEqual(message_count, 1)
 
     def tearDown(self):
         keys = self.redis_conn.keys('stats:crawler:*:test-spider:*')
 
@@ -7,6 +7,7 @@
 from mock import MagicMock
 from crawling.distributed_scheduler import DistributedScheduler
 from scrapy.http import Request
+from scrapy.utils.reqser import request_to_dict
 from scutils.redis_throttled_queue import RedisThrottledQueue
 
 
@@ -59,7 +60,7 @@ def test_enqueue_request(self, t):
 
         # test request already seen
         self.scheduler.dupefilter.request_seen = MagicMock(return_value=True)
-        self.assertEquals(self.scheduler.enqueue_request(self.req), None)
+        self.assertEqual(self.scheduler.enqueue_request(self.req), None)
 
         # test request not expiring and queue seen
         self.scheduler.queue_keys = ['link:ex.com:queue']
@@ -139,6 +140,21 @@ def test_find_item(self):
         self.assertEqual(self.scheduler.find_item(), None) # should also not raise exception
 
 
+class TestDistributedSchedulerRequestFromFeed(ThrottleMixin, TestCase):
+    def test_request_from_feed(self):
+        self.req = self.get_request()
+        feed = {
+            "url": "http://ex.com",
+            "crawlid": "abc123",
+            "appid": "myapp",
+            "spiderid": "link",
+        }
+        out = self.scheduler.request_from_feed(feed)
+        self.assertEqual(out.url, 'http://ex.com')
+        for key in out.meta:
+            self.assertEqual(out.meta[key], self.req.meta[key])
+
+
 class TestDistributedSchedulerNextRequest(ThrottleMixin, TestCase):
 
     @mock.patch('time.time', return_value=5)
@@ -169,28 +185,42 @@ def test_next_request(self, t):
         except Exception as e:
             self.assertEqual(str(e), "ip")
 
-        # test got item
-        self.scheduler.find_item = MagicMock(
-                                        return_value={"url": "http://ex.com",
-                                                      "crawlid": "abc123",
-                                                      "appid": "myapp",
-                                                      "spiderid": "link"})
+        # test request from feed
+        feed = {
+            "url": "http://ex.com",
+            "crawlid": "abc123",
+            "appid": "myapp",
+            "spiderid": "link",
+        }
+        self.scheduler.find_item = MagicMock(return_value=feed)
+        out = self.scheduler.next_request()
+        self.assertEqual(out.url, 'http://ex.com')
+        for key in out.meta:
+            self.assertEqual(out.meta[key], self.req.meta[key])
+
+        # test request from serialized request
+        exist_req = Request('http://ex.com')
+        exist_item = request_to_dict(exist_req)
+        exist_item["meta"]["crawlid"] = "abc123"
+        exist_item["meta"]["appid"] = "myapp"
+        exist_item["meta"]["spiderid"] = "link"
+        self.scheduler.find_item = MagicMock(return_value=exist_item)
         out = self.scheduler.next_request()
-        self.assertEquals(out.url, 'http://ex.com')
+        self.assertEqual(out.url, 'http://ex.com')
         for key in out.meta:
             self.assertEqual(out.meta[key], self.req.meta[key])
 
         # test didn't get item
         self.scheduler.find_item = MagicMock(return_value=None)
-        self.assertEquals(self.scheduler.next_request(), None)
+        self.assertEqual(self.scheduler.next_request(), None)
 
 
 class TestDistributedSchedulerChangeConfig(ThrottleMixin, TestCase):
 
     def test_change_config(self):
         good_string = ""\
           "domains:\n"\
-          "  dmoz.org:\n"\
+          "  dmoztools.net:\n"\
           "      window: 60\n"\
           "      hits: 60\n"\
           "      scale: 1.0\n"\