Update dmoz.org to dmoztools.net since dmoz.org now redirects. (istresearch#145) (istresearch#147)

Russ Ferriday · sabbir-006 · commit d598dbc33c7a · 2020-07-27T00:20:35.000+06:00
diff --git a/crawler/config/example.yml b/crawler/config/example.yml
@@ -1,9 +1,9 @@
 domains:
-    dmoz.org:
+    dmoztools.net:
         window: 60
         hits: 60
         scale: 1.0
     wikipedia.org:
         window: 60
         hits: 30
-        scale: 0.5
+        scale: 0.5
diff --git a/crawler/tests/test_distributed_scheduler.py b/crawler/tests/test_distributed_scheduler.py
@@ -220,7 +220,7 @@ class TestDistributedSchedulerChangeConfig(ThrottleMixin, TestCase):
     def test_change_config(self):
         good_string = ""\
           "domains:\n"\
-          "  dmoz.org:\n"\
+          "  dmoztools.net:\n"\
           "      window: 60\n"\
           "      hits: 60\n"\
           "      scale: 1.0\n"\
diff --git a/docs/topics/crawler/controlling.rst b/docs/topics/crawler/controlling.rst
@@ -141,7 +141,7 @@ To utilize the different throttle mechanisms you can alter the following setting
 Combining Domain Queues and Throttling
 --------------------------------------
 
-At the core of Scrapy Cluster is a Redis priority queue that holds all of the requests for a particular spider type and domain, like ``link:dmoz.org:queue``. The configured throttle determines when an individual Scrapy process can receive a new request from the Redis Queues. Only when the throttle says that it is "ok" will the Spider be returned a link to process.
+At the core of Scrapy Cluster is a Redis priority queue that holds all of the requests for a particular spider type and domain, like ``link:dmoztools.net:queue``. The configured throttle determines when an individual Scrapy process can receive a new request from the Redis Queues. Only when the throttle says that it is "ok" will the Spider be returned a link to process.
 
 This results in Spiders across the cluster continually polling all available domain queues for new requests, but only receiving requests when the throttle mechanism indicates that the request limit has not gone beyond the max desired configuration. Because the throttle coordination is conducted via Redis, it is not reliant on any one Scrapy process to determine whether the cluster can or can't crawl a particular domain.
 
diff --git a/docs/topics/crawler/extension.rst b/docs/topics/crawler/extension.rst
@@ -199,7 +199,7 @@ Then, feed your cluster.
 
 ::
 
-    python kafka_monitor.py feed '{"url": "http://dmoz.org", "appid":"testapp", "crawlid":"test123456", "spiderid":"wandering"}'
+    python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"test123456", "spiderid":"wandering"}'
 
 If you are looking at your ``demo.crawled_firehose`` Kafka Topic using the ``kafkadump.py`` script, you will begin to see output like so...
 
@@ -208,8 +208,8 @@ If you are looking at your ``demo.crawled_firehose`` Kafka Topic using the ``kaf
     {
         "body": <omitted>,
         "crawlid": "test123456",
-        "response_url": "http://www.dmoz.org/",
-        "url": "http://www.dmoz.org/",
+        "response_url": "http://www.dmoztools.net/",
+        "url": "http://www.dmoztools.net/",
         "status_code": 200,
         "status_msg": "OK",
         "appid": "testapp",
@@ -228,8 +228,8 @@ If you are looking at your ``demo.crawled_firehose`` Kafka Topic using the ``kaf
     {
         "body": <omitted>,
         "crawlid": "test123456",
-        "response_url": "http://www.dmoz.org/Computers/Hardware/",
-        "url": "http://www.dmoz.org/Computers/Hardware/",
+        "response_url": "http://www.dmoztools.net/Computers/Hardware/",
+        "url": "http://www.dmoztools.net/Computers/Hardware/",
         "status_code": 200,
         "status_msg": "OK",
         "appid": "testapp",
@@ -273,4 +273,4 @@ You can also fire up more than one crawl job at a time, and track the steps that
         "wandering_spider_count": 4
     }
 
-You now have two different examples of how Scrapy Cluster extends Scrapy to give you distributed crawling capabilities.
+You now have two different examples of how Scrapy Cluster extends Scrapy to give you distributed crawling capabilities.
diff --git a/docs/topics/introduction/quickstart.rst b/docs/topics/introduction/quickstart.rst
@@ -478,7 +478,7 @@ The following things will occur for this action request:
 
   ::
 
-      {u'server_time': 1450817666, u'crawlid': u'abc1234', u'total_pending': 25, u'total_domains': 2, u'spiderid': u'link', u'appid': u'testapp', u'domains': {u'twitter.com': {u'low_priority': -9, u'high_priority': -9, u'total': 1}, u'dmoz.org': {u'low_priority': -9, u'high_priority': -9, u'total': 24}}, u'uuid': u'someuuid'}
+      {u'server_time': 1450817666, u'crawlid': u'abc1234', u'total_pending': 25, u'total_domains': 2, u'spiderid': u'link', u'appid': u'testapp', u'domains': {u'twitter.com': {u'low_priority': -9, u'high_priority': -9, u'total': 1}, u'dmoztools.net': {u'low_priority': -9, u'high_priority': -9, u'total': 24}}, u'uuid': u'someuuid'}
 
 In this case we had 25 urls pending in the queue, so yours may be slightly different.
 
diff --git a/docs/topics/kafka-monitor/api.rst b/docs/topics/kafka-monitor/api.rst
@@ -74,9 +74,9 @@ Kafka Request:
 
     ::
 
-        $ python kafka_monitor.py feed '{"url": "http://www.dmoz.org/", "appid":"testapp", "crawlid":"abc123", "maxdepth":2, "priority":90}'
+        $ python kafka_monitor.py feed '{"url": "http://www.dmoztools.net/", "appid":"testapp", "crawlid":"abc123", "maxdepth":2, "priority":90}'
 
-    - Submits a dmoz.org crawl spidering 2 levels deep with a high priority
+    - Submits a dmoztools.net crawl spidering 2 levels deep with a high priority
 
     ::
 
@@ -899,15 +899,15 @@ Zookeeper Request:
 
     ::
 
-        $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-update", "domain":"dmoz.org", "hits":60, "window":60, "scale":0.9}'
+        $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-update", "domain":"dmoztools.net", "hits":60, "window":60, "scale":0.9}'
 
 Response from Kafka:
 
     ::
 
         {
             "action": "domain-update",
-            "domain": "dmoz.org",
+            "domain": "dmoztools.net",
             "server_time": 1464402128,
             "uuid": "abc123",
             "appid": "madisonTest"
@@ -923,15 +923,15 @@ Zookeeper Request:
 
     ::
 
-        $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-remove", "domain":"dmoz.org"}'
+        $ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-remove", "domain":"dmoztools.net"}'
 
 Response from Kafka:
 
     ::
 
         {
             "action": "domain-remove",
-            "domain": "dmoz.org",
+            "domain": "dmoztools.net",
             "server_time": 1464402146,
             "uuid": "abc123",
             "appid": "madisonTest"
diff --git a/redis-monitor/tests/test_plugins.py b/redis-monitor/tests/test_plugins.py
@@ -322,7 +322,7 @@ def test_stats_get_queue(self):
                                                 'link:istresearch.com:queue',
                                                 'link:yellowpages.com:queue',
                                                 'link:cnn.com:queue',
-                                                'wandering:dmoz.org:queue',
+                                                'wandering:dmoztools.net:queue',
                                                 'wandering:craigslist.org:queue',
                                                 ])
         results = [5, 10, 11, 1, 3]
@@ -349,7 +349,7 @@ def ret_val(*args):
                     'spider_backlog': 4,
                     'num_domains': 2,
                     'domains': [
-                        {'domain': 'dmoz.org', 'backlog': 1},
+                        {'domain': 'dmoztools.net', 'backlog': 1},
                         {'domain': 'craigslist.org', 'backlog': 3},
                     ]
                 }
@@ -395,20 +395,20 @@ def test_zk_regex(self):
 
     def test_zk_handle_du(self):
         # domain update
-        s = b'blacklist: []\ndomains:\n  dmoz.org: {hits: 60, scale: 1.0, window: 60}\n'
+        s = b'blacklist: []\ndomains:\n  dmoztools.net: {hits: 60, scale: 1.0, window: 60}\n'
         val = '{"uuid":"blah123","hits":15,"scale":0.9,"window":60}'
-        expected = b'blacklist: []\ndomains:\n  cnn.com:\n    hits: 15\n    scale: 0.9\n    window: 60\n  dmoz.org:\n    hits: 60\n    scale: 1.0\n    window: 60\n'
+        expected = b'blacklist: []\ndomains:\n  cnn.com:\n    hits: 15\n    scale: 0.9\n    window: 60\n  dmoztools.net:\n    hits: 60\n    scale: 1.0\n    window: 60\n'
         self.plugin.zoo_client.get = MagicMock(return_value=(s,))
         self.plugin.handle(key="zk:domain-update:cnn.com:testapp", value=val)
         self.plugin.zoo_client.set.assert_called_once_with("/some/path", expected)
 
     def test_zk_handle_dr(self):
         # domain remove
-        s = b'blacklist: []\ndomains:\n  dmoz.org: {hits: 60, scale: 1.0, window: 60}\n'
+        s = b'blacklist: []\ndomains:\n  dmoztools.net: {hits: 60, scale: 1.0, window: 60}\n'
         val = '{"uuid":"blah123"}'
         expected = b'blacklist: []\ndomains: {}\n'
         self.plugin.zoo_client.get = MagicMock(return_value=(s,))
-        self.plugin.handle(key="zk:domain-remove:dmoz.org:testapp", value=val)
+        self.plugin.handle(key="zk:domain-remove:dmoztools.net:testapp", value=val)
         self.plugin.zoo_client.set.assert_called_once_with("/some/path", expected)
 
     def test_zk_handle_bu(self):