Skip to content

Commit d598dbc

Browse files
Russ Ferridaysabbir-006
authored andcommitted
Update dmoz.org to dmoztools.net since dmoz.org now redirects. (istresearch#145) (istresearch#147)
1 parent e8cc712 commit d598dbc

File tree

7 files changed

+23
-23
lines changed

7 files changed

+23
-23
lines changed

crawler/config/example.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
domains:
2-
dmoz.org:
2+
dmoztools.net:
33
window: 60
44
hits: 60
55
scale: 1.0
66
wikipedia.org:
77
window: 60
88
hits: 30
9-
scale: 0.5
9+
scale: 0.5

crawler/tests/test_distributed_scheduler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ class TestDistributedSchedulerChangeConfig(ThrottleMixin, TestCase):
220220
def test_change_config(self):
221221
good_string = ""\
222222
"domains:\n"\
223-
" dmoz.org:\n"\
223+
" dmoztools.net:\n"\
224224
" window: 60\n"\
225225
" hits: 60\n"\
226226
" scale: 1.0\n"\

docs/topics/crawler/controlling.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ To utilize the different throttle mechanisms you can alter the following setting
141141
Combining Domain Queues and Throttling
142142
--------------------------------------
143143

144-
At the core of Scrapy Cluster is a Redis priority queue that holds all of the requests for a particular spider type and domain, like ``link:dmoz.org:queue``. The configured throttle determines when an individual Scrapy process can receive a new request from the Redis Queues. Only when the throttle says that it is "ok" will the Spider be returned a link to process.
144+
At the core of Scrapy Cluster is a Redis priority queue that holds all of the requests for a particular spider type and domain, like ``link:dmoztools.net:queue``. The configured throttle determines when an individual Scrapy process can receive a new request from the Redis Queues. Only when the throttle says that it is "ok" will the Spider be returned a link to process.
145145

146146
This results in Spiders across the cluster continually polling all available domain queues for new requests, but only receiving requests when the throttle mechanism indicates that the request limit has not gone beyond the max desired configuration. Because the throttle coordination is conducted via Redis, it is not reliant on any one Scrapy process to determine whether the cluster can or can't crawl a particular domain.
147147

docs/topics/crawler/extension.rst

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ Then, feed your cluster.
199199

200200
::
201201

202-
python kafka_monitor.py feed '{"url": "http://dmoz.org", "appid":"testapp", "crawlid":"test123456", "spiderid":"wandering"}'
202+
python kafka_monitor.py feed '{"url": "http://dmoztools.net", "appid":"testapp", "crawlid":"test123456", "spiderid":"wandering"}'
203203

204204
If you are looking at your ``demo.crawled_firehose`` Kafka Topic using the ``kafkadump.py`` script, you will begin to see output like so...
205205

@@ -208,8 +208,8 @@ If you are looking at your ``demo.crawled_firehose`` Kafka Topic using the ``kaf
208208
{
209209
"body": <omitted>,
210210
"crawlid": "test123456",
211-
"response_url": "http://www.dmoz.org/",
212-
"url": "http://www.dmoz.org/",
211+
"response_url": "http://www.dmoztools.net/",
212+
"url": "http://www.dmoztools.net/",
213213
"status_code": 200,
214214
"status_msg": "OK",
215215
"appid": "testapp",
@@ -228,8 +228,8 @@ If you are looking at your ``demo.crawled_firehose`` Kafka Topic using the ``kaf
228228
{
229229
"body": <omitted>,
230230
"crawlid": "test123456",
231-
"response_url": "http://www.dmoz.org/Computers/Hardware/",
232-
"url": "http://www.dmoz.org/Computers/Hardware/",
231+
"response_url": "http://www.dmoztools.net/Computers/Hardware/",
232+
"url": "http://www.dmoztools.net/Computers/Hardware/",
233233
"status_code": 200,
234234
"status_msg": "OK",
235235
"appid": "testapp",
@@ -273,4 +273,4 @@ You can also fire up more than one crawl job at a time, and track the steps that
273273
"wandering_spider_count": 4
274274
}
275275

276-
You now have two different examples of how Scrapy Cluster extends Scrapy to give you distributed crawling capabilities.
276+
You now have two different examples of how Scrapy Cluster extends Scrapy to give you distributed crawling capabilities.

docs/topics/introduction/quickstart.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -478,7 +478,7 @@ The following things will occur for this action request:
478478

479479
::
480480

481-
{u'server_time': 1450817666, u'crawlid': u'abc1234', u'total_pending': 25, u'total_domains': 2, u'spiderid': u'link', u'appid': u'testapp', u'domains': {u'twitter.com': {u'low_priority': -9, u'high_priority': -9, u'total': 1}, u'dmoz.org': {u'low_priority': -9, u'high_priority': -9, u'total': 24}}, u'uuid': u'someuuid'}
481+
{u'server_time': 1450817666, u'crawlid': u'abc1234', u'total_pending': 25, u'total_domains': 2, u'spiderid': u'link', u'appid': u'testapp', u'domains': {u'twitter.com': {u'low_priority': -9, u'high_priority': -9, u'total': 1}, u'dmoztools.net': {u'low_priority': -9, u'high_priority': -9, u'total': 24}}, u'uuid': u'someuuid'}
482482

483483
In this case we had 25 urls pending in the queue, so yours may be slightly different.
484484

docs/topics/kafka-monitor/api.rst

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,9 @@ Kafka Request:
7474

7575
::
7676

77-
$ python kafka_monitor.py feed '{"url": "http://www.dmoz.org/", "appid":"testapp", "crawlid":"abc123", "maxdepth":2, "priority":90}'
77+
$ python kafka_monitor.py feed '{"url": "http://www.dmoztools.net/", "appid":"testapp", "crawlid":"abc123", "maxdepth":2, "priority":90}'
7878

79-
- Submits a dmoz.org crawl spidering 2 levels deep with a high priority
79+
- Submits a dmoztools.net crawl spidering 2 levels deep with a high priority
8080

8181
::
8282

@@ -899,15 +899,15 @@ Zookeeper Request:
899899

900900
::
901901

902-
$ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-update", "domain":"dmoz.org", "hits":60, "window":60, "scale":0.9}'
902+
$ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-update", "domain":"dmoztools.net", "hits":60, "window":60, "scale":0.9}'
903903

904904
Response from Kafka:
905905

906906
::
907907

908908
{
909909
"action": "domain-update",
910-
"domain": "dmoz.org",
910+
"domain": "dmoztools.net",
911911
"server_time": 1464402128,
912912
"uuid": "abc123",
913913
"appid": "madisonTest"
@@ -923,15 +923,15 @@ Zookeeper Request:
923923

924924
::
925925

926-
$ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-remove", "domain":"dmoz.org"}'
926+
$ python kafka_monitor.py feed '{"uuid":"abc123", "appid":"madisonTest", "action":"domain-remove", "domain":"dmoztools.net"}'
927927

928928
Response from Kafka:
929929

930930
::
931931

932932
{
933933
"action": "domain-remove",
934-
"domain": "dmoz.org",
934+
"domain": "dmoztools.net",
935935
"server_time": 1464402146,
936936
"uuid": "abc123",
937937
"appid": "madisonTest"

redis-monitor/tests/test_plugins.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ def test_stats_get_queue(self):
322322
'link:istresearch.com:queue',
323323
'link:yellowpages.com:queue',
324324
'link:cnn.com:queue',
325-
'wandering:dmoz.org:queue',
325+
'wandering:dmoztools.net:queue',
326326
'wandering:craigslist.org:queue',
327327
])
328328
results = [5, 10, 11, 1, 3]
@@ -349,7 +349,7 @@ def ret_val(*args):
349349
'spider_backlog': 4,
350350
'num_domains': 2,
351351
'domains': [
352-
{'domain': 'dmoz.org', 'backlog': 1},
352+
{'domain': 'dmoztools.net', 'backlog': 1},
353353
{'domain': 'craigslist.org', 'backlog': 3},
354354
]
355355
}
@@ -395,20 +395,20 @@ def test_zk_regex(self):
395395

396396
def test_zk_handle_du(self):
397397
# domain update
398-
s = b'blacklist: []\ndomains:\n dmoz.org: {hits: 60, scale: 1.0, window: 60}\n'
398+
s = b'blacklist: []\ndomains:\n dmoztools.net: {hits: 60, scale: 1.0, window: 60}\n'
399399
val = '{"uuid":"blah123","hits":15,"scale":0.9,"window":60}'
400-
expected = b'blacklist: []\ndomains:\n cnn.com:\n hits: 15\n scale: 0.9\n window: 60\n dmoz.org:\n hits: 60\n scale: 1.0\n window: 60\n'
400+
expected = b'blacklist: []\ndomains:\n cnn.com:\n hits: 15\n scale: 0.9\n window: 60\n dmoztools.net:\n hits: 60\n scale: 1.0\n window: 60\n'
401401
self.plugin.zoo_client.get = MagicMock(return_value=(s,))
402402
self.plugin.handle(key="zk:domain-update:cnn.com:testapp", value=val)
403403
self.plugin.zoo_client.set.assert_called_once_with("/some/path", expected)
404404

405405
def test_zk_handle_dr(self):
406406
# domain remove
407-
s = b'blacklist: []\ndomains:\n dmoz.org: {hits: 60, scale: 1.0, window: 60}\n'
407+
s = b'blacklist: []\ndomains:\n dmoztools.net: {hits: 60, scale: 1.0, window: 60}\n'
408408
val = '{"uuid":"blah123"}'
409409
expected = b'blacklist: []\ndomains: {}\n'
410410
self.plugin.zoo_client.get = MagicMock(return_value=(s,))
411-
self.plugin.handle(key="zk:domain-remove:dmoz.org:testapp", value=val)
411+
self.plugin.handle(key="zk:domain-remove:dmoztools.net:testapp", value=val)
412412
self.plugin.zoo_client.set.assert_called_once_with("/some/path", expected)
413413

414414
def test_zk_handle_bu(self):

0 commit comments

Comments
 (0)