Skip to content

Commit 08bf7e6

Browse files
author
Madison Bahmer
committed
Merge branch 'dev' into ui
2 parents c921ff3 + d42bcb5 commit 08bf7e6

File tree

78 files changed

+628
-243
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+628
-243
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
1+
.vscode
2+
13
# Python binaries
24
*.pyc
35

6+
.idea/
7+
48
# Sphinx
59
docs/_build
610
docs/_build_html

.travis.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ env:
2020
- docker: 1
2121
dockerfile_name: Dockerfile.py2alpine
2222
docker_tag_suffix: dev-alpine
23+
- docker: 1
24+
dockerfile_name: Dockerfile.py3
25+
docker_tag_suffix: dev-py3
2326

2427
install: true
2528

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Please see the ``requirements.txt`` within each sub project for Pip package depe
1414

1515
Other important components required to run the cluster
1616

17-
- Python 2.7: https://www.python.org/downloads/
17+
- Python 2.7 or 3.6: https://www.python.org/downloads/
1818
- Redis: http://redis.io
1919
- Zookeeper: https://zookeeper.apache.org
2020
- Kafka: http://kafka.apache.org

crawler/crawling/distributed_scheduler.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ def update_ipaddress(self):
282282
try:
283283
obj = urllib.request.urlopen(settings.get('PUBLIC_IP_URL',
284284
'http://ip.42.pl/raw'))
285-
results = self.ip_regex.findall(obj.read())
285+
results = self.ip_regex.findall(obj.read().decode('utf-8'))
286286
if len(results) > 0:
287287
self.my_ip = results[0]
288288
else:
@@ -313,7 +313,8 @@ def report_self(self):
313313
def from_settings(cls, settings):
314314
server = redis.Redis(host=settings.get('REDIS_HOST'),
315315
port=settings.get('REDIS_PORT'),
316-
db=settings.get('REDIS_DB'))
316+
db=settings.get('REDIS_DB'),
317+
decode_responses=True)
317318
persist = settings.get('SCHEDULER_PERSIST', True)
318319
up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10)
319320
hits = settings.get('QUEUE_HITS', 10)
@@ -501,7 +502,7 @@ def next_request(self):
501502

502503
item = self.find_item()
503504
if item:
504-
self.logger.debug("Found url to crawl {url}" \
505+
self.logger.debug(u"Found url to crawl {url}" \
505506
.format(url=item['url']))
506507
try:
507508
req = Request(item['url'])

crawler/crawling/log_retry_middleware.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ def setup(self, settings):
6161
if self.settings['STATS_STATUS_CODES']:
6262
self.redis_conn = redis.Redis(host=self.settings.get('REDIS_HOST'),
6363
port=self.settings.get('REDIS_PORT'),
64-
db=settings.get('REDIS_DB'))
64+
db=settings.get('REDIS_DB'),
65+
decode_responses=True)
6566

6667
try:
6768
self.redis_conn.info()

crawler/crawling/pipelines.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,8 @@ def from_settings(cls, settings):
111111
producer = KafkaProducer(bootstrap_servers=settings['KAFKA_HOSTS'],
112112
retries=3,
113113
linger_ms=settings['KAFKA_PRODUCER_BATCH_LINGER_MS'],
114-
buffer_memory=settings['KAFKA_PRODUCER_BUFFER_BYTES'])
114+
buffer_memory=settings['KAFKA_PRODUCER_BUFFER_BYTES'],
115+
value_serializer=lambda m: m.encode('utf-8'))
115116
except Exception as e:
116117
logger.error("Unable to connect to Kafka in Pipeline"\
117118
", raising exit flag.")

crawler/crawling/redis_stats_middleware.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ def setup(self, settings):
4141
# set up redis
4242
self.redis_conn = redis.Redis(host=settings.get('REDIS_HOST'),
4343
port=settings.get('REDIS_PORT'),
44-
db=settings.get('REDIS_DB'))
44+
db=settings.get('REDIS_DB'),
45+
decode_responses=True)
4546

4647
try:
4748
self.redis_conn.info()

crawler/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ requests==2.13.0 # Updated from 2.11.1
2828
requests-file==1.4.1 # Updated from 1.4
2929
retrying==1.3.3
3030
Scrapy==1.3.3
31-
scutils==1.2.0
31+
../utils # scutils==1.3.0dev0
3232
service-identity==16.0.0
3333
six==1.10.0
3434
testfixtures==4.13.5 # Updated from 4.10.0

crawler/tests/online.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ def setUp(self):
4646
# set up redis
4747
self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
4848
port=self.settings['REDIS_PORT'],
49-
db=self.settings['REDIS_DB'])
49+
db=self.settings['REDIS_DB'],
50+
decode_responses=True)
5051
try:
5152
self.redis_conn.info()
5253
except ConnectionError:
@@ -66,7 +67,8 @@ def setUp(self):
6667
group_id="demo-id",
6768
auto_commit_interval_ms=10,
6869
consumer_timeout_ms=5000,
69-
auto_offset_reset='earliest'
70+
auto_offset_reset='earliest',
71+
value_deserializer=lambda m: m.decode('utf-8')
7072
)
7173
time.sleep(1)
7274

docker/crawler/Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ RUN mkdir -p /usr/src/app
1515
WORKDIR /usr/src/app
1616

1717
# install requirements
18+
COPY utils /usr/src/utils
1819
COPY crawler/requirements.txt /usr/src/app/
1920
RUN pip install --no-cache-dir -r requirements.txt
21+
RUN rm -rf /usr/src/utils
2022

2123
# move codebase over
2224
COPY crawler /usr/src/app

0 commit comments

Comments
 (0)