Skip to content

Commit

Permalink
logging: File log handler changed to Stream handler. Sentry extension…
Browse files Browse the repository at this point in the history
  • Loading branch information
oguzdemirbasci committed Aug 3, 2021
1 parent 2cd44d9 commit 95ec629
Show file tree
Hide file tree
Showing 15 changed files with 79 additions and 18 deletions.
21 changes: 21 additions & 0 deletions hepcrawl/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@

from __future__ import absolute_import, division, print_function

import sentry_sdk

from scrapy import signals
from scrapy.exceptions import NotConfigured


class ErrorHandler(object):
Expand All @@ -31,3 +34,21 @@ def spider_error(self, failure, response, spider, signal=None, sender=None, *arg
'exception': failure,
'sender': response,
})


class SentryLogging(object):
"""
Send exceptions and errors to Sentry.
"""

@classmethod
def from_crawler(cls, crawler):
sentry_dsn = crawler.settings.get('SENTRY_DSN', None)
if sentry_dsn is None:
raise NotConfigured
# instantiate the extension object
ext = cls()
# instantiate
sentry_sdk.init(sentry_dsn)
# return the extension object
return ext
4 changes: 0 additions & 4 deletions hepcrawl/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ def _prepare_payload(self, spider):
job_id=os.environ['SCRAPY_JOB'],
results_uri=os.environ['SCRAPY_FEED_URI'],
results_data=self.results_data,
log_file=os.environ['SCRAPY_LOG_FILE'],
)
payload['errors'] = [
{'exception': str(err['exception']), 'sender':str(err['sender'])}
Expand Down Expand Up @@ -168,9 +167,6 @@ def close_spider(self, spider):
"kwargs": self._prepare_payload(spider)
}

spider.logger.info(
'Sending results:\n%s' % pprint.pformat(json_data))

requests.post(api_url, json=json_data)

self._cleanup(spider)
Expand Down
2 changes: 1 addition & 1 deletion hepcrawl/scrapyd.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# [scrapyd]
# eggs_dir = eggs
# logs_dir = logs
# logs_dir =
# items_dir =
# jobs_to_keep = 5
# dbs_dir = dbs
Expand Down
10 changes: 8 additions & 2 deletions hepcrawl/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,9 @@
SENTRY_DSN = os.environ.get('APP_SENTRY_DSN')
if SENTRY_DSN:
EXTENSIONS = {
'scrapy_sentry.extensions.Errors': 100,
'hepcrawl.extensions.ErrorHandler': 200,
'hepcrawl.extensions.SentryLogging': 100,
'scrapy_sentry.extensions.Errors': 200,
'hepcrawl.extensions.ErrorHandler': 300,
}

# Configure item pipelines
Expand Down Expand Up @@ -178,6 +179,11 @@ def redis_url(service_name, database):
'SERVER_NAME': 'https://labs.inspirehep.net',
}

# LOGGER Settings
LOG_FILE = None
LOGS_ENABLED = True
LOG_LEVEL = 'INFO'

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
Expand Down
3 changes: 1 addition & 2 deletions hepcrawl/testlib/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class Config(object):


@app.task
def submit_results(job_id, errors, log_file, results_uri, results_data=None):
def submit_results(job_id, errors, results_uri, results_data=None):
"""Receive the submission of the results of a crawl job."""

def _extract_results_data(results_path):
Expand All @@ -52,7 +52,6 @@ def _extract_results_data(results_path):
return {
'job_id': job_id,
'errors': errors,
'log_file': log_file,
'results_url': results_uri,
'results_data': results_data,
}
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
'Twisted~=18.0,>=18.9.0',
#latex parsing
'pylatexenc~=2.9',
'queuelib==1.5.0'
'queuelib==1.5.0',
]

tests_require = [
Expand All @@ -73,6 +73,7 @@
'sentry': [
'raven~=6.0,>=6.2.1',
'scrapy-sentry',
'sentry-sdk==1.3.0',
],
}

Expand Down
2 changes: 1 addition & 1 deletion tests/functional/arxiv/test_arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def test_arxiv(
crawler_instance=crawler,
project=config['CRAWLER_PROJECT'],
spider=spider,
settings={},
settings={'LOG_FILE': None},
**config['CRAWLER_ARGUMENTS']
)

Expand Down
1 change: 0 additions & 1 deletion tests/unit/responses/aps/aps_single_parsed.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
{
"errors": [],
"results_uri": "scrapy_feed_uri",
"log_file": "scrapy_log_file",
"results_data": [
{
"page_nr": [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
{
"errors": [],
"results_uri": "scrapy_feed_uri",
"log_file": "scrapy_log_file",
"results_data": [
{
"_collections": ["Literature"],
Expand Down
1 change: 0 additions & 1 deletion tests/unit/test_desy.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def get_records(response_file_name):
# environmental variables needed for the pipelines payload
os.environ['SCRAPY_JOB'] = 'scrapy_job'
os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

spider = create_spider()
records = spider.parse(
Expand Down
22 changes: 21 additions & 1 deletion tests/unit/test_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
from scrapy.crawler import Crawler
from scrapy.utils.project import get_project_settings

from hepcrawl.extensions import ErrorHandler
from hepcrawl.extensions import ErrorHandler, SentryLogging
from hepcrawl.spiders.wsp_spider import WorldScientificSpider

from hepcrawl.testlib.fixtures import fake_response_from_file

import mock


@pytest.fixture
def crawler():
Expand All @@ -40,3 +42,21 @@ def test_error_handler(crawler):
assert 'errors' in crawler.spider.state
assert crawler.spider.state['errors'][0]["exception"] == "Some failure"
assert crawler.spider.state['errors'][0]["sender"] == response


@mock.patch("hepcrawl.extensions.sentry_sdk.init")
def test_sentry_logging_init(mock_sentry_sdk, crawler):
"""Test SentryLogging extension."""
log_settings = {
"SENTRY_DSN" : "TEST_SENTRY_DSN",
"EXTENSIONS" : {
'hepcrawl.extensions.SentryLogging': 100,
'scrapy_sentry.extensions.Errors': 200,
'hepcrawl.extensions.ErrorHandler': 300,
}
}
settings = get_project_settings()
settings.update(log_settings)
crawler.settings = settings
SentryLogging.from_crawler(crawler)
mock_sentry_sdk.assert_called_once()
1 change: 0 additions & 1 deletion tests/unit/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ def test_prepare_payload(
_, json_record = json_spider_record
os.environ['SCRAPY_JOB'] = 'scrapy_job'
os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

fixed_time = expected_response['results_data'][0]['acquisition_source']['datetime']
freezer = freeze_time(fixed_time)
Expand Down
1 change: 0 additions & 1 deletion tests/unit/test_pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ def generated_conference_paper(scrape_pos_conference_paper_page_body):
# environmental variables needed for the pipelines payload
os.environ['SCRAPY_JOB'] = 'scrapy_job'
os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

crawler = Crawler(spidercls=pos_spider.POSSpider)
spider = pos_spider.POSSpider.from_crawler(crawler)
Expand Down
24 changes: 24 additions & 0 deletions tests/unit/test_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2015, 2016, 2017, 2019 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

import logging

from scrapy.utils.project import get_project_settings
from scrapy.utils.log import (configure_logging, logger)


def test_log_settings():
settings = get_project_settings()
assert settings.get('LOG_FILE') == None
assert settings.get('LOGS_ENABLED') == True
assert settings.get('LOG_LEVEL') == 'INFO'

configure_logging(settings=settings)
assert any(isinstance(handler, logging.StreamHandler) for handler in logger.root.handlers)
assert not any(isinstance(handler, logging.FileHandler) for handler in logger.root.handlers)
1 change: 0 additions & 1 deletion tests/unit/test_world_scientific.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def get_records(response_file_name):
# environmental variables needed for the pipelines payload
os.environ['SCRAPY_JOB'] = 'scrapy_job'
os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

spider = create_spider()
records = spider.parse(
Expand Down

0 comments on commit 95ec629

Please sign in to comment.