diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..86809b7 --- /dev/null +++ b/.flake8 @@ -0,0 +1,6 @@ +[flake8] +exclude: + __pycache__ + random_headers_list.py + __init__.py + ./airflow/modules/tests/* \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 9b39621..da1fe3d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,8 @@ python: services: - docker -before_script: pip install docker-compose +before_script: pip install docker-compose flake8 script: - docker-compose run airflow sh -c "python -m pytest -v --show-capture=no" + - python -m flake8 diff --git a/airflow/dags/dags_config.py b/airflow/dags/dags_config.py index 5b4b423..021f77f 100644 --- a/airflow/dags/dags_config.py +++ b/airflow/dags/dags_config.py @@ -9,7 +9,7 @@ class Config: REDIS_CONFIG = { "host": "redis", "port": "6379", - "db": 0 + "db": 0 } REDIS_KEY = "proxies" @@ -26,6 +26,6 @@ class Config: "deadspin": "https://deadspin.com/rss" } - BOOTSTRAP_SERVERS = ["kafka:9092"] + BOOTSTRAP_SERVERS = ["kafka:9092"] TOPIC = "rss" diff --git a/airflow/dags/rss_news_dag.py b/airflow/dags/rss_news_dag.py index 8920319..aaa6360 100644 --- a/airflow/dags/rss_news_dag.py +++ b/airflow/dags/rss_news_dag.py @@ -1,18 +1,16 @@ from datetime import datetime from airflow import DAG from airflow.operators.python_operator import PythonOperator -from airflow.operators.bash_operator import BashOperator from dags_config import Config as config from rss_news import export_news_to_broker from proxypool import update_proxypool - def dummy_callable(task_id, action, dag): def foo(action): return f"{datetime.now()}: {action} scrapping RSS feeds!" - + return PythonOperator( task_id=task_id, python_callable=foo, @@ -39,7 +37,7 @@ def exporting_events(config, rss_feed, dag): ) proxypool = PythonOperator( - task_id=f"updating_proxypoool", + task_id="updating_proxypoool", python_callable=update_proxypool, op_kwargs={"config": config}, dag=dag diff --git a/airflow/modules/log/__init__.py b/airflow/modules/log/__init__.py index 721610f..4f65c7c 100644 --- a/airflow/modules/log/__init__.py +++ b/airflow/modules/log/__init__.py @@ -1 +1 @@ -from log.log import log, Logger \ No newline at end of file +from log.log import log, Logger diff --git a/airflow/modules/parser/__init__.py b/airflow/modules/parser/__init__.py index e9b7c93..54e695d 100644 --- a/airflow/modules/parser/__init__.py +++ b/airflow/modules/parser/__init__.py @@ -1 +1 @@ -from parser.web_parser import WebParser \ No newline at end of file +from parser.web_parser import WebParser diff --git a/airflow/modules/parser/web_parser.py b/airflow/modules/parser/web_parser.py index c958a02..259d76e 100644 --- a/airflow/modules/parser/web_parser.py +++ b/airflow/modules/parser/web_parser.py @@ -2,7 +2,7 @@ import re import random from contextlib import closing -from requests import get +from requests import get from log import log from parser.random_headers_list import headers_list diff --git a/airflow/modules/proxypool/__init__.py b/airflow/modules/proxypool/__init__.py index 24f2fca..ae58fd4 100644 --- a/airflow/modules/proxypool/__init__.py +++ b/airflow/modules/proxypool/__init__.py @@ -1,4 +1,4 @@ from proxypool.redis_proxypool_client import RedisProxyPoolClient from proxypool.main import update_proxypool from proxypool.proxypool_scraper import ProxyPoolScraper, ProxyRecord -from proxypool.proxypool_validator import ProxyPoolValidator \ No newline at end of file +from proxypool.proxypool_validator import ProxyPoolValidator diff --git a/airflow/modules/proxypool/main.py b/airflow/modules/proxypool/main.py index 795b82f..59e3151 100644 --- a/airflow/modules/proxypool/main.py +++ b/airflow/modules/proxypool/main.py @@ -13,9 +13,9 @@ def update_proxypool(config): with ThreadPoolExecutor(max_workers=config.MAX_WORKERS) as executor: results = executor.map(proxy_validator.validate_proxy, proxy_stream) valid_proxies = filter( - lambda x: x.is_valid == True, results + lambda x: x.is_valid is True, results ) - + with RedisProxyPoolClient(config.REDIS_KEY, config.REDIS_CONFIG) as client: client.override_existing_proxies( [json.dumps(record.proxy) for record in valid_proxies] diff --git a/airflow/modules/proxypool/proxypool_scraper.py b/airflow/modules/proxypool/proxypool_scraper.py index df1bd99..ae0cb96 100644 --- a/airflow/modules/proxypool/proxypool_scraper.py +++ b/airflow/modules/proxypool/proxypool_scraper.py @@ -6,7 +6,7 @@ @dataclass class ProxyRecord: ip_address: str - port: int + port: int country_code: str country: str anonymity: str @@ -20,7 +20,7 @@ def __post_init__(self): def format_proxy(self): protocol = "https" if self.https == "yes" else "http" - url = f"{protocol}://{self.ip_address}:{self.port}" + url = f"{protocol}://{self.ip_address}:{self.port}" return {"http": url, "https": url} @@ -35,7 +35,8 @@ def get_proxy_stream(self, limit): map(self._clear_up_record, raw_records) ) for record in clean_records[:limit]: - if record: yield ProxyRecord(*record) + if record: + yield ProxyRecord(*record) def extract_table_raw_records(self): content = self.parser.get_content() @@ -48,6 +49,6 @@ def extract_table_raw_records(self): def _clear_up_record(self, raw_record): return [ - val.text for val + val.text for val in raw_record.find_all("td") ] diff --git a/airflow/modules/proxypool/proxypool_validator.py b/airflow/modules/proxypool/proxypool_validator.py index 2e2b620..9071aca 100644 --- a/airflow/modules/proxypool/proxypool_validator.py +++ b/airflow/modules/proxypool/proxypool_validator.py @@ -5,14 +5,14 @@ @dataclass(frozen=True) class ProxyStatus: - proxy: str + proxy: str is_valid: bool @log class ProxyPoolValidator: def __init__(self, url, timeout=10): - self.timeout = timeout + self.timeout = timeout self.parser = WebParser(url, rotate_header=True) def validate_proxy(self, proxy_record): @@ -21,7 +21,7 @@ def validate_proxy(self, proxy_record): proxies=proxy_record.proxy ) proxy_status = ProxyStatus( - proxy_record.proxy, + proxy_record.proxy, content is not None ) self.logger.info(f"Proxy status: {proxy_status}") diff --git a/airflow/modules/proxypool/redis_proxypool_client.py b/airflow/modules/proxypool/redis_proxypool_client.py index 9448958..bd43e51 100644 --- a/airflow/modules/proxypool/redis_proxypool_client.py +++ b/airflow/modules/proxypool/redis_proxypool_client.py @@ -33,4 +33,4 @@ def __exit__(self, type, value, traceback): client_id = self.redis.client_id() self.redis.client_kill_filter( _id=client_id - ) \ No newline at end of file + ) diff --git a/airflow/modules/retry/__init__.py b/airflow/modules/retry/__init__.py index a0d429a..2664db0 100644 --- a/airflow/modules/retry/__init__.py +++ b/airflow/modules/retry/__init__.py @@ -1 +1 @@ -from retry.retry_on_exception import RetryOnException \ No newline at end of file +from retry.retry_on_exception import RetryOnException diff --git a/airflow/modules/retry/retry_on_exception.py b/airflow/modules/retry/retry_on_exception.py index e56fdab..7ea2d85 100644 --- a/airflow/modules/retry/retry_on_exception.py +++ b/airflow/modules/retry/retry_on_exception.py @@ -23,7 +23,7 @@ def wrapper(*args, **kwargs): return wrapper def _raise_on_condition(self, retries, exception): - if retries == 0: + if retries == 0: raise exception - else: + else: self.logger.info(f"Retries: {retries}") diff --git a/airflow/modules/rss_news/__init__.py b/airflow/modules/rss_news/__init__.py index 38664fb..3619d4b 100644 --- a/airflow/modules/rss_news/__init__.py +++ b/airflow/modules/rss_news/__init__.py @@ -1,3 +1,3 @@ from rss_news.main import export_news_to_broker from rss_news.rss_news_producer import NewsProducer, NewsFormatter, News -from rss_news.rss_news_exporter import NewsExporter \ No newline at end of file +from rss_news.rss_news_exporter import NewsExporter diff --git a/airflow/modules/rss_news/main.py b/airflow/modules/rss_news/main.py index c173d37..e460e13 100644 --- a/airflow/modules/rss_news/main.py +++ b/airflow/modules/rss_news/main.py @@ -1,4 +1,3 @@ -import argparse from proxypool import RedisProxyPoolClient from log import Logger from rss_news.rss_news_producer import NewsProducer @@ -22,4 +21,3 @@ def export_news_to_broker(config, rss_feed): config.TOPIC, news.as_dict() ) - diff --git a/airflow/modules/rss_news/rss_news_exporter.py b/airflow/modules/rss_news/rss_news_exporter.py index 5ad9bcb..f6cffd2 100644 --- a/airflow/modules/rss_news/rss_news_exporter.py +++ b/airflow/modules/rss_news/rss_news_exporter.py @@ -9,7 +9,7 @@ def __init__(self, bootstrap_servers): bootstrap_servers=bootstrap_servers, value_serializer=lambda x: self._encode(x) ) - + def _encode(self, value): return json.dumps(value).encode("utf-8") @@ -28,4 +28,3 @@ def export_news_to_broker(self, topic, record, sleep_time=0.01): def __exit__(self, type, value, traceback): self.producer.close() - \ No newline at end of file diff --git a/airflow/modules/rss_news/rss_news_producer.py b/airflow/modules/rss_news/rss_news_producer.py index 391242e..2ba572e 100644 --- a/airflow/modules/rss_news/rss_news_producer.py +++ b/airflow/modules/rss_news/rss_news_producer.py @@ -2,10 +2,8 @@ import re from dataclasses import dataclass import atoma -from dateutil import parser import langdetect from parser import WebParser -from rss_news.rss_news_exporter import NewsExporter @dataclass(frozen=True) @@ -17,7 +15,7 @@ class News: description: str author: str language: str - + def as_dict(self): return self.__dict__ @@ -33,7 +31,7 @@ def _extract_news_feed_items(self, proxies): return news_feed.items def get_news_stream(self, proxies): - news_feed_items = self._extract_news_feed_items(proxies) + news_feed_items = self._extract_news_feed_items(proxies) for entry in news_feed_items: formatted_entry = self.formatter.format_entry(entry) yield formatted_entry diff --git a/airflow/modules/tests/fixtures.py b/airflow/modules/tests/fixtures.py index 8b98c1e..ec0e5a0 100644 --- a/airflow/modules/tests/fixtures.py +++ b/airflow/modules/tests/fixtures.py @@ -36,11 +36,11 @@ def proxy_record(): yield ProxyRecord( "127.0.0.1", 8080, - "PL", - "POLAND", - "gold", - "no", - "no", + "PL", + "POLAND", + "gold", + "no", + "no", "30 minutes ago" ) @@ -65,7 +65,7 @@ def redis_config(): yield { "host": "redis", "port": "6379", - "db": 0 + "db": 0 } @@ -76,7 +76,7 @@ def helper(status_code): response.status_code = status_code response.headers['Content-Type'] = "text/html" return response - yield helper + yield helper @pytest.fixture() @@ -86,7 +86,7 @@ def helper(filename): "tests", f"dataresources/{filename}" ) - + yield helper @@ -94,7 +94,7 @@ def helper(filename): def add_function(): @retry(5) - def func(a , b): + def func(a, b): return a + b - yield func \ No newline at end of file + yield func diff --git a/airflow/modules/tests/proxypool/test_proxypool_scraper.py b/airflow/modules/tests/proxypool/test_proxypool_scraper.py index 7d15ca4..5896a50 100644 --- a/airflow/modules/tests/proxypool/test_proxypool_scraper.py +++ b/airflow/modules/tests/proxypool/test_proxypool_scraper.py @@ -1,4 +1,3 @@ -import pytest from proxypool import ProxyRecord from unittest.mock import patch @@ -8,7 +7,7 @@ @patch("parser.web_parser.WebParser.get_content") def test_get_proxy_stream(get_content, raw_content, web_parser, scraper): get_content.return_value = raw_content("proxy_list_file.txt") - + scraper.parser = web_parser stream = scraper.get_proxy_stream(5) diff --git a/airflow/modules/tests/proxypool/test_proxypool_validator.py b/airflow/modules/tests/proxypool/test_proxypool_validator.py index 784a4b7..2b5f552 100644 --- a/airflow/modules/tests/proxypool/test_proxypool_validator.py +++ b/airflow/modules/tests/proxypool/test_proxypool_validator.py @@ -1,4 +1,3 @@ -import pytest from unittest.mock import patch from proxypool import ProxyPoolValidator diff --git a/airflow/modules/tests/proxypool/test_redis_proxypool_client.py b/airflow/modules/tests/proxypool/test_redis_proxypool_client.py index f183277..c76bedf 100644 --- a/airflow/modules/tests/proxypool/test_redis_proxypool_client.py +++ b/airflow/modules/tests/proxypool/test_redis_proxypool_client.py @@ -1,7 +1,5 @@ import json from unittest.mock import patch -import pytest -import fakeredis from proxypool import RedisProxyPoolClient from ..fixtures import redis_config, redis_mock, proxies @@ -32,7 +30,7 @@ def test_list_existing_proxies(redis, redis_config, redis_mock, proxies): redis_client = RedisProxyPoolClient(key, redis_config) redis_client.redis = redis_mock - + result = redis_client.list_existing_proxies() - + assert result == proxies diff --git a/airflow/modules/tests/rss_news/test_rss_news_exporter.py b/airflow/modules/tests/rss_news/test_rss_news_exporter.py index 5b9565c..b11954e 100644 --- a/airflow/modules/tests/rss_news/test_rss_news_exporter.py +++ b/airflow/modules/tests/rss_news/test_rss_news_exporter.py @@ -16,5 +16,5 @@ def test_export_news_to_broker(export_news_to_broker): } export_news_to_broker(topic, news) - + export_news_to_broker.assert_called_once_with(topic, news) diff --git a/airflow/modules/tests/rss_news/test_rss_news_producer.py b/airflow/modules/tests/rss_news/test_rss_news_producer.py index 94228eb..f32fef6 100644 --- a/airflow/modules/tests/rss_news/test_rss_news_producer.py +++ b/airflow/modules/tests/rss_news/test_rss_news_producer.py @@ -34,7 +34,7 @@ def test_construct_id(formatter, title, expected_id): def test_unify_date(formatter): expected = "2020-05-17 00:00:00" - + date = datetime.datetime(2020, 5, 17) result = formatter.unify_date(date) diff --git a/airflow/requirements.txt b/airflow/requirements.txt index 08cedb8..bd0b7c3 100644 --- a/airflow/requirements.txt +++ b/airflow/requirements.txt @@ -7,4 +7,5 @@ redis==3.5.3 requests==2.23.0 fakeredis==1.4.1 langdetect==1.0.8 -pytest==5.4.3 \ No newline at end of file +pytest==5.4.3 +flake8=3.8.3 \ No newline at end of file diff --git a/api/service/app.py b/api/service/app.py index 93dfa9a..a95bbb5 100644 --- a/api/service/app.py +++ b/api/service/app.py @@ -22,7 +22,7 @@ def get(self, phrase): "title": phrase } } - } + } ) news_list = response.get("hits").get("hits")