Add support for linting with ruff (#468)

* Add support for linting with ruff * Add black and enforce black & ruff in CI * Install dev deps after build so we don't push them * Run latest black
alephdata · Jun 14, 2023 · 8d0c62b · 8d0c62b
1 parent 1ba0f49
commit 8d0c62b
Show file tree

Hide file tree

Showing 23 changed files with 94 additions and 39 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -21,6 +21,12 @@ jobs:
       - name: Build docker images
         run: |
           make cached-build
+      - name: Install development dependencies
+        run: make dev
+      - name: Check formatting
+        run: make format-check
+      - name: Run linter (ruff)
+        run: make lint
       - name: Run tests
         run: make test
       - name: Push docker images (hash)

diff --git a/Makefile b/Makefile
@@ -26,6 +26,15 @@ services:
 shell: services
 	$(DOCKER) /bin/bash
 
+lint:
+	ruff check .
+
+format:
+	black .
+
+format-check:
+	black --check .
+
 test: services
 	$(DOCKER) pytest --cov=ingestors --cov-report html --cov-report term
 
@@ -45,3 +54,7 @@ clean:
 	find . -name '*.pyc' -exec rm -f {} +
 	find . -name '*.pyo' -exec rm -f {} +
 	find . -type d -name __pycache__ -exec rm -r {} \+
+
+dev:
+	python3 -m pip install --upgrade pip
+	python3 -m pip install -q -r requirements-dev.txt
diff --git a/ingestors/analysis/__init__.py b/ingestors/analysis/__init__.py
@@ -41,9 +41,9 @@ def feed(self, entity):
         texts = entity.get_type_values(registry.text)
         for text in text_chunks(texts):
             detect_languages(self.entity, text)
-            for (prop, tag) in extract_entities(self.entity, text):
+            for prop, tag in extract_entities(self.entity, text):
                 self.aggregator_entities.add(prop, tag)
-            for (prop, tag) in extract_patterns(self.entity, text):
+            for prop, tag in extract_patterns(self.entity, text):
                 self.aggregator_patterns.add(prop, tag)
 
     def flush(self):
@@ -55,12 +55,12 @@ def flush(self):
             )
         )
 
-        for (key, prop, values) in results:
+        for key, prop, values in results:
             if prop.type == registry.country:
                 countries.add(key)
 
         mention_ids = set()
-        for (key, prop, values) in results:
+        for key, prop, values in results:
             label = values[0]
             if prop.type == registry.name:
                 label = registry.name.pick(values)

diff --git a/ingestors/analysis/ft_type_model.py b/ingestors/analysis/ft_type_model.py
@@ -8,6 +8,7 @@
 
 log = logging.getLogger(__name__)
 
+
 @SingletonDecorator
 class FTTypeModel(object):
     def __init__(self, model_path):
@@ -28,4 +29,3 @@ def _clean_input(self, values):
     def _clean_labels(self, labels):
         for label in labels:
             yield label[0].replace("__label__", "")
-
diff --git a/ingestors/analysis/language.py b/ingestors/analysis/language.py
@@ -17,7 +17,7 @@ def detect_languages(entity, text, k=1):
         lid_model = fasttext.load_model(settings.LID_MODEL_PATH)
         settings._lang_detector = lid_model
     langs = settings._lang_detector.predict(text, k=k)
-    for (lang, score) in zip(*langs):
+    for lang, score in zip(*langs):
         if score <= THRESHOLD:
             continue
         # fasttext labels are prefixed, with '__label__' by default

diff --git a/ingestors/email/calendar.py b/ingestors/email/calendar.py
@@ -1,6 +1,5 @@
 import logging
 import icalendar
-from vobject.base import ParseError
 from banal import ensure_list
 from followthemoney import model
 from followthemoney.util import sanitize_text

diff --git a/ingestors/misc/jsonfile.py b/ingestors/misc/jsonfile.py
@@ -8,7 +8,6 @@
 
 
 class JSONIngestor(Ingestor, EncodingSupport):
-
     MIME_TYPES = [
         "application/json",
         "text/javascript",

diff --git a/ingestors/support/convert.py b/ingestors/support/convert.py
@@ -1,9 +1,7 @@
 import logging
 import os
 import pathlib
-import tempfile
 import subprocess
-from tempfile import gettempdir
 
 from followthemoney.helpers import entity_filename
 

diff --git a/ingestors/support/email.py b/ingestors/support/email.py
@@ -99,7 +99,7 @@ def get_dates(self, msg, *headers):
 
     def get_identities(self, values):
         values = [v for v in ensure_list(values) if v is not None]
-        for (name, email) in getaddresses(values):
+        for name, email in getaddresses(values):
             yield EmailIdentity(self.manager, name, email)
 
     def get_header_identities(self, msg, *headers):

diff --git a/ingestors/support/pdf.py b/ingestors/support/pdf.py
@@ -96,7 +96,9 @@ def pdf_alternative_extract(self, entity, pdf_path: str, manager):
         entity.set("pdfHash", checksum)
         self.parse_and_ingest(pdf_path, entity, manager)
 
-    def pdf_extract_page(self, pdf_doc: fitz.Document, page: fitz.Page, page_number: int) -> PdfPageModel:
+    def pdf_extract_page(
+        self, pdf_doc: fitz.Document, page: fitz.Page, page_number: int
+    ) -> PdfPageModel:
         """Extract the contents of a single PDF page, using OCR if need be."""
         # Extract text
         fonts = page.get_fonts()

diff --git a/ingestors/util.py b/ingestors/util.py
@@ -1,10 +1,7 @@
 import shutil
 import locale
-import socket
-import random
 from pathlib import Path
 from normality import stringify
-from urllib.parse import urlparse
 from contextlib import contextmanager
 
 

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1 +1,3 @@
 bump2version==1.0.1
+black==23.3.0
+ruff==0.0.269
diff --git a/ruff.toml b/ruff.toml
@@ -0,0 +1,44 @@
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+select = ["E", "F"]
+ignore = ["E501"]
+
+# Allow autofix for all enabled rules (when `--fix`) is provided.
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+unfixable = []
+
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".git-rewrite",
+    ".hg",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "venv",
+]
+
+# Same as Black.
+line-length = 88
+
+# Allow unused variables when underscore-prefixed.
+dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+
+target-version = "py38"
+
+[mccabe]
+# Unlike Flake8, default to a complexity level of 10.
+max-complexity = 10
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,2 @@
 [bdist_wheel]
 universal = 1
-
-[flake8]
-exclude = docs
diff --git a/tests/test_djvu.py b/tests/test_djvu.py
@@ -11,5 +11,5 @@ def test_match(self):
         self.assertEqual(entity.first("mimeType"), "image/vnd.djvu")
 
         self.assertEqual(len(self.manager.entities), 11 + 1)
-        self.assertIn(u"Executive Orders", self.manager.entities[0].first("bodyText"))
+        self.assertIn("Executive Orders", self.manager.entities[0].first("bodyText"))
         self.assertEqual(entity.schema.name, "Pages")
diff --git a/tests/test_emlx.py b/tests/test_emlx.py
@@ -10,13 +10,13 @@ def test_plaintext(self):
         self.manager.ingest(fixture_path, entity)
         self.assertSuccess(entity)
         pprint(entity.to_dict())
-        self.assertEqual(entity.first("subject"), u"Re: Emlx library")
-        self.assertIn(u"Python", entity.first("bodyText"))
+        self.assertEqual(entity.first("subject"), "Re: Emlx library")
+        self.assertIn("Python", entity.first("bodyText"))
 
     def test_richtext(self):
         fixture_path, entity = self.fixture("richtext.emlx")
         self.manager.ingest(fixture_path, entity)
         self.assertSuccess(entity)
         self.assertIn("Emlx library", entity.first("subject"))
-        self.assertIn(u"Python", entity.first("bodyHtml"))
+        self.assertIn("Python", entity.first("bodyHtml"))
         self.assertEqual(entity.schema.name, "Email")
diff --git a/tests/test_html.py b/tests/test_html.py
@@ -17,27 +17,27 @@ def test_ingest_on_unicode_file(self):
             entity.first("indexText"),
             "Ingestors Test web page. The GitHub page.",  # noqa
         )
-        self.assertEqual(entity.first("title"), u"Ingestors Title")
-        self.assertEqual(entity.first("summary"), u"Ingestors description")
+        self.assertEqual(entity.first("title"), "Ingestors Title")
+        self.assertEqual(entity.first("summary"), "Ingestors description")
         self.assertEqual(
             set(entity.get("keywords")), set(["ingestors", "key", "words", "news"])
         )
 
     def test_ingest_extra_fixture(self):
-        fixture_path, entity = self.fixture(u"EDRM Micro Datasets « EDRM.htm")
+        fixture_path, entity = self.fixture("EDRM Micro Datasets « EDRM.htm")
         self.manager.ingest(fixture_path, entity)
 
         self.assertIn(
             "Creating Practical Resources to Improve E-Discovery",
             entity.first("indexText"),
         )
-        self.assertEqual(entity.first("title"), u"EDRM Micro Datasets \xab EDRM")
+        self.assertEqual(entity.first("title"), "EDRM Micro Datasets \xab EDRM")
         self.assertIsNone(entity.first("summary"))
         self.assertEqual(entity.get("keywords"), [])
         self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
 
     def test_ingest_empty(self):
-        fixture_path, entity = self.fixture(u"empty_5_doc_pages.html")
+        fixture_path, entity = self.fixture("empty_5_doc_pages.html")
         self.manager.ingest(fixture_path, entity)
 
         self.assertEqual(entity.first("indexText"), None)

diff --git a/tests/test_image.py b/tests/test_image.py
@@ -12,7 +12,7 @@ def test_ingest_on_svg(self):
         self.manager.ingest(fixture_path, entity)
         # print result.to_dict()
 
-        self.assertIn(u"TEST", entity.first("bodyText"))
+        self.assertIn("TEST", entity.first("bodyText"))
         # self.assertIn(u'1..2..3..', result.pages[0]['text'])
         self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
 

diff --git a/tests/test_jsonfile.py b/tests/test_jsonfile.py
@@ -1,5 +1,3 @@
-import json
-
 from .support import TestCase
 
 

diff --git a/tests/test_msg.py b/tests/test_msg.py
@@ -10,15 +10,15 @@ def test_thunderbird(self):
         self.manager.ingest(fixture_path, entity)
         self.assertSuccess(entity)
         pprint(entity.to_dict())
-        self.assertEqual(entity.first("subject"), u"JUnit test message")
-        self.assertIn(u"Dear Vladimir", entity.first("bodyText"))
+        self.assertEqual(entity.first("subject"), "JUnit test message")
+        self.assertIn("Dear Vladimir", entity.first("bodyText"))
 
     def test_naumann(self):
         fixture_path, entity = self.fixture("fnf.msg")
         self.manager.ingest(fixture_path, entity)
         self.assertSuccess(entity)
         self.assertIn("Innovationskongress", entity.first("subject"))
-        self.assertIn(u"freiheit.org", entity.first("bodyHtml"))
+        self.assertIn("freiheit.org", entity.first("bodyHtml"))
         self.assertEqual(entity.schema.name, "Email")
 
     def test_mbox(self):

diff --git a/tests/test_pdf.py b/tests/test_pdf.py
@@ -234,7 +234,7 @@ def test_ingest_pdf_normalized(self):
             assert expected[page_no] in page_text
 
     def test_pdf_type3_fonts(self):
-        """ From https://github.com/pymupdf/PyMuPDF/issues/1943"""
+        """From https://github.com/pymupdf/PyMuPDF/issues/1943"""
         fixture_path, entity = self.fixture("example.pdf")
         self.manager.ingest(fixture_path, entity)
 
@@ -251,4 +251,4 @@ def test_pdf_type3_fonts(self):
             "487410CB",
             "487574AB",
         ]
-        assert all([word in text for word in expected_words])
+        assert all([word in text for word in expected_words])
diff --git a/tests/test_tabular.py b/tests/test_tabular.py
@@ -24,13 +24,13 @@ def test_unicode_xls(self):
         self.assertEqual(entity.schema.name, "Workbook")
         tables = self.get_emitted("Table")
         tables = [t.first("title") for t in tables]
-        self.assertIn(u"Лист1", tables)
+        self.assertIn("Лист1", tables)
 
     def test_unicode_ods(self):
         fixture_path, entity = self.fixture("rom.ods")
         self.manager.ingest(fixture_path, entity)
         self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
         tables = self.get_emitted("Table")
         tables = [t.first("title") for t in tables]
-        self.assertIn(u"Лист1", tables)
+        self.assertIn("Лист1", tables)
         self.assertEqual(entity.schema.name, "Workbook")
diff --git a/tests/test_text.py b/tests/test_text.py
@@ -14,7 +14,7 @@ def test_match(self):
         self.assertEqual(entity.first("mimeType"), "text/plain")
 
         self.assertEqual(
-            decompose_nfkd(entity.first("bodyText")), decompose_nfkd(u"Îș unî©ođ€.")
+            decompose_nfkd(entity.first("bodyText")), decompose_nfkd("Îș unî©ođ€.")
         )
         self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
         self.assertEqual(entity.schema.name, "PlainText")
@@ -23,7 +23,7 @@ def test_ingest_binary_mode(self):
         fixture_path, entity = self.fixture("non_utf.txt")
         self.manager.ingest(fixture_path, entity)
 
-        self.assertIn(u"größter", entity.first("bodyText"))
+        self.assertIn("größter", entity.first("bodyText"))
         self.assertEqual(entity.schema.name, "PlainText")
 
     def test_ingest_extra_fixture(self):
Original file line number	Diff line number	Diff line change
		@@ -1,5 +1,3 @@
		import json

		from .support import TestCase


Expand Down