Skip to content

Commit

Permalink
Add support for linting with ruff (#468)
Browse files Browse the repository at this point in the history
* Add support for linting with ruff

* Add black and enforce black & ruff in CI

* Install dev deps after build so we don't push them

* Run latest black
  • Loading branch information
stchris authored Jun 14, 2023
1 parent 1ba0f49 commit 8d0c62b
Show file tree
Hide file tree
Showing 23 changed files with 94 additions and 39 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ jobs:
- name: Build docker images
run: |
make cached-build
- name: Install development dependencies
run: make dev
- name: Check formatting
run: make format-check
- name: Run linter (ruff)
run: make lint
- name: Run tests
run: make test
- name: Push docker images (hash)
Expand Down
13 changes: 13 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@ services:
shell: services
$(DOCKER) /bin/bash

lint:
ruff check .

format:
black .

format-check:
black --check .

test: services
$(DOCKER) pytest --cov=ingestors --cov-report html --cov-report term

Expand All @@ -45,3 +54,7 @@ clean:
find . -name '*.pyc' -exec rm -f {} +
find . -name '*.pyo' -exec rm -f {} +
find . -type d -name __pycache__ -exec rm -r {} \+

dev:
python3 -m pip install --upgrade pip
python3 -m pip install -q -r requirements-dev.txt
8 changes: 4 additions & 4 deletions ingestors/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ def feed(self, entity):
texts = entity.get_type_values(registry.text)
for text in text_chunks(texts):
detect_languages(self.entity, text)
for (prop, tag) in extract_entities(self.entity, text):
for prop, tag in extract_entities(self.entity, text):
self.aggregator_entities.add(prop, tag)
for (prop, tag) in extract_patterns(self.entity, text):
for prop, tag in extract_patterns(self.entity, text):
self.aggregator_patterns.add(prop, tag)

def flush(self):
Expand All @@ -55,12 +55,12 @@ def flush(self):
)
)

for (key, prop, values) in results:
for key, prop, values in results:
if prop.type == registry.country:
countries.add(key)

mention_ids = set()
for (key, prop, values) in results:
for key, prop, values in results:
label = values[0]
if prop.type == registry.name:
label = registry.name.pick(values)
Expand Down
2 changes: 1 addition & 1 deletion ingestors/analysis/ft_type_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

log = logging.getLogger(__name__)


@SingletonDecorator
class FTTypeModel(object):
def __init__(self, model_path):
Expand All @@ -28,4 +29,3 @@ def _clean_input(self, values):
def _clean_labels(self, labels):
for label in labels:
yield label[0].replace("__label__", "")

2 changes: 1 addition & 1 deletion ingestors/analysis/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def detect_languages(entity, text, k=1):
lid_model = fasttext.load_model(settings.LID_MODEL_PATH)
settings._lang_detector = lid_model
langs = settings._lang_detector.predict(text, k=k)
for (lang, score) in zip(*langs):
for lang, score in zip(*langs):
if score <= THRESHOLD:
continue
# fasttext labels are prefixed, with '__label__' by default
Expand Down
1 change: 0 additions & 1 deletion ingestors/email/calendar.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import icalendar
from vobject.base import ParseError
from banal import ensure_list
from followthemoney import model
from followthemoney.util import sanitize_text
Expand Down
1 change: 0 additions & 1 deletion ingestors/misc/jsonfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@


class JSONIngestor(Ingestor, EncodingSupport):

MIME_TYPES = [
"application/json",
"text/javascript",
Expand Down
2 changes: 0 additions & 2 deletions ingestors/support/convert.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import logging
import os
import pathlib
import tempfile
import subprocess
from tempfile import gettempdir

from followthemoney.helpers import entity_filename

Expand Down
2 changes: 1 addition & 1 deletion ingestors/support/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def get_dates(self, msg, *headers):

def get_identities(self, values):
values = [v for v in ensure_list(values) if v is not None]
for (name, email) in getaddresses(values):
for name, email in getaddresses(values):
yield EmailIdentity(self.manager, name, email)

def get_header_identities(self, msg, *headers):
Expand Down
4 changes: 3 additions & 1 deletion ingestors/support/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ def pdf_alternative_extract(self, entity, pdf_path: str, manager):
entity.set("pdfHash", checksum)
self.parse_and_ingest(pdf_path, entity, manager)

def pdf_extract_page(self, pdf_doc: fitz.Document, page: fitz.Page, page_number: int) -> PdfPageModel:
def pdf_extract_page(
self, pdf_doc: fitz.Document, page: fitz.Page, page_number: int
) -> PdfPageModel:
"""Extract the contents of a single PDF page, using OCR if need be."""
# Extract text
fonts = page.get_fonts()
Expand Down
3 changes: 0 additions & 3 deletions ingestors/util.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import shutil
import locale
import socket
import random
from pathlib import Path
from normality import stringify
from urllib.parse import urlparse
from contextlib import contextmanager


Expand Down
2 changes: 2 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
bump2version==1.0.1
black==23.3.0
ruff==0.0.269
44 changes: 44 additions & 0 deletions ruff.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
select = ["E", "F"]
ignore = ["E501"]

# Allow autofix for all enabled rules (when `--fix`) is provided.
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
unfixable = []

# Exclude a variety of commonly ignored directories.
exclude = [
".bzr",
".direnv",
".eggs",
".git",
".git-rewrite",
".hg",
".mypy_cache",
".nox",
".pants.d",
".pytype",
".ruff_cache",
".svn",
".tox",
".venv",
"__pypackages__",
"_build",
"buck-out",
"build",
"dist",
"node_modules",
"venv",
]

# Same as Black.
line-length = 88

# Allow unused variables when underscore-prefixed.
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"

target-version = "py38"

[mccabe]
# Unlike Flake8, default to a complexity level of 10.
max-complexity = 10
3 changes: 0 additions & 3 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,2 @@
[bdist_wheel]
universal = 1

[flake8]
exclude = docs
2 changes: 1 addition & 1 deletion tests/test_djvu.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ def test_match(self):
self.assertEqual(entity.first("mimeType"), "image/vnd.djvu")

self.assertEqual(len(self.manager.entities), 11 + 1)
self.assertIn(u"Executive Orders", self.manager.entities[0].first("bodyText"))
self.assertIn("Executive Orders", self.manager.entities[0].first("bodyText"))
self.assertEqual(entity.schema.name, "Pages")
6 changes: 3 additions & 3 deletions tests/test_emlx.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ def test_plaintext(self):
self.manager.ingest(fixture_path, entity)
self.assertSuccess(entity)
pprint(entity.to_dict())
self.assertEqual(entity.first("subject"), u"Re: Emlx library")
self.assertIn(u"Python", entity.first("bodyText"))
self.assertEqual(entity.first("subject"), "Re: Emlx library")
self.assertIn("Python", entity.first("bodyText"))

def test_richtext(self):
fixture_path, entity = self.fixture("richtext.emlx")
self.manager.ingest(fixture_path, entity)
self.assertSuccess(entity)
self.assertIn("Emlx library", entity.first("subject"))
self.assertIn(u"Python", entity.first("bodyHtml"))
self.assertIn("Python", entity.first("bodyHtml"))
self.assertEqual(entity.schema.name, "Email")
10 changes: 5 additions & 5 deletions tests/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,27 @@ def test_ingest_on_unicode_file(self):
entity.first("indexText"),
"Ingestors Test web page. The GitHub page.", # noqa
)
self.assertEqual(entity.first("title"), u"Ingestors Title")
self.assertEqual(entity.first("summary"), u"Ingestors description")
self.assertEqual(entity.first("title"), "Ingestors Title")
self.assertEqual(entity.first("summary"), "Ingestors description")
self.assertEqual(
set(entity.get("keywords")), set(["ingestors", "key", "words", "news"])
)

def test_ingest_extra_fixture(self):
fixture_path, entity = self.fixture(u"EDRM Micro Datasets « EDRM.htm")
fixture_path, entity = self.fixture("EDRM Micro Datasets « EDRM.htm")
self.manager.ingest(fixture_path, entity)

self.assertIn(
"Creating Practical Resources to Improve E-Discovery",
entity.first("indexText"),
)
self.assertEqual(entity.first("title"), u"EDRM Micro Datasets \xab EDRM")
self.assertEqual(entity.first("title"), "EDRM Micro Datasets \xab EDRM")
self.assertIsNone(entity.first("summary"))
self.assertEqual(entity.get("keywords"), [])
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)

def test_ingest_empty(self):
fixture_path, entity = self.fixture(u"empty_5_doc_pages.html")
fixture_path, entity = self.fixture("empty_5_doc_pages.html")
self.manager.ingest(fixture_path, entity)

self.assertEqual(entity.first("indexText"), None)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_ingest_on_svg(self):
self.manager.ingest(fixture_path, entity)
# print result.to_dict()

self.assertIn(u"TEST", entity.first("bodyText"))
self.assertIn("TEST", entity.first("bodyText"))
# self.assertIn(u'1..2..3..', result.pages[0]['text'])
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)

Expand Down
2 changes: 0 additions & 2 deletions tests/test_jsonfile.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import json

from .support import TestCase


Expand Down
6 changes: 3 additions & 3 deletions tests/test_msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ def test_thunderbird(self):
self.manager.ingest(fixture_path, entity)
self.assertSuccess(entity)
pprint(entity.to_dict())
self.assertEqual(entity.first("subject"), u"JUnit test message")
self.assertIn(u"Dear Vladimir", entity.first("bodyText"))
self.assertEqual(entity.first("subject"), "JUnit test message")
self.assertIn("Dear Vladimir", entity.first("bodyText"))

def test_naumann(self):
fixture_path, entity = self.fixture("fnf.msg")
self.manager.ingest(fixture_path, entity)
self.assertSuccess(entity)
self.assertIn("Innovationskongress", entity.first("subject"))
self.assertIn(u"freiheit.org", entity.first("bodyHtml"))
self.assertIn("freiheit.org", entity.first("bodyHtml"))
self.assertEqual(entity.schema.name, "Email")

def test_mbox(self):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def test_ingest_pdf_normalized(self):
assert expected[page_no] in page_text

def test_pdf_type3_fonts(self):
""" From https://github.com/pymupdf/PyMuPDF/issues/1943"""
"""From https://github.com/pymupdf/PyMuPDF/issues/1943"""
fixture_path, entity = self.fixture("example.pdf")
self.manager.ingest(fixture_path, entity)

Expand All @@ -251,4 +251,4 @@ def test_pdf_type3_fonts(self):
"487410CB",
"487574AB",
]
assert all([word in text for word in expected_words])
assert all([word in text for word in expected_words])
4 changes: 2 additions & 2 deletions tests/test_tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@ def test_unicode_xls(self):
self.assertEqual(entity.schema.name, "Workbook")
tables = self.get_emitted("Table")
tables = [t.first("title") for t in tables]
self.assertIn(u"Лист1", tables)
self.assertIn("Лист1", tables)

def test_unicode_ods(self):
fixture_path, entity = self.fixture("rom.ods")
self.manager.ingest(fixture_path, entity)
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
tables = self.get_emitted("Table")
tables = [t.first("title") for t in tables]
self.assertIn(u"Лист1", tables)
self.assertIn("Лист1", tables)
self.assertEqual(entity.schema.name, "Workbook")
4 changes: 2 additions & 2 deletions tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def test_match(self):
self.assertEqual(entity.first("mimeType"), "text/plain")

self.assertEqual(
decompose_nfkd(entity.first("bodyText")), decompose_nfkd(u"Îș unî©ođ€.")
decompose_nfkd(entity.first("bodyText")), decompose_nfkd("Îș unî©ođ€.")
)
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
self.assertEqual(entity.schema.name, "PlainText")
Expand All @@ -23,7 +23,7 @@ def test_ingest_binary_mode(self):
fixture_path, entity = self.fixture("non_utf.txt")
self.manager.ingest(fixture_path, entity)

self.assertIn(u"größter", entity.first("bodyText"))
self.assertIn("größter", entity.first("bodyText"))
self.assertEqual(entity.schema.name, "PlainText")

def test_ingest_extra_fixture(self):
Expand Down

0 comments on commit 8d0c62b

Please sign in to comment.