Extend Ruff to tests and with extra rules (#83)

This PR makes Ruff run on tests as well and adds extra rules: - More Pylint rules not yet covered by default checks - Isort for import order - Pydocstyle rules - I ignored quite a few rules here, which we might want to activate (D100 - D107), but we can do that in separate PRs as they require manual effort. I only had to fix one issue and add one local ignore. All other issues were auto-fixed, so these rules shouldn't lead to additional overhead.
ml6team · May 8, 2023 · b0465d0 · b0465d0
1 parent 02c0f06
commit b0465d0
Show file tree

Hide file tree

Showing 21 changed files with 112 additions and 113 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,17 +3,13 @@ ci:
   autoupdate_schedule: monthly
 repos:
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: 'v0.0.254'
+    rev: 'v0.0.264'
     hooks:
       - id: ruff
         files:  |
             (?x)^(
                 fondant/.*|
-                examples/pipelines/hf_dataset_pipeline|
-                examples/pipelines/finetune_stable_diffusion/components/load_from_hub_component|
-                examples/pipelines/finetune_stable_diffusion/components/image_filter_component|
-                examples/pipelines/finetune_stable_diffusion/components/embedding_component|
-                examples/pipelines/finetune_stable_diffusion/dataset_creation_pipeline.py|
+                tests/.*|
             )$
         args: [--fix, --exit-non-zero-on-fix]
 

diff --git a/examples/pipelines/simple_pipeline/components/image_filtering/src/main.py b/examples/pipelines/simple_pipeline/components/image_filtering/src/main.py
@@ -2,7 +2,6 @@
 This component filters images of the dataset based on image size (minimum height and width).
 """
 import logging
-from typing import Dict
 
 import dask.dataframe as dd
 

diff --git a/examples/pipelines/simple_pipeline/config/general_config.py b/examples/pipelines/simple_pipeline/config/general_config.py
@@ -29,7 +29,7 @@ class KubeflowConfig(GeneralConfig):
         HOST (str): kfp host url
     """
 
-    BASE_PATH = f"gcs://soy-audio-379412_kfp-artifacts/custom_artifact"
+    BASE_PATH = "gcs://soy-audio-379412_kfp-artifacts/custom_artifact"
     CLUSTER_NAME = "kfp-fondant"
     CLUSTER_ZONE = "europe-west4-a"
     HOST = "https://52074149b1563463-dot-europe-west1.pipelines.googleusercontent.com"
diff --git a/examples/pipelines/simple_pipeline/simple_pipeline.py b/examples/pipelines/simple_pipeline/simple_pipeline.py
@@ -47,7 +47,7 @@ def simple_pipeline(
     ).set_display_name("Load initial images")
 
     # Component 2
-    image_filter_task = image_filtering_op(
+    image_filtering_op(
         input_manifest_path=load_from_hub_task.outputs["output_manifest_path"],
         min_width=image_filter_min_width,
         min_height=image_filter_min_height,

diff --git a/fondant/component.py b/fondant/component.py
@@ -8,21 +8,21 @@
 import argparse
 import json
 import logging
-from pathlib import Path
 import typing as t
 from abc import ABC, abstractmethod
+from pathlib import Path
 
 import dask.dataframe as dd
 
-from fondant.component_spec import FondantComponentSpec, kubeflow2python_type, Argument
+from fondant.component_spec import Argument, FondantComponentSpec, kubeflow2python_type
 from fondant.dataset import FondantDataset
 from fondant.manifest import Manifest
 
 logger = logging.getLogger(__name__)
 
 
 class FondantComponent(ABC):
-    """Abstract base class for a Fondant component"""
+    """Abstract base class for a Fondant component."""
 
     def __init__(self, spec: FondantComponentSpec) -> None:
         self.spec = spec
@@ -32,7 +32,7 @@ def __init__(self, spec: FondantComponentSpec) -> None:
     def from_file(
         cls, path: t.Union[str, Path] = "../fondant_component.yaml"
     ) -> "FondantComponent":
-        """Create a component from a component spec file
+        """Create a component from a component spec file.
 
         Args:
             path: Path to the component spec file
@@ -55,7 +55,7 @@ def _get_component_arguments(self) -> t.Dict[str, Argument]:
 
     @abstractmethod
     def _add_and_parse_args(self) -> argparse.Namespace:
-        """Abstract method to add and parse the component arguments"""
+        """Abstract method to add and parse the component arguments."""
 
     @property
     def user_arguments(self) -> t.Dict[str, t.Any]:
@@ -68,17 +68,16 @@ def user_arguments(self) -> t.Dict[str, t.Any]:
 
     @abstractmethod
     def _load_or_create_manifest(self) -> Manifest:
-        """Abstract method that returns the dataset manifest"""
+        """Abstract method that returns the dataset manifest."""
 
     @abstractmethod
     def _process_dataset(self, dataset: FondantDataset) -> dd.DataFrame:
         """Abstract method that processes the input dataframe of the `FondantDataset` and
-        returns another dataframe"""
+        returns another dataframe.
+        """
 
     def run(self):
-        """
-        Runs the component.
-        """
+        """Runs the component."""
         input_manifest = self._load_or_create_manifest()
         input_dataset = FondantDataset(input_manifest)
 
@@ -100,7 +99,7 @@ def upload_manifest(self, manifest: Manifest, save_path: str):
 
 
 class FondantLoadComponent(FondantComponent):
-    """Base class for a Fondant load component"""
+    """Base class for a Fondant load component."""
 
     def _add_and_parse_args(self):
         parser = argparse.ArgumentParser()
@@ -145,7 +144,7 @@ def _load_or_create_manifest(self) -> Manifest:
 
     @abstractmethod
     def load(self, **kwargs) -> dd.DataFrame:
-        """Abstract method that loads the initial dataframe"""
+        """Abstract method that loads the initial dataframe."""
 
     def _process_dataset(self, dataset: FondantDataset) -> dd.DataFrame:
         """This function loads the initial dataframe sing the user-provided `load` method.
@@ -160,7 +159,7 @@ def _process_dataset(self, dataset: FondantDataset) -> dd.DataFrame:
 
 
 class FondantTransformComponent(FondantComponent):
-    """Base class for a Fondant transform component"""
+    """Base class for a Fondant transform component."""
 
     def _add_and_parse_args(self):
         parser = argparse.ArgumentParser()
@@ -181,7 +180,7 @@ def _load_or_create_manifest(self) -> Manifest:
 
     @abstractmethod
     def transform(self, dataframe: dd.DataFrame, **kwargs) -> dd.DataFrame:
-        """Abstract method for applying data transformations to the input dataframe"""
+        """Abstract method for applying data transformations to the input dataframe."""
 
     def _process_dataset(self, dataset: FondantDataset) -> dd.DataFrame:
         """

diff --git a/fondant/component_spec.py b/fondant/component_spec.py
@@ -1,19 +1,19 @@
 """This module defines classes to represent an Fondant component specification."""
 import copy
 import json
-import yaml
 import pkgutil
 import types
 import typing as t
-from pathlib import Path
 from dataclasses import dataclass
+from pathlib import Path
 
 import jsonschema.exceptions
+import yaml
 from jsonschema import Draft4Validator
 from jsonschema.validators import RefResolver
 
 from fondant.exceptions import InvalidComponentSpec
-from fondant.schema import Field, Type, KubeflowCommandArguments
+from fondant.schema import Field, KubeflowCommandArguments, Type
 
 # TODO: Change after upgrading to kfp v2
 # :https://www.kubeflow.org/docs/components/pipelines/v2/data-types/parameters/
@@ -40,7 +40,7 @@
 @dataclass
 class Argument:
     """
-    Kubeflow component argument
+    Kubeflow component argument.
 
     Args:
         name: name of the argument
@@ -94,11 +94,10 @@ def __init__(self, specification: t.Dict[str, t.Any]) -> None:
         self._validate_spec()
 
     def _validate_spec(self) -> None:
-        """Validate a component specification against the component schema
+        """Validate a component specification against the component schema.
 
         Raises: InvalidComponent when the component specification is not valid.
         """
-
         spec_data = pkgutil.get_data("fondant", "schemas/component_spec.json")
 
         if spec_data is None:
@@ -118,13 +117,13 @@ def _validate_spec(self) -> None:
 
     @classmethod
     def from_file(cls, path: t.Union[str, Path]) -> "FondantComponentSpec":
-        """Load the component spec from the file specified by the provided path"""
+        """Load the component spec from the file specified by the provided path."""
         with open(path, encoding="utf-8") as file_:
             specification = yaml.safe_load(file_)
             return cls(specification)
 
     def to_file(self, path) -> None:
-        """Dump the component spec to the file specified by the provided path"""
+        """Dump the component spec to the file specified by the provided path."""
         with open(path, "w", encoding="utf-8") as file_:
             yaml.dump(self._specification, file_)
 
@@ -142,7 +141,7 @@ def image(self):
 
     @property
     def input_subsets(self) -> t.Mapping[str, ComponentSubset]:
-        """The input subsets of the component as an immutable mapping"""
+        """The input subsets of the component as an immutable mapping."""
         return types.MappingProxyType(
             {
                 name: ComponentSubset(subset)
@@ -153,7 +152,7 @@ def input_subsets(self) -> t.Mapping[str, ComponentSubset]:
 
     @property
     def output_subsets(self) -> t.Mapping[str, ComponentSubset]:
-        """The output subsets of the component as an immutable mapping"""
+        """The output subsets of the component as an immutable mapping."""
         return types.MappingProxyType(
             {
                 name: ComponentSubset(subset)
@@ -267,7 +266,7 @@ def _dump_args(args: t.Iterable[Argument]) -> KubeflowCommandArguments:
         return dumped_args
 
     def to_file(self, path: t.Union[str, Path]) -> None:
-        """Dump the component specification to the file specified by the provided path"""
+        """Dump the component specification to the file specified by the provided path."""
         with open(path, "w", encoding="utf-8") as file_:
             yaml.dump(
                 self._specification,
@@ -279,7 +278,7 @@ def to_file(self, path: t.Union[str, Path]) -> None:
 
     @property
     def input_arguments(self) -> t.Mapping[str, Argument]:
-        """The input arguments of the component as an immutable mapping"""
+        """The input arguments of the component as an immutable mapping."""
         return types.MappingProxyType(
             {
                 info["name"]: Argument(
@@ -293,7 +292,7 @@ def input_arguments(self) -> t.Mapping[str, Argument]:
 
     @property
     def output_arguments(self) -> t.Mapping[str, Argument]:
-        """The output arguments of the component as an immutable mapping"""
+        """The output arguments of the component as an immutable mapping."""
         return types.MappingProxyType(
             {
                 info["name"]: Argument(

diff --git a/fondant/dataset.py b/fondant/dataset.py
@@ -6,7 +6,7 @@
 import dask.dataframe as dd
 
 from fondant.component_spec import FondantComponentSpec
-from fondant.manifest import Manifest, Field
+from fondant.manifest import Field, Manifest
 
 logger = logging.getLogger(__name__)
 
@@ -37,7 +37,6 @@ def _load_subset(self, subset_name: str, fields: t.List[str]) -> dd.DataFrame:
         Returns:
             The subset as a dask dataframe
         """
-
         subset = self.manifest.subsets[subset_name]
         remote_path = subset.location
         index_fields = list(self.manifest.index.fields.keys())
@@ -78,7 +77,7 @@ def _load_index(self) -> dd.DataFrame:
     def load_dataframe(self, spec: FondantComponentSpec) -> dd.DataFrame:
         """
         Function that loads the subsets defined in the component spec as a single Dask dataframe for
-          the user
+          the user.
 
         Args:
             spec: the fondant component spec
@@ -95,7 +94,7 @@ def load_dataframe(self, spec: FondantComponentSpec) -> dd.DataFrame:
             # left joins -> filter on index
             df = dd.merge(df, subset_df, on=["id", "source"], how="left")
 
-        logging.info("Columns of dataframe:", list(df.columns))
+        logging.info(f"Columns of dataframe: {list(df.columns)}")
 
         return df
 
@@ -116,7 +115,6 @@ def _create_write_dataframe_task(
              A delayed Dask task that uploads the DataFrame to the remote storage location when
               executed.
         """
-
         # Define task to upload index to remote storage
         write_task = dd.to_parquet(
             df, remote_path, schema=schema, overwrite=False, compute=False
@@ -126,6 +124,7 @@ def _create_write_dataframe_task(
     def write_index(self, df: dd.DataFrame):
         """
         Write the index dataframe to a remote location.
+
         Args:
             df: The output Dask dataframe returned by the user.
         """
@@ -163,7 +162,7 @@ def verify_subset_columns(
         ) -> t.List[str]:
             """
             Verify that all the fields defined in the output subset are present in
-            the output dataframe
+            the output dataframe.
             """
             # TODO: add logic for `additional fields`
             subset_columns = [f"{subset_name}_{field}" for field in subset_fields]
@@ -181,9 +180,7 @@ def verify_subset_columns(
         def create_subset_dataframe(
             subset_name: str, subset_columns: t.List[str], df: dd.DataFrame
         ):
-            """
-            Create subset dataframe to save with the original field name as the column name
-            """
+            """Create subset dataframe to save with the original field name as the column name."""
             # Create a new dataframe with only the columns needed for the output subset
             subset_df = df[subset_columns]
 

diff --git a/fondant/import_utils.py b/fondant/import_utils.py
@@ -1,11 +1,9 @@
-"""
-Import utils
-"""
-import logging
-import importlib.util
+"""Import utils."""
 import importlib.metadata
-from pathlib import Path
+import importlib.util
+import logging
 import sys
+from pathlib import Path
 
 logger = logging.getLogger(__name__)
 
@@ -36,9 +34,8 @@ def is_package_available(package_name: str, import_error_msg: str) -> bool:
         package_name (str): the name of the package
         import_error_msg (str): the error message to return if the package is not found
     Returns:
-        bool: check if package is available
+        bool: check if package is available.
     """
-
     package_available = importlib.util.find_spec(package_name) is not None
 
     try:
@@ -54,15 +51,15 @@ def is_package_available(package_name: str, import_error_msg: str) -> bool:
 
 
 def is_datasets_available():
-    """Check if 'datasets' is available"""
+    """Check if 'datasets' is available."""
     return is_package_available("datasets", DATASETS_IMPORT_ERROR)
 
 
 def is_pandas_available():
-    """Check if 'pandas' is available"""
+    """Check if 'pandas' is available."""
     return is_package_available("pandas", PANDAS_IMPORT_ERROR)
 
 
 def is_kfp_available():
-    """Check if 'pandas' is available"""
+    """Check if 'pandas' is available."""
     return is_package_available("kfp", KFP_IMPORT_ERROR)
diff --git a/fondant/logger.py b/fondant/logger.py
@@ -1,14 +1,14 @@
 """Utils file with useful methods."""
 
 import logging
-import sys
 import os
+import sys
 
 LOG_LEVEL = os.environ.get("LOG_LEVEL", default="INFO")
 
 
 def configure_logging(log_level=LOG_LEVEL) -> None:
-    """Configure the root logger based on config settings"""
+    """Configure the root logger based on config settings."""
     logger = logging.getLogger()
 
     # set loglevel