Skip to content

Commit

Permalink
Extend Ruff to tests and with extra rules (#83)
Browse files Browse the repository at this point in the history
This PR makes Ruff run on tests as well and adds extra rules:
- More Pylint rules not yet covered by default checks
- Isort for import order
- Pydocstyle rules
- I ignored quite a few rules here, which we might want to activate
(D100 - D107), but we can do that in separate PRs as they require manual
effort.

I only had to fix one issue and add one local ignore. All other issues
were auto-fixed, so these rules shouldn't lead to additional overhead.
  • Loading branch information
RobbeSneyders authored May 8, 2023
1 parent 02c0f06 commit b0465d0
Show file tree
Hide file tree
Showing 21 changed files with 112 additions and 113 deletions.
8 changes: 2 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,13 @@ ci:
autoupdate_schedule: monthly
repos:
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: 'v0.0.254'
rev: 'v0.0.264'
hooks:
- id: ruff
files: |
(?x)^(
fondant/.*|
examples/pipelines/hf_dataset_pipeline|
examples/pipelines/finetune_stable_diffusion/components/load_from_hub_component|
examples/pipelines/finetune_stable_diffusion/components/image_filter_component|
examples/pipelines/finetune_stable_diffusion/components/embedding_component|
examples/pipelines/finetune_stable_diffusion/dataset_creation_pipeline.py|
tests/.*|
)$
args: [--fix, --exit-non-zero-on-fix]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
This component filters images of the dataset based on image size (minimum height and width).
"""
import logging
from typing import Dict

import dask.dataframe as dd

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class KubeflowConfig(GeneralConfig):
HOST (str): kfp host url
"""

BASE_PATH = f"gcs://soy-audio-379412_kfp-artifacts/custom_artifact"
BASE_PATH = "gcs://soy-audio-379412_kfp-artifacts/custom_artifact"
CLUSTER_NAME = "kfp-fondant"
CLUSTER_ZONE = "europe-west4-a"
HOST = "https://52074149b1563463-dot-europe-west1.pipelines.googleusercontent.com"
2 changes: 1 addition & 1 deletion examples/pipelines/simple_pipeline/simple_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def simple_pipeline(
).set_display_name("Load initial images")

# Component 2
image_filter_task = image_filtering_op(
image_filtering_op(
input_manifest_path=load_from_hub_task.outputs["output_manifest_path"],
min_width=image_filter_min_width,
min_height=image_filter_min_height,
Expand Down
27 changes: 13 additions & 14 deletions fondant/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,21 @@
import argparse
import json
import logging
from pathlib import Path
import typing as t
from abc import ABC, abstractmethod
from pathlib import Path

import dask.dataframe as dd

from fondant.component_spec import FondantComponentSpec, kubeflow2python_type, Argument
from fondant.component_spec import Argument, FondantComponentSpec, kubeflow2python_type
from fondant.dataset import FondantDataset
from fondant.manifest import Manifest

logger = logging.getLogger(__name__)


class FondantComponent(ABC):
"""Abstract base class for a Fondant component"""
"""Abstract base class for a Fondant component."""

def __init__(self, spec: FondantComponentSpec) -> None:
self.spec = spec
Expand All @@ -32,7 +32,7 @@ def __init__(self, spec: FondantComponentSpec) -> None:
def from_file(
cls, path: t.Union[str, Path] = "../fondant_component.yaml"
) -> "FondantComponent":
"""Create a component from a component spec file
"""Create a component from a component spec file.
Args:
path: Path to the component spec file
Expand All @@ -55,7 +55,7 @@ def _get_component_arguments(self) -> t.Dict[str, Argument]:

@abstractmethod
def _add_and_parse_args(self) -> argparse.Namespace:
"""Abstract method to add and parse the component arguments"""
"""Abstract method to add and parse the component arguments."""

@property
def user_arguments(self) -> t.Dict[str, t.Any]:
Expand All @@ -68,17 +68,16 @@ def user_arguments(self) -> t.Dict[str, t.Any]:

@abstractmethod
def _load_or_create_manifest(self) -> Manifest:
"""Abstract method that returns the dataset manifest"""
"""Abstract method that returns the dataset manifest."""

@abstractmethod
def _process_dataset(self, dataset: FondantDataset) -> dd.DataFrame:
"""Abstract method that processes the input dataframe of the `FondantDataset` and
returns another dataframe"""
returns another dataframe.
"""

def run(self):
"""
Runs the component.
"""
"""Runs the component."""
input_manifest = self._load_or_create_manifest()
input_dataset = FondantDataset(input_manifest)

Expand All @@ -100,7 +99,7 @@ def upload_manifest(self, manifest: Manifest, save_path: str):


class FondantLoadComponent(FondantComponent):
"""Base class for a Fondant load component"""
"""Base class for a Fondant load component."""

def _add_and_parse_args(self):
parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -145,7 +144,7 @@ def _load_or_create_manifest(self) -> Manifest:

@abstractmethod
def load(self, **kwargs) -> dd.DataFrame:
"""Abstract method that loads the initial dataframe"""
"""Abstract method that loads the initial dataframe."""

def _process_dataset(self, dataset: FondantDataset) -> dd.DataFrame:
"""This function loads the initial dataframe sing the user-provided `load` method.
Expand All @@ -160,7 +159,7 @@ def _process_dataset(self, dataset: FondantDataset) -> dd.DataFrame:


class FondantTransformComponent(FondantComponent):
"""Base class for a Fondant transform component"""
"""Base class for a Fondant transform component."""

def _add_and_parse_args(self):
parser = argparse.ArgumentParser()
Expand All @@ -181,7 +180,7 @@ def _load_or_create_manifest(self) -> Manifest:

@abstractmethod
def transform(self, dataframe: dd.DataFrame, **kwargs) -> dd.DataFrame:
"""Abstract method for applying data transformations to the input dataframe"""
"""Abstract method for applying data transformations to the input dataframe."""

def _process_dataset(self, dataset: FondantDataset) -> dd.DataFrame:
"""
Expand Down
25 changes: 12 additions & 13 deletions fondant/component_spec.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
"""This module defines classes to represent an Fondant component specification."""
import copy
import json
import yaml
import pkgutil
import types
import typing as t
from pathlib import Path
from dataclasses import dataclass
from pathlib import Path

import jsonschema.exceptions
import yaml
from jsonschema import Draft4Validator
from jsonschema.validators import RefResolver

from fondant.exceptions import InvalidComponentSpec
from fondant.schema import Field, Type, KubeflowCommandArguments
from fondant.schema import Field, KubeflowCommandArguments, Type

# TODO: Change after upgrading to kfp v2
# :https://www.kubeflow.org/docs/components/pipelines/v2/data-types/parameters/
Expand All @@ -40,7 +40,7 @@
@dataclass
class Argument:
"""
Kubeflow component argument
Kubeflow component argument.
Args:
name: name of the argument
Expand Down Expand Up @@ -94,11 +94,10 @@ def __init__(self, specification: t.Dict[str, t.Any]) -> None:
self._validate_spec()

def _validate_spec(self) -> None:
"""Validate a component specification against the component schema
"""Validate a component specification against the component schema.
Raises: InvalidComponent when the component specification is not valid.
"""

spec_data = pkgutil.get_data("fondant", "schemas/component_spec.json")

if spec_data is None:
Expand All @@ -118,13 +117,13 @@ def _validate_spec(self) -> None:

@classmethod
def from_file(cls, path: t.Union[str, Path]) -> "FondantComponentSpec":
"""Load the component spec from the file specified by the provided path"""
"""Load the component spec from the file specified by the provided path."""
with open(path, encoding="utf-8") as file_:
specification = yaml.safe_load(file_)
return cls(specification)

def to_file(self, path) -> None:
"""Dump the component spec to the file specified by the provided path"""
"""Dump the component spec to the file specified by the provided path."""
with open(path, "w", encoding="utf-8") as file_:
yaml.dump(self._specification, file_)

Expand All @@ -142,7 +141,7 @@ def image(self):

@property
def input_subsets(self) -> t.Mapping[str, ComponentSubset]:
"""The input subsets of the component as an immutable mapping"""
"""The input subsets of the component as an immutable mapping."""
return types.MappingProxyType(
{
name: ComponentSubset(subset)
Expand All @@ -153,7 +152,7 @@ def input_subsets(self) -> t.Mapping[str, ComponentSubset]:

@property
def output_subsets(self) -> t.Mapping[str, ComponentSubset]:
"""The output subsets of the component as an immutable mapping"""
"""The output subsets of the component as an immutable mapping."""
return types.MappingProxyType(
{
name: ComponentSubset(subset)
Expand Down Expand Up @@ -267,7 +266,7 @@ def _dump_args(args: t.Iterable[Argument]) -> KubeflowCommandArguments:
return dumped_args

def to_file(self, path: t.Union[str, Path]) -> None:
"""Dump the component specification to the file specified by the provided path"""
"""Dump the component specification to the file specified by the provided path."""
with open(path, "w", encoding="utf-8") as file_:
yaml.dump(
self._specification,
Expand All @@ -279,7 +278,7 @@ def to_file(self, path: t.Union[str, Path]) -> None:

@property
def input_arguments(self) -> t.Mapping[str, Argument]:
"""The input arguments of the component as an immutable mapping"""
"""The input arguments of the component as an immutable mapping."""
return types.MappingProxyType(
{
info["name"]: Argument(
Expand All @@ -293,7 +292,7 @@ def input_arguments(self) -> t.Mapping[str, Argument]:

@property
def output_arguments(self) -> t.Mapping[str, Argument]:
"""The output arguments of the component as an immutable mapping"""
"""The output arguments of the component as an immutable mapping."""
return types.MappingProxyType(
{
info["name"]: Argument(
Expand Down
15 changes: 6 additions & 9 deletions fondant/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import dask.dataframe as dd

from fondant.component_spec import FondantComponentSpec
from fondant.manifest import Manifest, Field
from fondant.manifest import Field, Manifest

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -37,7 +37,6 @@ def _load_subset(self, subset_name: str, fields: t.List[str]) -> dd.DataFrame:
Returns:
The subset as a dask dataframe
"""

subset = self.manifest.subsets[subset_name]
remote_path = subset.location
index_fields = list(self.manifest.index.fields.keys())
Expand Down Expand Up @@ -78,7 +77,7 @@ def _load_index(self) -> dd.DataFrame:
def load_dataframe(self, spec: FondantComponentSpec) -> dd.DataFrame:
"""
Function that loads the subsets defined in the component spec as a single Dask dataframe for
the user
the user.
Args:
spec: the fondant component spec
Expand All @@ -95,7 +94,7 @@ def load_dataframe(self, spec: FondantComponentSpec) -> dd.DataFrame:
# left joins -> filter on index
df = dd.merge(df, subset_df, on=["id", "source"], how="left")

logging.info("Columns of dataframe:", list(df.columns))
logging.info(f"Columns of dataframe: {list(df.columns)}")

return df

Expand All @@ -116,7 +115,6 @@ def _create_write_dataframe_task(
A delayed Dask task that uploads the DataFrame to the remote storage location when
executed.
"""

# Define task to upload index to remote storage
write_task = dd.to_parquet(
df, remote_path, schema=schema, overwrite=False, compute=False
Expand All @@ -126,6 +124,7 @@ def _create_write_dataframe_task(
def write_index(self, df: dd.DataFrame):
"""
Write the index dataframe to a remote location.
Args:
df: The output Dask dataframe returned by the user.
"""
Expand Down Expand Up @@ -163,7 +162,7 @@ def verify_subset_columns(
) -> t.List[str]:
"""
Verify that all the fields defined in the output subset are present in
the output dataframe
the output dataframe.
"""
# TODO: add logic for `additional fields`
subset_columns = [f"{subset_name}_{field}" for field in subset_fields]
Expand All @@ -181,9 +180,7 @@ def verify_subset_columns(
def create_subset_dataframe(
subset_name: str, subset_columns: t.List[str], df: dd.DataFrame
):
"""
Create subset dataframe to save with the original field name as the column name
"""
"""Create subset dataframe to save with the original field name as the column name."""
# Create a new dataframe with only the columns needed for the output subset
subset_df = df[subset_columns]

Expand Down
19 changes: 8 additions & 11 deletions fondant/import_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
"""
Import utils
"""
import logging
import importlib.util
"""Import utils."""
import importlib.metadata
from pathlib import Path
import importlib.util
import logging
import sys
from pathlib import Path

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -36,9 +34,8 @@ def is_package_available(package_name: str, import_error_msg: str) -> bool:
package_name (str): the name of the package
import_error_msg (str): the error message to return if the package is not found
Returns:
bool: check if package is available
bool: check if package is available.
"""

package_available = importlib.util.find_spec(package_name) is not None

try:
Expand All @@ -54,15 +51,15 @@ def is_package_available(package_name: str, import_error_msg: str) -> bool:


def is_datasets_available():
"""Check if 'datasets' is available"""
"""Check if 'datasets' is available."""
return is_package_available("datasets", DATASETS_IMPORT_ERROR)


def is_pandas_available():
"""Check if 'pandas' is available"""
"""Check if 'pandas' is available."""
return is_package_available("pandas", PANDAS_IMPORT_ERROR)


def is_kfp_available():
"""Check if 'pandas' is available"""
"""Check if 'pandas' is available."""
return is_package_available("kfp", KFP_IMPORT_ERROR)
4 changes: 2 additions & 2 deletions fondant/logger.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
"""Utils file with useful methods."""

import logging
import sys
import os
import sys

LOG_LEVEL = os.environ.get("LOG_LEVEL", default="INFO")


def configure_logging(log_level=LOG_LEVEL) -> None:
"""Configure the root logger based on config settings"""
"""Configure the root logger based on config settings."""
logger = logging.getLogger()

# set loglevel
Expand Down
Loading

0 comments on commit b0465d0

Please sign in to comment.