Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support pyspark sql dataframe validation #1243

Merged
merged 9 commits into from
Jul 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ repos:
- id: check-yaml
description: Attempts to load all yaml files to verify syntax
- id: debug-statements
description: Check for debugger imports and py37+ breakpoint() calls in python source
description: Check for debugger imports and py37+ calls in python source
- id: end-of-file-fixer
description: Makes sure files end in a newline and only a newline
- id: trailing-whitespace
Expand Down
6 changes: 4 additions & 2 deletions .pylintrc
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[BASIC]
ignore=mypy.py,noxfile.py
ignore=mypy.py,noxfile.py,pandera/accessors/pyspark_sql_accessor.py,pandera/engines/pyspark_engine.py,pandera/pyspark.py,pandera/typing/pyspark_sql.py,
ignore-patterns=pandera/api/pyspark/*,tests/pyspark/*
good-names=
T,
F,
Expand Down Expand Up @@ -45,4 +46,5 @@ disable=
function-redefined,
arguments-differ,
unnecessary-dunder-call,
use-dict-literal
use-dict-literal,
invalid-name
76 changes: 46 additions & 30 deletions asv_bench/benchmarks/dataframe_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,20 @@
import pandas as pd

from pandera import (
Column, DataFrameSchema, Bool, Category, Check,
DateTime, Float, Int, Object, String, Timedelta,
check_input, check_output)
Column,
DataFrameSchema,
Bool,
Category,
Check,
DateTime,
Float,
Int,
Object,
String,
Timedelta,
check_input,
check_output,
)


class Validate:
Expand All @@ -14,41 +25,46 @@ class Validate:

def setup(self):
self.schema = DataFrameSchema(
{
"a": Column(Int),
"b": Column(Float),
"c": Column(String),
"d": Column(Bool),
"e": Column(Category),
"f": Column(Object),
"g": Column(DateTime),
"i": Column(Timedelta),
},
)
{
"a": Column(Int),
"b": Column(Float),
"c": Column(String),
"d": Column(Bool),
"e": Column(Category),
"f": Column(Object),
"g": Column(DateTime),
"i": Column(Timedelta),
},
)
self.df = pd.DataFrame(
{
"a": [1, 2, 3],
"b": [1.1, 2.5, 9.9],
"c": ["z", "y", "x"],
"d": [True, True, False],
"e": pd.Series(["c2", "c1", "c3"], dtype="category"),
"f": [(3,), (2,), (1,)],
"g": [pd.Timestamp("2015-02-01"),
pd.Timestamp("2015-02-02"),
pd.Timestamp("2015-02-03")],
"i": [pd.Timedelta(1, unit="D"),
pd.Timedelta(5, unit="D"),
pd.Timedelta(9, unit="D")]
})
{
"a": [1, 2, 3],
"b": [1.1, 2.5, 9.9],
"c": ["z", "y", "x"],
"d": [True, True, False],
"e": pd.Series(["c2", "c1", "c3"], dtype="category"),
"f": [(3,), (2,), (1,)],
"g": [
pd.Timestamp("2015-02-01"),
pd.Timestamp("2015-02-02"),
pd.Timestamp("2015-02-03"),
],
"i": [
pd.Timedelta(1, unit="D"),
pd.Timedelta(5, unit="D"),
pd.Timedelta(9, unit="D"),
],
}
)

def time_df_schema(self):
self.schema.validate(self.df)

def mem_df_schema(self):
self.schema.validate(self.df)
self.schema.validate(self.df)

def peakmem_df_schema(self):
self.schema.validate(self.df)
self.schema.validate(self.df)


class Decorators:
Expand Down
42 changes: 27 additions & 15 deletions asv_bench/benchmarks/series_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,20 @@
import pandas as pd

from pandera import (
Column, DataFrameSchema, SeriesSchema, Bool, Category, Check,
DateTime, Float, Int, Object, String, Timedelta, String)
Column,
DataFrameSchema,
SeriesSchema,
Bool,
Category,
Check,
DateTime,
Float,
Int,
Object,
String,
Timedelta,
String,
)


class Validate:
Expand All @@ -13,23 +25,23 @@ class Validate:

def setup(self):
self.schema = SeriesSchema(
String,
checks=[
Check(lambda s: s.str.startswith("foo")),
Check(lambda s: s.str.endswith("bar")),
Check(lambda x: len(x) > 3, element_wise=True)
],
nullable=False,
unique=False,
name="my_series")
self.series = pd.Series(["foobar", "foobar", "foobar"],
name="my_series")
String,
checks=[
Check(lambda s: s.str.startswith("foo")),
Check(lambda s: s.str.endswith("bar")),
Check(lambda x: len(x) > 3, element_wise=True),
],
nullable=False,
unique=False,
name="my_series",
)
self.series = pd.Series(["foobar", "foobar", "foobar"], name="my_series")

def time_series_schema(self):
self.schema.validate(self.series)

def mem_series_schema(self):
self.schema.validate(self.series)
self.schema.validate(self.series)

def peakmem_series_schema(self):
self.schema.validate(self.series)
self.schema.validate(self.series)
Binary file added docs/.DS_Store
Binary file not shown.
2 changes: 2 additions & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@
)
copybutton_prompt_is_regexp = True


# this is a workaround to filter out forward reference issue in
# sphinx_autodoc_typehints
class FilterPandasTypeAnnotationWarning(pylogging.Filter):
Expand Down Expand Up @@ -215,6 +216,7 @@ def filter(self, record: pylogging.LogRecord) -> bool:
FilterPandasTypeAnnotationWarning()
)


# based on pandas/doc/source/conf.py
def linkcode_resolve(domain, info):
"""Determine the URL corresponding to Python object."""
Expand Down
3 changes: 2 additions & 1 deletion docs/source/dataframe_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,11 @@ You can easily convert a :class:`~pandera.api.pandas.model.DataFrameModel` class
coerce=False,
dtype=None,
index=None,
strict=False
strict=False,
name=InputSchema,
ordered=False,
unique_column_names=False,
metadata=None,
add_missing_columns=False
)>

Expand Down
6 changes: 4 additions & 2 deletions docs/source/dataframe_schemas.rst
Original file line number Diff line number Diff line change
Expand Up @@ -847,10 +847,11 @@ data pipeline:
coerce=False,
dtype=None,
index=None,
strict=True
strict=True,
name=None,
ordered=False,
unique_column_names=False,
metadata=None,
add_missing_columns=False
)>

Expand Down Expand Up @@ -896,10 +897,11 @@ the pipeline output.
name=None,
ordered=True
)>,
strict=True
strict=True,
name=None,
ordered=False,
unique_column_names=False,
metadata=None,
add_missing_columns=False
)>

Expand Down
2 changes: 1 addition & 1 deletion docs/source/pyspark.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

.. _scaling_pyspark:

Data Validation with Pyspark ⭐️ (New)
Data Validation with Pyspark Pandas
=======================================

*new in 0.10.0*
Expand Down
Loading