unionai-oss · cosmicBboy · Jun 23, 2023 · Mar 22, 2023 · Mar 22, 2023 · Mar 22, 2023
diff --git a/pandera/api/base/schema.py b/pandera/api/base/schema.py
@@ -32,6 +32,7 @@ def __init__(
         name=None,
         title=None,
         description=None,
+        drop_invalid=False,
     ):
         """Abstract base schema initializer."""
         self.dtype = dtype
@@ -40,6 +41,7 @@ def __init__(
         self.name = name
         self.title = title
         self.description = description
+        self.drop_invalid = drop_invalid
 
     def validate(
         self,

diff --git a/pandera/api/pandas/array.py b/pandera/api/pandas/array.py
@@ -37,6 +37,7 @@ def __init__(
         title: Optional[str] = None,
         description: Optional[str] = None,
         default: Optional[Any] = None,
+        drop_invalid: bool = False,
     ) -> None:
         """Initialize array schema.
 
@@ -63,6 +64,8 @@ def __init__(
         :param title: A human-readable label for the series.
         :param description: An arbitrary textual description of the series.
         :param default: The default value for missing values in the series.
+        :param drop_invalid: if True, drop invalid rows on validation.
+
         """
 
         super().__init__(
@@ -72,6 +75,7 @@ def __init__(
             name=name,
             title=title,
             description=description,
+            drop_invalid=drop_invalid,
         )
 
         if checks is None:
@@ -300,6 +304,7 @@ def __init__(
         title: Optional[str] = None,
         description: Optional[str] = None,
         default: Optional[Any] = None,
+        drop_invalid: bool = False,
     ) -> None:
         """Initialize series schema base object.
 
@@ -327,6 +332,7 @@ def __init__(
         :param title: A human-readable label for the series.
         :param description: An arbitrary textual description of the series.
         :param default: The default value for missing values in the series.
+        :param drop_invalid: if True, drop invalid rows on validation.
 
         """
         super().__init__(
@@ -340,6 +346,7 @@ def __init__(
             title,
             description,
             default,
+            drop_invalid,
         )
         self.index = index
 

diff --git a/pandera/api/pandas/components.py b/pandera/api/pandas/components.py
@@ -30,6 +30,7 @@ def __init__(
         title: Optional[str] = None,
         description: Optional[str] = None,
         default: Optional[Any] = None,
+        drop_invalid: bool = False,
     ) -> None:
         """Create column validator object.
 
@@ -54,6 +55,7 @@ def __init__(
         :param title: A human-readable label for the column.
         :param description: An arbitrary textual description of the column.
         :param default: The default value for missing values in the column.
+        :param drop_invalid: if True, drop invalid rows on validation.
 
         :raises SchemaInitError: if impossible to build schema from parameters
 
@@ -85,6 +87,7 @@ def __init__(
             title=title,
             description=description,
             default=default,
+            drop_invalid=drop_invalid,
         )
         if (
             name is not None

diff --git a/pandera/api/pandas/container.py b/pandera/api/pandas/container.py
@@ -46,6 +46,7 @@ def __init__(
         unique_column_names: bool = False,
         title: Optional[str] = None,
         description: Optional[str] = None,
+        drop_invalid: bool = False,
     ) -> None:
         """Initialize DataFrameSchema validator.
 
@@ -77,6 +78,7 @@ def __init__(
         :param unique_column_names: whether or not column names must be unique.
         :param title: A human-readable label for the schema.
         :param description: An arbitrary textual description of the schema.
+        :param drop_invalid: if True, drop invalid rows on validation.
 
         :raises SchemaInitError: if impossible to build schema from parameters
 
@@ -152,7 +154,7 @@ def __init__(
         self._unique = unique
         self.report_duplicates = report_duplicates
         self.unique_column_names = unique_column_names
-
+        self.drop_invalid = drop_invalid
         # this attribute is not meant to be accessed by users and is explicitly
         # set to True in the case that a schema is created by infer_schema.
         self._IS_INFERRED = False

diff --git a/pandera/api/pandas/model_config.py b/pandera/api/pandas/model_config.py
@@ -21,6 +21,7 @@ class BaseConfig(BaseModelConfig):  # pylint:disable=R0903
     title: Optional[str] = None  #: human-readable label for schema
     description: Optional[str] = None  #: arbitrary textual description
     coerce: bool = False  #: coerce types of all schema components
+    drop_invalid: bool = False  #: drop invalid rows on validation
 
     #: make sure certain column combinations are unique
     unique: Optional[Union[str, List[str]]] = None

diff --git a/pandera/backends/base/__init__.py b/pandera/backends/base/__init__.py
@@ -124,6 +124,10 @@ def failure_cases_metadata(
         """Get failure cases metadata for lazy validation."""
         raise NotImplementedError
 
+    def drop_invalid_data(self, check_obj, error_handler):
+        """Remove invalid elements in a `check_obj` according to failures in caught by the `error_handler`"""
+        raise NotImplementedError
+
 
 class BaseCheckBackend(ABC):
     """Abstract base class for a check backend implementation."""

diff --git a/pandera/backends/pandas/array.py b/pandera/backends/pandas/array.py
@@ -55,6 +55,42 @@ def validate(
             except SchemaError as exc:
                 error_handler.collect_error(exc.reason_code, exc)
 
+        # run the core checks
+        error_handler = self.run_checks_and_handle_errors(
+            error_handler,
+            schema,
+            check_obj,
+            head,
+            tail,
+            sample,
+            random_state,
+        )
+
+        if lazy and error_handler.collected_errors:
+            if hasattr(schema, "drop_invalid") and schema.drop_invalid:
+                check_obj = self.drop_invalid_data(check_obj, error_handler)
+                return check_obj
+            else:
+                raise SchemaErrors(
+                    schema=schema,
+                    schema_errors=error_handler.collected_errors,
+                    data=check_obj,
+                )
+
+        return check_obj
+
+    def run_checks_and_handle_errors(
+        self,
+        error_handler,
+        schema,
+        check_obj,
+        head,
+        tail,
+        sample,
+        random_state,
+    ):
+        """Run checks on schema"""
+        # pylint: disable=too-many-locals
         field_obj_subsample = self.subsample(
             check_obj if is_field(check_obj) else check_obj[schema.name],
             head,
@@ -71,14 +107,15 @@ def validate(
             random_state,
         )
 
-        # run the core checks
-        for core_check, args in (
+        core_checks = [
             (self.check_name, (field_obj_subsample, schema)),
             (self.check_nullable, (field_obj_subsample, schema)),
             (self.check_unique, (field_obj_subsample, schema)),
             (self.check_dtype, (field_obj_subsample, schema)),
             (self.run_checks, (check_obj_subsample, schema)),
-        ):
+        ]
+
+        for core_check, args in core_checks:
             results = core_check(*args)
             if isinstance(results, CoreCheckResult):
                 results = [results]
@@ -106,13 +143,7 @@ def validate(
                         original_exc=result.original_exc,
                     )
 
-        if lazy and error_handler.collected_errors:
-            raise SchemaErrors(
-                schema=schema,
-                schema_errors=error_handler.collected_errors,
-                data=check_obj,
-            )
-        return check_obj
+        return error_handler
 
     def coerce_dtype(
         self,

diff --git a/pandera/backends/pandas/base.py b/pandera/backends/pandas/base.py
@@ -24,6 +24,7 @@
     scalar_failure_case,
 )
 from pandera.errors import FailureCaseMetadata, SchemaError, SchemaErrorReason
+from pandera.error_handlers import SchemaErrorHandler
 
 
 class ColumnInfo(NamedTuple):
@@ -149,3 +150,12 @@ def failure_cases_metadata(
             message=message,
             error_counts=error_counts,
         )
+
+    def drop_invalid_data(self, check_obj, error_handler: SchemaErrorHandler):
+        """Remove invalid elements in a check obj according to failures in caught by the error handler."""
+        errors = error_handler.collected_errors
+        for err in errors:
+            check_obj = check_obj.loc[
+                ~check_obj.index.isin(err.failure_cases["index"])
+            ]
+        return check_obj
diff --git a/pandera/backends/pandas/components.py b/pandera/backends/pandas/components.py
@@ -1,4 +1,5 @@
 """Backend implementation for pandas schema components."""
+# pylint: disable=too-many-locals
 
 import traceback
 from copy import copy, deepcopy
@@ -51,10 +52,10 @@ def validate(
                 "method.",
             )
 
-        def validate_column(check_obj, column_name):
+        def validate_column(check_obj, column_name, return_check_obj=False):
             try:
                 # pylint: disable=super-with-arguments
-                super(ColumnBackend, self).validate(
+                validated_check_obj = super(ColumnBackend, self).validate(
                     check_obj,
                     copy(schema).set_name(column_name),
                     head=head,
@@ -64,6 +65,10 @@ def validate_column(check_obj, column_name):
                     lazy=lazy,
                     inplace=inplace,
                 )
+
+                if return_check_obj:
+                    return validated_check_obj
+
             except SchemaErrors as err:
                 for err in err.schema_errors:
                     error_handler.collect_error(
@@ -95,14 +100,24 @@ def validate_column(check_obj, column_name):
                         check_obj[column_name].iloc[:, [i]], column_name
                     )
             else:
-                validate_column(check_obj, column_name)
+                if hasattr(schema, "drop_invalid") and schema.drop_invalid:
+                    # replace the check_obj with the validated check_obj
+                    check_obj = validate_column(
+                        check_obj, column_name, return_check_obj=True
+                    )
+                else:
+                    validate_column(check_obj, column_name)
 
         if lazy and error_handler.collected_errors:
-            raise SchemaErrors(
-                schema=schema,
-                schema_errors=error_handler.collected_errors,
-                data=check_obj,
-            )
+            if hasattr(schema, "drop_invalid") and schema.drop_invalid:
+                check_obj = self.drop_invalid_data(check_obj, error_handler)
+                return check_obj
+            else:
+                raise SchemaErrors(
+                    schema=schema,
+                    schema_errors=error_handler.collected_errors,
+                    data=check_obj,
+                )
 
         return check_obj
 
@@ -381,16 +396,8 @@ def validate(
             otherwise creates a copy of the data.
         :returns: validated DataFrame or Series.
         """
-        # pylint: disable=too-many-locals
         if schema.coerce:
-            try:
-                check_obj.index = self.coerce_dtype(
-                    check_obj.index, schema=schema  # type: ignore [arg-type]
-                )
-            except SchemaErrors as err:
-                if lazy:
-                    raise
-                raise err.schema_errors[0] from err
+            check_obj.index = self.__coerce_index(check_obj, schema, lazy)
 
         # Prevent data type coercion when the validate method is called because
         # it leads to some weird behavior when calling coerce_dtype within the
@@ -419,32 +426,9 @@ def validate(
             ):
                 columns[name] = column.set_name(name)
             schema_copy.columns = columns
-
-        def to_dataframe(multiindex):
-            """
-            Emulate the behavior of pandas.MultiIndex.to_frame, but preserve
-            duplicate index names if they exist.
-            """
-            # NOTE: this is a hack to support pyspark.pandas
-            if type(multiindex).__module__.startswith("pyspark.pandas"):
-                df = multiindex.to_frame()
-            else:
-                df = pd.DataFrame(
-                    {
-                        i: multiindex.get_level_values(i)
-                        for i in range(multiindex.nlevels)
-                    }
-                )
-                df.columns = [
-                    i if name is None else name
-                    for i, name in enumerate(multiindex.names)
-                ]
-                df.index = multiindex
-            return df
-
         try:
             validation_result = super().validate(
-                to_dataframe(check_obj.index),
+                self.__to_dataframe(check_obj.index),
                 schema_copy,
                 head=head,
                 tail=tail,
@@ -480,3 +464,36 @@ def to_dataframe(multiindex):
 
         assert is_table(validation_result)
         return check_obj
+
+    def __to_dataframe(self, multiindex):
+        """
+        Emulate the behavior of pandas.MultiIndex.to_frame, but preserve
+        duplicate index names if they exist.
+        """
+        # NOTE: this is a hack to support pyspark.pandas
+        if type(multiindex).__module__.startswith("pyspark.pandas"):
+            df = multiindex.to_frame()
+        else:
+            df = pd.DataFrame(
+                {
+                    i: multiindex.get_level_values(i)
+                    for i in range(multiindex.nlevels)
+                }
+            )
+            df.columns = [
+                i if name is None else name
+                for i, name in enumerate(multiindex.names)
+            ]
+            df.index = multiindex
+        return df
+
+    def __coerce_index(self, check_obj, schema, lazy):
+        """Coerce index"""
+        try:
+            return self.coerce_dtype(
+                check_obj.index, schema=schema  # type: ignore [arg-type]
+            )
+        except SchemaErrors as err:
+            if lazy:
+                raise
+            raise err.schema_errors[0] from err