Skip to content

Commit d3f665d

Browse files
authored
Error on bad lines pyarrow (#45029)
1 parent 52c5703 commit d3f665d

File tree

3 files changed

+62
-51
lines changed

3 files changed

+62
-51
lines changed

pandas/io/parsers/base_parser.py

Lines changed: 47 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -77,52 +77,6 @@
7777
)
7878
from pandas.io.date_converters import generic_parser
7979

80-
parser_defaults = {
81-
"delimiter": None,
82-
"escapechar": None,
83-
"quotechar": '"',
84-
"quoting": csv.QUOTE_MINIMAL,
85-
"doublequote": True,
86-
"skipinitialspace": False,
87-
"lineterminator": None,
88-
"header": "infer",
89-
"index_col": None,
90-
"names": None,
91-
"prefix": None,
92-
"skiprows": None,
93-
"skipfooter": 0,
94-
"nrows": None,
95-
"na_values": None,
96-
"keep_default_na": True,
97-
"true_values": None,
98-
"false_values": None,
99-
"converters": None,
100-
"dtype": None,
101-
"cache_dates": True,
102-
"thousands": None,
103-
"comment": None,
104-
"decimal": ".",
105-
# 'engine': 'c',
106-
"parse_dates": False,
107-
"keep_date_col": False,
108-
"dayfirst": False,
109-
"date_parser": None,
110-
"usecols": None,
111-
# 'iterator': False,
112-
"chunksize": None,
113-
"verbose": False,
114-
"encoding": None,
115-
"squeeze": None,
116-
"compression": None,
117-
"mangle_dupe_cols": True,
118-
"infer_datetime_format": False,
119-
"skip_blank_lines": True,
120-
"encoding_errors": "strict",
121-
"on_bad_lines": "error",
122-
"error_bad_lines": None,
123-
"warn_bad_lines": None,
124-
}
125-
12680

12781
class ParserBase:
12882
class BadLineHandleMethod(Enum):
@@ -1178,6 +1132,53 @@ def converter(*date_cols):
11781132
return converter
11791133

11801134

1135+
parser_defaults = {
1136+
"delimiter": None,
1137+
"escapechar": None,
1138+
"quotechar": '"',
1139+
"quoting": csv.QUOTE_MINIMAL,
1140+
"doublequote": True,
1141+
"skipinitialspace": False,
1142+
"lineterminator": None,
1143+
"header": "infer",
1144+
"index_col": None,
1145+
"names": None,
1146+
"prefix": None,
1147+
"skiprows": None,
1148+
"skipfooter": 0,
1149+
"nrows": None,
1150+
"na_values": None,
1151+
"keep_default_na": True,
1152+
"true_values": None,
1153+
"false_values": None,
1154+
"converters": None,
1155+
"dtype": None,
1156+
"cache_dates": True,
1157+
"thousands": None,
1158+
"comment": None,
1159+
"decimal": ".",
1160+
# 'engine': 'c',
1161+
"parse_dates": False,
1162+
"keep_date_col": False,
1163+
"dayfirst": False,
1164+
"date_parser": None,
1165+
"usecols": None,
1166+
# 'iterator': False,
1167+
"chunksize": None,
1168+
"verbose": False,
1169+
"encoding": None,
1170+
"squeeze": None,
1171+
"compression": None,
1172+
"mangle_dupe_cols": True,
1173+
"infer_datetime_format": False,
1174+
"skip_blank_lines": True,
1175+
"encoding_errors": "strict",
1176+
"on_bad_lines": ParserBase.BadLineHandleMethod.ERROR,
1177+
"error_bad_lines": None,
1178+
"warn_bad_lines": None,
1179+
}
1180+
1181+
11811182
def _process_date_conversion(
11821183
data_dict,
11831184
converter: Callable,

pandas/io/parsers/readers.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -434,10 +434,7 @@
434434
"dialect",
435435
"warn_bad_lines",
436436
"error_bad_lines",
437-
# TODO(1.4)
438-
# This doesn't error properly ATM, fix for release
439-
# but not blocker for initial PR
440-
# "on_bad_lines",
437+
"on_bad_lines",
441438
"delim_whitespace",
442439
"quoting",
443440
"lineterminator",
@@ -932,7 +929,18 @@ def _get_options_with_defaults(self, engine):
932929
engine == "pyarrow"
933930
and argname in _pyarrow_unsupported
934931
and value != default
932+
and value != getattr(value, "value", default)
935933
):
934+
if (
935+
argname == "on_bad_lines"
936+
and kwds.get("error_bad_lines") is not None
937+
):
938+
argname = "error_bad_lines"
939+
elif (
940+
argname == "on_bad_lines" and kwds.get("warn_bad_lines") is not None
941+
):
942+
argname = "warn_bad_lines"
943+
936944
raise ValueError(
937945
f"The {repr(argname)} option is not supported with the "
938946
f"'pyarrow' engine"

pandas/tests/io/parser/test_unsupported.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,10 +140,12 @@ def test_pyarrow_engine(self):
140140
f"supported with the 'pyarrow' engine"
141141
)
142142
kwargs = {default: object()}
143-
default_needs_bool = {"on_bad_lines", "error_bad_lines"}
143+
default_needs_bool = {"warn_bad_lines", "error_bad_lines"}
144144
if default == "dialect":
145145
kwargs[default] = "excel" # test a random dialect
146146
elif default in default_needs_bool:
147147
kwargs[default] = True
148+
elif default == "on_bad_lines":
149+
kwargs[default] = "warn"
148150
with pytest.raises(ValueError, match=msg):
149151
read_csv(StringIO(data), engine="pyarrow", **kwargs)

0 commit comments

Comments
 (0)