Skip to content

Commit fb58d89

Browse files
GH-14755: [Python] Expose QuotingStyle to Python (#14722)
This exposes the QuotingStyle option to Python. I tested this with the v10 tag since I wasn't able to build the master branch for some reason. I'm happy to use enums instead of strings. Both seem to be used in Python and I wasn't sure which one was preferred. (no Jira ticket since I don't have an account and can't create one) Lead-authored-by: Frederick Jansen <frederick.jansen@gmail.com> Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent 80295b0 commit fb58d89

File tree

3 files changed

+95
-2
lines changed

3 files changed

+95
-2
lines changed

python/pyarrow/_csv.pyx

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1275,6 +1275,30 @@ def open_csv(input_file, read_options=None, parse_options=None,
12751275
return reader
12761276

12771277

1278+
def _raise_invalid_function_option(value, description, *,
1279+
exception_class=ValueError):
1280+
raise exception_class(f"\"{value}\" is not a valid {description}")
1281+
1282+
1283+
cdef CQuotingStyle unwrap_quoting_style(quoting_style) except *:
1284+
if quoting_style == "needed":
1285+
return CQuotingStyle_Needed
1286+
elif quoting_style == "all_valid":
1287+
return CQuotingStyle_AllValid
1288+
elif quoting_style == "none":
1289+
return CQuotingStyle_None
1290+
_raise_invalid_function_option(quoting_style, "quoting style")
1291+
1292+
1293+
cdef wrap_quoting_style(quoting_style):
1294+
if quoting_style == CQuotingStyle_Needed:
1295+
return 'needed'
1296+
elif quoting_style == CQuotingStyle_AllValid:
1297+
return 'all_valid'
1298+
elif quoting_style == CQuotingStyle_None:
1299+
return 'none'
1300+
1301+
12781302
cdef class WriteOptions(_Weakrefable):
12791303
"""
12801304
Options for writing CSV files.
@@ -1288,20 +1312,31 @@ cdef class WriteOptions(_Weakrefable):
12881312
CSV data
12891313
delimiter : 1-character string, optional (default ",")
12901314
The character delimiting individual cells in the CSV data.
1315+
quoting_style : str, optional (default "needed")
1316+
Whether to quote values, and if so, which quoting style to use.
1317+
The following values are accepted:
1318+
1319+
- "needed" (default): only enclose values in quotes when needed.
1320+
- "all_valid": enclose all valid values in quotes; nulls are not quoted.
1321+
- "none": do not enclose any values in quotes; values containing
1322+
special characters (such as quotes, cell delimiters or line endings)
1323+
will raise an error.
12911324
"""
12921325

12931326
# Avoid mistakingly creating attributes
12941327
__slots__ = ()
12951328

12961329
def __init__(self, *, include_header=None, batch_size=None,
1297-
delimiter=None):
1330+
delimiter=None, quoting_style=None):
12981331
self.options.reset(new CCSVWriteOptions(CCSVWriteOptions.Defaults()))
12991332
if include_header is not None:
13001333
self.include_header = include_header
13011334
if batch_size is not None:
13021335
self.batch_size = batch_size
13031336
if delimiter is not None:
13041337
self.delimiter = delimiter
1338+
if quoting_style is not None:
1339+
self.quoting_style = quoting_style
13051340

13061341
@property
13071342
def include_header(self):
@@ -1337,6 +1372,24 @@ cdef class WriteOptions(_Weakrefable):
13371372
def delimiter(self, value):
13381373
deref(self.options).delimiter = _single_char(value)
13391374

1375+
@property
1376+
def quoting_style(self):
1377+
"""
1378+
Whether to quote values, and if so, which quoting style to use.
1379+
The following values are accepted:
1380+
1381+
- "needed" (default): only enclose values in quotes when needed.
1382+
- "all_valid": enclose all valid values in quotes; nulls are not quoted.
1383+
- "none": do not enclose any values in quotes; values containing
1384+
special characters (such as quotes, cell delimiters or line endings)
1385+
will raise an error.
1386+
"""
1387+
return wrap_quoting_style(deref(self.options).quoting_style)
1388+
1389+
@quoting_style.setter
1390+
def quoting_style(self, value):
1391+
deref(self.options).quoting_style = unwrap_quoting_style(value)
1392+
13401393
@staticmethod
13411394
cdef WriteOptions wrap(CCSVWriteOptions options):
13421395
out = WriteOptions()

python/pyarrow/includes/libarrow.pxd

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1706,6 +1706,11 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
17061706

17071707
cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
17081708

1709+
ctypedef enum CQuotingStyle "arrow::csv::QuotingStyle":
1710+
CQuotingStyle_Needed "arrow::csv::QuotingStyle::Needed"
1711+
CQuotingStyle_AllValid "arrow::csv::QuotingStyle::AllValid"
1712+
CQuotingStyle_None "arrow::csv::QuotingStyle::None"
1713+
17091714
cdef cppclass CCSVParseOptions" arrow::csv::ParseOptions":
17101715
unsigned char delimiter
17111716
c_bool quoting
@@ -1770,6 +1775,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" nogil:
17701775
c_bool include_header
17711776
int32_t batch_size
17721777
unsigned char delimiter
1778+
CQuotingStyle quoting_style
17731779
CIOContext io_context
17741780

17751781
CCSVWriteOptions()

python/pyarrow/tests/test_csv.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,8 @@ def test_write_options():
326326
opts = cls()
327327

328328
check_options_class(
329-
cls, include_header=[True, False], delimiter=[',', '\t', '|'])
329+
cls, include_header=[True, False], delimiter=[',', '\t', '|'],
330+
quoting_style=['needed', 'none', 'all_valid'])
330331

331332
assert opts.batch_size > 0
332333
opts.batch_size = 12345
@@ -1908,6 +1909,39 @@ def test_write_read_round_trip():
19081909
parse_options=parse_options)
19091910

19101911

1912+
def test_write_quoting_style():
1913+
t = pa.Table.from_arrays([[1, 2, None], ["a", None, "c"]], ["c1", "c2"])
1914+
buf = io.BytesIO()
1915+
for write_options, res in [
1916+
(WriteOptions(quoting_style='none'), b'"c1","c2"\n1,a\n2,\n,c\n'),
1917+
(WriteOptions(), b'"c1","c2"\n1,"a"\n2,\n,"c"\n'),
1918+
(WriteOptions(quoting_style='all_valid'),
1919+
b'"c1","c2"\n"1","a"\n"2",\n,"c"\n'),
1920+
]:
1921+
with CSVWriter(buf, t.schema, write_options=write_options) as writer:
1922+
writer.write_table(t)
1923+
assert buf.getvalue() == res
1924+
buf.seek(0)
1925+
1926+
# Test writing special characters with different quoting styles
1927+
t = pa.Table.from_arrays([[",", "\""]], ["c1"])
1928+
buf = io.BytesIO()
1929+
for write_options, res in [
1930+
(WriteOptions(quoting_style='needed'), b'"c1"\n","\n""""\n'),
1931+
(WriteOptions(quoting_style='none'), pa.lib.ArrowInvalid),
1932+
]:
1933+
with CSVWriter(buf, t.schema, write_options=write_options) as writer:
1934+
try:
1935+
writer.write_table(t)
1936+
except Exception as e:
1937+
# This will trigger when we try to write a comma (,)
1938+
# without quotes, which is invalid
1939+
assert type(e) == res
1940+
break
1941+
assert buf.getvalue() == res
1942+
buf.seek(0)
1943+
1944+
19111945
def test_read_csv_reference_cycle():
19121946
# ARROW-13187
19131947
def inner():

0 commit comments

Comments
 (0)