Skip to content

Commit 1a8c9a4

Browse files
cpcloudwesm
authored andcommitted
ARROW-1976: [Python] Handling unicode pandas columns on parquet.read_table
Author: Phillip Cloud <cpcloud@gmail.com> Author: Licht-T <licht-t@outlook.jp> Author: Wes McKinney <wes.mckinney@twosigma.com> Author: Simba Nyatsanga <simnyatsanga@gmail.com> Closes #1553 from cpcloud/ARROW-1976 and squashes the following commits: d8793f7 [Wes McKinney] Fix flakes 77cd95b [Phillip Cloud] No need for additional function call 4f71b62 [Phillip Cloud] Make sure it's actually binary 6f8ad84 [Phillip Cloud] Fix binary on python3 e8d4154 [Phillip Cloud] Use _column_name_to_strings 210607f [Phillip Cloud] Add tests 40910cb [Phillip Cloud] Revert "Fix compat by using text_type" b098d8a [Phillip Cloud] Fix compat by using text_type a52f5c7 [Phillip Cloud] Revert 8773fad [Phillip Cloud] Ignore pytest cache db6176c [Simba Nyatsanga] Not using str with frombytes to ensure Python3 tests pass. e9385c7 [Licht-T] BUG: Convert str by frombytes on pandas_compat.py 17f28b1 [Licht-T] TST: Add tests for Pandas data SerDe with Unicode column names 85c1231 [Licht-T] BUG: Fix Pandas data SerDe with Unicode column names in Python 2.7
1 parent 0d02a7d commit 1a8c9a4

File tree

4 files changed

+43
-9
lines changed

4 files changed

+43
-9
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,4 @@ cpp/.idea/
2929
python/.eggs/
3030
.vscode
3131
.idea/
32+
.pytest_cache/

python/pyarrow/pandas_compat.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -170,9 +170,11 @@ def get_column_metadata(column, name, arrow_type, field_name):
170170
)
171171
)
172172

173+
assert field_name is None or isinstance(field_name, six.string_types), \
174+
str(type(field_name))
173175
return {
174176
'name': name,
175-
'field_name': str(field_name),
177+
'field_name': 'None' if field_name is None else field_name,
176178
'pandas_type': logical_type,
177179
'numpy_type': string_dtype,
178180
'metadata': extra_metadata,
@@ -279,8 +281,11 @@ def _column_name_to_strings(name):
279281
"""
280282
if isinstance(name, six.string_types):
281283
return name
284+
elif isinstance(name, six.binary_type):
285+
# XXX: should we assume that bytes in Python 3 are UTF-8?
286+
return name.decode('utf8')
282287
elif isinstance(name, tuple):
283-
return tuple(map(_column_name_to_strings, name))
288+
return str(tuple(map(_column_name_to_strings, name)))
284289
elif isinstance(name, collections.Sequence):
285290
raise TypeError("Unsupported type for MultiIndex level")
286291
elif name is None:
@@ -327,10 +332,7 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):
327332

328333
for name in df.columns:
329334
col = df[name]
330-
if not isinstance(name, six.string_types):
331-
name = _column_name_to_strings(name)
332-
if name is not None:
333-
name = str(name)
335+
name = _column_name_to_strings(name)
334336

335337
if schema is not None:
336338
field = schema.field_by_name(name)
@@ -561,7 +563,8 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1,
561563
column_strings = [x.name for x in block_table.itercolumns()]
562564
if columns:
563565
columns_name_dict = {
564-
c.get('field_name', str(c['name'])): c['name'] for c in columns
566+
c.get('field_name', _column_name_to_strings(c['name'])): c['name']
567+
for c in columns
565568
}
566569
columns_values = [
567570
columns_name_dict.get(name, name) for name in column_strings

python/pyarrow/parquet.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -939,7 +939,7 @@ def write_table(table, where, row_group_size=None, version='1.0',
939939
coerce_timestamps=None,
940940
flavor=None, **kwargs):
941941
row_group_size = kwargs.pop('chunk_size', row_group_size)
942-
942+
use_int96 = use_deprecated_int96_timestamps
943943
try:
944944
with ParquetWriter(
945945
where, table.schema,
@@ -948,7 +948,7 @@ def write_table(table, where, row_group_size=None, version='1.0',
948948
use_dictionary=use_dictionary,
949949
coerce_timestamps=coerce_timestamps,
950950
compression=compression,
951-
use_deprecated_int96_timestamps= use_deprecated_int96_timestamps, # noqa
951+
use_deprecated_int96_timestamps=use_int96,
952952
**kwargs) as writer:
953953
writer.write_table(table, row_group_size=row_group_size)
954954
except Exception:

python/pyarrow/tests/test_convert_pandas.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,11 @@ def test_multiindex_columns_with_dtypes(self):
156156
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
157157
_check_pandas_roundtrip(df, preserve_index=True)
158158

159+
def test_multiindex_columns_unicode(self):
160+
columns = pd.MultiIndex.from_arrays([[u'あ', u'い'], ['X', 'Y']])
161+
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
162+
_check_pandas_roundtrip(df, preserve_index=True)
163+
159164
def test_integer_index_column(self):
160165
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
161166
_check_pandas_roundtrip(df, preserve_index=True)
@@ -519,6 +524,31 @@ def test_unicode(self):
519524

520525
_check_pandas_roundtrip(df, expected_schema=schema)
521526

527+
def test_unicode_with_unicode_column_and_index(self):
528+
df = pd.DataFrame({u'あ': [u'い']}, index=[u'う'])
529+
530+
_check_pandas_roundtrip(df, preserve_index=True)
531+
532+
def test_mixed_unicode_column_names(self):
533+
df = pd.DataFrame({u'あ': [u'い'], b'a': 1}, index=[u'う'])
534+
535+
# TODO(phillipc): Should this raise?
536+
with pytest.raises(AssertionError):
537+
_check_pandas_roundtrip(df, preserve_index=True)
538+
539+
def test_binary_column_name(self):
540+
column_data = [u'い']
541+
data = {u'あ'.encode('utf8'): column_data}
542+
df = pd.DataFrame(data)
543+
544+
# we can't use _check_pandas_roundtrip here because our metdata
545+
# is always decoded as utf8: even if binary goes in, utf8 comes out
546+
t = pa.Table.from_pandas(df, preserve_index=True)
547+
df2 = t.to_pandas()
548+
assert df.values[0] == df2.values[0]
549+
assert df.index.values[0] == df2.index.values[0]
550+
assert df.columns[0] == df2.columns[0].encode('utf8')
551+
522552
def test_bytes_to_binary(self):
523553
values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
524554
df = pd.DataFrame({'strings': values})

0 commit comments

Comments
 (0)