ARROW-1976: [Python] Handling unicode pandas columns on parquet.read_table

cpcloud · wesm · commit 1a8c9a414e23 · 2018-02-05T19:27:42.000-05:00
Author: Phillip Cloud <cpcloud@gmail.com> Author: Licht-T <licht-t@outlook.jp> Author: Wes McKinney <wes.mckinney@twosigma.com> Author: Simba Nyatsanga <simnyatsanga@gmail.com> Closes #1553 from cpcloud/ARROW-1976 and squashes the following commits: d8793f7 [Wes McKinney] Fix flakes 77cd95b [Phillip Cloud] No need for additional function call 4f71b62 [Phillip Cloud] Make sure it's actually binary 6f8ad84 [Phillip Cloud] Fix binary on python3 e8d4154 [Phillip Cloud] Use _column_name_to_strings 210607f [Phillip Cloud] Add tests 40910cb [Phillip Cloud] Revert "Fix compat by using text_type" b098d8a [Phillip Cloud] Fix compat by using text_type a52f5c7 [Phillip Cloud] Revert 8773fad [Phillip Cloud] Ignore pytest cache db6176c [Simba Nyatsanga] Not using str with frombytes to ensure Python3 tests pass. e9385c7 [Licht-T] BUG: Convert str by frombytes on pandas_compat.py 17f28b1 [Licht-T] TST: Add tests for Pandas data SerDe with Unicode column names 85c1231 [Licht-T] BUG: Fix Pandas data SerDe with Unicode column names in Python 2.7
diff --git a/.gitignore b/.gitignore
@@ -29,3 +29,4 @@ cpp/.idea/
 python/.eggs/
 .vscode
 .idea/
+.pytest_cache/
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
@@ -170,9 +170,11 @@ def get_column_metadata(column, name, arrow_type, field_name):
             )
         )
 
+    assert field_name is None or isinstance(field_name, six.string_types), \
+        str(type(field_name))
     return {
         'name': name,
-        'field_name': str(field_name),
+        'field_name': 'None' if field_name is None else field_name,
         'pandas_type': logical_type,
         'numpy_type': string_dtype,
         'metadata': extra_metadata,
@@ -279,8 +281,11 @@ def _column_name_to_strings(name):
     """
     if isinstance(name, six.string_types):
         return name
+    elif isinstance(name, six.binary_type):
+        # XXX: should we assume that bytes in Python 3 are UTF-8?
+        return name.decode('utf8')
     elif isinstance(name, tuple):
-        return tuple(map(_column_name_to_strings, name))
+        return str(tuple(map(_column_name_to_strings, name)))
     elif isinstance(name, collections.Sequence):
         raise TypeError("Unsupported type for MultiIndex level")
     elif name is None:
@@ -327,10 +332,7 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):
 
     for name in df.columns:
         col = df[name]
-        if not isinstance(name, six.string_types):
-            name = _column_name_to_strings(name)
-            if name is not None:
-                name = str(name)
+        name = _column_name_to_strings(name)
 
         if schema is not None:
             field = schema.field_by_name(name)
@@ -561,7 +563,8 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1,
     column_strings = [x.name for x in block_table.itercolumns()]
     if columns:
         columns_name_dict = {
-            c.get('field_name', str(c['name'])): c['name'] for c in columns
+            c.get('field_name', _column_name_to_strings(c['name'])): c['name']
+            for c in columns
         }
         columns_values = [
             columns_name_dict.get(name, name) for name in column_strings
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
@@ -939,7 +939,7 @@ def write_table(table, where, row_group_size=None, version='1.0',
                 coerce_timestamps=None,
                 flavor=None, **kwargs):
     row_group_size = kwargs.pop('chunk_size', row_group_size)
-
+    use_int96 = use_deprecated_int96_timestamps
     try:
         with ParquetWriter(
                 where, table.schema,
@@ -948,7 +948,7 @@ def write_table(table, where, row_group_size=None, version='1.0',
                 use_dictionary=use_dictionary,
                 coerce_timestamps=coerce_timestamps,
                 compression=compression,
-                use_deprecated_int96_timestamps= use_deprecated_int96_timestamps, # noqa
+                use_deprecated_int96_timestamps=use_int96,
                 **kwargs) as writer:
             writer.write_table(table, row_group_size=row_group_size)
     except Exception:
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
@@ -156,6 +156,11 @@ def test_multiindex_columns_with_dtypes(self):
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
         _check_pandas_roundtrip(df, preserve_index=True)
 
+    def test_multiindex_columns_unicode(self):
+        columns = pd.MultiIndex.from_arrays([[u'あ', u'い'], ['X', 'Y']])
+        df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
+        _check_pandas_roundtrip(df, preserve_index=True)
+
     def test_integer_index_column(self):
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
         _check_pandas_roundtrip(df, preserve_index=True)
@@ -519,6 +524,31 @@ def test_unicode(self):
 
         _check_pandas_roundtrip(df, expected_schema=schema)
 
+    def test_unicode_with_unicode_column_and_index(self):
+        df = pd.DataFrame({u'あ': [u'い']}, index=[u'う'])
+
+        _check_pandas_roundtrip(df, preserve_index=True)
+
+    def test_mixed_unicode_column_names(self):
+        df = pd.DataFrame({u'あ': [u'い'], b'a': 1}, index=[u'う'])
+
+        # TODO(phillipc): Should this raise?
+        with pytest.raises(AssertionError):
+            _check_pandas_roundtrip(df, preserve_index=True)
+
+    def test_binary_column_name(self):
+        column_data = [u'い']
+        data = {u'あ'.encode('utf8'): column_data}
+        df = pd.DataFrame(data)
+
+        # we can't use _check_pandas_roundtrip here because our metdata
+        # is always decoded as utf8: even if binary goes in, utf8 comes out
+        t = pa.Table.from_pandas(df, preserve_index=True)
+        df2 = t.to_pandas()
+        assert df.values[0] == df2.values[0]
+        assert df.index.values[0] == df2.index.values[0]
+        assert df.columns[0] == df2.columns[0].encode('utf8')
+
     def test_bytes_to_binary(self):
         values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
         df = pd.DataFrame({'strings': values})