palantir · rshkv · May 7, 2020 · Apr 20, 2020 · May 2, 2020 · May 2, 2020
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
@@ -532,7 +532,8 @@ def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
         """
         from distutils.version import LooseVersion
         from pyspark.serializers import ArrowStreamPandasSerializer
-        from pyspark.sql.types import from_arrow_type, to_arrow_type, TimestampType
+        from pyspark.sql.types import from_arrow_type, to_arrow_type, TimestampType, \
+            _infer_binary_columns_as_arrow_string
         from pyspark.sql.utils import require_minimum_pandas_version, \
             require_minimum_pyarrow_version
 
@@ -549,6 +550,11 @@ def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
                 arrow_schema = temp_batch.schema
             else:
                 arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False)
+
+            # TODO(rshkv): Remove when we stop supporting Python 2 (#678)
+            if sys.version < '3' and LooseVersion(pa.__version__) >= LooseVersion("0.10.0"):
+                arrow_schema = _infer_binary_columns_as_arrow_string(arrow_schema, pdf)
+
             struct = StructType()
             for name, field in zip(schema, arrow_schema):
                 struct.add(name, from_arrow_type(field.type), nullable=field.nullable)

diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
@@ -351,6 +351,29 @@ def test_createDataFrame_with_int_col_names(self):
         self.assertEqual(pdf_col_names, df.columns)
         self.assertEqual(pdf_col_names, df_arrow.columns)
 
+    def test_createDataFrame_with_str_col(self):
+        import pandas as pd
+        pdf = pd.DataFrame({"a": ["x"]})
+
+        df, df_arrow = self._createDataFrame_toggle(pdf)
+        self.assertEqual(df.schema, df_arrow.schema)
+
+    def test_createDataFrame_with_str_array_col(self):
+        import pandas as pd
+        pdf = pd.DataFrame({"a": [["x"]]})
+
+        with self.sql_conf({"spark.sql.execution.arrow.fallback.enabled": True}):
+            df, df_arrow = self._createDataFrame_toggle(pdf)
+            self.assertEqual(df.schema, df_arrow.schema)
+
+    def test_createDataFrame_with_str_struct_col(self):
+        import pandas as pd
+        pdf = pd.DataFrame({"a": [{"x": "x"}]})
+
+        with self.sql_conf({"spark.sql.execution.arrow.fallback.enabled": True}):
+            df, df_arrow = self._createDataFrame_toggle(pdf)
+            self.assertEqual(df.schema, df_arrow.schema)
+
     def test_createDataFrame_fallback_enabled(self):
         with QuietTest(self.sc):
             with self.sql_conf({"spark.sql.execution.arrow.fallback.enabled": True}):

diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
@@ -1676,6 +1676,12 @@ def from_arrow_type(at):
     elif types.is_list(at):
         if types.is_timestamp(at.value_type):
             raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
+
+        # TODO(rshkv): Support binary type when we move off Python 2 (#678)
+        if sys.version < '3' and types.is_binary(at.value_type):
+            raise TypeError("Unsupported type in conversion from Arrow: " + str(at) +
+                            "\nPlease use Python3 for support of BinaryType in arrays.")
+
         spark_type = ArrayType(from_arrow_type(at.value_type))
     elif types.is_struct(at):
         # TODO: remove version check once minimum pyarrow version is 0.10.0
@@ -1748,6 +1754,19 @@ def _arrow_table_to_pandas(table, schema):
         return table.to_pandas(date_as_object=True)
 
 
+def _infer_binary_columns_as_arrow_string(schema, pandas_df):
+    import pandas as pd
+    import pyarrow as pa
+
+    for field_index, field in enumerate(schema):
+        if field.type == pa.binary() and \
+                pd.api.types.infer_dtype(pandas_df.iloc[:, field_index]) == "string":
+            field_as_string = pa.field(field.name, pa.string())
+            schema = schema.set(field_index, field_as_string)
+
+    return schema
+
+
 def _get_local_timezone():
     """ Get local timezone using pytz with environment variable, or dateutil.