Skip to content

Commit f958f27

Browse files
author
Davies Liu
committed
[SPARK-8766] support non-ascii character in column names
Use UTF-8 to encode the name of column in Python 2, or it may failed to encode with default encoding ('ascii'). This PR also fix a bug when there is Java exception without error message. Author: Davies Liu <davies@databricks.com> Closes apache#7165 from davies/non_ascii and squashes the following commits: 02cb61a [Davies Liu] fix tests 3b09d31 [Davies Liu] add encoding in header 867754a [Davies Liu] support non-ascii character in column names
1 parent 1ce6428 commit f958f27

File tree

4 files changed

+15
-5
lines changed

4 files changed

+15
-5
lines changed

python/pyspark/sql/dataframe.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -484,13 +484,12 @@ def dtypes(self):
484484
return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields]
485485

486486
@property
487-
@ignore_unicode_prefix
488487
@since(1.3)
489488
def columns(self):
490489
"""Returns all column names as a list.
491490
492491
>>> df.columns
493-
[u'age', u'name']
492+
['age', 'name']
494493
"""
495494
return [f.name for f in self.schema.fields]
496495

python/pyspark/sql/tests.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# -*- encoding: utf-8 -*-
12
#
23
# Licensed to the Apache Software Foundation (ASF) under one or more
34
# contributor license agreements. See the NOTICE file distributed with
@@ -628,6 +629,14 @@ def test_access_column(self):
628629
self.assertRaises(IndexError, lambda: df["bad_key"])
629630
self.assertRaises(TypeError, lambda: df[{}])
630631

632+
def test_column_name_with_non_ascii(self):
633+
df = self.sqlCtx.createDataFrame([(1,)], ["数量"])
634+
self.assertEqual(StructType([StructField("数量", LongType(), True)]), df.schema)
635+
self.assertEqual("DataFrame[数量: bigint]", str(df))
636+
self.assertEqual([("数量", 'bigint')], df.dtypes)
637+
self.assertEqual(1, df.select("数量").first()[0])
638+
self.assertEqual(1, df.select(df["数量"]).first()[0])
639+
631640
def test_access_nested_types(self):
632641
df = self.sc.parallelize([Row(l=[1], r=Row(a=1, b="b"), d={"k": "v"})]).toDF()
633642
self.assertEqual(1, df.select(df.l[0]).first()[0])

python/pyspark/sql/types.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,8 @@ def __init__(self, name, dataType, nullable=True, metadata=None):
324324
False
325325
"""
326326
assert isinstance(dataType, DataType), "dataType should be DataType"
327+
if not isinstance(name, str):
328+
name = name.encode('utf-8')
327329
self.name = name
328330
self.dataType = dataType
329331
self.nullable = nullable

python/pyspark/sql/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ def deco(*a, **kw):
2929
try:
3030
return f(*a, **kw)
3131
except py4j.protocol.Py4JJavaError as e:
32-
cls, msg = e.java_exception.toString().split(': ', 1)
33-
if cls == 'org.apache.spark.sql.AnalysisException':
34-
raise AnalysisException(msg)
32+
s = e.java_exception.toString()
33+
if s.startswith('org.apache.spark.sql.AnalysisException: '):
34+
raise AnalysisException(s.split(': ', 1)[1])
3535
raise
3636
return deco
3737

0 commit comments

Comments
 (0)