Skip to content

Commit 84aec3e

Browse files
author
云峤
committed
Update python dataFrame show test and add empty df unit test.
1 parent 159b3d5 commit 84aec3e

File tree

2 files changed

+83
-38
lines changed

2 files changed

+83
-38
lines changed

python/pyspark/sql/dataframe.py

Lines changed: 69 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -275,9 +275,12 @@ def show(self, n=20):
275275
>>> df
276276
DataFrame[age: int, name: string]
277277
>>> df.show()
278-
age name
279-
2 Alice
280-
5 Bob
278+
+---+-----+
279+
|age| name|
280+
+---+-----+
281+
| 2|Alice|
282+
| 5| Bob|
283+
+---+-----+
281284
"""
282285
print(self._jdf.showString(n))
283286

@@ -591,12 +594,15 @@ def describe(self, *cols):
591594
given, this function computes statistics for all numerical columns.
592595
593596
>>> df.describe().show()
594-
summary age
595-
count 2
596-
mean 3.5
597-
stddev 1.5
598-
min 2
599-
max 5
597+
+-------+---+
598+
|summary|age|
599+
+-------+---+
600+
| count| 2|
601+
| mean|3.5|
602+
| stddev|1.5|
603+
| min| 2|
604+
| max| 5|
605+
+-------+---+
600606
"""
601607
jdf = self._jdf.describe(self._jseq(cols))
602608
return DataFrame(jdf, self.sql_ctx)
@@ -801,12 +807,18 @@ def dropna(self, how='any', thresh=None, subset=None):
801807
:param subset: optional list of column names to consider.
802808
803809
>>> df4.dropna().show()
804-
age height name
805-
10 80 Alice
810+
+---+------+-----+
811+
|age|height| name|
812+
+---+------+-----+
813+
| 10| 80|Alice|
814+
+---+------+-----+
806815
807816
>>> df4.na.drop().show()
808-
age height name
809-
10 80 Alice
817+
+---+------+-----+
818+
|age|height| name|
819+
+---+------+-----+
820+
| 10| 80|Alice|
821+
+---+------+-----+
810822
"""
811823
if how is not None and how not in ['any', 'all']:
812824
raise ValueError("how ('" + how + "') should be 'any' or 'all'")
@@ -837,25 +849,34 @@ def fillna(self, value, subset=None):
837849
then the non-string column is simply ignored.
838850
839851
>>> df4.fillna(50).show()
840-
age height name
841-
10 80 Alice
842-
5 50 Bob
843-
50 50 Tom
844-
50 50 null
852+
+---+------+-----+
853+
|age|height| name|
854+
+---+------+-----+
855+
| 10| 80|Alice|
856+
| 5| 50| Bob|
857+
| 50| 50| Tom|
858+
| 50| 50| null|
859+
+---+------+-----+
845860
846861
>>> df4.fillna({'age': 50, 'name': 'unknown'}).show()
847-
age height name
848-
10 80 Alice
849-
5 null Bob
850-
50 null Tom
851-
50 null unknown
862+
+---+------+-------+
863+
|age|height| name|
864+
+---+------+-------+
865+
| 10| 80| Alice|
866+
| 5| null| Bob|
867+
| 50| null| Tom|
868+
| 50| null|unknown|
869+
+---+------+-------+
852870
853871
>>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
854-
age height name
855-
10 80 Alice
856-
5 null Bob
857-
50 null Tom
858-
50 null unknown
872+
+---+------+-------+
873+
|age|height| name|
874+
+---+------+-------+
875+
| 10| 80| Alice|
876+
| 5| null| Bob|
877+
| 50| null| Tom|
878+
| 50| null|unknown|
879+
+---+------+-------+
859880
"""
860881
if not isinstance(value, (float, int, long, basestring, dict)):
861882
raise ValueError("value should be a float, int, long, string, or dict")
@@ -1220,11 +1241,17 @@ def getItem(self, key):
12201241
12211242
>>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
12221243
>>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
1223-
l[0] d[key]
1224-
1 value
1244+
+----+------+
1245+
|l[0]|d[key]|
1246+
+----+------+
1247+
| 1| value|
1248+
+----+------+
12251249
>>> df.select(df.l[0], df.d["key"]).show()
1226-
l[0] d[key]
1227-
1 value
1250+
+----+------+
1251+
|l[0]|d[key]|
1252+
+----+------+
1253+
| 1| value|
1254+
+----+------+
12281255
"""
12291256
return self[key]
12301257

@@ -1234,11 +1261,17 @@ def getField(self, name):
12341261
>>> from pyspark.sql import Row
12351262
>>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
12361263
>>> df.select(df.r.getField("b")).show()
1237-
r.b
1238-
b
1264+
+---+
1265+
|r.b|
1266+
+---+
1267+
| b|
1268+
+---+
12391269
>>> df.select(df.r.a).show()
1240-
r.a
1241-
1
1270+
+---+
1271+
|r.a|
1272+
+---+
1273+
| 1|
1274+
+---+
12421275
"""
12431276
return Column(self._jc.getField(name))
12441277

sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -599,8 +599,20 @@ class DataFrameSuite extends QueryTest {
599599
}
600600

601601
test("SPARK-7319 showString") {
602-
assert(testData.select($"*").showString(1).split("\n") === Seq("+---+-----+",
603-
"|key|value|", "+---+-----+", "| 1| 1|", "+---+-----+"))
602+
assert(testData.select($"*").showString(1) === """+---+-----+
603+
||key|value|
604+
|+---+-----+
605+
|| 1| 1|
606+
|+---+-----+
607+
|""".stripMargin)
608+
}
609+
610+
test("SPARK-7327 show with empty dataFrame") {
611+
assert(testData.select($"*").filter($"key" < 0).showString(1) === """+---+-----+
612+
||key|value|
613+
|+---+-----+
614+
|+---+-----+
615+
|""".stripMargin)
604616
}
605617

606618
test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") {

0 commit comments

Comments
 (0)