Skip to content

Commit f32e69e

Browse files
云峤rxin
authored andcommitted
[SPARK-7319][SQL] Improve the output from DataFrame.show()
Author: 云峤 <chensong.cs@alibaba-inc.com> Closes apache#5865 from kaka1992/df.show and squashes the following commits: c79204b [云峤] Update a1338f6 [云峤] Update python dataFrame show test and add empty df unit test. 734369c [云峤] Update python dataFrame show test and add empty df unit test. 84aec3e [云峤] Update python dataFrame show test and add empty df unit test. 159b3d5 [云峤] update 03ef434 [云峤] update 7394fd5 [云峤] update test show ced487a [云峤] update pep8 b6e690b [云峤] Merge remote-tracking branch 'upstream/master' into df.show 30ac311 [云峤] [SPARK-7294] ADD BETWEEN 7d62368 [云峤] [SPARK-7294] ADD BETWEEN baf839b [云峤] [SPARK-7294] ADD BETWEEN d11d5b9 [云峤] [SPARK-7294] ADD BETWEEN
1 parent e0833c5 commit f32e69e

File tree

5 files changed

+112
-44
lines changed

5 files changed

+112
-44
lines changed

R/pkg/R/DataFrame.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ setMethod("isLocal",
167167
setMethod("showDF",
168168
signature(x = "DataFrame"),
169169
function(x, numRows = 20) {
170-
cat(callJMethod(x@sdf, "showString", numToInt(numRows)), "\n")
170+
callJMethod(x@sdf, "showString", numToInt(numRows))
171171
})
172172

173173
#' show

R/pkg/inst/tests/test_sparkSQL.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -641,7 +641,7 @@ test_that("toJSON() returns an RDD of the correct values", {
641641

642642
test_that("showDF()", {
643643
df <- jsonFile(sqlCtx, jsonPath)
644-
expect_output(showDF(df), "age name \nnull Michael\n30 Andy \n19 Justin ")
644+
expect_output(showDF(df), "+----+-------+\n| age| name|\n+----+-------+\n|null|Michael|\n| 30| Andy|\n| 19| Justin|\n+----+-------+\n")
645645
})
646646

647647
test_that("isLocal()", {

python/pyspark/sql/dataframe.py

Lines changed: 69 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -275,9 +275,12 @@ def show(self, n=20):
275275
>>> df
276276
DataFrame[age: int, name: string]
277277
>>> df.show()
278-
age name
279-
2 Alice
280-
5 Bob
278+
+---+-----+
279+
|age| name|
280+
+---+-----+
281+
| 2|Alice|
282+
| 5| Bob|
283+
+---+-----+
281284
"""
282285
print(self._jdf.showString(n))
283286

@@ -591,12 +594,15 @@ def describe(self, *cols):
591594
given, this function computes statistics for all numerical columns.
592595
593596
>>> df.describe().show()
594-
summary age
595-
count 2
596-
mean 3.5
597-
stddev 1.5
598-
min 2
599-
max 5
597+
+-------+---+
598+
|summary|age|
599+
+-------+---+
600+
| count| 2|
601+
| mean|3.5|
602+
| stddev|1.5|
603+
| min| 2|
604+
| max| 5|
605+
+-------+---+
600606
"""
601607
jdf = self._jdf.describe(self._jseq(cols))
602608
return DataFrame(jdf, self.sql_ctx)
@@ -801,12 +807,18 @@ def dropna(self, how='any', thresh=None, subset=None):
801807
:param subset: optional list of column names to consider.
802808
803809
>>> df4.dropna().show()
804-
age height name
805-
10 80 Alice
810+
+---+------+-----+
811+
|age|height| name|
812+
+---+------+-----+
813+
| 10| 80|Alice|
814+
+---+------+-----+
806815
807816
>>> df4.na.drop().show()
808-
age height name
809-
10 80 Alice
817+
+---+------+-----+
818+
|age|height| name|
819+
+---+------+-----+
820+
| 10| 80|Alice|
821+
+---+------+-----+
810822
"""
811823
if how is not None and how not in ['any', 'all']:
812824
raise ValueError("how ('" + how + "') should be 'any' or 'all'")
@@ -837,25 +849,34 @@ def fillna(self, value, subset=None):
837849
then the non-string column is simply ignored.
838850
839851
>>> df4.fillna(50).show()
840-
age height name
841-
10 80 Alice
842-
5 50 Bob
843-
50 50 Tom
844-
50 50 null
852+
+---+------+-----+
853+
|age|height| name|
854+
+---+------+-----+
855+
| 10| 80|Alice|
856+
| 5| 50| Bob|
857+
| 50| 50| Tom|
858+
| 50| 50| null|
859+
+---+------+-----+
845860
846861
>>> df4.fillna({'age': 50, 'name': 'unknown'}).show()
847-
age height name
848-
10 80 Alice
849-
5 null Bob
850-
50 null Tom
851-
50 null unknown
862+
+---+------+-------+
863+
|age|height| name|
864+
+---+------+-------+
865+
| 10| 80| Alice|
866+
| 5| null| Bob|
867+
| 50| null| Tom|
868+
| 50| null|unknown|
869+
+---+------+-------+
852870
853871
>>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
854-
age height name
855-
10 80 Alice
856-
5 null Bob
857-
50 null Tom
858-
50 null unknown
872+
+---+------+-------+
873+
|age|height| name|
874+
+---+------+-------+
875+
| 10| 80| Alice|
876+
| 5| null| Bob|
877+
| 50| null| Tom|
878+
| 50| null|unknown|
879+
+---+------+-------+
859880
"""
860881
if not isinstance(value, (float, int, long, basestring, dict)):
861882
raise ValueError("value should be a float, int, long, string, or dict")
@@ -1241,11 +1262,17 @@ def getItem(self, key):
12411262
12421263
>>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
12431264
>>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
1244-
l[0] d[key]
1245-
1 value
1265+
+----+------+
1266+
|l[0]|d[key]|
1267+
+----+------+
1268+
| 1| value|
1269+
+----+------+
12461270
>>> df.select(df.l[0], df.d["key"]).show()
1247-
l[0] d[key]
1248-
1 value
1271+
+----+------+
1272+
|l[0]|d[key]|
1273+
+----+------+
1274+
| 1| value|
1275+
+----+------+
12491276
"""
12501277
return self[key]
12511278

@@ -1255,11 +1282,17 @@ def getField(self, name):
12551282
>>> from pyspark.sql import Row
12561283
>>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
12571284
>>> df.select(df.r.getField("b")).show()
1258-
r.b
1259-
b
1285+
+---+
1286+
|r.b|
1287+
+---+
1288+
| b|
1289+
+---+
12601290
>>> df.select(df.r.a).show()
1261-
r.a
1262-
1
1291+
+---+
1292+
|r.a|
1293+
+---+
1294+
| 1|
1295+
+---+
12631296
"""
12641297
return Column(self._jc.getField(name))
12651298

sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package org.apache.spark.sql
2020
import java.io.CharArrayWriter
2121
import java.sql.DriverManager
2222

23+
2324
import scala.collection.JavaConversions._
2425
import scala.language.implicitConversions
2526
import scala.reflect.ClassTag
@@ -28,6 +29,7 @@ import scala.util.control.NonFatal
2829

2930
import com.fasterxml.jackson.core.JsonFactory
3031

32+
import org.apache.commons.lang3.StringUtils
3133
import org.apache.spark.annotation.{DeveloperApi, Experimental}
3234
import org.apache.spark.api.java.JavaRDD
3335
import org.apache.spark.api.python.SerDeUtil
@@ -175,6 +177,7 @@ class DataFrame private[sql](
175177
* @param numRows Number of rows to show
176178
*/
177179
private[sql] def showString(numRows: Int): String = {
180+
val sb = new StringBuilder
178181
val data = take(numRows)
179182
val numCols = schema.fieldNames.length
180183

@@ -194,12 +197,25 @@ class DataFrame private[sql](
194197
}
195198
}
196199

197-
// Pad the cells
198-
rows.map { row =>
199-
row.zipWithIndex.map { case (cell, i) =>
200-
String.format(s"%-${colWidths(i)}s", cell)
201-
}.mkString(" ")
202-
}.mkString("\n")
200+
// Create SeparateLine
201+
val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString()
202+
203+
// column names
204+
rows.head.zipWithIndex.map { case (cell, i) =>
205+
StringUtils.leftPad(cell.toString, colWidths(i))
206+
}.addString(sb, "|", "|", "|\n")
207+
208+
sb.append(sep)
209+
210+
// data
211+
rows.tail.map {
212+
_.zipWithIndex.map { case (cell, i) =>
213+
StringUtils.leftPad(cell.toString, colWidths(i))
214+
}.addString(sb, "|", "|", "|\n")
215+
}
216+
217+
sb.append(sep)
218+
sb.toString()
203219
}
204220

205221
override def toString: String = {

sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,25 @@ class DataFrameSuite extends QueryTest {
598598
testData.select($"*").show(1000)
599599
}
600600

601+
test("SPARK-7319 showString") {
602+
val expectedAnswer = """+---+-----+
603+
||key|value|
604+
|+---+-----+
605+
|| 1| 1|
606+
|+---+-----+
607+
|""".stripMargin
608+
assert(testData.select($"*").showString(1) === expectedAnswer)
609+
}
610+
611+
test("SPARK-7327 show with empty dataFrame") {
612+
val expectedAnswer = """+---+-----+
613+
||key|value|
614+
|+---+-----+
615+
|+---+-----+
616+
|""".stripMargin
617+
assert(testData.select($"*").filter($"key" < 0).showString(1) === expectedAnswer)
618+
}
619+
601620
test("createDataFrame(RDD[Row], StructType) should convert UDTs (SPARK-6672)") {
602621
val rowRDD = TestSQLContext.sparkContext.parallelize(Seq(Row(new ExamplePoint(1.0, 2.0))))
603622
val schema = StructType(Array(StructField("point", new ExamplePointUDT(), false)))

0 commit comments

Comments
 (0)