@@ -275,9 +275,12 @@ def show(self, n=20):
275
275
>>> df
276
276
DataFrame[age: int, name: string]
277
277
>>> df.show()
278
- age name
279
- 2 Alice
280
- 5 Bob
278
+ +---+-----+
279
+ |age| name|
280
+ +---+-----+
281
+ | 2|Alice|
282
+ | 5| Bob|
283
+ +---+-----+
281
284
"""
282
285
print (self ._jdf .showString (n ))
283
286
@@ -591,12 +594,15 @@ def describe(self, *cols):
591
594
given, this function computes statistics for all numerical columns.
592
595
593
596
>>> df.describe().show()
594
- summary age
595
- count 2
596
- mean 3.5
597
- stddev 1.5
598
- min 2
599
- max 5
597
+ +-------+---+
598
+ |summary|age|
599
+ +-------+---+
600
+ | count| 2|
601
+ | mean|3.5|
602
+ | stddev|1.5|
603
+ | min| 2|
604
+ | max| 5|
605
+ +-------+---+
600
606
"""
601
607
jdf = self ._jdf .describe (self ._jseq (cols ))
602
608
return DataFrame (jdf , self .sql_ctx )
@@ -801,12 +807,18 @@ def dropna(self, how='any', thresh=None, subset=None):
801
807
:param subset: optional list of column names to consider.
802
808
803
809
>>> df4.dropna().show()
804
- age height name
805
- 10 80 Alice
810
+ +---+------+-----+
811
+ |age|height| name|
812
+ +---+------+-----+
813
+ | 10| 80|Alice|
814
+ +---+------+-----+
806
815
807
816
>>> df4.na.drop().show()
808
- age height name
809
- 10 80 Alice
817
+ +---+------+-----+
818
+ |age|height| name|
819
+ +---+------+-----+
820
+ | 10| 80|Alice|
821
+ +---+------+-----+
810
822
"""
811
823
if how is not None and how not in ['any' , 'all' ]:
812
824
raise ValueError ("how ('" + how + "') should be 'any' or 'all'" )
@@ -837,25 +849,34 @@ def fillna(self, value, subset=None):
837
849
then the non-string column is simply ignored.
838
850
839
851
>>> df4.fillna(50).show()
840
- age height name
841
- 10 80 Alice
842
- 5 50 Bob
843
- 50 50 Tom
844
- 50 50 null
852
+ +---+------+-----+
853
+ |age|height| name|
854
+ +---+------+-----+
855
+ | 10| 80|Alice|
856
+ | 5| 50| Bob|
857
+ | 50| 50| Tom|
858
+ | 50| 50| null|
859
+ +---+------+-----+
845
860
846
861
>>> df4.fillna({'age': 50, 'name': 'unknown'}).show()
847
- age height name
848
- 10 80 Alice
849
- 5 null Bob
850
- 50 null Tom
851
- 50 null unknown
862
+ +---+------+-------+
863
+ |age|height| name|
864
+ +---+------+-------+
865
+ | 10| 80| Alice|
866
+ | 5| null| Bob|
867
+ | 50| null| Tom|
868
+ | 50| null|unknown|
869
+ +---+------+-------+
852
870
853
871
>>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()
854
- age height name
855
- 10 80 Alice
856
- 5 null Bob
857
- 50 null Tom
858
- 50 null unknown
872
+ +---+------+-------+
873
+ |age|height| name|
874
+ +---+------+-------+
875
+ | 10| 80| Alice|
876
+ | 5| null| Bob|
877
+ | 50| null| Tom|
878
+ | 50| null|unknown|
879
+ +---+------+-------+
859
880
"""
860
881
if not isinstance (value , (float , int , long , basestring , dict )):
861
882
raise ValueError ("value should be a float, int, long, string, or dict" )
@@ -1220,11 +1241,17 @@ def getItem(self, key):
1220
1241
1221
1242
>>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
1222
1243
>>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
1223
- l[0] d[key]
1224
- 1 value
1244
+ +----+------+
1245
+ |l[0]|d[key]|
1246
+ +----+------+
1247
+ | 1| value|
1248
+ +----+------+
1225
1249
>>> df.select(df.l[0], df.d["key"]).show()
1226
- l[0] d[key]
1227
- 1 value
1250
+ +----+------+
1251
+ |l[0]|d[key]|
1252
+ +----+------+
1253
+ | 1| value|
1254
+ +----+------+
1228
1255
"""
1229
1256
return self [key ]
1230
1257
@@ -1234,11 +1261,17 @@ def getField(self, name):
1234
1261
>>> from pyspark.sql import Row
1235
1262
>>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
1236
1263
>>> df.select(df.r.getField("b")).show()
1237
- r.b
1238
- b
1264
+ +---+
1265
+ |r.b|
1266
+ +---+
1267
+ | b|
1268
+ +---+
1239
1269
>>> df.select(df.r.a).show()
1240
- r.a
1241
- 1
1270
+ +---+
1271
+ |r.a|
1272
+ +---+
1273
+ | 1|
1274
+ +---+
1242
1275
"""
1243
1276
return Column (self ._jc .getField (name ))
1244
1277
0 commit comments