@@ -58,7 +58,7 @@ class DataFrame(object):
58
58
Once created, it can be manipulated using the various domain-specific-language
59
59
(DSL) functions defined in: :class:`DataFrame`, :class:`Column`.
60
60
61
- To select a column from the data frame , use the apply method::
61
+ To select a column from the :class:`DataFrame` , use the apply method::
62
62
63
63
ageCol = people.age
64
64
@@ -124,7 +124,7 @@ def toJSON(self, use_unicode=True):
124
124
125
125
@since (2.0 )
126
126
def createTempView (self , name ):
127
- """Creates a local temporary view with this DataFrame.
127
+ """Creates a local temporary view with this :class:` DataFrame` .
128
128
129
129
The lifetime of this temporary table is tied to the :class:`SparkSession`
130
130
that was used to create this :class:`DataFrame`.
@@ -146,7 +146,7 @@ def createTempView(self, name):
146
146
147
147
@since (2.0 )
148
148
def createOrReplaceTempView (self , name ):
149
- """Creates or replaces a local temporary view with this DataFrame.
149
+ """Creates or replaces a local temporary view with this :class:` DataFrame` .
150
150
151
151
The lifetime of this temporary table is tied to the :class:`SparkSession`
152
152
that was used to create this :class:`DataFrame`.
@@ -164,7 +164,7 @@ def createOrReplaceTempView(self, name):
164
164
165
165
@since (2.1 )
166
166
def createGlobalTempView (self , name ):
167
- """Creates a global temporary view with this DataFrame.
167
+ """Creates a global temporary view with this :class:` DataFrame` .
168
168
169
169
The lifetime of this temporary view is tied to this Spark application.
170
170
throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the
@@ -312,7 +312,7 @@ def isLocal(self):
312
312
@property
313
313
@since (2.0 )
314
314
def isStreaming (self ):
315
- """Returns true if this :class:`Dataset` contains one or more sources that continuously
315
+ """Returns ``True`` if this :class:`Dataset` contains one or more sources that continuously
316
316
return data as it arrives. A :class:`Dataset` that reads data from a streaming source
317
317
must be executed as a :class:`StreamingQuery` using the :func:`start` method in
318
318
:class:`DataStreamWriter`. Methods that return a single answer, (e.g., :func:`count` or
@@ -328,10 +328,10 @@ def show(self, n=20, truncate=True, vertical=False):
328
328
"""Prints the first ``n`` rows to the console.
329
329
330
330
:param n: Number of rows to show.
331
- :param truncate: If set to True, truncate strings longer than 20 chars by default.
331
+ :param truncate: If set to `` True`` , truncate strings longer than 20 chars by default.
332
332
If set to a number greater than one, truncates long strings to length ``truncate``
333
333
and align cells right.
334
- :param vertical: If set to True, print output rows vertically (one line
334
+ :param vertical: If set to `` True`` , print output rows vertically (one line
335
335
per column value).
336
336
337
337
>>> df
@@ -373,7 +373,7 @@ def __repr__(self):
373
373
return "DataFrame[%s]" % (", " .join ("%s: %s" % c for c in self .dtypes ))
374
374
375
375
def _repr_html_ (self ):
376
- """Returns a dataframe with html code when you enabled eager evaluation
376
+ """Returns a :class:`DataFrame` with html code when you enabled eager evaluation
377
377
by 'spark.sql.repl.eagerEval.enabled', this only called by REPL you are
378
378
using support eager evaluation with HTML.
379
379
"""
@@ -407,11 +407,11 @@ def _repr_html_(self):
407
407
@since (2.1 )
408
408
def checkpoint (self , eager = True ):
409
409
"""Returns a checkpointed version of this Dataset. Checkpointing can be used to truncate the
410
- logical plan of this DataFrame, which is especially useful in iterative algorithms where the
411
- plan may grow exponentially. It will be saved to files inside the checkpoint
410
+ logical plan of this :class:` DataFrame` , which is especially useful in iterative algorithms
411
+ where the plan may grow exponentially. It will be saved to files inside the checkpoint
412
412
directory set with :meth:`SparkContext.setCheckpointDir`.
413
413
414
- :param eager: Whether to checkpoint this DataFrame immediately
414
+ :param eager: Whether to checkpoint this :class:` DataFrame` immediately
415
415
416
416
.. note:: Experimental
417
417
"""
@@ -421,11 +421,11 @@ def checkpoint(self, eager=True):
421
421
@since (2.3 )
422
422
def localCheckpoint (self , eager = True ):
423
423
"""Returns a locally checkpointed version of this Dataset. Checkpointing can be used to
424
- truncate the logical plan of this DataFrame, which is especially useful in iterative
425
- algorithms where the plan may grow exponentially. Local checkpoints are stored in the
426
- executors using the caching subsystem and therefore they are not reliable.
424
+ truncate the logical plan of this :class:` DataFrame` , which is especially useful in
425
+ iterative algorithms where the plan may grow exponentially. Local checkpoints are
426
+ stored in the executors using the caching subsystem and therefore they are not reliable.
427
427
428
- :param eager: Whether to checkpoint this DataFrame immediately
428
+ :param eager: Whether to checkpoint this :class:` DataFrame` immediately
429
429
430
430
.. note:: Experimental
431
431
"""
@@ -468,7 +468,7 @@ def withWatermark(self, eventTime, delayThreshold):
468
468
469
469
@since (2.2 )
470
470
def hint (self , name , * parameters ):
471
- """Specifies some hint on the current DataFrame.
471
+ """Specifies some hint on the current :class:` DataFrame` .
472
472
473
473
:param name: A name of the hint.
474
474
:param parameters: Optional parameters.
@@ -523,8 +523,9 @@ def collect(self):
523
523
def toLocalIterator (self , prefetchPartitions = False ):
524
524
"""
525
525
Returns an iterator that contains all of the rows in this :class:`DataFrame`.
526
- The iterator will consume as much memory as the largest partition in this DataFrame.
527
- With prefetch it may consume up to the memory of the 2 largest partitions.
526
+ The iterator will consume as much memory as the largest partition in this
527
+ :class:`DataFrame`. With prefetch it may consume up to the memory of the 2 largest
528
+ partitions.
528
529
529
530
:param prefetchPartitions: If Spark should pre-fetch the next partition
530
531
before it is needed.
@@ -633,7 +634,7 @@ def unpersist(self, blocking=False):
633
634
"""Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from
634
635
memory and disk.
635
636
636
- .. note:: `blocking` default has changed to False to match Scala in 2.0.
637
+ .. note:: `blocking` default has changed to `` False`` to match Scala in 2.0.
637
638
"""
638
639
self .is_cached = False
639
640
self ._jdf .unpersist (blocking )
@@ -668,7 +669,7 @@ def coalesce(self, numPartitions):
668
669
def repartition (self , numPartitions , * cols ):
669
670
"""
670
671
Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The
671
- resulting DataFrame is hash partitioned.
672
+ resulting :class:` DataFrame` is hash partitioned.
672
673
673
674
:param numPartitions:
674
675
can be an int to specify the target number of partitions or a Column.
@@ -730,7 +731,7 @@ def repartition(self, numPartitions, *cols):
730
731
def repartitionByRange (self , numPartitions , * cols ):
731
732
"""
732
733
Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The
733
- resulting DataFrame is range partitioned.
734
+ resulting :class:` DataFrame` is range partitioned.
734
735
735
736
:param numPartitions:
736
737
can be an int to specify the target number of partitions or a Column.
@@ -790,7 +791,7 @@ def distinct(self):
790
791
def sample (self , withReplacement = None , fraction = None , seed = None ):
791
792
"""Returns a sampled subset of this :class:`DataFrame`.
792
793
793
- :param withReplacement: Sample with replacement or not (default False).
794
+ :param withReplacement: Sample with replacement or not (default `` False`` ).
794
795
:param fraction: Fraction of rows to generate, range [0.0, 1.0].
795
796
:param seed: Seed for sampling (default a random seed).
796
797
@@ -862,7 +863,7 @@ def sampleBy(self, col, fractions, seed=None):
862
863
sampling fraction for each stratum. If a stratum is not
863
864
specified, we treat its fraction as zero.
864
865
:param seed: random seed
865
- :return: a new DataFrame that represents the stratified sample
866
+ :return: a new :class:` DataFrame` that represents the stratified sample
866
867
867
868
>>> from pyspark.sql.functions import col
868
869
>>> dataset = sqlContext.range(0, 100).select((col("id") % 3).alias("key"))
@@ -898,8 +899,8 @@ def sampleBy(self, col, fractions, seed=None):
898
899
def randomSplit (self , weights , seed = None ):
899
900
"""Randomly splits this :class:`DataFrame` with the provided weights.
900
901
901
- :param weights: list of doubles as weights with which to split the DataFrame. Weights will
902
- be normalized if they don't sum up to 1.0.
902
+ :param weights: list of doubles as weights with which to split the :class:` DataFrame`.
903
+ Weights will be normalized if they don't sum up to 1.0.
903
904
:param seed: The seed for sampling.
904
905
905
906
>>> splits = df4.randomSplit([1.0, 2.0], 24)
@@ -964,7 +965,7 @@ def colRegex(self, colName):
964
965
def alias (self , alias ):
965
966
"""Returns a new :class:`DataFrame` with an alias set.
966
967
967
- :param alias: string, an alias name to be set for the DataFrame.
968
+ :param alias: string, an alias name to be set for the :class:` DataFrame` .
968
969
969
970
>>> from pyspark.sql.functions import *
970
971
>>> df_as1 = df.alias("df_as1")
@@ -1056,7 +1057,7 @@ def sortWithinPartitions(self, *cols, **kwargs):
1056
1057
"""Returns a new :class:`DataFrame` with each partition sorted by the specified column(s).
1057
1058
1058
1059
:param cols: list of :class:`Column` or column names to sort by.
1059
- :param ascending: boolean or list of boolean (default True).
1060
+ :param ascending: boolean or list of boolean (default `` True`` ).
1060
1061
Sort ascending vs. descending. Specify list for multiple sort orders.
1061
1062
If a list is specified, length of the list must equal length of the `cols`.
1062
1063
@@ -1077,7 +1078,7 @@ def sort(self, *cols, **kwargs):
1077
1078
"""Returns a new :class:`DataFrame` sorted by the specified column(s).
1078
1079
1079
1080
:param cols: list of :class:`Column` or column names to sort by.
1080
- :param ascending: boolean or list of boolean (default True).
1081
+ :param ascending: boolean or list of boolean (default `` True`` ).
1081
1082
Sort ascending vs. descending. Specify list for multiple sort orders.
1082
1083
If a list is specified, length of the list must equal length of the `cols`.
1083
1084
@@ -1144,7 +1145,8 @@ def describe(self, *cols):
1144
1145
given, this function computes statistics for all numerical or string columns.
1145
1146
1146
1147
.. note:: This function is meant for exploratory data analysis, as we make no
1147
- guarantee about the backward compatibility of the schema of the resulting DataFrame.
1148
+ guarantee about the backward compatibility of the schema of the resulting
1149
+ :class:`DataFrame`.
1148
1150
1149
1151
>>> df.describe(['age']).show()
1150
1152
+-------+------------------+
@@ -1188,7 +1190,8 @@ def summary(self, *statistics):
1188
1190
approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
1189
1191
1190
1192
.. note:: This function is meant for exploratory data analysis, as we make no
1191
- guarantee about the backward compatibility of the schema of the resulting DataFrame.
1193
+ guarantee about the backward compatibility of the schema of the resulting
1194
+ :class:`DataFrame`.
1192
1195
1193
1196
>>> df.summary().show()
1194
1197
+-------+------------------+-----+
@@ -1310,7 +1313,7 @@ def select(self, *cols):
1310
1313
1311
1314
:param cols: list of column names (string) or expressions (:class:`Column`).
1312
1315
If one of the column names is '*', that column is expanded to include all columns
1313
- in the current DataFrame.
1316
+ in the current :class:` DataFrame` .
1314
1317
1315
1318
>>> df.select('*').collect()
1316
1319
[Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
@@ -1414,7 +1417,7 @@ def rollup(self, *cols):
1414
1417
def cube (self , * cols ):
1415
1418
"""
1416
1419
Create a multi-dimensional cube for the current :class:`DataFrame` using
1417
- the specified columns, so we can run aggregation on them.
1420
+ the specified columns, so we can run aggregations on them.
1418
1421
1419
1422
>>> df.cube("name", df.age).count().orderBy("name", "age").show()
1420
1423
+-----+----+-----+
@@ -1448,7 +1451,8 @@ def agg(self, *exprs):
1448
1451
1449
1452
@since (2.0 )
1450
1453
def union (self , other ):
1451
- """ Return a new :class:`DataFrame` containing union of rows in this and another frame.
1454
+ """ Return a new :class:`DataFrame` containing union of rows in this and another
1455
+ :class:`DataFrame`.
1452
1456
1453
1457
This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
1454
1458
(that does deduplication of elements), use this function followed by :func:`distinct`.
@@ -1459,7 +1463,8 @@ def union(self, other):
1459
1463
1460
1464
@since (1.3 )
1461
1465
def unionAll (self , other ):
1462
- """ Return a new :class:`DataFrame` containing union of rows in this and another frame.
1466
+ """ Return a new :class:`DataFrame` containing union of rows in this and another
1467
+ :class:`DataFrame`.
1463
1468
1464
1469
This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
1465
1470
(that does deduplication of elements), use this function followed by :func:`distinct`.
@@ -1470,7 +1475,8 @@ def unionAll(self, other):
1470
1475
1471
1476
@since (2.3 )
1472
1477
def unionByName (self , other ):
1473
- """ Returns a new :class:`DataFrame` containing union of rows in this and another frame.
1478
+ """ Returns a new :class:`DataFrame` containing union of rows in this and another
1479
+ :class:`DataFrame`.
1474
1480
1475
1481
This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set
1476
1482
union (that does deduplication of elements), use this function followed by :func:`distinct`.
@@ -1493,16 +1499,16 @@ def unionByName(self, other):
1493
1499
@since (1.3 )
1494
1500
def intersect (self , other ):
1495
1501
""" Return a new :class:`DataFrame` containing rows only in
1496
- both this frame and another frame .
1502
+ both this :class:`DataFrame` and another :class:`DataFrame` .
1497
1503
1498
1504
This is equivalent to `INTERSECT` in SQL.
1499
1505
"""
1500
1506
return DataFrame (self ._jdf .intersect (other ._jdf ), self .sql_ctx )
1501
1507
1502
1508
@since (2.4 )
1503
1509
def intersectAll (self , other ):
1504
- """ Return a new :class:`DataFrame` containing rows in both this dataframe and other
1505
- dataframe while preserving duplicates.
1510
+ """ Return a new :class:`DataFrame` containing rows in both this :class:`DataFrame`
1511
+ and another :class:`DataFrame` while preserving duplicates.
1506
1512
1507
1513
This is equivalent to `INTERSECT ALL` in SQL.
1508
1514
>>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
@@ -1523,8 +1529,8 @@ def intersectAll(self, other):
1523
1529
1524
1530
@since (1.3 )
1525
1531
def subtract (self , other ):
1526
- """ Return a new :class:`DataFrame` containing rows in this frame
1527
- but not in another frame .
1532
+ """ Return a new :class:`DataFrame` containing rows in this :class:`DataFrame`
1533
+ but not in another :class:`DataFrame` .
1528
1534
1529
1535
This is equivalent to `EXCEPT DISTINCT` in SQL.
1530
1536
@@ -1814,12 +1820,12 @@ def all_of_(xs):
1814
1820
def approxQuantile (self , col , probabilities , relativeError ):
1815
1821
"""
1816
1822
Calculates the approximate quantiles of numerical columns of a
1817
- DataFrame.
1823
+ :class:` DataFrame` .
1818
1824
1819
1825
The result of this algorithm has the following deterministic bound:
1820
- If the DataFrame has N elements and if we request the quantile at
1826
+ If the :class:` DataFrame` has N elements and if we request the quantile at
1821
1827
probability `p` up to error `err`, then the algorithm will return
1822
- a sample `x` from the DataFrame so that the *exact* rank of `x` is
1828
+ a sample `x` from the :class:` DataFrame` so that the *exact* rank of `x` is
1823
1829
close to (p * N). More precisely,
1824
1830
1825
1831
floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
@@ -1887,7 +1893,7 @@ def approxQuantile(self, col, probabilities, relativeError):
1887
1893
@since (1.4 )
1888
1894
def corr (self , col1 , col2 , method = None ):
1889
1895
"""
1890
- Calculates the correlation of two columns of a DataFrame as a double value.
1896
+ Calculates the correlation of two columns of a :class:` DataFrame` as a double value.
1891
1897
Currently only supports the Pearson Correlation Coefficient.
1892
1898
:func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases of each other.
1893
1899
@@ -1935,7 +1941,7 @@ def crosstab(self, col1, col2):
1935
1941
:param col1: The name of the first column. Distinct items will make the first item of
1936
1942
each row.
1937
1943
:param col2: The name of the second column. Distinct items will make the column names
1938
- of the DataFrame.
1944
+ of the :class:` DataFrame` .
1939
1945
"""
1940
1946
if not isinstance (col1 , basestring ):
1941
1947
raise ValueError ("col1 should be a string." )
@@ -1952,7 +1958,8 @@ def freqItems(self, cols, support=None):
1952
1958
:func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.
1953
1959
1954
1960
.. note:: This function is meant for exploratory data analysis, as we make no
1955
- guarantee about the backward compatibility of the schema of the resulting DataFrame.
1961
+ guarantee about the backward compatibility of the schema of the resulting
1962
+ :class:`DataFrame`.
1956
1963
1957
1964
:param cols: Names of the columns to calculate frequent items for as a list or tuple of
1958
1965
strings.
@@ -1974,8 +1981,8 @@ def withColumn(self, colName, col):
1974
1981
Returns a new :class:`DataFrame` by adding a column or replacing the
1975
1982
existing column that has the same name.
1976
1983
1977
- The column expression must be an expression over this DataFrame; attempting to add
1978
- a column from some other dataframe will raise an error.
1984
+ The column expression must be an expression over this :class:` DataFrame` ; attempting to add
1985
+ a column from some other :class:`DataFrame` will raise an error.
1979
1986
1980
1987
:param colName: string, name of the new column.
1981
1988
:param col: a :class:`Column` expression for the new column.
@@ -2090,8 +2097,8 @@ def toPandas(self):
2090
2097
2091
2098
This is only available if Pandas is installed and available.
2092
2099
2093
- .. note:: This method should only be used if the resulting Pandas's DataFrame is expected
2094
- to be small, as all the data is loaded into the driver's memory.
2100
+ .. note:: This method should only be used if the resulting Pandas's :class:` DataFrame` is
2101
+ expected to be small, as all the data is loaded into the driver's memory.
2095
2102
2096
2103
.. note:: Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental.
2097
2104
@@ -2293,8 +2300,9 @@ def _to_scala_map(sc, jm):
2293
2300
2294
2301
def _to_corrected_pandas_type (dt ):
2295
2302
"""
2296
- When converting Spark SQL records to Pandas DataFrame, the inferred data type may be wrong.
2297
- This method gets the corrected data type for Pandas if that type may be inferred uncorrectly.
2303
+ When converting Spark SQL records to Pandas :class:`DataFrame`, the inferred data type may be
2304
+ wrong. This method gets the corrected data type for Pandas if that type may be inferred
2305
+ uncorrectly.
2298
2306
"""
2299
2307
import numpy as np
2300
2308
if type (dt ) == ByteType :
0 commit comments