apache · mstill3 · Oct 29, 2019 · Oct 30, 2019 · Oct 30, 2019 · Oct 30, 2019
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
@@ -318,7 +318,7 @@ def registerDataFrameAsTable(self, df, tableName):
 
     @since(1.6)
     def dropTempTable(self, tableName):
-        """ Remove the temp table from catalog.
+        """ Remove the temporary table from catalog.
 
         >>> sqlContext.registerDataFrameAsTable(df, "table1")
         >>> sqlContext.dropTempTable("table1")

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -58,7 +58,7 @@ class DataFrame(object):
     Once created, it can be manipulated using the various domain-specific-language
     (DSL) functions defined in: :class:`DataFrame`, :class:`Column`.
 
-    To select a column from the data frame, use the apply method::
+    To select a column from the :class:`DataFrame`, use the apply method::
 
         ageCol = people.age
 
@@ -124,7 +124,7 @@ def toJSON(self, use_unicode=True):
 
     @since(2.0)
     def createTempView(self, name):
-        """Creates a local temporary view with this DataFrame.
+        """Creates a local temporary view with this :class:`DataFrame`.
 
         The lifetime of this temporary table is tied to the :class:`SparkSession`
         that was used to create this :class:`DataFrame`.
@@ -146,7 +146,7 @@ def createTempView(self, name):
 
     @since(2.0)
     def createOrReplaceTempView(self, name):
-        """Creates or replaces a local temporary view with this DataFrame.
+        """Creates or replaces a local temporary view with this :class:`DataFrame`.
 
         The lifetime of this temporary table is tied to the :class:`SparkSession`
         that was used to create this :class:`DataFrame`.
@@ -164,7 +164,7 @@ def createOrReplaceTempView(self, name):
 
     @since(2.1)
     def createGlobalTempView(self, name):
-        """Creates a global temporary view with this DataFrame.
+        """Creates a global temporary view with this :class:`DataFrame`.
 
         The lifetime of this temporary view is tied to this Spark application.
         throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the
@@ -312,7 +312,7 @@ def isLocal(self):
     @property
     @since(2.0)
     def isStreaming(self):
-        """Returns true if this :class:`Dataset` contains one or more sources that continuously
+        """Returns ``True`` if this :class:`Dataset` contains one or more sources that continuously
         return data as it arrives. A :class:`Dataset` that reads data from a streaming source
         must be executed as a :class:`StreamingQuery` using the :func:`start` method in
         :class:`DataStreamWriter`.  Methods that return a single answer, (e.g., :func:`count` or
@@ -328,10 +328,10 @@ def show(self, n=20, truncate=True, vertical=False):
         """Prints the first ``n`` rows to the console.
 
         :param n: Number of rows to show.
-        :param truncate: If set to True, truncate strings longer than 20 chars by default.
+        :param truncate: If set to ``True``, truncate strings longer than 20 chars by default.
             If set to a number greater than one, truncates long strings to length ``truncate``
             and align cells right.
-        :param vertical: If set to True, print output rows vertically (one line
+        :param vertical: If set to ``True``, print output rows vertically (one line
             per column value).
 
         >>> df
@@ -373,7 +373,7 @@ def __repr__(self):
             return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes))
 
     def _repr_html_(self):
-        """Returns a dataframe with html code when you enabled eager evaluation
+        """Returns a :class:`DataFrame` with html code when you enabled eager evaluation
         by 'spark.sql.repl.eagerEval.enabled', this only called by REPL you are
         using support eager evaluation with HTML.
         """
@@ -407,11 +407,11 @@ def _repr_html_(self):
     @since(2.1)
     def checkpoint(self, eager=True):
         """Returns a checkpointed version of this Dataset. Checkpointing can be used to truncate the
-        logical plan of this DataFrame, which is especially useful in iterative algorithms where the
-        plan may grow exponentially. It will be saved to files inside the checkpoint
+        logical plan of this :class:`DataFrame`, which is especially useful in iterative algorithms
+        where the plan may grow exponentially. It will be saved to files inside the checkpoint
         directory set with :meth:`SparkContext.setCheckpointDir`.
 
-        :param eager: Whether to checkpoint this DataFrame immediately
+        :param eager: Whether to checkpoint this :class:`DataFrame` immediately
 
         .. note:: Experimental
         """
@@ -421,11 +421,11 @@ def checkpoint(self, eager=True):
     @since(2.3)
     def localCheckpoint(self, eager=True):
         """Returns a locally checkpointed version of this Dataset. Checkpointing can be used to
-        truncate the logical plan of this DataFrame, which is especially useful in iterative
-        algorithms where the plan may grow exponentially. Local checkpoints are stored in the
-        executors using the caching subsystem and therefore they are not reliable.
+        truncate the logical plan of this :class:`DataFrame`, which is especially useful in
+        iterative algorithms where the plan may grow exponentially. Local checkpoints are
+        stored in the executors using the caching subsystem and therefore they are not reliable.
 
-        :param eager: Whether to checkpoint this DataFrame immediately
+        :param eager: Whether to checkpoint this :class:`DataFrame` immediately
 
         .. note:: Experimental
         """
@@ -468,7 +468,7 @@ def withWatermark(self, eventTime, delayThreshold):
 
     @since(2.2)
     def hint(self, name, *parameters):
-        """Specifies some hint on the current DataFrame.
+        """Specifies some hint on the current :class:`DataFrame`.
 
         :param name: A name of the hint.
         :param parameters: Optional parameters.
@@ -523,8 +523,9 @@ def collect(self):
     def toLocalIterator(self, prefetchPartitions=False):
         """
         Returns an iterator that contains all of the rows in this :class:`DataFrame`.
-        The iterator will consume as much memory as the largest partition in this DataFrame.
-        With prefetch it may consume up to the memory of the 2 largest partitions.
+        The iterator will consume as much memory as the largest partition in this
+        :class:`DataFrame`. With prefetch it may consume up to the memory of the 2 largest
+        partitions.
 
         :param prefetchPartitions: If Spark should pre-fetch the next partition
                                    before it is needed.
@@ -633,7 +634,7 @@ def unpersist(self, blocking=False):
         """Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from
         memory and disk.
 
-        .. note:: `blocking` default has changed to False to match Scala in 2.0.
+        .. note:: `blocking` default has changed to ``False`` to match Scala in 2.0.
         """
         self.is_cached = False
         self._jdf.unpersist(blocking)
@@ -668,7 +669,7 @@ def coalesce(self, numPartitions):
     def repartition(self, numPartitions, *cols):
         """
         Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The
-        resulting DataFrame is hash partitioned.
+        resulting :class:`DataFrame` is hash partitioned.
 
         :param numPartitions:
             can be an int to specify the target number of partitions or a Column.
@@ -730,7 +731,7 @@ def repartition(self, numPartitions, *cols):
     def repartitionByRange(self, numPartitions, *cols):
         """
         Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The
-        resulting DataFrame is range partitioned.
+        resulting :class:`DataFrame` is range partitioned.
 
         :param numPartitions:
             can be an int to specify the target number of partitions or a Column.
@@ -790,7 +791,7 @@ def distinct(self):
     def sample(self, withReplacement=None, fraction=None, seed=None):
         """Returns a sampled subset of this :class:`DataFrame`.
 
-        :param withReplacement: Sample with replacement or not (default False).
+        :param withReplacement: Sample with replacement or not (default ``False``).
         :param fraction: Fraction of rows to generate, range [0.0, 1.0].
         :param seed: Seed for sampling (default a random seed).
 
@@ -862,7 +863,7 @@ def sampleBy(self, col, fractions, seed=None):
             sampling fraction for each stratum. If a stratum is not
             specified, we treat its fraction as zero.
         :param seed: random seed
-        :return: a new DataFrame that represents the stratified sample
+        :return: a new :class:`DataFrame` that represents the stratified sample
 
         >>> from pyspark.sql.functions import col
         >>> dataset = sqlContext.range(0, 100).select((col("id") % 3).alias("key"))
@@ -898,8 +899,8 @@ def sampleBy(self, col, fractions, seed=None):
     def randomSplit(self, weights, seed=None):
         """Randomly splits this :class:`DataFrame` with the provided weights.
 
-        :param weights: list of doubles as weights with which to split the DataFrame. Weights will
-            be normalized if they don't sum up to 1.0.
+        :param weights: list of doubles as weights with which to split the :class:`DataFrame`.
+            Weights will be normalized if they don't sum up to 1.0.
         :param seed: The seed for sampling.
 
         >>> splits = df4.randomSplit([1.0, 2.0], 24)
@@ -964,7 +965,7 @@ def colRegex(self, colName):
     def alias(self, alias):
         """Returns a new :class:`DataFrame` with an alias set.
 
-        :param alias: string, an alias name to be set for the DataFrame.
+        :param alias: string, an alias name to be set for the :class:`DataFrame`.
 
         >>> from pyspark.sql.functions import *
         >>> df_as1 = df.alias("df_as1")
@@ -1056,7 +1057,7 @@ def sortWithinPartitions(self, *cols, **kwargs):
         """Returns a new :class:`DataFrame` with each partition sorted by the specified column(s).
 
         :param cols: list of :class:`Column` or column names to sort by.
-        :param ascending: boolean or list of boolean (default True).
+        :param ascending: boolean or list of boolean (default ``True``).
             Sort ascending vs. descending. Specify list for multiple sort orders.
             If a list is specified, length of the list must equal length of the `cols`.
 
@@ -1077,7 +1078,7 @@ def sort(self, *cols, **kwargs):
         """Returns a new :class:`DataFrame` sorted by the specified column(s).
 
         :param cols: list of :class:`Column` or column names to sort by.
-        :param ascending: boolean or list of boolean (default True).
+        :param ascending: boolean or list of boolean (default ``True``).
             Sort ascending vs. descending. Specify list for multiple sort orders.
             If a list is specified, length of the list must equal length of the `cols`.
 
@@ -1144,7 +1145,8 @@ def describe(self, *cols):
         given, this function computes statistics for all numerical or string columns.
 
         .. note:: This function is meant for exploratory data analysis, as we make no
-            guarantee about the backward compatibility of the schema of the resulting DataFrame.
+            guarantee about the backward compatibility of the schema of the resulting
+            :class:`DataFrame`.
 
         >>> df.describe(['age']).show()
         +-------+------------------+
@@ -1188,7 +1190,8 @@ def summary(self, *statistics):
         approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
 
         .. note:: This function is meant for exploratory data analysis, as we make no
-            guarantee about the backward compatibility of the schema of the resulting DataFrame.
+            guarantee about the backward compatibility of the schema of the resulting
+            :class:`DataFrame`.
 
         >>> df.summary().show()
         +-------+------------------+-----+
@@ -1310,7 +1313,7 @@ def select(self, *cols):
 
         :param cols: list of column names (string) or expressions (:class:`Column`).
             If one of the column names is '*', that column is expanded to include all columns
-            in the current DataFrame.
+            in the current :class:`DataFrame`.
 
         >>> df.select('*').collect()
         [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
@@ -1414,7 +1417,7 @@ def rollup(self, *cols):
     def cube(self, *cols):
         """
         Create a multi-dimensional cube for the current :class:`DataFrame` using
-        the specified columns, so we can run aggregation on them.
+        the specified columns, so we can run aggregations on them.
 
         >>> df.cube("name", df.age).count().orderBy("name", "age").show()
         +-----+----+-----+
@@ -1448,7 +1451,8 @@ def agg(self, *exprs):
 
     @since(2.0)
     def union(self, other):
-        """ Return a new :class:`DataFrame` containing union of rows in this and another frame.
+        """ Return a new :class:`DataFrame` containing union of rows in this and another
+        :class:`DataFrame`.
 
         This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
         (that does deduplication of elements), use this function followed by :func:`distinct`.
@@ -1459,7 +1463,8 @@ def union(self, other):
 
     @since(1.3)
     def unionAll(self, other):
-        """ Return a new :class:`DataFrame` containing union of rows in this and another frame.
+        """ Return a new :class:`DataFrame` containing union of rows in this and another
+        :class:`DataFrame`.
 
         This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
         (that does deduplication of elements), use this function followed by :func:`distinct`.
@@ -1470,7 +1475,8 @@ def unionAll(self, other):
 
     @since(2.3)
     def unionByName(self, other):
-        """ Returns a new :class:`DataFrame` containing union of rows in this and another frame.
+        """ Returns a new :class:`DataFrame` containing union of rows in this and another
+        :class:`DataFrame`.
 
         This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set
         union (that does deduplication of elements), use this function followed by :func:`distinct`.
@@ -1493,16 +1499,16 @@ def unionByName(self, other):
     @since(1.3)
     def intersect(self, other):
         """ Return a new :class:`DataFrame` containing rows only in
-        both this frame and another frame.
+        both this :class:`DataFrame` and another :class:`DataFrame`.
 
         This is equivalent to `INTERSECT` in SQL.
         """
         return DataFrame(self._jdf.intersect(other._jdf), self.sql_ctx)
 
     @since(2.4)
     def intersectAll(self, other):
-        """ Return a new :class:`DataFrame` containing rows in both this dataframe and other
-        dataframe while preserving duplicates.
+        """ Return a new :class:`DataFrame` containing rows in both this :class:`DataFrame`
+        and another :class:`DataFrame` while preserving duplicates.
 
         This is equivalent to `INTERSECT ALL` in SQL.
         >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
@@ -1523,8 +1529,8 @@ def intersectAll(self, other):
 
     @since(1.3)
     def subtract(self, other):
-        """ Return a new :class:`DataFrame` containing rows in this frame
-        but not in another frame.
+        """ Return a new :class:`DataFrame` containing rows in this :class:`DataFrame`
+        but not in another :class:`DataFrame`.
 
         This is equivalent to `EXCEPT DISTINCT` in SQL.
 
@@ -1814,12 +1820,12 @@ def all_of_(xs):
     def approxQuantile(self, col, probabilities, relativeError):
         """
         Calculates the approximate quantiles of numerical columns of a
-        DataFrame.
+        :class:`DataFrame`.
 
         The result of this algorithm has the following deterministic bound:
-        If the DataFrame has N elements and if we request the quantile at
+        If the :class:`DataFrame` has N elements and if we request the quantile at
         probability `p` up to error `err`, then the algorithm will return
-        a sample `x` from the DataFrame so that the *exact* rank of `x` is
+        a sample `x` from the :class:`DataFrame` so that the *exact* rank of `x` is
         close to (p * N). More precisely,
 
           floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
@@ -1887,7 +1893,7 @@ def approxQuantile(self, col, probabilities, relativeError):
     @since(1.4)
     def corr(self, col1, col2, method=None):
         """
-        Calculates the correlation of two columns of a DataFrame as a double value.
+        Calculates the correlation of two columns of a :class:`DataFrame` as a double value.
         Currently only supports the Pearson Correlation Coefficient.
         :func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases of each other.
 
@@ -1935,7 +1941,7 @@ def crosstab(self, col1, col2):
         :param col1: The name of the first column. Distinct items will make the first item of
             each row.
         :param col2: The name of the second column. Distinct items will make the column names
-            of the DataFrame.
+            of the :class:`DataFrame`.
         """
         if not isinstance(col1, basestring):
             raise ValueError("col1 should be a string.")
@@ -1952,7 +1958,8 @@ def freqItems(self, cols, support=None):
         :func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.
 
         .. note:: This function is meant for exploratory data analysis, as we make no
-            guarantee about the backward compatibility of the schema of the resulting DataFrame.
+            guarantee about the backward compatibility of the schema of the resulting
+            :class:`DataFrame`.
 
         :param cols: Names of the columns to calculate frequent items for as a list or tuple of
             strings.
@@ -1974,8 +1981,8 @@ def withColumn(self, colName, col):
         Returns a new :class:`DataFrame` by adding a column or replacing the
         existing column that has the same name.
 
-        The column expression must be an expression over this DataFrame; attempting to add
-        a column from some other dataframe will raise an error.
+        The column expression must be an expression over this :class:`DataFrame`; attempting to add
+        a column from some other :class:`DataFrame` will raise an error.
 
         :param colName: string, name of the new column.
         :param col: a :class:`Column` expression for the new column.
@@ -2090,8 +2097,8 @@ def toPandas(self):
 
         This is only available if Pandas is installed and available.
 
-        .. note:: This method should only be used if the resulting Pandas's DataFrame is expected
-            to be small, as all the data is loaded into the driver's memory.
+        .. note:: This method should only be used if the resulting Pandas's :class:`DataFrame` is
+            expected to be small, as all the data is loaded into the driver's memory.
 
         .. note:: Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental.
 
@@ -2293,8 +2300,9 @@ def _to_scala_map(sc, jm):
 
 def _to_corrected_pandas_type(dt):
     """
-    When converting Spark SQL records to Pandas DataFrame, the inferred data type may be wrong.
-    This method gets the corrected data type for Pandas if that type may be inferred uncorrectly.
+    When converting Spark SQL records to Pandas :class:`DataFrame`, the inferred data type may be
+    wrong. This method gets the corrected data type for Pandas if that type may be inferred
+    uncorrectly.
     """
     import numpy as np
     if type(dt) == ByteType:

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -733,7 +733,7 @@ def save(self, path=None, format=None, mode=None, partitionBy=None, **options):
         :param partitionBy: names of partitioning columns
         :param options: all other string options
 
-        >>> df.write.mode('append').parquet(os.path.join(tempfile.mkdtemp(), 'data'))
+        >>> df.write.mode("append").save(os.path.join(tempfile.mkdtemp(), 'data'))
         """
         self.mode(mode).options(**options)
         if partitionBy is not None: