Skip to content

Commit 1e1b730

Browse files
mstill3dongjoon-hyun
authored andcommitted
[MINOR][PYSPARK][DOCS] Fix typo in example documentation
### What changes were proposed in this pull request? I propose that we change the example code documentation to call the proper function . For example, under the `foreachBatch` function, the example code was calling the `foreach()` function by mistake. ### Why are the changes needed? I suppose it could confuse some people, and it is a typo ### Does this PR introduce any user-facing change? No, there is no "meaningful" code being change, simply the documentation ### How was this patch tested? I made the change on a fork and it still worked Closes #26299 from mstill3/patch-1. Authored-by: Matt Stillwell <18670089+mstill3@users.noreply.github.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
1 parent 39fff92 commit 1e1b730

File tree

5 files changed

+78
-67
lines changed

5 files changed

+78
-67
lines changed

python/pyspark/sql/context.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ def registerDataFrameAsTable(self, df, tableName):
318318

319319
@since(1.6)
320320
def dropTempTable(self, tableName):
321-
""" Remove the temp table from catalog.
321+
""" Remove the temporary table from catalog.
322322
323323
>>> sqlContext.registerDataFrameAsTable(df, "table1")
324324
>>> sqlContext.dropTempTable("table1")

python/pyspark/sql/dataframe.py

Lines changed: 60 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ class DataFrame(object):
5858
Once created, it can be manipulated using the various domain-specific-language
5959
(DSL) functions defined in: :class:`DataFrame`, :class:`Column`.
6060
61-
To select a column from the data frame, use the apply method::
61+
To select a column from the :class:`DataFrame`, use the apply method::
6262
6363
ageCol = people.age
6464
@@ -124,7 +124,7 @@ def toJSON(self, use_unicode=True):
124124

125125
@since(2.0)
126126
def createTempView(self, name):
127-
"""Creates a local temporary view with this DataFrame.
127+
"""Creates a local temporary view with this :class:`DataFrame`.
128128
129129
The lifetime of this temporary table is tied to the :class:`SparkSession`
130130
that was used to create this :class:`DataFrame`.
@@ -146,7 +146,7 @@ def createTempView(self, name):
146146

147147
@since(2.0)
148148
def createOrReplaceTempView(self, name):
149-
"""Creates or replaces a local temporary view with this DataFrame.
149+
"""Creates or replaces a local temporary view with this :class:`DataFrame`.
150150
151151
The lifetime of this temporary table is tied to the :class:`SparkSession`
152152
that was used to create this :class:`DataFrame`.
@@ -164,7 +164,7 @@ def createOrReplaceTempView(self, name):
164164

165165
@since(2.1)
166166
def createGlobalTempView(self, name):
167-
"""Creates a global temporary view with this DataFrame.
167+
"""Creates a global temporary view with this :class:`DataFrame`.
168168
169169
The lifetime of this temporary view is tied to this Spark application.
170170
throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the
@@ -312,7 +312,7 @@ def isLocal(self):
312312
@property
313313
@since(2.0)
314314
def isStreaming(self):
315-
"""Returns true if this :class:`Dataset` contains one or more sources that continuously
315+
"""Returns ``True`` if this :class:`Dataset` contains one or more sources that continuously
316316
return data as it arrives. A :class:`Dataset` that reads data from a streaming source
317317
must be executed as a :class:`StreamingQuery` using the :func:`start` method in
318318
:class:`DataStreamWriter`. Methods that return a single answer, (e.g., :func:`count` or
@@ -328,10 +328,10 @@ def show(self, n=20, truncate=True, vertical=False):
328328
"""Prints the first ``n`` rows to the console.
329329
330330
:param n: Number of rows to show.
331-
:param truncate: If set to True, truncate strings longer than 20 chars by default.
331+
:param truncate: If set to ``True``, truncate strings longer than 20 chars by default.
332332
If set to a number greater than one, truncates long strings to length ``truncate``
333333
and align cells right.
334-
:param vertical: If set to True, print output rows vertically (one line
334+
:param vertical: If set to ``True``, print output rows vertically (one line
335335
per column value).
336336
337337
>>> df
@@ -373,7 +373,7 @@ def __repr__(self):
373373
return "DataFrame[%s]" % (", ".join("%s: %s" % c for c in self.dtypes))
374374

375375
def _repr_html_(self):
376-
"""Returns a dataframe with html code when you enabled eager evaluation
376+
"""Returns a :class:`DataFrame` with html code when you enabled eager evaluation
377377
by 'spark.sql.repl.eagerEval.enabled', this only called by REPL you are
378378
using support eager evaluation with HTML.
379379
"""
@@ -407,11 +407,11 @@ def _repr_html_(self):
407407
@since(2.1)
408408
def checkpoint(self, eager=True):
409409
"""Returns a checkpointed version of this Dataset. Checkpointing can be used to truncate the
410-
logical plan of this DataFrame, which is especially useful in iterative algorithms where the
411-
plan may grow exponentially. It will be saved to files inside the checkpoint
410+
logical plan of this :class:`DataFrame`, which is especially useful in iterative algorithms
411+
where the plan may grow exponentially. It will be saved to files inside the checkpoint
412412
directory set with :meth:`SparkContext.setCheckpointDir`.
413413
414-
:param eager: Whether to checkpoint this DataFrame immediately
414+
:param eager: Whether to checkpoint this :class:`DataFrame` immediately
415415
416416
.. note:: Experimental
417417
"""
@@ -421,11 +421,11 @@ def checkpoint(self, eager=True):
421421
@since(2.3)
422422
def localCheckpoint(self, eager=True):
423423
"""Returns a locally checkpointed version of this Dataset. Checkpointing can be used to
424-
truncate the logical plan of this DataFrame, which is especially useful in iterative
425-
algorithms where the plan may grow exponentially. Local checkpoints are stored in the
426-
executors using the caching subsystem and therefore they are not reliable.
424+
truncate the logical plan of this :class:`DataFrame`, which is especially useful in
425+
iterative algorithms where the plan may grow exponentially. Local checkpoints are
426+
stored in the executors using the caching subsystem and therefore they are not reliable.
427427
428-
:param eager: Whether to checkpoint this DataFrame immediately
428+
:param eager: Whether to checkpoint this :class:`DataFrame` immediately
429429
430430
.. note:: Experimental
431431
"""
@@ -468,7 +468,7 @@ def withWatermark(self, eventTime, delayThreshold):
468468

469469
@since(2.2)
470470
def hint(self, name, *parameters):
471-
"""Specifies some hint on the current DataFrame.
471+
"""Specifies some hint on the current :class:`DataFrame`.
472472
473473
:param name: A name of the hint.
474474
:param parameters: Optional parameters.
@@ -523,8 +523,9 @@ def collect(self):
523523
def toLocalIterator(self, prefetchPartitions=False):
524524
"""
525525
Returns an iterator that contains all of the rows in this :class:`DataFrame`.
526-
The iterator will consume as much memory as the largest partition in this DataFrame.
527-
With prefetch it may consume up to the memory of the 2 largest partitions.
526+
The iterator will consume as much memory as the largest partition in this
527+
:class:`DataFrame`. With prefetch it may consume up to the memory of the 2 largest
528+
partitions.
528529
529530
:param prefetchPartitions: If Spark should pre-fetch the next partition
530531
before it is needed.
@@ -633,7 +634,7 @@ def unpersist(self, blocking=False):
633634
"""Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from
634635
memory and disk.
635636
636-
.. note:: `blocking` default has changed to False to match Scala in 2.0.
637+
.. note:: `blocking` default has changed to ``False`` to match Scala in 2.0.
637638
"""
638639
self.is_cached = False
639640
self._jdf.unpersist(blocking)
@@ -668,7 +669,7 @@ def coalesce(self, numPartitions):
668669
def repartition(self, numPartitions, *cols):
669670
"""
670671
Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The
671-
resulting DataFrame is hash partitioned.
672+
resulting :class:`DataFrame` is hash partitioned.
672673
673674
:param numPartitions:
674675
can be an int to specify the target number of partitions or a Column.
@@ -730,7 +731,7 @@ def repartition(self, numPartitions, *cols):
730731
def repartitionByRange(self, numPartitions, *cols):
731732
"""
732733
Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The
733-
resulting DataFrame is range partitioned.
734+
resulting :class:`DataFrame` is range partitioned.
734735
735736
:param numPartitions:
736737
can be an int to specify the target number of partitions or a Column.
@@ -790,7 +791,7 @@ def distinct(self):
790791
def sample(self, withReplacement=None, fraction=None, seed=None):
791792
"""Returns a sampled subset of this :class:`DataFrame`.
792793
793-
:param withReplacement: Sample with replacement or not (default False).
794+
:param withReplacement: Sample with replacement or not (default ``False``).
794795
:param fraction: Fraction of rows to generate, range [0.0, 1.0].
795796
:param seed: Seed for sampling (default a random seed).
796797
@@ -862,7 +863,7 @@ def sampleBy(self, col, fractions, seed=None):
862863
sampling fraction for each stratum. If a stratum is not
863864
specified, we treat its fraction as zero.
864865
:param seed: random seed
865-
:return: a new DataFrame that represents the stratified sample
866+
:return: a new :class:`DataFrame` that represents the stratified sample
866867
867868
>>> from pyspark.sql.functions import col
868869
>>> dataset = sqlContext.range(0, 100).select((col("id") % 3).alias("key"))
@@ -898,8 +899,8 @@ def sampleBy(self, col, fractions, seed=None):
898899
def randomSplit(self, weights, seed=None):
899900
"""Randomly splits this :class:`DataFrame` with the provided weights.
900901
901-
:param weights: list of doubles as weights with which to split the DataFrame. Weights will
902-
be normalized if they don't sum up to 1.0.
902+
:param weights: list of doubles as weights with which to split the :class:`DataFrame`.
903+
Weights will be normalized if they don't sum up to 1.0.
903904
:param seed: The seed for sampling.
904905
905906
>>> splits = df4.randomSplit([1.0, 2.0], 24)
@@ -964,7 +965,7 @@ def colRegex(self, colName):
964965
def alias(self, alias):
965966
"""Returns a new :class:`DataFrame` with an alias set.
966967
967-
:param alias: string, an alias name to be set for the DataFrame.
968+
:param alias: string, an alias name to be set for the :class:`DataFrame`.
968969
969970
>>> from pyspark.sql.functions import *
970971
>>> df_as1 = df.alias("df_as1")
@@ -1056,7 +1057,7 @@ def sortWithinPartitions(self, *cols, **kwargs):
10561057
"""Returns a new :class:`DataFrame` with each partition sorted by the specified column(s).
10571058
10581059
:param cols: list of :class:`Column` or column names to sort by.
1059-
:param ascending: boolean or list of boolean (default True).
1060+
:param ascending: boolean or list of boolean (default ``True``).
10601061
Sort ascending vs. descending. Specify list for multiple sort orders.
10611062
If a list is specified, length of the list must equal length of the `cols`.
10621063
@@ -1077,7 +1078,7 @@ def sort(self, *cols, **kwargs):
10771078
"""Returns a new :class:`DataFrame` sorted by the specified column(s).
10781079
10791080
:param cols: list of :class:`Column` or column names to sort by.
1080-
:param ascending: boolean or list of boolean (default True).
1081+
:param ascending: boolean or list of boolean (default ``True``).
10811082
Sort ascending vs. descending. Specify list for multiple sort orders.
10821083
If a list is specified, length of the list must equal length of the `cols`.
10831084
@@ -1144,7 +1145,8 @@ def describe(self, *cols):
11441145
given, this function computes statistics for all numerical or string columns.
11451146
11461147
.. note:: This function is meant for exploratory data analysis, as we make no
1147-
guarantee about the backward compatibility of the schema of the resulting DataFrame.
1148+
guarantee about the backward compatibility of the schema of the resulting
1149+
:class:`DataFrame`.
11481150
11491151
>>> df.describe(['age']).show()
11501152
+-------+------------------+
@@ -1188,7 +1190,8 @@ def summary(self, *statistics):
11881190
approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
11891191
11901192
.. note:: This function is meant for exploratory data analysis, as we make no
1191-
guarantee about the backward compatibility of the schema of the resulting DataFrame.
1193+
guarantee about the backward compatibility of the schema of the resulting
1194+
:class:`DataFrame`.
11921195
11931196
>>> df.summary().show()
11941197
+-------+------------------+-----+
@@ -1310,7 +1313,7 @@ def select(self, *cols):
13101313
13111314
:param cols: list of column names (string) or expressions (:class:`Column`).
13121315
If one of the column names is '*', that column is expanded to include all columns
1313-
in the current DataFrame.
1316+
in the current :class:`DataFrame`.
13141317
13151318
>>> df.select('*').collect()
13161319
[Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]
@@ -1414,7 +1417,7 @@ def rollup(self, *cols):
14141417
def cube(self, *cols):
14151418
"""
14161419
Create a multi-dimensional cube for the current :class:`DataFrame` using
1417-
the specified columns, so we can run aggregation on them.
1420+
the specified columns, so we can run aggregations on them.
14181421
14191422
>>> df.cube("name", df.age).count().orderBy("name", "age").show()
14201423
+-----+----+-----+
@@ -1448,7 +1451,8 @@ def agg(self, *exprs):
14481451

14491452
@since(2.0)
14501453
def union(self, other):
1451-
""" Return a new :class:`DataFrame` containing union of rows in this and another frame.
1454+
""" Return a new :class:`DataFrame` containing union of rows in this and another
1455+
:class:`DataFrame`.
14521456
14531457
This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
14541458
(that does deduplication of elements), use this function followed by :func:`distinct`.
@@ -1459,7 +1463,8 @@ def union(self, other):
14591463

14601464
@since(1.3)
14611465
def unionAll(self, other):
1462-
""" Return a new :class:`DataFrame` containing union of rows in this and another frame.
1466+
""" Return a new :class:`DataFrame` containing union of rows in this and another
1467+
:class:`DataFrame`.
14631468
14641469
This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
14651470
(that does deduplication of elements), use this function followed by :func:`distinct`.
@@ -1470,7 +1475,8 @@ def unionAll(self, other):
14701475

14711476
@since(2.3)
14721477
def unionByName(self, other):
1473-
""" Returns a new :class:`DataFrame` containing union of rows in this and another frame.
1478+
""" Returns a new :class:`DataFrame` containing union of rows in this and another
1479+
:class:`DataFrame`.
14741480
14751481
This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set
14761482
union (that does deduplication of elements), use this function followed by :func:`distinct`.
@@ -1493,16 +1499,16 @@ def unionByName(self, other):
14931499
@since(1.3)
14941500
def intersect(self, other):
14951501
""" Return a new :class:`DataFrame` containing rows only in
1496-
both this frame and another frame.
1502+
both this :class:`DataFrame` and another :class:`DataFrame`.
14971503
14981504
This is equivalent to `INTERSECT` in SQL.
14991505
"""
15001506
return DataFrame(self._jdf.intersect(other._jdf), self.sql_ctx)
15011507

15021508
@since(2.4)
15031509
def intersectAll(self, other):
1504-
""" Return a new :class:`DataFrame` containing rows in both this dataframe and other
1505-
dataframe while preserving duplicates.
1510+
""" Return a new :class:`DataFrame` containing rows in both this :class:`DataFrame`
1511+
and another :class:`DataFrame` while preserving duplicates.
15061512
15071513
This is equivalent to `INTERSECT ALL` in SQL.
15081514
>>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
@@ -1523,8 +1529,8 @@ def intersectAll(self, other):
15231529

15241530
@since(1.3)
15251531
def subtract(self, other):
1526-
""" Return a new :class:`DataFrame` containing rows in this frame
1527-
but not in another frame.
1532+
""" Return a new :class:`DataFrame` containing rows in this :class:`DataFrame`
1533+
but not in another :class:`DataFrame`.
15281534
15291535
This is equivalent to `EXCEPT DISTINCT` in SQL.
15301536
@@ -1814,12 +1820,12 @@ def all_of_(xs):
18141820
def approxQuantile(self, col, probabilities, relativeError):
18151821
"""
18161822
Calculates the approximate quantiles of numerical columns of a
1817-
DataFrame.
1823+
:class:`DataFrame`.
18181824
18191825
The result of this algorithm has the following deterministic bound:
1820-
If the DataFrame has N elements and if we request the quantile at
1826+
If the :class:`DataFrame` has N elements and if we request the quantile at
18211827
probability `p` up to error `err`, then the algorithm will return
1822-
a sample `x` from the DataFrame so that the *exact* rank of `x` is
1828+
a sample `x` from the :class:`DataFrame` so that the *exact* rank of `x` is
18231829
close to (p * N). More precisely,
18241830
18251831
floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
@@ -1887,7 +1893,7 @@ def approxQuantile(self, col, probabilities, relativeError):
18871893
@since(1.4)
18881894
def corr(self, col1, col2, method=None):
18891895
"""
1890-
Calculates the correlation of two columns of a DataFrame as a double value.
1896+
Calculates the correlation of two columns of a :class:`DataFrame` as a double value.
18911897
Currently only supports the Pearson Correlation Coefficient.
18921898
:func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases of each other.
18931899
@@ -1935,7 +1941,7 @@ def crosstab(self, col1, col2):
19351941
:param col1: The name of the first column. Distinct items will make the first item of
19361942
each row.
19371943
:param col2: The name of the second column. Distinct items will make the column names
1938-
of the DataFrame.
1944+
of the :class:`DataFrame`.
19391945
"""
19401946
if not isinstance(col1, basestring):
19411947
raise ValueError("col1 should be a string.")
@@ -1952,7 +1958,8 @@ def freqItems(self, cols, support=None):
19521958
:func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.
19531959
19541960
.. note:: This function is meant for exploratory data analysis, as we make no
1955-
guarantee about the backward compatibility of the schema of the resulting DataFrame.
1961+
guarantee about the backward compatibility of the schema of the resulting
1962+
:class:`DataFrame`.
19561963
19571964
:param cols: Names of the columns to calculate frequent items for as a list or tuple of
19581965
strings.
@@ -1974,8 +1981,8 @@ def withColumn(self, colName, col):
19741981
Returns a new :class:`DataFrame` by adding a column or replacing the
19751982
existing column that has the same name.
19761983
1977-
The column expression must be an expression over this DataFrame; attempting to add
1978-
a column from some other dataframe will raise an error.
1984+
The column expression must be an expression over this :class:`DataFrame`; attempting to add
1985+
a column from some other :class:`DataFrame` will raise an error.
19791986
19801987
:param colName: string, name of the new column.
19811988
:param col: a :class:`Column` expression for the new column.
@@ -2090,8 +2097,8 @@ def toPandas(self):
20902097
20912098
This is only available if Pandas is installed and available.
20922099
2093-
.. note:: This method should only be used if the resulting Pandas's DataFrame is expected
2094-
to be small, as all the data is loaded into the driver's memory.
2100+
.. note:: This method should only be used if the resulting Pandas's :class:`DataFrame` is
2101+
expected to be small, as all the data is loaded into the driver's memory.
20952102
20962103
.. note:: Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental.
20972104
@@ -2293,8 +2300,9 @@ def _to_scala_map(sc, jm):
22932300

22942301
def _to_corrected_pandas_type(dt):
22952302
"""
2296-
When converting Spark SQL records to Pandas DataFrame, the inferred data type may be wrong.
2297-
This method gets the corrected data type for Pandas if that type may be inferred uncorrectly.
2303+
When converting Spark SQL records to Pandas :class:`DataFrame`, the inferred data type may be
2304+
wrong. This method gets the corrected data type for Pandas if that type may be inferred
2305+
uncorrectly.
22982306
"""
22992307
import numpy as np
23002308
if type(dt) == ByteType:

python/pyspark/sql/readwriter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,7 @@ def save(self, path=None, format=None, mode=None, partitionBy=None, **options):
733733
:param partitionBy: names of partitioning columns
734734
:param options: all other string options
735735
736-
>>> df.write.mode('append').parquet(os.path.join(tempfile.mkdtemp(), 'data'))
736+
>>> df.write.mode("append").save(os.path.join(tempfile.mkdtemp(), 'data'))
737737
"""
738738
self.mode(mode).options(**options)
739739
if partitionBy is not None:

0 commit comments

Comments
 (0)