Skip to content

Commit f081d47

Browse files
committed
More documentation updates.
1 parent c9902fa commit f081d47

File tree

5 files changed

+77
-140
lines changed

5 files changed

+77
-140
lines changed

.rat-excludes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,4 @@ local-1426633911242/*
8282
local-1430917381534/*
8383
DESCRIPTION
8484
NAMESPACE
85+
test_support/*

python/pyspark/sql/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,4 +71,5 @@ def deco(f):
7171
__all__ = [
7272
'SQLContext', 'HiveContext', 'DataFrame', 'GroupedData', 'Column', 'Row',
7373
'DataFrameNaFunctions', 'DataFrameStatFunctions', 'Window', 'WindowSpec',
74+
'DataFrameReader', 'DataFrameWriter'
7475
]

python/pyspark/sql/context.py

Lines changed: 30 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,10 @@ def getConf(self, key, defaultValue):
124124
@property
125125
@since("1.3.1")
126126
def udf(self):
127-
"""Returns a :class:`UDFRegistration` for UDF registration."""
127+
"""Returns a :class:`UDFRegistration` for UDF registration.
128+
129+
:return: :class:`UDFRegistration`
130+
"""
128131
return UDFRegistration(self)
129132

130133
@since(1.4)
@@ -138,7 +141,7 @@ def range(self, start, end, step=1, numPartitions=None):
138141
:param end: the end value (exclusive)
139142
:param step: the incremental step (default: 1)
140143
:param numPartitions: the number of partitions of the DataFrame
141-
:return: A new DataFrame
144+
:return: :class:`DataFrame`
142145
143146
>>> sqlContext.range(1, 7, 2).collect()
144147
[Row(id=1), Row(id=3), Row(id=5)]
@@ -196,7 +199,7 @@ def _inferSchema(self, rdd, samplingRatio=None):
196199
"can not infer schema")
197200
if type(first) is dict:
198201
warnings.warn("Using RDD of dict to inferSchema is deprecated,"
199-
"please use pyspark.sql.Row instead")
202+
"please use pyspark.sql.Row instead", DeprecationWarning)
200203

201204
if samplingRatio is None:
202205
schema = _infer_schema(first)
@@ -219,7 +222,8 @@ def inferSchema(self, rdd, samplingRatio=None):
219222
"""
220223
.. note:: Deprecated in 1.3, use :func:`createDataFrame` instead.
221224
"""
222-
warnings.warn("inferSchema is deprecated, please use createDataFrame instead")
225+
warnings.warn(
226+
"inferSchema is deprecated, please use createDataFrame instead.", DeprecationWarning)
223227

224228
if isinstance(rdd, DataFrame):
225229
raise TypeError("Cannot apply schema to DataFrame")
@@ -231,7 +235,8 @@ def applySchema(self, rdd, schema):
231235
"""
232236
.. note:: Deprecated in 1.3, use :func:`createDataFrame` instead.
233237
"""
234-
warnings.warn("applySchema is deprecated, please use createDataFrame instead")
238+
warnings.warn(
239+
"applySchema is deprecated, please use createDataFrame instead", DeprecationWarning)
235240

236241
if isinstance(rdd, DataFrame):
237242
raise TypeError("Cannot apply schema to DataFrame")
@@ -262,6 +267,7 @@ def createDataFrame(self, data, schema=None, samplingRatio=None):
262267
:class:`list`, or :class:`pandas.DataFrame`.
263268
:param schema: a :class:`StructType` or list of column names. default None.
264269
:param samplingRatio: the sample ratio of rows used for inferring
270+
:return: :class:`DataFrame`
265271
266272
>>> l = [('Alice', 1)]
267273
>>> sqlContext.createDataFrame(l).collect()
@@ -359,58 +365,25 @@ def registerDataFrameAsTable(self, df, tableName):
359365
else:
360366
raise ValueError("Can only register DataFrame as table")
361367

362-
@since(1.0)
363368
def parquetFile(self, *paths):
364369
"""Loads a Parquet file, returning the result as a :class:`DataFrame`.
365370
366-
>>> import tempfile, shutil
367-
>>> parquetFile = tempfile.mkdtemp()
368-
>>> shutil.rmtree(parquetFile)
369-
>>> df.saveAsParquetFile(parquetFile)
370-
>>> df2 = sqlContext.parquetFile(parquetFile)
371-
>>> sorted(df.collect()) == sorted(df2.collect())
372-
True
371+
.. note:: Deprecated in 1.4, use :func:`DataFrameReader.parquet` instead.
373372
"""
373+
warnings.warn("parquetFile is deprecated. Use read.parquet() instead.", DeprecationWarning)
374374
gateway = self._sc._gateway
375375
jpaths = gateway.new_array(gateway.jvm.java.lang.String, len(paths))
376376
for i in range(0, len(paths)):
377377
jpaths[i] = paths[i]
378378
jdf = self._ssql_ctx.parquetFile(jpaths)
379379
return DataFrame(jdf, self)
380380

381-
@since(1.0)
382381
def jsonFile(self, path, schema=None, samplingRatio=1.0):
383382
"""Loads a text file storing one JSON object per line as a :class:`DataFrame`.
384383
385-
If the schema is provided, applies the given schema to this JSON dataset.
386-
Otherwise, it samples the dataset with ratio ``samplingRatio`` to determine the schema.
387-
388-
>>> import tempfile, shutil
389-
>>> jsonFile = tempfile.mkdtemp()
390-
>>> shutil.rmtree(jsonFile)
391-
>>> with open(jsonFile, 'w') as f:
392-
... f.writelines(jsonStrings)
393-
>>> df1 = sqlContext.jsonFile(jsonFile)
394-
>>> df1.printSchema()
395-
root
396-
|-- field1: long (nullable = true)
397-
|-- field2: string (nullable = true)
398-
|-- field3: struct (nullable = true)
399-
| |-- field4: long (nullable = true)
400-
401-
>>> from pyspark.sql.types import *
402-
>>> schema = StructType([
403-
... StructField("field2", StringType()),
404-
... StructField("field3",
405-
... StructType([StructField("field5", ArrayType(IntegerType()))]))])
406-
>>> df2 = sqlContext.jsonFile(jsonFile, schema)
407-
>>> df2.printSchema()
408-
root
409-
|-- field2: string (nullable = true)
410-
|-- field3: struct (nullable = true)
411-
| |-- field5: array (nullable = true)
412-
| | |-- element: integer (containsNull = true)
384+
.. note:: Deprecated in 1.4, use :func:`DataFrameReader.json` instead.
413385
"""
386+
warnings.warn("jsonFile is deprecated. Use read.json() instead.", DeprecationWarning)
414387
if schema is None:
415388
df = self._ssql_ctx.jsonFile(path, samplingRatio)
416389
else:
@@ -462,21 +435,16 @@ def func(iterator):
462435
df = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype)
463436
return DataFrame(df, self)
464437

465-
@since(1.3)
466438
def load(self, path=None, source=None, schema=None, **options):
467439
"""Returns the dataset in a data source as a :class:`DataFrame`.
468440
469-
The data source is specified by the ``source`` and a set of ``options``.
470-
If ``source`` is not specified, the default data source configured by
471-
``spark.sql.sources.default`` will be used.
472-
473-
Optionally, a schema can be provided as the schema of the returned DataFrame.
441+
.. note:: Deprecated in 1.4, use :func:`DataFrameReader.load` instead.
474442
"""
443+
warnings.warn("load is deprecated. Use read.load() instead.", DeprecationWarning)
475444
return self.read.load(path, source, schema, **options)
476445

477446
@since(1.3)
478-
def createExternalTable(self, tableName, path=None, source=None,
479-
schema=None, **options):
447+
def createExternalTable(self, tableName, path=None, source=None, schema=None, **options):
480448
"""Creates an external table based on the dataset in a data source.
481449
482450
It returns the DataFrame associated with the external table.
@@ -487,6 +455,8 @@ def createExternalTable(self, tableName, path=None, source=None,
487455
488456
Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and
489457
created external table.
458+
459+
:return: :class:`DataFrame`
490460
"""
491461
if path is not None:
492462
options["path"] = path
@@ -508,6 +478,8 @@ def createExternalTable(self, tableName, path=None, source=None,
508478
def sql(self, sqlQuery):
509479
"""Returns a :class:`DataFrame` representing the result of the given query.
510480
481+
:return: :class:`DataFrame`
482+
511483
>>> sqlContext.registerDataFrameAsTable(df, "table1")
512484
>>> df2 = sqlContext.sql("SELECT field1 AS f1, field2 as f2 from table1")
513485
>>> df2.collect()
@@ -519,6 +491,8 @@ def sql(self, sqlQuery):
519491
def table(self, tableName):
520492
"""Returns the specified table as a :class:`DataFrame`.
521493
494+
:return: :class:`DataFrame`
495+
522496
>>> sqlContext.registerDataFrameAsTable(df, "table1")
523497
>>> df2 = sqlContext.table("table1")
524498
>>> sorted(df.collect()) == sorted(df2.collect())
@@ -536,6 +510,9 @@ def tables(self, dbName=None):
536510
The returned DataFrame has two columns: ``tableName`` and ``isTemporary``
537511
(a column with :class:`BooleanType` indicating if a table is a temporary one or not).
538512
513+
:param dbName: string, name of the database to use.
514+
:return: :class:`DataFrame`
515+
539516
>>> sqlContext.registerDataFrameAsTable(df, "table1")
540517
>>> df2 = sqlContext.tables()
541518
>>> df2.filter("tableName = 'table1'").first()
@@ -550,7 +527,8 @@ def tables(self, dbName=None):
550527
def tableNames(self, dbName=None):
551528
"""Returns a list of names of tables in the database ``dbName``.
552529
553-
If ``dbName`` is not specified, the current database will be used.
530+
:param dbName: string, name of the database to use. Default to the current database.
531+
:return: list of table names, in string
554532
555533
>>> sqlContext.registerDataFrameAsTable(df, "table1")
556534
>>> "table1" in sqlContext.tableNames()
@@ -585,8 +563,7 @@ def read(self):
585563
Returns a :class:`DataFrameReader` that can be used to read data
586564
in as a :class:`DataFrame`.
587565
588-
>>> sqlContext.read
589-
<pyspark.sql.readwriter.DataFrameReader object at ...>
566+
:return: :class:`DataFrameReader`
590567
"""
591568
return DataFrameReader(self)
592569

python/pyspark/sql/dataframe.py

Lines changed: 20 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class DataFrame(object):
4444
A :class:`DataFrame` is equivalent to a relational table in Spark SQL,
4545
and can be created using various functions in :class:`SQLContext`::
4646
47-
people = sqlContext.parquetFile("...")
47+
people = sqlContext.read.parquet("...")
4848
4949
Once created, it can be manipulated using the various domain-specific-language
5050
(DSL) functions defined in: :class:`DataFrame`, :class:`Column`.
@@ -56,8 +56,8 @@ class DataFrame(object):
5656
A more concrete example::
5757
5858
# To create DataFrame using SQLContext
59-
people = sqlContext.parquetFile("...")
60-
department = sqlContext.parquetFile("...")
59+
people = sqlContext.read.parquet("...")
60+
department = sqlContext.read.parquet("...")
6161
6262
people.filter(people.age > 30).join(department, people.deptId == department.id)) \
6363
.groupBy(department.name, "gender").agg({"salary": "avg", "age": "max"})
@@ -120,21 +120,13 @@ def toJSON(self, use_unicode=True):
120120
rdd = self._jdf.toJSON()
121121
return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode))
122122

123-
@since(1.3)
124123
def saveAsParquetFile(self, path):
125124
"""Saves the contents as a Parquet file, preserving the schema.
126125
127-
Files that are written out using this method can be read back in as
128-
a :class:`DataFrame` using :func:`SQLContext.parquetFile`.
129-
130-
>>> import tempfile, shutil
131-
>>> parquetFile = tempfile.mkdtemp()
132-
>>> shutil.rmtree(parquetFile)
133-
>>> df.saveAsParquetFile(parquetFile)
134-
>>> df2 = sqlContext.parquetFile(parquetFile)
135-
>>> sorted(df2.collect()) == sorted(df.collect())
136-
True
126+
.. note:: Deprecated in 1.4, use :func:`DataFrameWriter.parquet` instead.
137127
"""
128+
warnings.warn(
129+
"saveAsParquetFile is deprecated. Use write.parquet() instead.", DeprecationWarning)
138130
self._jdf.saveAsParquetFile(path)
139131

140132
@since(1.3)
@@ -151,69 +143,48 @@ def registerTempTable(self, name):
151143
"""
152144
self._jdf.registerTempTable(name)
153145

154-
@since(1.3)
155146
def registerAsTable(self, name):
156-
"""DEPRECATED: use :func:`registerTempTable` instead"""
147+
"""
148+
.. note:: Deprecated in 1.4, use :func:`registerTempTable` instead.
149+
"""
157150
warnings.warn("Use registerTempTable instead of registerAsTable.", DeprecationWarning)
158151
self.registerTempTable(name)
159152

160-
@since(1.3)
161153
def insertInto(self, tableName, overwrite=False):
162154
"""Inserts the contents of this :class:`DataFrame` into the specified table.
163155
164-
Optionally overwriting any existing data.
156+
.. note:: Deprecated in 1.4, use :func:`DataFrameWriter.insertInto` instead.
165157
"""
158+
warnings.warn(
159+
"insertInto is deprecated. Use write.insertInto() instead.", DeprecationWarning)
166160
self.write.insertInto(tableName, overwrite)
167161

168-
@since(1.3)
169162
def saveAsTable(self, tableName, source=None, mode="error", **options):
170163
"""Saves the contents of this :class:`DataFrame` to a data source as a table.
171164
172-
The data source is specified by the ``source`` and a set of ``options``.
173-
If ``source`` is not specified, the default data source configured by
174-
``spark.sql.sources.default`` will be used.
175-
176-
Additionally, mode is used to specify the behavior of the saveAsTable operation when
177-
table already exists in the data source. There are four modes:
178-
179-
* `append`: Append contents of this :class:`DataFrame` to existing data.
180-
* `overwrite`: Overwrite existing data.
181-
* `error`: Throw an exception if data already exists.
182-
* `ignore`: Silently ignore this operation if data already exists.
165+
.. note:: Deprecated in 1.4, use :func:`DataFrameWriter.saveAsTable` instead.
183166
"""
167+
warnings.warn(
168+
"insertInto is deprecated. Use write.saveAsTable() instead.", DeprecationWarning)
184169
self.write.saveAsTable(tableName, source, mode, **options)
185170

186171
@since(1.3)
187172
def save(self, path=None, source=None, mode="error", **options):
188173
"""Saves the contents of the :class:`DataFrame` to a data source.
189174
190-
The data source is specified by the ``source`` and a set of ``options``.
191-
If ``source`` is not specified, the default data source configured by
192-
``spark.sql.sources.default`` will be used.
193-
194-
Additionally, mode is used to specify the behavior of the save operation when
195-
data already exists in the data source. There are four modes:
196-
197-
* `append`: Append contents of this :class:`DataFrame` to existing data.
198-
* `overwrite`: Overwrite existing data.
199-
* `error`: Throw an exception if data already exists.
200-
* `ignore`: Silently ignore this operation if data already exists.
175+
.. note:: Deprecated in 1.4, use :func:`DataFrameWriter.save` instead.
201176
"""
177+
warnings.warn(
178+
"insertInto is deprecated. Use write.save() instead.", DeprecationWarning)
202179
return self.write.save(path, source, mode, **options)
203180

204181
@property
205182
@since(1.4)
206183
def write(self):
207184
"""
208-
Interface for saving the content of the :class:`DataFrame` out
209-
into external storage.
210-
211-
:return :class:`DataFrameWriter`
212-
213-
.. note:: Experimental
185+
Interface for saving the content of the :class:`DataFrame` out into external storage.
214186
215-
>>> df.write
216-
<pyspark.sql.readwriter.DataFrameWriter object at ...>
187+
:return: :class:`DataFrameWriter`
217188
"""
218189
return DataFrameWriter(self)
219190

0 commit comments

Comments
 (0)