Skip to content

Commit c285951

Browse files
committed
Reset deprecation warning.
1 parent 8c1031e commit c285951

File tree

1 file changed

+48
-5
lines changed

1 file changed

+48
-5
lines changed

python/pyspark/sql/context.py

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -173,8 +173,31 @@ def _inferSchema(self, rdd, samplingRatio=None):
173173
return schema
174174

175175
def inferSchema(self, rdd, samplingRatio=None):
176-
"""DEPRECATED: use :func:`createDataFrame` instead"""
177-
warnings.warn("Use createDataFrame instead of inferSchema.", DeprecationWarning)
176+
"""Infer and apply a schema to an RDD of L{Row}.
177+
178+
::note:
179+
Deprecated in 1.3, use :func:`createDataFrame` instead
180+
181+
When samplingRatio is specified, the schema is inferred by looking
182+
at the types of each row in the sampled dataset. Otherwise, the
183+
first 100 rows of the RDD are inspected. Nested collections are
184+
supported, which can include array, dict, list, Row, tuple,
185+
namedtuple, or object.
186+
187+
Each row could be L{pyspark.sql.Row} object or namedtuple or objects.
188+
Using top level dicts is deprecated, as dict is used to represent Maps.
189+
190+
If a single column has multiple distinct inferred types, it may cause
191+
runtime exceptions.
192+
193+
>>> rdd = sc.parallelize(
194+
... [Row(field1=1, field2="row1"),
195+
... Row(field1=2, field2="row2"),
196+
... Row(field1=3, field2="row3")])
197+
>>> df = sqlCtx.inferSchema(rdd)
198+
>>> df.collect()[0]
199+
Row(field1=1, field2=u'row1')
200+
"""
178201

179202
if isinstance(rdd, DataFrame):
180203
raise TypeError("Cannot apply schema to DataFrame")
@@ -185,8 +208,28 @@ def inferSchema(self, rdd, samplingRatio=None):
185208
return self.applySchema(rdd, schema)
186209

187210
def applySchema(self, rdd, schema):
188-
"""DEPRECATED: use :func:`createDataFrame` instead"""
189-
warnings.warn("Use createDataFrame instead of applySchema.", DeprecationWarning)
211+
"""
212+
Applies the given schema to the given RDD of L{tuple} or L{list}.
213+
214+
::note:
215+
Deprecated in 1.3, use :func:`createDataFrame` instead
216+
217+
These tuples or lists can contain complex nested structures like
218+
lists, maps or nested rows.
219+
220+
The schema should be a StructType.
221+
222+
It is important that the schema matches the types of the objects
223+
in each row or exceptions could be thrown at runtime.
224+
225+
>>> from pyspark.sql.types import *
226+
>>> rdd2 = sc.parallelize([(1, "row1"), (2, "row2"), (3, "row3")])
227+
>>> schema = StructType([StructField("field1", IntegerType(), False),
228+
... StructField("field2", StringType(), False)])
229+
>>> df = sqlCtx.applySchema(rdd2, schema)
230+
>>> df.collect()
231+
[Row(field1=1, field2=u'row1'),..., Row(field1=3, field2=u'row3')]
232+
"""
190233

191234
if isinstance(rdd, DataFrame):
192235
raise TypeError("Cannot apply schema to DataFrame")
@@ -285,7 +328,7 @@ def createDataFrame(self, data, schema=None, samplingRatio=None):
285328
raise ValueError("cannot create an RDD from type: %s" % type(data))
286329

287330
if schema is None:
288-
return self._inferSchema(data, samplingRatio)
331+
return self.inferSchema(data, samplingRatio)
289332

290333
if isinstance(schema, (list, tuple)):
291334
first = data.first()

0 commit comments

Comments
 (0)