@@ -173,8 +173,31 @@ def _inferSchema(self, rdd, samplingRatio=None):
173
173
return schema
174
174
175
175
def inferSchema (self , rdd , samplingRatio = None ):
176
- """DEPRECATED: use :func:`createDataFrame` instead"""
177
- warnings .warn ("Use createDataFrame instead of inferSchema." , DeprecationWarning )
176
+ """Infer and apply a schema to an RDD of L{Row}.
177
+
178
+ ::note:
179
+ Deprecated in 1.3, use :func:`createDataFrame` instead
180
+
181
+ When samplingRatio is specified, the schema is inferred by looking
182
+ at the types of each row in the sampled dataset. Otherwise, the
183
+ first 100 rows of the RDD are inspected. Nested collections are
184
+ supported, which can include array, dict, list, Row, tuple,
185
+ namedtuple, or object.
186
+
187
+ Each row could be L{pyspark.sql.Row} object or namedtuple or objects.
188
+ Using top level dicts is deprecated, as dict is used to represent Maps.
189
+
190
+ If a single column has multiple distinct inferred types, it may cause
191
+ runtime exceptions.
192
+
193
+ >>> rdd = sc.parallelize(
194
+ ... [Row(field1=1, field2="row1"),
195
+ ... Row(field1=2, field2="row2"),
196
+ ... Row(field1=3, field2="row3")])
197
+ >>> df = sqlCtx.inferSchema(rdd)
198
+ >>> df.collect()[0]
199
+ Row(field1=1, field2=u'row1')
200
+ """
178
201
179
202
if isinstance (rdd , DataFrame ):
180
203
raise TypeError ("Cannot apply schema to DataFrame" )
@@ -185,8 +208,28 @@ def inferSchema(self, rdd, samplingRatio=None):
185
208
return self .applySchema (rdd , schema )
186
209
187
210
def applySchema (self , rdd , schema ):
188
- """DEPRECATED: use :func:`createDataFrame` instead"""
189
- warnings .warn ("Use createDataFrame instead of applySchema." , DeprecationWarning )
211
+ """
212
+ Applies the given schema to the given RDD of L{tuple} or L{list}.
213
+
214
+ ::note:
215
+ Deprecated in 1.3, use :func:`createDataFrame` instead
216
+
217
+ These tuples or lists can contain complex nested structures like
218
+ lists, maps or nested rows.
219
+
220
+ The schema should be a StructType.
221
+
222
+ It is important that the schema matches the types of the objects
223
+ in each row or exceptions could be thrown at runtime.
224
+
225
+ >>> from pyspark.sql.types import *
226
+ >>> rdd2 = sc.parallelize([(1, "row1"), (2, "row2"), (3, "row3")])
227
+ >>> schema = StructType([StructField("field1", IntegerType(), False),
228
+ ... StructField("field2", StringType(), False)])
229
+ >>> df = sqlCtx.applySchema(rdd2, schema)
230
+ >>> df.collect()
231
+ [Row(field1=1, field2=u'row1'),..., Row(field1=3, field2=u'row3')]
232
+ """
190
233
191
234
if isinstance (rdd , DataFrame ):
192
235
raise TypeError ("Cannot apply schema to DataFrame" )
@@ -285,7 +328,7 @@ def createDataFrame(self, data, schema=None, samplingRatio=None):
285
328
raise ValueError ("cannot create an RDD from type: %s" % type (data ))
286
329
287
330
if schema is None :
288
- return self ._inferSchema (data , samplingRatio )
331
+ return self .inferSchema (data , samplingRatio )
289
332
290
333
if isinstance (schema , (list , tuple )):
291
334
first = data .first ()
0 commit comments