@@ -317,33 +317,26 @@ class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable)
317
317
318
318
319
319
@inherit_doc
320
- class Bucketizer (JavaTransformer , HasInputCol , HasOutputCol , HasInputCols , HasOutputCols ,
321
- HasHandleInvalid , JavaMLReadable , JavaMLWritable ):
322
- """
323
- Maps a column of continuous features to a column of feature buckets. Since 2.3.0,
324
- :py:class:`Bucketizer` can map multiple columns at once by setting the :py:attr:`inputCols`
325
- parameter. Note that when both the :py:attr:`inputCol` and :py:attr:`inputCols` parameters
326
- are set, an Exception will be thrown. The :py:attr:`splits` parameter is only used for single
327
- column usage, and :py:attr:`splitsArray` is for multiple columns.
328
-
329
- >>> values = [(0.1, 0.0), (0.4, 1.0), (1.2, 1.3), (1.5, float("nan")),
330
- ... (float("nan"), 1.0), (float("nan"), 0.0)]
331
- >>> df = spark.createDataFrame(values, ["values1", "values2"])
320
+ class Bucketizer (JavaTransformer , HasInputCol , HasOutputCol , HasHandleInvalid ,
321
+ JavaMLReadable , JavaMLWritable ):
322
+ """
323
+ Maps a column of continuous features to a column of feature buckets.
324
+
325
+ >>> values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)]
326
+ >>> df = spark.createDataFrame(values, ["values"])
332
327
>>> bucketizer = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")],
333
- ... inputCol="values1", outputCol="buckets")
334
- >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df.select("values1"))
335
- >>> bucketed.show(truncate=False)
336
- +-------+-------+
337
- |values1|buckets|
338
- +-------+-------+
339
- |0.1 |0.0 |
340
- |0.4 |0.0 |
341
- |1.2 |1.0 |
342
- |1.5 |2.0 |
343
- |NaN |3.0 |
344
- |NaN |3.0 |
345
- +-------+-------+
346
- ...
328
+ ... inputCol="values", outputCol="buckets")
329
+ >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df).collect()
330
+ >>> len(bucketed)
331
+ 6
332
+ >>> bucketed[0].buckets
333
+ 0.0
334
+ >>> bucketed[1].buckets
335
+ 0.0
336
+ >>> bucketed[2].buckets
337
+ 1.0
338
+ >>> bucketed[3].buckets
339
+ 2.0
347
340
>>> bucketizer.setParams(outputCol="b").transform(df).head().b
348
341
0.0
349
342
>>> bucketizerPath = temp_path + "/bucketizer"
@@ -354,22 +347,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOu
354
347
>>> bucketed = bucketizer.setHandleInvalid("skip").transform(df).collect()
355
348
>>> len(bucketed)
356
349
4
357
- >>> bucketizer2 = Bucketizer(splitsArray=
358
- ... [[-float("inf"), 0.5, 1.4, float("inf")], [-float("inf"), 0.5, float("inf")]],
359
- ... inputCols=["values1", "values2"], outputCols=["buckets1", "buckets2"])
360
- >>> bucketed2 = bucketizer2.setHandleInvalid("keep").transform(df)
361
- >>> bucketed2.show(truncate=False)
362
- +-------+-------+--------+--------+
363
- |values1|values2|buckets1|buckets2|
364
- +-------+-------+--------+--------+
365
- |0.1 |0.0 |0.0 |0.0 |
366
- |0.4 |1.0 |0.0 |1.0 |
367
- |1.2 |1.3 |1.0 |1.0 |
368
- |1.5 |NaN |2.0 |2.0 |
369
- |NaN |1.0 |3.0 |1.0 |
370
- |NaN |0.0 |3.0 |0.0 |
371
- +-------+-------+--------+--------+
372
- ...
373
350
374
351
.. versionadded:: 1.4.0
375
352
"""
@@ -386,30 +363,14 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOu
386
363
387
364
handleInvalid = Param (Params ._dummy (), "handleInvalid" , "how to handle invalid entries. " +
388
365
"Options are 'skip' (filter out rows with invalid values), " +
389
- "'error' (throw an error), or 'keep' (keep invalid values in a " +
390
- "special additional bucket). Note that in the multiple column " +
391
- "case, the invalid handling is applied to all columns. That said " +
392
- "for 'error' it will throw an error if any invalids are found in " +
393
- "any column, for 'skip' it will skip rows with any invalids in " +
394
- "any columns, etc." ,
366
+ "'error' (throw an error), or 'keep' (keep invalid values in a special " +
367
+ "additional bucket)." ,
395
368
typeConverter = TypeConverters .toString )
396
369
397
- splitsArray = Param (Params ._dummy (), "splitsArray" , "The array of split points for mapping " +
398
- "continuous features into buckets for multiple columns. For each input " +
399
- "column, with n+1 splits, there are n buckets. A bucket defined by " +
400
- "splits x,y holds values in the range [x,y) except the last bucket, " +
401
- "which also includes y. The splits should be of length >= 3 and " +
402
- "strictly increasing. Values at -inf, inf must be explicitly provided " +
403
- "to cover all Double values; otherwise, values outside the splits " +
404
- "specified will be treated as errors." ,
405
- typeConverter = TypeConverters .toListListFloat )
406
-
407
370
@keyword_only
408
- def __init__ (self , splits = None , inputCol = None , outputCol = None , handleInvalid = "error" ,
409
- splitsArray = None , inputCols = None , outputCols = None ):
371
+ def __init__ (self , splits = None , inputCol = None , outputCol = None , handleInvalid = "error" ):
410
372
"""
411
- __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error", \
412
- splitsArray=None, inputCols=None, outputCols=None)
373
+ __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error")
413
374
"""
414
375
super (Bucketizer , self ).__init__ ()
415
376
self ._java_obj = self ._new_java_obj ("org.apache.spark.ml.feature.Bucketizer" , self .uid )
@@ -419,11 +380,9 @@ def __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="er
419
380
420
381
@keyword_only
421
382
@since ("1.4.0" )
422
- def setParams (self , splits = None , inputCol = None , outputCol = None , handleInvalid = "error" ,
423
- splitsArray = None , inputCols = None , outputCols = None ):
383
+ def setParams (self , splits = None , inputCol = None , outputCol = None , handleInvalid = "error" ):
424
384
"""
425
- setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error", \
426
- splitsArray=None, inputCols=None, outputCols=None)
385
+ setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error")
427
386
Sets params for this Bucketizer.
428
387
"""
429
388
kwargs = self ._input_kwargs
@@ -443,20 +402,6 @@ def getSplits(self):
443
402
"""
444
403
return self .getOrDefault (self .splits )
445
404
446
- @since ("2.3.0" )
447
- def setSplitsArray (self , value ):
448
- """
449
- Sets the value of :py:attr:`splitsArray`.
450
- """
451
- return self ._set (splitsArray = value )
452
-
453
- @since ("2.3.0" )
454
- def getSplitsArray (self ):
455
- """
456
- Gets the array of split points or its default value.
457
- """
458
- return self .getOrDefault (self .splitsArray )
459
-
460
405
461
406
@inherit_doc
462
407
class CountVectorizer (JavaEstimator , HasInputCol , HasOutputCol , JavaMLReadable , JavaMLWritable ):
0 commit comments