1717 Timestamp ,
1818 lib ,
1919)
20- from pandas ._libs .lib import infer_dtype
2120
2221from pandas .core .dtypes .common import (
23- DT64NS_DTYPE ,
2422 ensure_platform_int ,
2523 is_bool_dtype ,
2624 is_integer ,
@@ -243,7 +241,7 @@ def cut(
243241
244242 original = x
245243 x_idx = _preprocess_for_cut (x )
246- x_idx , dtype = _coerce_to_type (x_idx )
244+ x_idx , _ = _coerce_to_type (x_idx )
247245
248246 if not np .iterable (bins ):
249247 bins = _nbins_to_bins (x_idx , bins , right )
@@ -254,16 +252,8 @@ def cut(
254252
255253 else :
256254 bins = Index (bins )
257- if isinstance (getattr (bins , "dtype" , None ), DatetimeTZDtype ):
258- bins = np .asarray (bins , dtype = DT64NS_DTYPE )
259- else :
260- bins = np .asarray (bins )
261- bins = _convert_bin_to_numeric_type (bins , dtype )
262-
263- # GH 26045: cast to float64 to avoid an overflow
264- if (np .diff (bins .astype ("float64" )) < 0 ).any ():
255+ if not bins .is_monotonic_increasing :
265256 raise ValueError ("bins must increase monotonically." )
266- bins = Index (bins )
267257
268258 fac , bins = _bins_to_cuts (
269259 x_idx ,
@@ -272,12 +262,11 @@ def cut(
272262 labels = labels ,
273263 precision = precision ,
274264 include_lowest = include_lowest ,
275- dtype = dtype ,
276265 duplicates = duplicates ,
277266 ordered = ordered ,
278267 )
279268
280- return _postprocess_for_cut (fac , bins , retbins , dtype , original )
269+ return _postprocess_for_cut (fac , bins , retbins , original )
281270
282271
283272def qcut (
@@ -343,25 +332,22 @@ def qcut(
343332 """
344333 original = x
345334 x_idx = _preprocess_for_cut (x )
346- x_idx , dtype = _coerce_to_type (x_idx )
335+ x_idx , _ = _coerce_to_type (x_idx )
347336
348337 quantiles = np .linspace (0 , 1 , q + 1 ) if is_integer (q ) else q
349338
350- x_np = np .asarray (x_idx )
351- x_np = x_np [~ np .isnan (x_np )]
352- bins = np .quantile (x_np , quantiles )
339+ bins = x_idx .to_series ().dropna ().quantile (quantiles )
353340
354341 fac , bins = _bins_to_cuts (
355342 x_idx ,
356343 Index (bins ),
357344 labels = labels ,
358345 precision = precision ,
359346 include_lowest = True ,
360- dtype = dtype ,
361347 duplicates = duplicates ,
362348 )
363349
364- return _postprocess_for_cut (fac , bins , retbins , dtype , original )
350+ return _postprocess_for_cut (fac , bins , retbins , original )
365351
366352
367353def _nbins_to_bins (x_idx : Index , nbins : int , right : bool ) -> Index :
@@ -378,18 +364,41 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
378364 rng = (x_idx .min (), x_idx .max ())
379365 mn , mx = rng
380366
381- if np .isinf (mn ) or np .isinf (mx ):
367+ is_dt_or_td = lib .is_np_dtype (x_idx .dtype , "mM" ) or isinstance (
368+ x_idx .dtype , DatetimeTZDtype
369+ )
370+
371+ if is_numeric_dtype (x_idx .dtype ) and (np .isinf (mn ) or np .isinf (mx )):
382372 # GH#24314
383373 raise ValueError (
384374 "cannot specify integer `bins` when input data contains infinity"
385375 )
386376
387377 if mn == mx : # adjust end points before binning
388- mn -= 0.001 * abs (mn ) if mn != 0 else 0.001
389- mx += 0.001 * abs (mx ) if mx != 0 else 0.001
390- bins = np .linspace (mn , mx , nbins + 1 , endpoint = True )
378+ if is_dt_or_td :
379+ # using seconds=1 is pretty arbitrary here
380+ td = Timedelta (seconds = 1 )
381+ # Use DatetimeArray/TimedeltaArray method instead of linspace
382+ # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
383+ # has no attribute "_generate_range"
384+ bins = x_idx ._values ._generate_range ( # type: ignore[union-attr]
385+ start = mn - td , end = mx + td , periods = nbins + 1 , freq = None
386+ )
387+ else :
388+ mn -= 0.001 * abs (mn ) if mn != 0 else 0.001
389+ mx += 0.001 * abs (mx ) if mx != 0 else 0.001
390+
391+ bins = np .linspace (mn , mx , nbins + 1 , endpoint = True )
391392 else : # adjust end points after binning
392- bins = np .linspace (mn , mx , nbins + 1 , endpoint = True )
393+ if is_dt_or_td :
394+ # Use DatetimeArray/TimedeltaArray method instead of linspace
395+ # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]"
396+ # has no attribute "_generate_range"
397+ bins = x_idx ._values ._generate_range ( # type: ignore[union-attr]
398+ start = mn , end = mx , periods = nbins + 1 , freq = None
399+ )
400+ else :
401+ bins = np .linspace (mn , mx , nbins + 1 , endpoint = True )
393402 adj = (mx - mn ) * 0.001 # 0.1% of the range
394403 if right :
395404 bins [0 ] -= adj
@@ -400,13 +409,12 @@ def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index:
400409
401410
402411def _bins_to_cuts (
403- x : Index ,
412+ x_idx : Index ,
404413 bins : Index ,
405414 right : bool = True ,
406415 labels = None ,
407416 precision : int = 3 ,
408417 include_lowest : bool = False ,
409- dtype : DtypeObj | None = None ,
410418 duplicates : str = "raise" ,
411419 ordered : bool = True ,
412420):
@@ -422,7 +430,7 @@ def _bins_to_cuts(
422430
423431 if isinstance (bins , IntervalIndex ):
424432 # we have a fast-path here
425- ids = bins .get_indexer (x )
433+ ids = bins .get_indexer (x_idx )
426434 cat_dtype = CategoricalDtype (bins , ordered = True )
427435 result = Categorical .from_codes (ids , dtype = cat_dtype , validate = False )
428436 return result , bins
@@ -437,12 +445,29 @@ def _bins_to_cuts(
437445 bins = unique_bins
438446
439447 side : Literal ["left" , "right" ] = "left" if right else "right"
440- ids = ensure_platform_int (bins .searchsorted (x , side = side ))
448+
449+ try :
450+ ids = bins .searchsorted (x_idx , side = side )
451+ except TypeError as err :
452+ # e.g. test_datetime_nan_error if bins are DatetimeArray and x_idx
453+ # is integers
454+ if x_idx .dtype .kind == "m" :
455+ raise ValueError ("bins must be of timedelta64 dtype" ) from err
456+ elif x_idx .dtype .kind == bins .dtype .kind == "M" :
457+ raise ValueError (
458+ "Cannot use timezone-naive bins with timezone-aware values, "
459+ "or vice-versa"
460+ ) from err
461+ elif x_idx .dtype .kind == "M" :
462+ raise ValueError ("bins must be of datetime64 dtype" ) from err
463+ else :
464+ raise
465+ ids = ensure_platform_int (ids )
441466
442467 if include_lowest :
443- ids [np . asarray ( x ) == bins [0 ]] = 1
468+ ids [x_idx == bins [0 ]] = 1
444469
445- na_mask = isna (x ) | (ids == len (bins )) | (ids == 0 )
470+ na_mask = isna (x_idx ) | (ids == len (bins )) | (ids == 0 )
446471 has_nas = na_mask .any ()
447472
448473 if labels is not False :
@@ -454,7 +479,7 @@ def _bins_to_cuts(
454479
455480 if labels is None :
456481 labels = _format_labels (
457- bins , precision , right = right , include_lowest = include_lowest , dtype = dtype
482+ bins , precision , right = right , include_lowest = include_lowest
458483 )
459484 elif ordered and len (set (labels )) != len (labels ):
460485 raise ValueError (
@@ -513,90 +538,28 @@ def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]:
513538 x_arr = x .to_numpy (dtype = np .float64 , na_value = np .nan )
514539 x = Index (x_arr )
515540
516- if dtype is not None :
517- # GH 19768: force NaT to NaN during integer conversion
518- x_arr = np .where (x .notna (), x .view (np .int64 ), np .nan )
519- x = Index (x_arr )
520-
521- return x , dtype
522-
523-
524- def _convert_bin_to_numeric_type (bins , dtype : DtypeObj | None ):
525- """
526- if the passed bin is of datetime/timedelta type,
527- this method converts it to integer
528-
529- Parameters
530- ----------
531- bins : list-like of bins
532- dtype : dtype of data
533-
534- Raises
535- ------
536- ValueError if bins are not of a compat dtype to dtype
537- """
538- bins_dtype = infer_dtype (bins , skipna = False )
539- if lib .is_np_dtype (dtype , "m" ):
540- if bins_dtype in ["timedelta" , "timedelta64" ]:
541- bins = to_timedelta (bins ).view (np .int64 )
542- else :
543- raise ValueError ("bins must be of timedelta64 dtype" )
544- elif lib .is_np_dtype (dtype , "M" ) or isinstance (dtype , DatetimeTZDtype ):
545- if bins_dtype in ["datetime" , "datetime64" ]:
546- bins = to_datetime (bins )
547- if lib .is_np_dtype (bins .dtype , "M" ):
548- # As of 2.0, to_datetime may give non-nano, so we need to convert
549- # here until the rest of this file recognizes non-nano
550- bins = bins .astype ("datetime64[ns]" , copy = False )
551- bins = bins .view (np .int64 )
552- else :
553- raise ValueError ("bins must be of datetime64 dtype" )
554-
555- return bins
556-
557-
558- def _convert_bin_to_datelike_type (bins , dtype : DtypeObj | None ):
559- """
560- Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is
561- datelike
562-
563- Parameters
564- ----------
565- bins : list-like of bins
566- dtype : dtype of data
567-
568- Returns
569- -------
570- bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is
571- datelike
572- """
573- if isinstance (dtype , DatetimeTZDtype ):
574- bins = to_datetime (bins .astype (np .int64 ), utc = True ).tz_convert (dtype .tz )
575- elif lib .is_np_dtype (dtype , "mM" ):
576- bins = Index (bins .astype (np .int64 ), dtype = dtype )
577- return bins
541+ return Index (x ), dtype
578542
579543
580544def _format_labels (
581545 bins : Index ,
582546 precision : int ,
583547 right : bool = True ,
584548 include_lowest : bool = False ,
585- dtype : DtypeObj | None = None ,
586549):
587550 """based on the dtype, return our labels"""
588551 closed : IntervalLeftRight = "right" if right else "left"
589552
590553 formatter : Callable [[Any ], Timestamp ] | Callable [[Any ], Timedelta ]
591554
592- if isinstance (dtype , DatetimeTZDtype ):
593- formatter = lambda x : Timestamp ( x , tz = dtype . tz )
555+ if isinstance (bins . dtype , DatetimeTZDtype ):
556+ formatter = lambda x : x
594557 adjust = lambda x : x - Timedelta ("1ns" )
595- elif lib .is_np_dtype (dtype , "M" ):
596- formatter = Timestamp
558+ elif lib .is_np_dtype (bins . dtype , "M" ):
559+ formatter = lambda x : x
597560 adjust = lambda x : x - Timedelta ("1ns" )
598- elif lib .is_np_dtype (dtype , "m" ):
599- formatter = Timedelta
561+ elif lib .is_np_dtype (bins . dtype , "m" ):
562+ formatter = lambda x : x
600563 adjust = lambda x : x - Timedelta ("1ns" )
601564 else :
602565 precision = _infer_precision (precision , bins )
@@ -628,7 +591,7 @@ def _preprocess_for_cut(x) -> Index:
628591 return Index (x )
629592
630593
631- def _postprocess_for_cut (fac , bins , retbins : bool , dtype : DtypeObj | None , original ):
594+ def _postprocess_for_cut (fac , bins , retbins : bool , original ):
632595 """
633596 handles post processing for the cut method where
634597 we combine the index information if the originally passed
@@ -640,7 +603,6 @@ def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, origi
640603 if not retbins :
641604 return fac
642605
643- bins = _convert_bin_to_datelike_type (bins , dtype )
644606 if isinstance (bins , Index ) and is_numeric_dtype (bins .dtype ):
645607 bins = bins ._values
646608
0 commit comments