@@ -421,24 +421,39 @@ def str_extract(arr, pat, flags=0):
421
421
Pattern or regular expression
422
422
flags : int, default 0 (no flags)
423
423
re module flags, e.g. re.IGNORECASE
424
+ expand : bool, default True
425
+ * If True, return DataFrame/MultiIndex expanding dimensionality.
426
+ * If False, return Series/Index.
424
427
425
428
Returns
426
429
-------
427
- extracted groups : Series (one group) or DataFrame (multiple groups)
430
+ extracted groups : Deprecated: Series (one group) or DataFrame (multiple groups)
428
431
Note that dtype of the result is always object, even when no match is
429
432
found and the result is a Series or DataFrame containing only NaN
430
433
values.
431
434
435
+ Being changed to return Series/Index or DataFrame/MultiIndex of objects
436
+ specified by expand option in future version.
437
+
432
438
Examples
433
439
--------
434
- A pattern with one group will return a Series. Non-matches will be NaN.
440
+ Deprecated: A pattern with one group returns a Series. Non-matches will be NaN.
441
+ Being changed to return DataFrame by default in future version.
435
442
436
443
>>> Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)')
437
444
0 1
438
445
1 2
439
446
2 NaN
440
447
dtype: object
441
448
449
+ Specify ``expand=False`` to return Series.
450
+
451
+ >>> Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)', expand=False)
452
+ 0 1
453
+ 1 2
454
+ 2 NaN
455
+ dtype: object
456
+
442
457
A pattern with more than one group will return a DataFrame.
443
458
444
459
>>> Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)')
@@ -462,12 +477,7 @@ def str_extract(arr, pat, flags=0):
462
477
0 a 1
463
478
1 b 2
464
479
2 NaN NaN
465
-
466
480
"""
467
- from pandas .core .series import Series
468
- from pandas .core .frame import DataFrame
469
- from pandas .core .index import Index
470
-
471
481
regex = re .compile (pat , flags = flags )
472
482
# just to be safe, check this
473
483
if regex .groups == 0 :
@@ -487,18 +497,9 @@ def f(x):
487
497
result = np .array ([f (val )[0 ] for val in arr ], dtype = object )
488
498
name = _get_single_group_name (regex )
489
499
else :
490
- if isinstance (arr , Index ):
491
- raise ValueError ("only one regex group is supported with Index" )
492
- name = None
493
500
names = dict (zip (regex .groupindex .values (), regex .groupindex .keys ()))
494
- columns = [names .get (1 + i , i ) for i in range (regex .groups )]
495
- if arr .empty :
496
- result = DataFrame (columns = columns , dtype = object )
497
- else :
498
- result = DataFrame ([f (val ) for val in arr ],
499
- columns = columns ,
500
- index = arr .index ,
501
- dtype = object )
501
+ name = [names .get (1 + i , i ) for i in range (regex .groups )]
502
+ result = np .array ([f (val ) for val in arr ], dtype = object )
502
503
return result , name
503
504
504
505
@@ -511,6 +512,9 @@ def str_get_dummies(arr, sep='|'):
511
512
----------
512
513
sep : string, default "|"
513
514
String to split on.
515
+ expand : bool, default False
516
+ * If True, return DataFrame/MultiIndex expanding dimensionality.
517
+ * If False, return Series/Index.
514
518
515
519
Returns
516
520
-------
@@ -534,15 +538,15 @@ def str_get_dummies(arr, sep='|'):
534
538
--------
535
539
pandas.get_dummies
536
540
"""
537
- from pandas .core .frame import DataFrame
538
541
from pandas .core .index import Index
539
-
540
- # GH9980, Index.str does not support get_dummies() as it returns a frame
542
+ # TODO: Add fillna GH 10089
541
543
if isinstance (arr , Index ):
542
- raise TypeError ("get_dummies is not supported for string methods on Index" )
543
-
544
- # TODO remove this hack?
545
- arr = arr .fillna ('' )
544
+ # temp hack
545
+ values = arr .values
546
+ values [isnull (values )] = ''
547
+ arr = Index (values )
548
+ else :
549
+ arr = arr .fillna ('' )
546
550
try :
547
551
arr = sep + arr + sep
548
552
except TypeError :
@@ -558,7 +562,7 @@ def str_get_dummies(arr, sep='|'):
558
562
for i , t in enumerate (tags ):
559
563
pat = sep + t + sep
560
564
dummies [:, i ] = lib .map_infer (arr .values , lambda x : pat in x )
561
- return DataFrame ( dummies , arr . index , tags )
565
+ return dummies , tags
562
566
563
567
564
568
def str_join (arr , sep ):
@@ -1043,40 +1047,19 @@ def __iter__(self):
1043
1047
i += 1
1044
1048
g = self .get (i )
1045
1049
1046
- def _wrap_result (self , result , ** kwargs ):
1047
-
1048
- # leave as it is to keep extract and get_dummies results
1049
- # can be merged to _wrap_result_expand in v0.17
1050
- from pandas .core .series import Series
1051
- from pandas .core .frame import DataFrame
1052
- from pandas .core .index import Index
1053
-
1054
- if not hasattr (result , 'ndim' ):
1055
- return result
1056
- name = kwargs .get ('name' ) or getattr (result , 'name' , None ) or self .series .name
1057
-
1058
- if result .ndim == 1 :
1059
- if isinstance (self .series , Index ):
1060
- # if result is a boolean np.array, return the np.array
1061
- # instead of wrapping it into a boolean Index (GH 8875)
1062
- if is_bool_dtype (result ):
1063
- return result
1064
- return Index (result , name = name )
1065
- return Series (result , index = self .series .index , name = name )
1066
- else :
1067
- assert result .ndim < 3
1068
- return DataFrame (result , index = self .series .index )
1050
+ def _wrap_result (self , result , expand = False , name = None ):
1051
+ from pandas .core .index import Index , MultiIndex
1069
1052
1070
- def _wrap_result_expand (self , result , expand = False ):
1071
1053
if not isinstance (expand , bool ):
1072
1054
raise ValueError ("expand must be True or False" )
1073
1055
1074
- from pandas .core .index import Index , MultiIndex
1056
+ if name is None :
1057
+ name = getattr (result , 'name' , None ) or self .series .name
1058
+
1075
1059
if not hasattr (result , 'ndim' ):
1076
1060
return result
1077
1061
1078
1062
if isinstance (self .series , Index ):
1079
- name = getattr (result , 'name' , None )
1080
1063
# if result is a boolean np.array, return the np.array
1081
1064
# instead of wrapping it into a boolean Index (GH 8875)
1082
1065
if hasattr (result , 'dtype' ) and is_bool_dtype (result ):
@@ -1086,16 +1069,19 @@ def _wrap_result_expand(self, result, expand=False):
1086
1069
result = list (result )
1087
1070
return MultiIndex .from_tuples (result , names = name )
1088
1071
else :
1072
+ print (type (name ), name )
1089
1073
return Index (result , name = name )
1090
1074
else :
1091
1075
index = self .series .index
1092
1076
if expand :
1093
1077
cons_row = self .series ._constructor
1094
1078
cons = self .series ._constructor_expanddim
1095
- data = [cons_row (x ) for x in result ]
1096
- return cons (data , index = index )
1079
+ data = [cons_row (x , index = name ) for x in result ]
1080
+ return cons (data , index = index , columns = name ,
1081
+ dtype = result .dtype )
1097
1082
else :
1098
- name = getattr (result , 'name' , None )
1083
+ if result .ndim > 1 :
1084
+ result = list (result )
1099
1085
cons = self .series ._constructor
1100
1086
return cons (result , name = name , index = index )
1101
1087
@@ -1109,7 +1095,7 @@ def cat(self, others=None, sep=None, na_rep=None):
1109
1095
@copy (str_split )
1110
1096
def split (self , pat = None , n = - 1 , expand = False ):
1111
1097
result = str_split (self .series , pat , n = n )
1112
- return self ._wrap_result_expand (result , expand = expand )
1098
+ return self ._wrap_result (result , expand = expand )
1113
1099
1114
1100
_shared_docs ['str_partition' ] = ("""
1115
1101
Split the string at the %(side)s occurrence of `sep`, and return 3 elements
@@ -1160,15 +1146,15 @@ def split(self, pat=None, n=-1, expand=False):
1160
1146
def partition (self , pat = ' ' , expand = True ):
1161
1147
f = lambda x : x .partition (pat )
1162
1148
result = _na_map (f , self .series )
1163
- return self ._wrap_result_expand (result , expand = expand )
1149
+ return self ._wrap_result (result , expand = expand )
1164
1150
1165
1151
@Appender (_shared_docs ['str_partition' ] % {'side' : 'last' ,
1166
1152
'return' : '3 elements containing two empty strings, followed by the string itself' ,
1167
1153
'also' : 'partition : Split the string at the first occurrence of `sep`' })
1168
1154
def rpartition (self , pat = ' ' , expand = True ):
1169
1155
f = lambda x : x .rpartition (pat )
1170
1156
result = _na_map (f , self .series )
1171
- return self ._wrap_result_expand (result , expand = expand )
1157
+ return self ._wrap_result (result , expand = expand )
1172
1158
1173
1159
@copy (str_get )
1174
1160
def get (self , i ):
@@ -1309,9 +1295,9 @@ def wrap(self, width, **kwargs):
1309
1295
return self ._wrap_result (result )
1310
1296
1311
1297
@copy (str_get_dummies )
1312
- def get_dummies (self , sep = '|' ):
1313
- result = str_get_dummies (self .series , sep )
1314
- return self ._wrap_result (result )
1298
+ def get_dummies (self , sep = '|' , expand = True ):
1299
+ result , name = str_get_dummies (self .series , sep )
1300
+ return self ._wrap_result (result , name = name , expand = expand )
1315
1301
1316
1302
@copy (str_translate )
1317
1303
def translate (self , table , deletechars = None ):
@@ -1324,9 +1310,26 @@ def translate(self, table, deletechars=None):
1324
1310
findall = _pat_wrapper (str_findall , flags = True )
1325
1311
1326
1312
@copy (str_extract )
1327
- def extract (self , pat , flags = 0 ):
1313
+ def extract (self , pat , flags = 0 , expand = None ):
1328
1314
result , name = str_extract (self .series , pat , flags = flags )
1329
- return self ._wrap_result (result , name = name )
1315
+
1316
+ if expand is None and hasattr (result , 'ndim' ):
1317
+ # to be compat with previous behavior
1318
+ msg = ("Extracting with single group returns DataFrame in future version. "
1319
+ "Specify expand=False to return Series." )
1320
+ if len (result ) == 0 :
1321
+ # for empty input
1322
+ if isinstance (name , list ):
1323
+ expand = True
1324
+ else :
1325
+ warnings .warn (msg , UserWarning )
1326
+ expand = False
1327
+ elif result .ndim > 1 :
1328
+ expand = True
1329
+ else :
1330
+ warnings .warn (msg , UserWarning )
1331
+ expand = False
1332
+ return self ._wrap_result (result , name = name , expand = expand )
1330
1333
1331
1334
_shared_docs ['find' ] = ("""
1332
1335
Return %(side)s indexes in each strings in the Series/Index
0 commit comments