Skip to content

Aggregate fails with mixed types in grouping series #16916

Closed
@nreeve17

Description

@nreeve17

Code Sample, a copy-pastable example if possible

X = pd.DataFrame(data=np.random.rand(7, 3), columns=list('XYZ'), index=list('zxcvbnm'))
X['grouping'] = ['group 1', 'group 1', 'group 1', 2, 2 , 2, 'group 1']
X.groupby('grouping').aggregate(lambda x: x.tolist())

This is the exception and traceback that the code above returns:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/Users/nicolaireeve/miniconda2/envs/skbiodev/lib/python3.4/site-packages/pandas/core/groupby.py in aggregate(self, arg, *args, **kwargs)
   3482                     result = self._aggregate_multiple_funcs(
-> 3483                         [arg], _level=_level, _axis=self.axis)
   3484                     result.columns = Index(

/Users/nicolaireeve/miniconda2/envs/skbiodev/lib/python3.4/site-packages/pandas/core/base.py in _aggregate_multiple_funcs(self, arg, _level, _axis)
    690         if not len(results):
--> 691             raise ValueError("no results")
    692 

ValueError: no results

During handling of the above exception, another exception occurred:

AttributeError                            Traceback (most recent call last)
/Users/nicolaireeve/miniconda2/envs/skbiodev/lib/python3.4/site-packages/pandas/core/groupby.py in _aggregate_generic(self, func, *args, **kwargs)
   3508                 for name, data in self:
-> 3509                     result[name] = self._try_cast(func(data, *args, **kwargs),
   3510                                                   data)

<ipython-input-25-18b24604e98f> in <lambda>(x)
      2 X['grouping'] = ['group 1', 'group 1', 'group 1', 2, 2 , 2, 'group 1']
----> 3 X.groupby('grouping').aggregate(lambda x: x.tolist())

/Users/nicolaireeve/miniconda2/envs/skbiodev/lib/python3.4/site-packages/pandas/core/generic.py in __getattr__(self, name)
   3080                 return self[name]
-> 3081             return object.__getattribute__(self, name)
   3082 

AttributeError: 'DataFrame' object has no attribute 'tolist'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-25-18b24604e98f> in <module>()
      1 X = pd.DataFrame(data=np.random.rand(7, 3), columns=list('XYZ'), index=list('zxcvbnm'))
      2 X['grouping'] = ['group 1', 'group 1', 'group 1', 2, 2 , 2, 'group 1']
----> 3 X.groupby('grouping').aggregate(lambda x: x.tolist())

/Users/nicolaireeve/miniconda2/envs/skbiodev/lib/python3.4/site-packages/pandas/core/groupby.py in aggregate(self, arg, *args, **kwargs)
   4034         versionadded=''))
   4035     def aggregate(self, arg, *args, **kwargs):
-> 4036         return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)
   4037 
   4038     agg = aggregate

/Users/nicolaireeve/miniconda2/envs/skbiodev/lib/python3.4/site-packages/pandas/core/groupby.py in aggregate(self, arg, *args, **kwargs)
   3486                         name=self._selected_obj.columns.name)
   3487                 except:
-> 3488                     result = self._aggregate_generic(arg, *args, **kwargs)
   3489 
   3490         if not self.as_index:

/Users/nicolaireeve/miniconda2/envs/skbiodev/lib/python3.4/site-packages/pandas/core/groupby.py in _aggregate_generic(self, func, *args, **kwargs)
   3510                                                   data)
   3511             except Exception:
-> 3512                 return self._aggregate_item_by_item(func, *args, **kwargs)
   3513         else:
   3514             for name in self.indices:

/Users/nicolaireeve/miniconda2/envs/skbiodev/lib/python3.4/site-packages/pandas/core/groupby.py in _aggregate_item_by_item(self, func, *args, **kwargs)
   3554             # GH6337
   3555             if not len(result_columns) and errors is not None:
-> 3556                 raise errors
   3557 
   3558         return DataFrame(result, columns=result_columns)

/Users/nicolaireeve/miniconda2/envs/skbiodev/lib/python3.4/site-packages/pandas/core/groupby.py in _aggregate_item_by_item(self, func, *args, **kwargs)
   3539                                      grouper=self.grouper)
   3540                 result[item] = self._try_cast(
-> 3541                     colg.aggregate(func, *args, **kwargs), data)
   3542             except ValueError:
   3543                 cannot_agg.append(item)

/Users/nicolaireeve/miniconda2/envs/skbiodev/lib/python3.4/site-packages/pandas/core/groupby.py in aggregate(self, func_or_funcs, *args, **kwargs)
   2885                 result = self._aggregate_named(func_or_funcs, *args, **kwargs)
   2886 
-> 2887             index = Index(sorted(result), name=self.grouper.names[0])
   2888             ret = Series(result, index=index)
   2889 

TypeError: unorderable types: str() < int()

Problem description

If a grouping vector is of mixed type and aggregate is used after groupby(...), an exception will be raised. The source code will get to this line and fails because sorted() does not support mixed types.

Expected Output

This is what we would expect to see if the exception was not raised. This output was achieved by using a column in groupby that is of a single type. In this instance, 2 was changed to a string

X = pd.DataFrame(data=np.random.rand(7, 3), columns=list('XYZ'), index=list('zxcvbnm'))
X['grouping'] = ['group 1', 'group 1', 'group 1', '2', '2' , '2', 'group 1']
X.groupby('grouping').aggregate(lambda x: x.tolist())

                                                          X  \
grouping                                                      
2         [0.9219120799240533, 0.6439069401684864, 0.035...   
group 1   [0.6884732212797477, 0.326906484996646, 0.6718...   

                                                          Y  \
grouping                                                      
2         [0.7796923828539405, 0.7668459596180287, 0.868...   
group 1   [0.20259205506065203, 0.9138593138141587, 0.95...   

                                                          Z  
grouping                                                     
2         [0.9863526134877422, 0.6342347501171951, 0.873...  
group 1   [0.054465751087565906, 0.9026560581041934, 0.9...  

Output of pd.show_versions()

# Paste the output here pd.show_versions() here
INSTALLED VERSIONS
------------------
commit: None
python: 3.4.5.final.0
python-bits: 64
OS: Darwin
OS-release: 16.6.0
machine: x86_64
processor: i386
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: en_US.UTF-8

pandas: 0.20.3
pytest: None
pip: 9.0.1
setuptools: 35.0.2
Cython: None
numpy: 1.13.1
scipy: 0.19.0
xarray: None
IPython: 6.0.0
sphinx: None
patsy: None
dateutil: 2.6.1
pytz: 2017.2
blosc: None
bottleneck: None
tables: None
numexpr: None
feather: None
matplotlib: 2.0.2
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml: None
bs4: None
html5lib: None
sqlalchemy: None
pymysql: None
psycopg2: None
jinja2: None
s3fs: None
pandas_gbq: None
pandas_datareader: None

cc @ElDeveloper

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions