Skip to content

Accept multiple lambda in groupby list #26430

Closed
@TomAugspurger

Description

@TomAugspurger

We currently don't allow duplicate function names in the list passed too .groupby().agg({'col': [aggfuncs]}). This is painful with multiple lambdas, which all have the name <lambda>

In [1]: import pandas as pd
df
In [2]: df = pd.DataFrame({"A": ['a', 'a'], 'B': [1, 2], 'C': [3, 4]})

In [3]: df.groupby("A").agg({'B': [lambda x: 0, lambda x: 1]})
---------------------------------------------------------------------------
SpecificationError                        Traceback (most recent call last)
~/sandbox/pandas/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
    483                 try:
--> 484                     result = _agg(arg, _agg_1dim)
    485                 except SpecificationError:

~/sandbox/pandas/pandas/core/base.py in _agg(arg, func)
    434                 for fname, agg_how in arg.items():
--> 435                     result[fname] = func(fname, agg_how)
    436                 return result

~/sandbox/pandas/pandas/core/base.py in _agg_1dim(name, how, subset)
    417                                              "in aggregation")
--> 418                 return colg.aggregate(how, _level=(_level or 0) + 1)
    419

~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, func_or_funcs, *args, **kwargs)
    771             ret = self._aggregate_multiple_funcs(func_or_funcs,
--> 772                                                  (_level or 0) + 1)
    773         else:

~/sandbox/pandas/pandas/core/groupby/generic.py in _aggregate_multiple_funcs(self, arg, _level)
    834                     'Function names must be unique, found multiple named '
--> 835                     '{}'.format(name))
    836

SpecificationError: Function names must be unique, found multiple named <lambda>

During handling of the above exception, another exception occurred:

SpecificationError                        Traceback (most recent call last)
<ipython-input-3-2aa02bdc2edd> in <module>
----> 1 df.groupby("A").agg({'B': [lambda x: 0, lambda x: 1]})

~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, arg, *args, **kwargs)
   1344     @Appender(_shared_docs['aggregate'])
   1345     def aggregate(self, arg=None, *args, **kwargs):
-> 1346         return super().aggregate(arg, *args, **kwargs)
   1347
   1348     agg = aggregate

~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, func, *args, **kwargs)
    174                             "'(column, aggfunc).")
    175
--> 176         result, how = self._aggregate(func, _level=_level, *args, **kwargs)
    177         if how is None:
    178             return result

~/sandbox/pandas/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
    487                     # we are aggregating expecting all 1d-returns
    488                     # but we have 2d
--> 489                     result = _agg(arg, _agg_2dim)
    490
    491             # combine results

~/sandbox/pandas/pandas/core/base.py in _agg(arg, func)
    433                 result = OrderedDict()
    434                 for fname, agg_how in arg.items():
--> 435                     result[fname] = func(fname, agg_how)
    436                 return result
    437

~/sandbox/pandas/pandas/core/base.py in _agg_2dim(name, how)
    424                 colg = self._gotitem(self._selection, ndim=2,
    425                                      subset=obj)
--> 426                 return colg.aggregate(how, _level=None)
    427
    428             def _agg(arg, func):

~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, arg, *args, **kwargs)
   1344     @Appender(_shared_docs['aggregate'])
   1345     def aggregate(self, arg=None, *args, **kwargs):
-> 1346         return super().aggregate(arg, *args, **kwargs)
   1347
   1348     agg = aggregate

~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, func, *args, **kwargs)
    174                             "'(column, aggfunc).")
    175
--> 176         result, how = self._aggregate(func, _level=_level, *args, **kwargs)
    177         if how is None:
    178             return result

~/sandbox/pandas/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
    542             return self._aggregate_multiple_funcs(arg,
    543                                                   _level=_level,
--> 544                                                   _axis=_axis), None
    545         else:
    546             result = None

~/sandbox/pandas/pandas/core/base.py in _aggregate_multiple_funcs(self, arg, _level, _axis)
    588                     colg = self._gotitem(col, ndim=1,
    589                                          subset=obj.iloc[:, index])
--> 590                     results.append(colg.aggregate(arg))
    591                     keys.append(col)
    592                 except (TypeError, DataError):

~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, func_or_funcs, *args, **kwargs)
    770             # but not the class list / tuple itself.
    771             ret = self._aggregate_multiple_funcs(func_or_funcs,
--> 772                                                  (_level or 0) + 1)
    773         else:
    774             cyfunc = self._is_cython_func(func_or_funcs)

~/sandbox/pandas/pandas/core/groupby/generic.py in _aggregate_multiple_funcs(self, arg, _level)
    833                 raise SpecificationError(
    834                     'Function names must be unique, found multiple named '
--> 835                     '{}'.format(name))
    836
    837             # reset the cache so that we

SpecificationError: Function names must be unique, found multiple named <lambda>

I propose that we mangle the names somehow

In [2]: df = pd.DataFrame({"A": ['a', 'a'], 'B': [1, 2], 'C': [3, 4]})

In [3]: df.groupby("A").agg({'B': [lambda x: 0, lambda x: 1]})
Out[3]:
         B
  <lambda> <lambda 1>
A
a        0          1

That adds a 1, 2, ... to all subsequent lambdas in the same MI level. It doesn't change the first. Do we want <lambda 0> for the first?

As a side-effect, this enables multiple lambdas per column with the new keyword aggregation

In [4]: df.groupby("A").agg(b=('B', lambda x: 0), c=('B', lambda x: 1))
Out[4]:
   b  c
A
a  0  0

I have a WIP started. Will do for 0.25.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions