Closed
Description
We currently don't allow duplicate function names in the list passed too .groupby().agg({'col': [aggfuncs]})
. This is painful with multiple lambdas, which all have the name <lambda>
In [1]: import pandas as pd
df
In [2]: df = pd.DataFrame({"A": ['a', 'a'], 'B': [1, 2], 'C': [3, 4]})
In [3]: df.groupby("A").agg({'B': [lambda x: 0, lambda x: 1]})
---------------------------------------------------------------------------
SpecificationError Traceback (most recent call last)
~/sandbox/pandas/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
483 try:
--> 484 result = _agg(arg, _agg_1dim)
485 except SpecificationError:
~/sandbox/pandas/pandas/core/base.py in _agg(arg, func)
434 for fname, agg_how in arg.items():
--> 435 result[fname] = func(fname, agg_how)
436 return result
~/sandbox/pandas/pandas/core/base.py in _agg_1dim(name, how, subset)
417 "in aggregation")
--> 418 return colg.aggregate(how, _level=(_level or 0) + 1)
419
~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, func_or_funcs, *args, **kwargs)
771 ret = self._aggregate_multiple_funcs(func_or_funcs,
--> 772 (_level or 0) + 1)
773 else:
~/sandbox/pandas/pandas/core/groupby/generic.py in _aggregate_multiple_funcs(self, arg, _level)
834 'Function names must be unique, found multiple named '
--> 835 '{}'.format(name))
836
SpecificationError: Function names must be unique, found multiple named <lambda>
During handling of the above exception, another exception occurred:
SpecificationError Traceback (most recent call last)
<ipython-input-3-2aa02bdc2edd> in <module>
----> 1 df.groupby("A").agg({'B': [lambda x: 0, lambda x: 1]})
~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, arg, *args, **kwargs)
1344 @Appender(_shared_docs['aggregate'])
1345 def aggregate(self, arg=None, *args, **kwargs):
-> 1346 return super().aggregate(arg, *args, **kwargs)
1347
1348 agg = aggregate
~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, func, *args, **kwargs)
174 "'(column, aggfunc).")
175
--> 176 result, how = self._aggregate(func, _level=_level, *args, **kwargs)
177 if how is None:
178 return result
~/sandbox/pandas/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
487 # we are aggregating expecting all 1d-returns
488 # but we have 2d
--> 489 result = _agg(arg, _agg_2dim)
490
491 # combine results
~/sandbox/pandas/pandas/core/base.py in _agg(arg, func)
433 result = OrderedDict()
434 for fname, agg_how in arg.items():
--> 435 result[fname] = func(fname, agg_how)
436 return result
437
~/sandbox/pandas/pandas/core/base.py in _agg_2dim(name, how)
424 colg = self._gotitem(self._selection, ndim=2,
425 subset=obj)
--> 426 return colg.aggregate(how, _level=None)
427
428 def _agg(arg, func):
~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, arg, *args, **kwargs)
1344 @Appender(_shared_docs['aggregate'])
1345 def aggregate(self, arg=None, *args, **kwargs):
-> 1346 return super().aggregate(arg, *args, **kwargs)
1347
1348 agg = aggregate
~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, func, *args, **kwargs)
174 "'(column, aggfunc).")
175
--> 176 result, how = self._aggregate(func, _level=_level, *args, **kwargs)
177 if how is None:
178 return result
~/sandbox/pandas/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
542 return self._aggregate_multiple_funcs(arg,
543 _level=_level,
--> 544 _axis=_axis), None
545 else:
546 result = None
~/sandbox/pandas/pandas/core/base.py in _aggregate_multiple_funcs(self, arg, _level, _axis)
588 colg = self._gotitem(col, ndim=1,
589 subset=obj.iloc[:, index])
--> 590 results.append(colg.aggregate(arg))
591 keys.append(col)
592 except (TypeError, DataError):
~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, func_or_funcs, *args, **kwargs)
770 # but not the class list / tuple itself.
771 ret = self._aggregate_multiple_funcs(func_or_funcs,
--> 772 (_level or 0) + 1)
773 else:
774 cyfunc = self._is_cython_func(func_or_funcs)
~/sandbox/pandas/pandas/core/groupby/generic.py in _aggregate_multiple_funcs(self, arg, _level)
833 raise SpecificationError(
834 'Function names must be unique, found multiple named '
--> 835 '{}'.format(name))
836
837 # reset the cache so that we
SpecificationError: Function names must be unique, found multiple named <lambda>
I propose that we mangle the names somehow
In [2]: df = pd.DataFrame({"A": ['a', 'a'], 'B': [1, 2], 'C': [3, 4]})
In [3]: df.groupby("A").agg({'B': [lambda x: 0, lambda x: 1]})
Out[3]:
B
<lambda> <lambda 1>
A
a 0 1
That adds a 1
, 2
, ... to all subsequent lambdas in the same MI level. It doesn't change the first. Do we want <lambda 0>
for the first?
As a side-effect, this enables multiple lambdas per column with the new keyword aggregation
In [4]: df.groupby("A").agg(b=('B', lambda x: 0), c=('B', lambda x: 1))
Out[4]:
b c
A
a 0 0
I have a WIP started. Will do for 0.25.