Skip to content

Commit

Permalink
Merge pull request #7738 from seth-p/flex_binary_moment_column_order
Browse files Browse the repository at this point in the history
BUG: _flex_binary_moment() doesn't preserve column order or handle multiple columns with the same label
  • Loading branch information
jreback committed Jul 25, 2014
2 parents 6d4a686 + 34d2910 commit 99b7c8c
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 19 deletions.
7 changes: 5 additions & 2 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ Bug Fixes



- Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`)
- Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`)



Expand All @@ -278,7 +278,10 @@ Bug Fixes
- Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`)
- Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`)


- Bug in ``expanding_cov``, ``expanding_corr``, ``rolling_cov``, ``rolling_cov``, ``ewmcov``, and ``ewmcorr``
returning results with columns sorted by name and producing an error for non-unique columns;
now handles non-unique columns and returns columns in original order
(except for the case of two DataFrames with ``pairwise=False``, where behavior is unchanged) (:issue:`7542`)



Expand Down
49 changes: 33 additions & 16 deletions pandas/stats/moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,38 +259,55 @@ def _flex_binary_moment(arg1, arg2, f, pairwise=False):
isinstance(arg2, (np.ndarray,Series)):
X, Y = _prep_binary(arg1, arg2)
return f(X, Y)

elif isinstance(arg1, DataFrame):
def dataframe_from_int_dict(data, frame_template):
result = DataFrame(data, index=frame_template.index)
result.columns = frame_template.columns[result.columns]
return result

results = {}
if isinstance(arg2, DataFrame):
X, Y = arg1.align(arg2, join='outer')
if pairwise is False:
X = X + 0 * Y
Y = Y + 0 * X
res_columns = arg1.columns.union(arg2.columns)
for col in res_columns:
if col in X and col in Y:
results[col] = f(X[col], Y[col])
if arg1 is arg2:
# special case in order to handle duplicate column names
for i, col in enumerate(arg1.columns):
results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i])
return dataframe_from_int_dict(results, arg1)
else:
if not arg1.columns.is_unique:
raise ValueError("'arg1' columns are not unique")
if not arg2.columns.is_unique:
raise ValueError("'arg2' columns are not unique")
X, Y = arg1.align(arg2, join='outer')
X = X + 0 * Y
Y = Y + 0 * X
res_columns = arg1.columns.union(arg2.columns)
for col in res_columns:
if col in X and col in Y:
results[col] = f(X[col], Y[col])
return DataFrame(results, index=X.index, columns=res_columns)
elif pairwise is True:
results = defaultdict(dict)
for i, k1 in enumerate(arg1.columns):
for j, k2 in enumerate(arg2.columns):
if j<i and arg2 is arg1:
# Symmetric case
results[k1][k2] = results[k2][k1]
results[i][j] = results[j][i]
else:
results[k1][k2] = f(*_prep_binary(arg1[k1], arg2[k2]))
return Panel.from_dict(results).swapaxes('items', 'major')
results[i][j] = f(*_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]))
p = Panel.from_dict(results).swapaxes('items', 'major')
p.major_axis = arg1.columns[p.major_axis]
p.minor_axis = arg2.columns[p.minor_axis]
return p
else:
raise ValueError("'pairwise' is not True/False")
else:
res_columns = arg1.columns
X, Y = arg1.align(arg2, axis=0, join='outer')
results = {}
for i, col in enumerate(arg1.columns):
results[i] = f(*_prep_binary(arg1.iloc[:, i], arg2))
return dataframe_from_int_dict(results, arg1)

for col in res_columns:
results[col] = f(X[col], Y)

return DataFrame(results, index=X.index, columns=res_columns)
else:
return _flex_binary_moment(arg2, arg1, f)

Expand Down
115 changes: 114 additions & 1 deletion pandas/stats/tests/test_moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from pandas import Series, DataFrame, Panel, bdate_range, isnull, notnull
from pandas.util.testing import (
assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal
assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, assert_index_equal
)
import pandas.core.datetools as datetools
import pandas.stats.moments as mom
Expand Down Expand Up @@ -970,6 +970,119 @@ def test_expanding_corr_pairwise_diff_length(self):
assert_frame_equal(result2, expected)
assert_frame_equal(result3, expected)
assert_frame_equal(result4, expected)

def test_pairwise_stats_column_names_order(self):
# GH 7738
df1s = [DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[0,1]),
DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[1,0]),
DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[1,1]),
DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=['C','C']),
DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[1.,0]),
DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[0.,1]),
DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=['C',1]),
DataFrame([[2.,4.],[1.,2.],[5.,2.],[8.,1.]], columns=[1,0.]),
DataFrame([[2,4.],[1,2.],[5,2.],[8,1.]], columns=[0,1.]),
DataFrame([[2,4],[1,2],[5,2],[8,1.]], columns=[1.,'X']),
]
df2 = DataFrame([[None,1,1],[None,1,2],[None,3,2],[None,8,1]], columns=['Y','Z','X'])
s = Series([1,1,3,8])

# DataFrame methods (which do not call _flex_binary_moment())
for f in [lambda x: x.cov(),
lambda x: x.corr(),
]:
results = [f(df) for df in df1s]
for (df, result) in zip(df1s, results):
assert_index_equal(result.index, df.columns)
assert_index_equal(result.columns, df.columns)
for i, result in enumerate(results):
if i > 0:
self.assert_numpy_array_equivalent(result, results[0])

# DataFrame with itself, pairwise=True
for f in [lambda x: mom.expanding_cov(x, pairwise=True),
lambda x: mom.expanding_corr(x, pairwise=True),
lambda x: mom.rolling_cov(x, window=3, pairwise=True),
lambda x: mom.rolling_corr(x, window=3, pairwise=True),
lambda x: mom.ewmcov(x, com=3, pairwise=True),
lambda x: mom.ewmcorr(x, com=3, pairwise=True),
]:
results = [f(df) for df in df1s]
for (df, result) in zip(df1s, results):
assert_index_equal(result.items, df.index)
assert_index_equal(result.major_axis, df.columns)
assert_index_equal(result.minor_axis, df.columns)
for i, result in enumerate(results):
if i > 0:
self.assert_numpy_array_equivalent(result, results[0])

# DataFrame with itself, pairwise=False
for f in [lambda x: mom.expanding_cov(x, pairwise=False),
lambda x: mom.expanding_corr(x, pairwise=False),
lambda x: mom.rolling_cov(x, window=3, pairwise=False),
lambda x: mom.rolling_corr(x, window=3, pairwise=False),
lambda x: mom.ewmcov(x, com=3, pairwise=False),
lambda x: mom.ewmcorr(x, com=3, pairwise=False),
]:
results = [f(df) for df in df1s]
for (df, result) in zip(df1s, results):
assert_index_equal(result.index, df.index)
assert_index_equal(result.columns, df.columns)
for i, result in enumerate(results):
if i > 0:
self.assert_numpy_array_equivalent(result, results[0])

# DataFrame with another DataFrame, pairwise=True
for f in [lambda x, y: mom.expanding_cov(x, y, pairwise=True),
lambda x, y: mom.expanding_corr(x, y, pairwise=True),
lambda x, y: mom.rolling_cov(x, y, window=3, pairwise=True),
lambda x, y: mom.rolling_corr(x, y, window=3, pairwise=True),
lambda x, y: mom.ewmcov(x, y, com=3, pairwise=True),
lambda x, y: mom.ewmcorr(x, y, com=3, pairwise=True),
]:
results = [f(df, df2) for df in df1s]
for (df, result) in zip(df1s, results):
assert_index_equal(result.items, df.index)
assert_index_equal(result.major_axis, df.columns)
assert_index_equal(result.minor_axis, df2.columns)
for i, result in enumerate(results):
if i > 0:
self.assert_numpy_array_equivalent(result, results[0])

# DataFrame with another DataFrame, pairwise=False
for f in [lambda x, y: mom.expanding_cov(x, y, pairwise=False),
lambda x, y: mom.expanding_corr(x, y, pairwise=False),
lambda x, y: mom.rolling_cov(x, y, window=3, pairwise=False),
lambda x, y: mom.rolling_corr(x, y, window=3, pairwise=False),
lambda x, y: mom.ewmcov(x, y, com=3, pairwise=False),
lambda x, y: mom.ewmcorr(x, y, com=3, pairwise=False),
]:
results = [f(df, df2) if df.columns.is_unique else None for df in df1s]
for (df, result) in zip(df1s, results):
if result is not None:
expected_index = df.index.union(df2.index)
expected_columns = df.columns.union(df2.columns)
assert_index_equal(result.index, expected_index)
assert_index_equal(result.columns, expected_columns)
else:
tm.assertRaisesRegexp(ValueError, "'arg1' columns are not unique", f, df, df2)
tm.assertRaisesRegexp(ValueError, "'arg2' columns are not unique", f, df2, df)

# DataFrame with a Series
for f in [lambda x, y: mom.expanding_cov(x, y),
lambda x, y: mom.expanding_corr(x, y),
lambda x, y: mom.rolling_cov(x, y, window=3),
lambda x, y: mom.rolling_corr(x, y, window=3),
lambda x, y: mom.ewmcov(x, y, com=3),
lambda x, y: mom.ewmcorr(x, y, com=3),
]:
results = [f(df, s) for df in df1s] + [f(s, df) for df in df1s]
for (df, result) in zip(df1s, results):
assert_index_equal(result.index, df.index)
assert_index_equal(result.columns, df.columns)
for i, result in enumerate(results):
if i > 0:
self.assert_numpy_array_equivalent(result, results[0])

def test_rolling_skew_edge_cases(self):

Expand Down

0 comments on commit 99b7c8c

Please sign in to comment.