From 34d2910f0f789fface33b43c0bb6b6e3ec7ae903 Mon Sep 17 00:00:00 2001 From: seth-p Date: Thu, 24 Jul 2014 14:16:41 -0400 Subject: [PATCH] BUG: _flex_binary_moment() doesn't preserve column order or handle non-unique columns --- doc/source/v0.15.0.txt | 7 +- pandas/stats/moments.py | 49 ++++++++---- pandas/stats/tests/test_moments.py | 115 ++++++++++++++++++++++++++++- 3 files changed, 152 insertions(+), 19 deletions(-) diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 1c05c01633b15..da96d1e359454 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -265,7 +265,7 @@ Bug Fixes -- Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`) +- Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`) @@ -278,7 +278,10 @@ Bug Fixes - Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`) - Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`) - +- Bug in ``expanding_cov``, ``expanding_corr``, ``rolling_cov``, ``rolling_cov``, ``ewmcov``, and ``ewmcorr`` + returning results with columns sorted by name and producing an error for non-unique columns; + now handles non-unique columns and returns columns in original order + (except for the case of two DataFrames with ``pairwise=False``, where behavior is unchanged) (:issue:`7542`) diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 6f06255c7262d..a62d8178385cc 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -259,38 +259,55 @@ def _flex_binary_moment(arg1, arg2, f, pairwise=False): isinstance(arg2, (np.ndarray,Series)): X, Y = _prep_binary(arg1, arg2) return f(X, Y) + elif isinstance(arg1, DataFrame): + def dataframe_from_int_dict(data, frame_template): + result = DataFrame(data, index=frame_template.index) + result.columns = frame_template.columns[result.columns] + return result + results = {} if isinstance(arg2, DataFrame): - X, Y = arg1.align(arg2, join='outer') if pairwise is False: - X = X + 0 * Y - Y = Y + 0 * X - res_columns = arg1.columns.union(arg2.columns) - for col in res_columns: - if col in X and col in Y: - results[col] = f(X[col], Y[col]) + if arg1 is arg2: + # special case in order to handle duplicate column names + for i, col in enumerate(arg1.columns): + results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i]) + return dataframe_from_int_dict(results, arg1) + else: + if not arg1.columns.is_unique: + raise ValueError("'arg1' columns are not unique") + if not arg2.columns.is_unique: + raise ValueError("'arg2' columns are not unique") + X, Y = arg1.align(arg2, join='outer') + X = X + 0 * Y + Y = Y + 0 * X + res_columns = arg1.columns.union(arg2.columns) + for col in res_columns: + if col in X and col in Y: + results[col] = f(X[col], Y[col]) + return DataFrame(results, index=X.index, columns=res_columns) elif pairwise is True: results = defaultdict(dict) for i, k1 in enumerate(arg1.columns): for j, k2 in enumerate(arg2.columns): if j 0: + self.assert_numpy_array_equivalent(result, results[0]) + + # DataFrame with itself, pairwise=True + for f in [lambda x: mom.expanding_cov(x, pairwise=True), + lambda x: mom.expanding_corr(x, pairwise=True), + lambda x: mom.rolling_cov(x, window=3, pairwise=True), + lambda x: mom.rolling_corr(x, window=3, pairwise=True), + lambda x: mom.ewmcov(x, com=3, pairwise=True), + lambda x: mom.ewmcorr(x, com=3, pairwise=True), + ]: + results = [f(df) for df in df1s] + for (df, result) in zip(df1s, results): + assert_index_equal(result.items, df.index) + assert_index_equal(result.major_axis, df.columns) + assert_index_equal(result.minor_axis, df.columns) + for i, result in enumerate(results): + if i > 0: + self.assert_numpy_array_equivalent(result, results[0]) + + # DataFrame with itself, pairwise=False + for f in [lambda x: mom.expanding_cov(x, pairwise=False), + lambda x: mom.expanding_corr(x, pairwise=False), + lambda x: mom.rolling_cov(x, window=3, pairwise=False), + lambda x: mom.rolling_corr(x, window=3, pairwise=False), + lambda x: mom.ewmcov(x, com=3, pairwise=False), + lambda x: mom.ewmcorr(x, com=3, pairwise=False), + ]: + results = [f(df) for df in df1s] + for (df, result) in zip(df1s, results): + assert_index_equal(result.index, df.index) + assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.assert_numpy_array_equivalent(result, results[0]) + + # DataFrame with another DataFrame, pairwise=True + for f in [lambda x, y: mom.expanding_cov(x, y, pairwise=True), + lambda x, y: mom.expanding_corr(x, y, pairwise=True), + lambda x, y: mom.rolling_cov(x, y, window=3, pairwise=True), + lambda x, y: mom.rolling_corr(x, y, window=3, pairwise=True), + lambda x, y: mom.ewmcov(x, y, com=3, pairwise=True), + lambda x, y: mom.ewmcorr(x, y, com=3, pairwise=True), + ]: + results = [f(df, df2) for df in df1s] + for (df, result) in zip(df1s, results): + assert_index_equal(result.items, df.index) + assert_index_equal(result.major_axis, df.columns) + assert_index_equal(result.minor_axis, df2.columns) + for i, result in enumerate(results): + if i > 0: + self.assert_numpy_array_equivalent(result, results[0]) + + # DataFrame with another DataFrame, pairwise=False + for f in [lambda x, y: mom.expanding_cov(x, y, pairwise=False), + lambda x, y: mom.expanding_corr(x, y, pairwise=False), + lambda x, y: mom.rolling_cov(x, y, window=3, pairwise=False), + lambda x, y: mom.rolling_corr(x, y, window=3, pairwise=False), + lambda x, y: mom.ewmcov(x, y, com=3, pairwise=False), + lambda x, y: mom.ewmcorr(x, y, com=3, pairwise=False), + ]: + results = [f(df, df2) if df.columns.is_unique else None for df in df1s] + for (df, result) in zip(df1s, results): + if result is not None: + expected_index = df.index.union(df2.index) + expected_columns = df.columns.union(df2.columns) + assert_index_equal(result.index, expected_index) + assert_index_equal(result.columns, expected_columns) + else: + tm.assertRaisesRegexp(ValueError, "'arg1' columns are not unique", f, df, df2) + tm.assertRaisesRegexp(ValueError, "'arg2' columns are not unique", f, df2, df) + + # DataFrame with a Series + for f in [lambda x, y: mom.expanding_cov(x, y), + lambda x, y: mom.expanding_corr(x, y), + lambda x, y: mom.rolling_cov(x, y, window=3), + lambda x, y: mom.rolling_corr(x, y, window=3), + lambda x, y: mom.ewmcov(x, y, com=3), + lambda x, y: mom.ewmcorr(x, y, com=3), + ]: + results = [f(df, s) for df in df1s] + [f(s, df) for df in df1s] + for (df, result) in zip(df1s, results): + assert_index_equal(result.index, df.index) + assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.assert_numpy_array_equivalent(result, results[0]) def test_rolling_skew_edge_cases(self):