ENH: Ensure that rolling_var of identical values is exactly zero

jaimefrio · jaimefrio · commit 79f699a85982 · 2014-09-15T18:46:02.000-07:00
Add a check to rolling_var for repeated observations, in order to produce an exactly zero value of the variance when all entries are identical. Related to the discussion in #7900
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
@@ -1087,7 +1087,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y,
     sum_wt = 1.
     sum_wt2 = 1.
     old_wt = 1.
-    
+
     for i from 1 <= i < N:
         cur_x = input_x[i]
         cur_y = input_y[i]
@@ -1117,7 +1117,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y,
         elif is_observation:
             mean_x = cur_x
             mean_y = cur_y
-        
+
         if nobs >= minp:
             if not bias:
                 numerator = sum_wt * sum_wt
@@ -1259,84 +1259,72 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
     """
     Numerically stable implementation using Welford's method.
     """
-    cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta
-    cdef Py_ssize_t i
-    cdef Py_ssize_t N = len(input)
+    cdef double val, prev, mean_x = 0, ssqdm_x = 0, delta, rep = NaN, out
+    cdef Py_ssize_t nobs = 0, nrep = 0, i, N = len(input)
 
     cdef ndarray[double_t] output = np.empty(N, dtype=float)
 
     minp = _check_minp(win, minp, N)
 
-    # Check for windows larger than array, addresses #7297
-    win = min(win, N)
-
-    # Over the first window, observations can only be added, never removed
-    for i from 0 <= i < win:
+    for i from 0 <= i < N:
         val = input[i]
-
-        # Not NaN
+        prev = NaN if i < win else input[i - win]
+
+        # First, count the number of observations and consecutive repeats
+        if prev == prev:
+            # prev is not NaN, removing an observation...
+            if nobs == nrep:
+                # ...and removing a repeat
+                nrep -= 1
+                if nrep == 0:
+                    rep = NaN
+            nobs -= 1
         if val == val:
-            nobs += 1
-            delta = (val - mean_x)
-            mean_x += delta / nobs
-            ssqdm_x += delta * (val - mean_x)
-
-        if (nobs >= minp) and (nobs > ddof):
-            #pathological case
-            if nobs == 1:
-                val = 0
+            # next is not NaN, adding an observation...
+            if val == rep:
+                # ...and adding a repeat
+                nrep += 1
             else:
-                val = ssqdm_x / (nobs - ddof)
-                if val < 0:
-                    val = 0
-        else:
-            val = NaN
-
-        output[i] = val
-
-    # After the first window, observations can both be added and removed
-    for i from win <= i < N:
-        val = input[i]
-        prev = input[i - win]
+                # ...and resetting repeats
+                nrep = 1
+                rep = val
+            nobs += 1
 
-        if val == val:
+        # Then, compute the new mean and sum of squared differences
+        if nobs == nrep:
+            # All non-NaN values in window are identical...
+            ssqdm_x = 0
+            mean_x = rep if nobs > 0 else 0
+        elif val == val:
+            # Adding one observation...
             if prev == prev:
-                # Adding one observation and removing another one
+                # ...and removing another
                 delta = val - prev
                 prev -= mean_x
                 mean_x += delta / nobs
                 val -= mean_x
                 ssqdm_x += (val + prev) * delta
             else:
-                # Adding one observation and not removing any
-                nobs += 1
-                delta = (val - mean_x)
+                # ...and not removing any
+                delta = val - mean_x
                 mean_x += delta / nobs
                 ssqdm_x += delta * (val - mean_x)
         elif prev == prev:
             # Adding no new observation, but removing one
-            nobs -= 1
-            if nobs:
-                delta = (prev - mean_x)
-                mean_x -= delta  / nobs
-                ssqdm_x -= delta * (prev - mean_x)
-            else:
-                mean_x = 0
-                ssqdm_x = 0
+            delta = prev - mean_x
+            mean_x -= delta  / nobs
+            ssqdm_x -= delta * (prev - mean_x)
         # Variance is unchanged if no observation is added or removed
 
-        if (nobs >= minp) and (nobs > ddof):
-            #pathological case
-            if nobs == 1:
-                val = 0
-            else:
-                val = ssqdm_x / (nobs - ddof)
-                if val < 0:
-                    val = 0
+        # Finally, compute and write the rolling variance to the output array
+        if nobs >= minp and nobs > ddof:
+            out = ssqdm_x / (nobs - ddof)
+            if out < 0:
+                out = 0
         else:
-            val = NaN
+            out = NaN
 
-        output[i] = val
+        output[i] = out
 
     return output
 
diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py
@@ -7,9 +7,9 @@
 import numpy as np
 
 from pandas import Series, DataFrame, Panel, bdate_range, isnull, notnull
-from pandas.util.testing import (
-    assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, assert_index_equal
-)
+from pandas.util.testing import (assert_almost_equal, assert_series_equal,
+                                 assert_frame_equal, assert_panel_equal,
+                                 assert_index_equal,)
 import pandas.core.datetools as datetools
 import pandas.stats.moments as mom
 import pandas.util.testing as tm
@@ -524,7 +524,7 @@ def test_ewma(self):
         self.assertTrue(np.abs(result - 1) < 1e-2)
 
         s = Series([1.0, 2.0, 4.0, 8.0])
-        
+
         expected = Series([1.0, 1.6, 2.736842, 4.923077])
         for f in [lambda s: mom.ewma(s, com=2.0, adjust=True),
                   lambda s: mom.ewma(s, com=2.0, adjust=True, ignore_na=False),
@@ -750,7 +750,7 @@ def _non_null_values(x):
 
             for (std, var, cov) in [(std_biased, var_biased, cov_biased),
                                     (std_unbiased, var_unbiased, cov_unbiased)]:
-                
+
                 # check that var(x), std(x), and cov(x) are all >= 0
                 var_x = var(x)
                 std_x = std(x)
@@ -762,7 +762,7 @@ def _non_null_values(x):
 
                     # check that var(x) == cov(x, x)
                     assert_equal(var_x, cov_x_x)
-                
+
                 # check that var(x) == std(x)^2
                 assert_equal(var_x, std_x * std_x)
 
@@ -796,7 +796,7 @@ def _non_null_values(x):
                             cov_x_y = cov(x, y)
                             cov_y_x = cov(y, x)
                             assert_equal(cov_x_y, cov_y_x)
-                    
+
                             # check that cov(x, y) == (var(x+y) - var(x) - var(y)) / 2
                             var_x_plus_y = var(x + y)
                             var_y = var(y)
@@ -1007,7 +1007,7 @@ def test_rolling_consistency(self):
                                         expected.iloc[:, i, j] = rolling_f(x.iloc[:, i], x.iloc[:, j],
                                                                            window=window, min_periods=min_periods, center=center)
                                 assert_panel_equal(rolling_f_result, expected)
-    
+
     # binary moments
     def test_rolling_cov(self):
         A = self.series
@@ -1432,7 +1432,7 @@ def test_expanding_corr_pairwise_diff_length(self):
         assert_frame_equal(result2, expected)
         assert_frame_equal(result3, expected)
         assert_frame_equal(result4, expected)
-    
+
     def test_pairwise_stats_column_names_order(self):
         # GH 7738
         df1s = [DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[0,1]),
@@ -1731,6 +1731,13 @@ def test_rolling_median_how_resample(self):
         x = mom.rolling_median(series, window=1, freq='D')
         assert_series_equal(expected, x)
 
+    def test_rolling_var_exactly_zero(self):
+        # Related to GH 7900
+        a = np.array([1, 2, 3, 3, 3, 2, 2, 2, np.nan, 2, np.nan, 2, np.nan,
+                      np.nan, np.nan, 1, 1, 1])
+        v = mom.rolling_var(a, window=3, min_periods=2)
+        self.assertTrue(np.all(v[[4, 7, 8, 9, 11, 16, 17]] == 0))
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],