Skip to content

Commit

Permalink
ENH: Ensure that rolling_var of identical values is exactly zero
Browse files Browse the repository at this point in the history
Add a check to rolling_var for repeated observations, in order to
produce an exactly zero value of the variance when all entries are
identical. Related to the discussion in #7900
  • Loading branch information
jaimefrio committed Sep 16, 2014
1 parent 35a9527 commit 79f699a
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 65 deletions.
100 changes: 44 additions & 56 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1087,7 +1087,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y,
sum_wt = 1.
sum_wt2 = 1.
old_wt = 1.

for i from 1 <= i < N:
cur_x = input_x[i]
cur_y = input_y[i]
Expand Down Expand Up @@ -1117,7 +1117,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y,
elif is_observation:
mean_x = cur_x
mean_y = cur_y

if nobs >= minp:
if not bias:
numerator = sum_wt * sum_wt
Expand Down Expand Up @@ -1259,84 +1259,72 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
"""
Numerically stable implementation using Welford's method.
"""
cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta
cdef Py_ssize_t i
cdef Py_ssize_t N = len(input)
cdef double val, prev, mean_x = 0, ssqdm_x = 0, delta, rep = NaN, out
cdef Py_ssize_t nobs = 0, nrep = 0, i, N = len(input)

cdef ndarray[double_t] output = np.empty(N, dtype=float)

minp = _check_minp(win, minp, N)

# Check for windows larger than array, addresses #7297
win = min(win, N)

# Over the first window, observations can only be added, never removed
for i from 0 <= i < win:
for i from 0 <= i < N:
val = input[i]

# Not NaN
prev = NaN if i < win else input[i - win]

# First, count the number of observations and consecutive repeats
if prev == prev:
# prev is not NaN, removing an observation...
if nobs == nrep:
# ...and removing a repeat
nrep -= 1
if nrep == 0:
rep = NaN
nobs -= 1
if val == val:
nobs += 1
delta = (val - mean_x)
mean_x += delta / nobs
ssqdm_x += delta * (val - mean_x)

if (nobs >= minp) and (nobs > ddof):
#pathological case
if nobs == 1:
val = 0
# next is not NaN, adding an observation...
if val == rep:
# ...and adding a repeat
nrep += 1
else:
val = ssqdm_x / (nobs - ddof)
if val < 0:
val = 0
else:
val = NaN

output[i] = val

# After the first window, observations can both be added and removed
for i from win <= i < N:
val = input[i]
prev = input[i - win]
# ...and resetting repeats
nrep = 1
rep = val
nobs += 1

if val == val:
# Then, compute the new mean and sum of squared differences
if nobs == nrep:
# All non-NaN values in window are identical...
ssqdm_x = 0
mean_x = rep if nobs > 0 else 0
elif val == val:
# Adding one observation...
if prev == prev:
# Adding one observation and removing another one
# ...and removing another
delta = val - prev
prev -= mean_x
mean_x += delta / nobs
val -= mean_x
ssqdm_x += (val + prev) * delta
else:
# Adding one observation and not removing any
nobs += 1
delta = (val - mean_x)
# ...and not removing any
delta = val - mean_x
mean_x += delta / nobs
ssqdm_x += delta * (val - mean_x)
elif prev == prev:
# Adding no new observation, but removing one
nobs -= 1
if nobs:
delta = (prev - mean_x)
mean_x -= delta / nobs
ssqdm_x -= delta * (prev - mean_x)
else:
mean_x = 0
ssqdm_x = 0
delta = prev - mean_x
mean_x -= delta / nobs
ssqdm_x -= delta * (prev - mean_x)
# Variance is unchanged if no observation is added or removed

if (nobs >= minp) and (nobs > ddof):
#pathological case
if nobs == 1:
val = 0
else:
val = ssqdm_x / (nobs - ddof)
if val < 0:
val = 0
# Finally, compute and write the rolling variance to the output array
if nobs >= minp and nobs > ddof:
out = ssqdm_x / (nobs - ddof)
if out < 0:
out = 0
else:
val = NaN
out = NaN

output[i] = val
output[i] = out

return output

Expand Down
25 changes: 16 additions & 9 deletions pandas/stats/tests/test_moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
import numpy as np

from pandas import Series, DataFrame, Panel, bdate_range, isnull, notnull
from pandas.util.testing import (
assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, assert_index_equal
)
from pandas.util.testing import (assert_almost_equal, assert_series_equal,
assert_frame_equal, assert_panel_equal,
assert_index_equal,)
import pandas.core.datetools as datetools
import pandas.stats.moments as mom
import pandas.util.testing as tm
Expand Down Expand Up @@ -524,7 +524,7 @@ def test_ewma(self):
self.assertTrue(np.abs(result - 1) < 1e-2)

s = Series([1.0, 2.0, 4.0, 8.0])

expected = Series([1.0, 1.6, 2.736842, 4.923077])
for f in [lambda s: mom.ewma(s, com=2.0, adjust=True),
lambda s: mom.ewma(s, com=2.0, adjust=True, ignore_na=False),
Expand Down Expand Up @@ -750,7 +750,7 @@ def _non_null_values(x):

for (std, var, cov) in [(std_biased, var_biased, cov_biased),
(std_unbiased, var_unbiased, cov_unbiased)]:

# check that var(x), std(x), and cov(x) are all >= 0
var_x = var(x)
std_x = std(x)
Expand All @@ -762,7 +762,7 @@ def _non_null_values(x):

# check that var(x) == cov(x, x)
assert_equal(var_x, cov_x_x)

# check that var(x) == std(x)^2
assert_equal(var_x, std_x * std_x)

Expand Down Expand Up @@ -796,7 +796,7 @@ def _non_null_values(x):
cov_x_y = cov(x, y)
cov_y_x = cov(y, x)
assert_equal(cov_x_y, cov_y_x)

# check that cov(x, y) == (var(x+y) - var(x) - var(y)) / 2
var_x_plus_y = var(x + y)
var_y = var(y)
Expand Down Expand Up @@ -1007,7 +1007,7 @@ def test_rolling_consistency(self):
expected.iloc[:, i, j] = rolling_f(x.iloc[:, i], x.iloc[:, j],
window=window, min_periods=min_periods, center=center)
assert_panel_equal(rolling_f_result, expected)

# binary moments
def test_rolling_cov(self):
A = self.series
Expand Down Expand Up @@ -1432,7 +1432,7 @@ def test_expanding_corr_pairwise_diff_length(self):
assert_frame_equal(result2, expected)
assert_frame_equal(result3, expected)
assert_frame_equal(result4, expected)

def test_pairwise_stats_column_names_order(self):
# GH 7738
df1s = [DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[0,1]),
Expand Down Expand Up @@ -1731,6 +1731,13 @@ def test_rolling_median_how_resample(self):
x = mom.rolling_median(series, window=1, freq='D')
assert_series_equal(expected, x)

def test_rolling_var_exactly_zero(self):
# Related to GH 7900
a = np.array([1, 2, 3, 3, 3, 2, 2, 2, np.nan, 2, np.nan, 2, np.nan,
np.nan, np.nan, 1, 1, 1])
v = mom.rolling_var(a, window=3, min_periods=2)
self.assertTrue(np.all(v[[4, 7, 8, 9, 11, 16, 17]] == 0))

if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down

0 comments on commit 79f699a

Please sign in to comment.