Skip to content

Commit d7d1e86

Browse files
author
Ralf Gommers
committed
Merge pull request scipy#5007 from aeklant/_chk_asarray
ENH: nan handling in functions that use `stats._chk_asarray`
2 parents 135f994 + 6540298 commit d7d1e86

File tree

6 files changed

+1312
-357
lines changed

6 files changed

+1312
-357
lines changed
Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
from collections import namedtuple
2+
3+
import numpy as np
4+
5+
from . import distributions
6+
from . import futil
7+
8+
9+
__all__ = ['find_repeats', 'linregress', 'theilslopes']
10+
11+
12+
def linregress(x, y=None):
13+
"""
14+
Calculate a regression line
15+
16+
This computes a least-squares regression for two sets of measurements.
17+
18+
Parameters
19+
----------
20+
x, y : array_like
21+
Two sets of measurements. Both arrays should have the same length.
22+
If only x is given (and y=None), then it must be a two-dimensional
23+
array where one dimension has length 2. The two sets of measurements
24+
are then found by splitting the array along the length-2 dimension.
25+
26+
Returns
27+
-------
28+
slope : float
29+
slope of the regression line
30+
intercept : float
31+
intercept of the regression line
32+
rvalue : float
33+
correlation coefficient
34+
pvalue : float
35+
two-sided p-value for a hypothesis test whose null hypothesis is
36+
that the slope is zero.
37+
stderr : float
38+
Standard error of the estimate
39+
40+
Examples
41+
--------
42+
>>> from scipy import stats
43+
>>> np.random.seed(12345678)
44+
>>> x = np.random.random(10)
45+
>>> y = np.random.random(10)
46+
>>> slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
47+
48+
# To get coefficient of determination (r_squared)
49+
50+
>>> print("r-squared:", r_value**2)
51+
('r-squared:', 0.080402268539028335)
52+
53+
"""
54+
TINY = 1.0e-20
55+
if y is None: # x is a (2, N) or (N, 2) shaped array_like
56+
x = np.asarray(x)
57+
if x.shape[0] == 2:
58+
x, y = x
59+
elif x.shape[1] == 2:
60+
x, y = x.T
61+
else:
62+
msg = ("If only `x` is given as input, it has to be of shape "
63+
"(2, N) or (N, 2), provided shape was %s" % str(x.shape))
64+
raise ValueError(msg)
65+
else:
66+
x = np.asarray(x)
67+
y = np.asarray(y)
68+
n = len(x)
69+
xmean = np.mean(x, None)
70+
ymean = np.mean(y, None)
71+
72+
# average sum of squares:
73+
ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat
74+
r_num = ssxym
75+
r_den = np.sqrt(ssxm * ssym)
76+
if r_den == 0.0:
77+
r = 0.0
78+
else:
79+
r = r_num / r_den
80+
# test for numerical error propagation
81+
if r > 1.0:
82+
r = 1.0
83+
elif r < -1.0:
84+
r = -1.0
85+
86+
df = n - 2
87+
t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
88+
prob = 2 * distributions.t.sf(np.abs(t), df)
89+
slope = r_num / ssxm
90+
intercept = ymean - slope*xmean
91+
sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)
92+
93+
LinregressResult = namedtuple('LinregressResult', ('slope', 'intercept',
94+
'rvalue', 'pvalue',
95+
'stderr'))
96+
return LinregressResult(slope, intercept, r, prob, sterrest)
97+
98+
99+
def theilslopes(y, x=None, alpha=0.95):
100+
r"""
101+
Computes the Theil-Sen estimator for a set of points (x, y).
102+
103+
`theilslopes` implements a method for robust linear regression. It
104+
computes the slope as the median of all slopes between paired values.
105+
106+
Parameters
107+
----------
108+
y : array_like
109+
Dependent variable.
110+
x : array_like or None, optional
111+
Independent variable. If None, use ``arange(len(y))`` instead.
112+
alpha : float, optional
113+
Confidence degree between 0 and 1. Default is 95% confidence.
114+
Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
115+
interpreted as "find the 90% confidence interval".
116+
117+
Returns
118+
-------
119+
medslope : float
120+
Theil slope.
121+
medintercept : float
122+
Intercept of the Theil line, as ``median(y) - medslope*median(x)``.
123+
lo_slope : float
124+
Lower bound of the confidence interval on `medslope`.
125+
up_slope : float
126+
Upper bound of the confidence interval on `medslope`.
127+
128+
Notes
129+
-----
130+
The implementation of `theilslopes` follows [1]_. The intercept is
131+
not defined in [1]_, and here it is defined as ``median(y) -
132+
medslope*median(x)``, which is given in [3]_. Other definitions of
133+
the intercept exist in the literature. A confidence interval for
134+
the intercept is not given as this question is not addressed in
135+
[1]_.
136+
137+
References
138+
----------
139+
.. [1] P.K. Sen, "Estimates of the regression coefficient based on Kendall's tau",
140+
J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.
141+
.. [2] H. Theil, "A rank-invariant method of linear and polynomial
142+
regression analysis I, II and III", Nederl. Akad. Wetensch., Proc.
143+
53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.
144+
.. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,
145+
John Wiley and Sons, New York, pp. 493.
146+
147+
Examples
148+
--------
149+
>>> from scipy import stats
150+
>>> import matplotlib.pyplot as plt
151+
152+
>>> x = np.linspace(-5, 5, num=150)
153+
>>> y = x + np.random.normal(size=x.size)
154+
>>> y[11:15] += 10 # add outliers
155+
>>> y[-5:] -= 7
156+
157+
Compute the slope, intercept and 90% confidence interval. For comparison,
158+
also compute the least-squares fit with `linregress`:
159+
160+
>>> res = stats.theilslopes(y, x, 0.90)
161+
>>> lsq_res = stats.linregress(x, y)
162+
163+
Plot the results. The Theil-Sen regression line is shown in red, with the
164+
dashed red lines illustrating the confidence interval of the slope (note
165+
that the dashed red lines are not the confidence interval of the regression
166+
as the confidence interval of the intercept is not included). The green
167+
line shows the least-squares fit for comparison.
168+
169+
>>> fig = plt.figure()
170+
>>> ax = fig.add_subplot(111)
171+
>>> ax.plot(x, y, 'b.')
172+
>>> ax.plot(x, res[1] + res[0] * x, 'r-')
173+
>>> ax.plot(x, res[1] + res[2] * x, 'r--')
174+
>>> ax.plot(x, res[1] + res[3] * x, 'r--')
175+
>>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
176+
>>> plt.show()
177+
178+
"""
179+
y = np.asarray(y).flatten()
180+
if x is None:
181+
x = np.arange(len(y), dtype=float)
182+
else:
183+
x = np.asarray(x, dtype=float).flatten()
184+
if len(x) != len(y):
185+
raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
186+
187+
# Compute sorted slopes only when deltax > 0
188+
deltax = x[:, np.newaxis] - x
189+
deltay = y[:, np.newaxis] - y
190+
slopes = deltay[deltax > 0] / deltax[deltax > 0]
191+
slopes.sort()
192+
medslope = np.median(slopes)
193+
medinter = np.median(y) - medslope * np.median(x)
194+
# Now compute confidence intervals
195+
if alpha > 0.5:
196+
alpha = 1. - alpha
197+
198+
z = distributions.norm.ppf(alpha / 2.)
199+
# This implements (2.6) from Sen (1968)
200+
_, nxreps = find_repeats(x)
201+
_, nyreps = find_repeats(y)
202+
nt = len(slopes) # N in Sen (1968)
203+
ny = len(y) # n in Sen (1968)
204+
# Equation 2.6 in Sen (1968):
205+
sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -
206+
np.sum(k * (k-1) * (2*k + 5) for k in nxreps) -
207+
np.sum(k * (k-1) * (2*k + 5) for k in nyreps))
208+
# Find the confidence interval indices in `slopes`
209+
sigma = np.sqrt(sigsq)
210+
Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
211+
Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
212+
delta = slopes[[Rl, Ru]]
213+
return medslope, medinter, delta[0], delta[1]
214+
215+
216+
def find_repeats(arr):
217+
"""
218+
Find repeats and repeat counts.
219+
220+
Parameters
221+
----------
222+
arr : array_like
223+
Input array
224+
225+
Returns
226+
-------
227+
values : ndarray
228+
The unique values from the (flattened) input that are repeated.
229+
230+
counts : ndarray
231+
Number of times the corresponding 'value' is repeated.
232+
233+
Notes
234+
-----
235+
In numpy >= 1.9 `numpy.unique` provides similar functionality. The main
236+
difference is that `find_repeats` only returns repeated values.
237+
238+
Examples
239+
--------
240+
>>> from scipy import stats
241+
>>> stats.find_repeats([2, 1, 2, 3, 2, 2, 5])
242+
RepeatedResults(values=array([ 2.]), counts=array([4], dtype=int32))
243+
244+
>>> stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])
245+
RepeatedResults(values=array([ 4., 5.]), counts=array([2, 2], dtype=int32))
246+
247+
"""
248+
RepeatedResults = namedtuple('RepeatedResults', ('values', 'counts'))
249+
250+
if np.asarray(arr).size == 0:
251+
return RepeatedResults([], [])
252+
253+
v1, v2, n = futil.dfreps(arr)
254+
return RepeatedResults(v1[:n], v2[:n])

scipy/stats/mstats.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,4 +96,6 @@
9696

9797
from .mstats_basic import *
9898
from .mstats_extras import *
99-
from scipy.stats import gmean, hmean
99+
# Functions that support masked array input in stats but need to be kept in the
100+
# mstats namespace for backwards compatibility:
101+
from scipy.stats import gmean, hmean, zmap, zscore, chisquare

0 commit comments

Comments
 (0)