|
| 1 | +from collections import namedtuple |
| 2 | + |
| 3 | +import numpy as np |
| 4 | + |
| 5 | +from . import distributions |
| 6 | +from . import futil |
| 7 | + |
| 8 | + |
| 9 | +__all__ = ['find_repeats', 'linregress', 'theilslopes'] |
| 10 | + |
| 11 | + |
| 12 | +def linregress(x, y=None): |
| 13 | + """ |
| 14 | + Calculate a regression line |
| 15 | +
|
| 16 | + This computes a least-squares regression for two sets of measurements. |
| 17 | +
|
| 18 | + Parameters |
| 19 | + ---------- |
| 20 | + x, y : array_like |
| 21 | + Two sets of measurements. Both arrays should have the same length. |
| 22 | + If only x is given (and y=None), then it must be a two-dimensional |
| 23 | + array where one dimension has length 2. The two sets of measurements |
| 24 | + are then found by splitting the array along the length-2 dimension. |
| 25 | +
|
| 26 | + Returns |
| 27 | + ------- |
| 28 | + slope : float |
| 29 | + slope of the regression line |
| 30 | + intercept : float |
| 31 | + intercept of the regression line |
| 32 | + rvalue : float |
| 33 | + correlation coefficient |
| 34 | + pvalue : float |
| 35 | + two-sided p-value for a hypothesis test whose null hypothesis is |
| 36 | + that the slope is zero. |
| 37 | + stderr : float |
| 38 | + Standard error of the estimate |
| 39 | +
|
| 40 | + Examples |
| 41 | + -------- |
| 42 | + >>> from scipy import stats |
| 43 | + >>> np.random.seed(12345678) |
| 44 | + >>> x = np.random.random(10) |
| 45 | + >>> y = np.random.random(10) |
| 46 | + >>> slope, intercept, r_value, p_value, std_err = stats.linregress(x,y) |
| 47 | +
|
| 48 | + # To get coefficient of determination (r_squared) |
| 49 | +
|
| 50 | + >>> print("r-squared:", r_value**2) |
| 51 | + ('r-squared:', 0.080402268539028335) |
| 52 | +
|
| 53 | + """ |
| 54 | + TINY = 1.0e-20 |
| 55 | + if y is None: # x is a (2, N) or (N, 2) shaped array_like |
| 56 | + x = np.asarray(x) |
| 57 | + if x.shape[0] == 2: |
| 58 | + x, y = x |
| 59 | + elif x.shape[1] == 2: |
| 60 | + x, y = x.T |
| 61 | + else: |
| 62 | + msg = ("If only `x` is given as input, it has to be of shape " |
| 63 | + "(2, N) or (N, 2), provided shape was %s" % str(x.shape)) |
| 64 | + raise ValueError(msg) |
| 65 | + else: |
| 66 | + x = np.asarray(x) |
| 67 | + y = np.asarray(y) |
| 68 | + n = len(x) |
| 69 | + xmean = np.mean(x, None) |
| 70 | + ymean = np.mean(y, None) |
| 71 | + |
| 72 | + # average sum of squares: |
| 73 | + ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat |
| 74 | + r_num = ssxym |
| 75 | + r_den = np.sqrt(ssxm * ssym) |
| 76 | + if r_den == 0.0: |
| 77 | + r = 0.0 |
| 78 | + else: |
| 79 | + r = r_num / r_den |
| 80 | + # test for numerical error propagation |
| 81 | + if r > 1.0: |
| 82 | + r = 1.0 |
| 83 | + elif r < -1.0: |
| 84 | + r = -1.0 |
| 85 | + |
| 86 | + df = n - 2 |
| 87 | + t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY))) |
| 88 | + prob = 2 * distributions.t.sf(np.abs(t), df) |
| 89 | + slope = r_num / ssxm |
| 90 | + intercept = ymean - slope*xmean |
| 91 | + sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df) |
| 92 | + |
| 93 | + LinregressResult = namedtuple('LinregressResult', ('slope', 'intercept', |
| 94 | + 'rvalue', 'pvalue', |
| 95 | + 'stderr')) |
| 96 | + return LinregressResult(slope, intercept, r, prob, sterrest) |
| 97 | + |
| 98 | + |
| 99 | +def theilslopes(y, x=None, alpha=0.95): |
| 100 | + r""" |
| 101 | + Computes the Theil-Sen estimator for a set of points (x, y). |
| 102 | +
|
| 103 | + `theilslopes` implements a method for robust linear regression. It |
| 104 | + computes the slope as the median of all slopes between paired values. |
| 105 | +
|
| 106 | + Parameters |
| 107 | + ---------- |
| 108 | + y : array_like |
| 109 | + Dependent variable. |
| 110 | + x : array_like or None, optional |
| 111 | + Independent variable. If None, use ``arange(len(y))`` instead. |
| 112 | + alpha : float, optional |
| 113 | + Confidence degree between 0 and 1. Default is 95% confidence. |
| 114 | + Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are |
| 115 | + interpreted as "find the 90% confidence interval". |
| 116 | +
|
| 117 | + Returns |
| 118 | + ------- |
| 119 | + medslope : float |
| 120 | + Theil slope. |
| 121 | + medintercept : float |
| 122 | + Intercept of the Theil line, as ``median(y) - medslope*median(x)``. |
| 123 | + lo_slope : float |
| 124 | + Lower bound of the confidence interval on `medslope`. |
| 125 | + up_slope : float |
| 126 | + Upper bound of the confidence interval on `medslope`. |
| 127 | +
|
| 128 | + Notes |
| 129 | + ----- |
| 130 | + The implementation of `theilslopes` follows [1]_. The intercept is |
| 131 | + not defined in [1]_, and here it is defined as ``median(y) - |
| 132 | + medslope*median(x)``, which is given in [3]_. Other definitions of |
| 133 | + the intercept exist in the literature. A confidence interval for |
| 134 | + the intercept is not given as this question is not addressed in |
| 135 | + [1]_. |
| 136 | +
|
| 137 | + References |
| 138 | + ---------- |
| 139 | + .. [1] P.K. Sen, "Estimates of the regression coefficient based on Kendall's tau", |
| 140 | + J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968. |
| 141 | + .. [2] H. Theil, "A rank-invariant method of linear and polynomial |
| 142 | + regression analysis I, II and III", Nederl. Akad. Wetensch., Proc. |
| 143 | + 53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950. |
| 144 | + .. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed., |
| 145 | + John Wiley and Sons, New York, pp. 493. |
| 146 | +
|
| 147 | + Examples |
| 148 | + -------- |
| 149 | + >>> from scipy import stats |
| 150 | + >>> import matplotlib.pyplot as plt |
| 151 | +
|
| 152 | + >>> x = np.linspace(-5, 5, num=150) |
| 153 | + >>> y = x + np.random.normal(size=x.size) |
| 154 | + >>> y[11:15] += 10 # add outliers |
| 155 | + >>> y[-5:] -= 7 |
| 156 | +
|
| 157 | + Compute the slope, intercept and 90% confidence interval. For comparison, |
| 158 | + also compute the least-squares fit with `linregress`: |
| 159 | +
|
| 160 | + >>> res = stats.theilslopes(y, x, 0.90) |
| 161 | + >>> lsq_res = stats.linregress(x, y) |
| 162 | +
|
| 163 | + Plot the results. The Theil-Sen regression line is shown in red, with the |
| 164 | + dashed red lines illustrating the confidence interval of the slope (note |
| 165 | + that the dashed red lines are not the confidence interval of the regression |
| 166 | + as the confidence interval of the intercept is not included). The green |
| 167 | + line shows the least-squares fit for comparison. |
| 168 | +
|
| 169 | + >>> fig = plt.figure() |
| 170 | + >>> ax = fig.add_subplot(111) |
| 171 | + >>> ax.plot(x, y, 'b.') |
| 172 | + >>> ax.plot(x, res[1] + res[0] * x, 'r-') |
| 173 | + >>> ax.plot(x, res[1] + res[2] * x, 'r--') |
| 174 | + >>> ax.plot(x, res[1] + res[3] * x, 'r--') |
| 175 | + >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-') |
| 176 | + >>> plt.show() |
| 177 | +
|
| 178 | + """ |
| 179 | + y = np.asarray(y).flatten() |
| 180 | + if x is None: |
| 181 | + x = np.arange(len(y), dtype=float) |
| 182 | + else: |
| 183 | + x = np.asarray(x, dtype=float).flatten() |
| 184 | + if len(x) != len(y): |
| 185 | + raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x))) |
| 186 | + |
| 187 | + # Compute sorted slopes only when deltax > 0 |
| 188 | + deltax = x[:, np.newaxis] - x |
| 189 | + deltay = y[:, np.newaxis] - y |
| 190 | + slopes = deltay[deltax > 0] / deltax[deltax > 0] |
| 191 | + slopes.sort() |
| 192 | + medslope = np.median(slopes) |
| 193 | + medinter = np.median(y) - medslope * np.median(x) |
| 194 | + # Now compute confidence intervals |
| 195 | + if alpha > 0.5: |
| 196 | + alpha = 1. - alpha |
| 197 | + |
| 198 | + z = distributions.norm.ppf(alpha / 2.) |
| 199 | + # This implements (2.6) from Sen (1968) |
| 200 | + _, nxreps = find_repeats(x) |
| 201 | + _, nyreps = find_repeats(y) |
| 202 | + nt = len(slopes) # N in Sen (1968) |
| 203 | + ny = len(y) # n in Sen (1968) |
| 204 | + # Equation 2.6 in Sen (1968): |
| 205 | + sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) - |
| 206 | + np.sum(k * (k-1) * (2*k + 5) for k in nxreps) - |
| 207 | + np.sum(k * (k-1) * (2*k + 5) for k in nyreps)) |
| 208 | + # Find the confidence interval indices in `slopes` |
| 209 | + sigma = np.sqrt(sigsq) |
| 210 | + Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1) |
| 211 | + Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0) |
| 212 | + delta = slopes[[Rl, Ru]] |
| 213 | + return medslope, medinter, delta[0], delta[1] |
| 214 | + |
| 215 | + |
| 216 | +def find_repeats(arr): |
| 217 | + """ |
| 218 | + Find repeats and repeat counts. |
| 219 | +
|
| 220 | + Parameters |
| 221 | + ---------- |
| 222 | + arr : array_like |
| 223 | + Input array |
| 224 | +
|
| 225 | + Returns |
| 226 | + ------- |
| 227 | + values : ndarray |
| 228 | + The unique values from the (flattened) input that are repeated. |
| 229 | +
|
| 230 | + counts : ndarray |
| 231 | + Number of times the corresponding 'value' is repeated. |
| 232 | +
|
| 233 | + Notes |
| 234 | + ----- |
| 235 | + In numpy >= 1.9 `numpy.unique` provides similar functionality. The main |
| 236 | + difference is that `find_repeats` only returns repeated values. |
| 237 | +
|
| 238 | + Examples |
| 239 | + -------- |
| 240 | + >>> from scipy import stats |
| 241 | + >>> stats.find_repeats([2, 1, 2, 3, 2, 2, 5]) |
| 242 | + RepeatedResults(values=array([ 2.]), counts=array([4], dtype=int32)) |
| 243 | +
|
| 244 | + >>> stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]]) |
| 245 | + RepeatedResults(values=array([ 4., 5.]), counts=array([2, 2], dtype=int32)) |
| 246 | +
|
| 247 | + """ |
| 248 | + RepeatedResults = namedtuple('RepeatedResults', ('values', 'counts')) |
| 249 | + |
| 250 | + if np.asarray(arr).size == 0: |
| 251 | + return RepeatedResults([], []) |
| 252 | + |
| 253 | + v1, v2, n = futil.dfreps(arr) |
| 254 | + return RepeatedResults(v1[:n], v2[:n]) |
0 commit comments