sandepp123
diff --git a/‎scipy/stats/_stats_mstats_common.py‎
Lines changed: 254 additions & 0 deletions b/‎scipy/stats/_stats_mstats_common.py‎
Lines changed: 254 additions & 0 deletions
diff --git a/‎scipy/stats/mstats.py‎
Lines changed: 3 additions & 1 deletion b/‎scipy/stats/mstats.py‎
Lines changed: 3 additions & 1 deletion
@@ -0,0 +1,254 @@
+from collections import namedtuple
+
+import numpy as np
+
+from . import distributions
+from . import futil
+
+
+__all__ = ['find_repeats', 'linregress', 'theilslopes']
+
+
+def linregress(x, y=None):
+    """
+    Calculate a regression line
+
+    This computes a least-squares regression for two sets of measurements.
+
+    Parameters
+    ----------
+    x, y : array_like
+        Two sets of measurements.  Both arrays should have the same length.
+        If only x is given (and y=None), then it must be a two-dimensional
+        array where one dimension has length 2.  The two sets of measurements
+        are then found by splitting the array along the length-2 dimension.
+
+    Returns
+    -------
+    slope : float
+        slope of the regression line
+    intercept : float
+        intercept of the regression line
+    rvalue : float
+        correlation coefficient
+    pvalue : float
+        two-sided p-value for a hypothesis test whose null hypothesis is
+        that the slope is zero.
+    stderr : float
+        Standard error of the estimate
+
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> np.random.seed(12345678)
+    >>> x = np.random.random(10)
+    >>> y = np.random.random(10)
+    >>> slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
+
+    # To get coefficient of determination (r_squared)
+
+    >>> print("r-squared:", r_value**2)
+    ('r-squared:', 0.080402268539028335)
+
+    """
+    TINY = 1.0e-20
+    if y is None:  # x is a (2, N) or (N, 2) shaped array_like
+        x = np.asarray(x)
+        if x.shape[0] == 2:
+            x, y = x
+        elif x.shape[1] == 2:
+            x, y = x.T
+        else:
+            msg = ("If only `x` is given as input, it has to be of shape "
+                   "(2, N) or (N, 2), provided shape was %s" % str(x.shape))
+            raise ValueError(msg)
+    else:
+        x = np.asarray(x)
+        y = np.asarray(y)
+    n = len(x)
+    xmean = np.mean(x, None)
+    ymean = np.mean(y, None)
+
+    # average sum of squares:
+    ssxm, ssxym, ssyxm, ssym = np.cov(x, y, bias=1).flat
+    r_num = ssxym
+    r_den = np.sqrt(ssxm * ssym)
+    if r_den == 0.0:
+        r = 0.0
+    else:
+        r = r_num / r_den
+        # test for numerical error propagation
+        if r > 1.0:
+            r = 1.0
+        elif r < -1.0:
+            r = -1.0
+
+    df = n - 2
+    t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
+    prob = 2 * distributions.t.sf(np.abs(t), df)
+    slope = r_num / ssxm
+    intercept = ymean - slope*xmean
+    sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)
+
+    LinregressResult = namedtuple('LinregressResult', ('slope', 'intercept',
+                                                       'rvalue', 'pvalue',
+                                                       'stderr'))
+    return LinregressResult(slope, intercept, r, prob, sterrest)
+
+
+def theilslopes(y, x=None, alpha=0.95):
+    r"""
+    Computes the Theil-Sen estimator for a set of points (x, y).
+
+    `theilslopes` implements a method for robust linear regression.  It
+    computes the slope as the median of all slopes between paired values.
+
+    Parameters
+    ----------
+    y : array_like
+        Dependent variable.
+    x : array_like or None, optional
+        Independent variable. If None, use ``arange(len(y))`` instead.
+    alpha : float, optional
+        Confidence degree between 0 and 1. Default is 95% confidence.
+        Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
+        interpreted as "find the 90% confidence interval".
+
+    Returns
+    -------
+    medslope : float
+        Theil slope.
+    medintercept : float
+        Intercept of the Theil line, as ``median(y) - medslope*median(x)``.
+    lo_slope : float
+        Lower bound of the confidence interval on `medslope`.
+    up_slope : float
+        Upper bound of the confidence interval on `medslope`.
+
+    Notes
+    -----
+    The implementation of `theilslopes` follows [1]_. The intercept is
+    not defined in [1]_, and here it is defined as ``median(y) -
+    medslope*median(x)``, which is given in [3]_. Other definitions of
+    the intercept exist in the literature. A confidence interval for
+    the intercept is not given as this question is not addressed in
+    [1]_.
+
+    References
+    ----------
+    .. [1] P.K. Sen, "Estimates of the regression coefficient based on Kendall's tau",
+           J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.
+    .. [2] H. Theil, "A rank-invariant method of linear and polynomial
+           regression analysis I, II and III",  Nederl. Akad. Wetensch., Proc.
+           53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.
+    .. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,
+           John Wiley and Sons, New York, pp. 493.
+
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+
+    >>> x = np.linspace(-5, 5, num=150)
+    >>> y = x + np.random.normal(size=x.size)
+    >>> y[11:15] += 10  # add outliers
+    >>> y[-5:] -= 7
+
+    Compute the slope, intercept and 90% confidence interval.  For comparison,
+    also compute the least-squares fit with `linregress`:
+
+    >>> res = stats.theilslopes(y, x, 0.90)
+    >>> lsq_res = stats.linregress(x, y)
+
+    Plot the results. The Theil-Sen regression line is shown in red, with the
+    dashed red lines illustrating the confidence interval of the slope (note
+    that the dashed red lines are not the confidence interval of the regression
+    as the confidence interval of the intercept is not included). The green
+    line shows the least-squares fit for comparison.
+
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> ax.plot(x, y, 'b.')
+    >>> ax.plot(x, res[1] + res[0] * x, 'r-')
+    >>> ax.plot(x, res[1] + res[2] * x, 'r--')
+    >>> ax.plot(x, res[1] + res[3] * x, 'r--')
+    >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
+    >>> plt.show()
+
+    """
+    y = np.asarray(y).flatten()
+    if x is None:
+        x = np.arange(len(y), dtype=float)
+    else:
+        x = np.asarray(x, dtype=float).flatten()
+        if len(x) != len(y):
+            raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
+
+    # Compute sorted slopes only when deltax > 0
+    deltax = x[:, np.newaxis] - x
+    deltay = y[:, np.newaxis] - y
+    slopes = deltay[deltax > 0] / deltax[deltax > 0]
+    slopes.sort()
+    medslope = np.median(slopes)
+    medinter = np.median(y) - medslope * np.median(x)
+    # Now compute confidence intervals
+    if alpha > 0.5:
+        alpha = 1. - alpha
+
+    z = distributions.norm.ppf(alpha / 2.)
+    # This implements (2.6) from Sen (1968)
+    _, nxreps = find_repeats(x)
+    _, nyreps = find_repeats(y)
+    nt = len(slopes)       # N in Sen (1968)
+    ny = len(y)            # n in Sen (1968)
+    # Equation 2.6 in Sen (1968):
+    sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -
+                     np.sum(k * (k-1) * (2*k + 5) for k in nxreps) -
+                     np.sum(k * (k-1) * (2*k + 5) for k in nyreps))
+    # Find the confidence interval indices in `slopes`
+    sigma = np.sqrt(sigsq)
+    Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
+    Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
+    delta = slopes[[Rl, Ru]]
+    return medslope, medinter, delta[0], delta[1]
+
+
+def find_repeats(arr):
+    """
+    Find repeats and repeat counts.
+
+    Parameters
+    ----------
+    arr : array_like
+        Input array
+
+    Returns
+    -------
+    values : ndarray
+        The unique values from the (flattened) input that are repeated.
+
+    counts : ndarray
+        Number of times the corresponding 'value' is repeated.
+
+    Notes
+    -----
+    In numpy >= 1.9 `numpy.unique` provides similar functionality. The main
+    difference is that `find_repeats` only returns repeated values.
+
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> stats.find_repeats([2, 1, 2, 3, 2, 2, 5])
+    RepeatedResults(values=array([ 2.]), counts=array([4], dtype=int32))
+
+    >>> stats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])
+    RepeatedResults(values=array([ 4.,  5.]), counts=array([2, 2], dtype=int32))
+
+    """
+    RepeatedResults = namedtuple('RepeatedResults', ('values', 'counts'))
+
+    if np.asarray(arr).size == 0:
+        return RepeatedResults([], [])
+
+    v1, v2, n = futil.dfreps(arr)
+    return RepeatedResults(v1[:n], v2[:n])
@@ -96,4 +96,6 @@
 
 from .mstats_basic import *
 from .mstats_extras import *
-from scipy.stats import gmean, hmean
+# Functions that support masked array input in stats but need to be kept in the
+# mstats namespace for backwards compatibility:
+from scipy.stats import gmean, hmean, zmap, zscore, chisquare