forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_isotonic.pyx
124 lines (106 loc) · 3.96 KB
/
_isotonic.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Author: Nelle Varoquaux, Andrew Tulloch
# Uses the pool adjacent violators algorithm (PAVA), with the
# enhancement of searching for the longest decreasing subsequence to
# pool at each step.
import numpy as np
cimport numpy as np
cimport cython
ctypedef np.float64_t DOUBLE
np.import_array()
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
def _isotonic_regression(np.ndarray[DOUBLE, ndim=1] y,
np.ndarray[DOUBLE, ndim=1] weight,
np.ndarray[DOUBLE, ndim=1] solution):
cdef:
DOUBLE numerator, denominator, ratio
Py_ssize_t i, pooled, n, k
n = y.shape[0]
# The algorithm proceeds by iteratively updating the solution
# array.
# TODO - should we just pass in a pre-copied solution
# array and mutate that?
for i in range(n):
solution[i] = y[i]
if n <= 1:
return solution
n -= 1
while 1:
# repeat until there are no more adjacent violators.
i = 0
pooled = 0
while i < n:
k = i
while k < n and solution[k] >= solution[k + 1]:
k += 1
if solution[i] != solution[k]:
# solution[i:k + 1] is a decreasing subsequence, so
# replace each point in the subsequence with the
# weighted average of the subsequence.
# TODO: explore replacing each subsequence with a
# _single_ weighted point, and reconstruct the whole
# sequence from the sequence of collapsed points.
# Theoretically should reduce running time, though
# initial experiments weren't promising.
numerator = 0.0
denominator = 0.0
for j in range(i, k + 1):
numerator += solution[j] * weight[j]
denominator += weight[j]
ratio = numerator / denominator
for j in range(i, k + 1):
solution[j] = ratio
pooled = 1
i = k + 1
# Check for convergence
if pooled == 0:
break
return solution
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
def _make_unique(np.ndarray[dtype=np.float64_t] X,
np.ndarray[dtype=np.float64_t] y,
np.ndarray[dtype=np.float64_t] sample_weights):
"""Average targets for duplicate X, drop duplicates.
Aggregates duplicate X values into a single X value where
the target y is a (sample_weighted) average of the individual
targets.
Assumes that X is ordered, so that all duplicates follow each other.
"""
unique_values = len(np.unique(X))
if unique_values == len(X):
return X, y, sample_weights
cdef np.ndarray[dtype=np.float64_t] y_out = np.empty(unique_values)
cdef np.ndarray[dtype=np.float64_t] x_out = np.empty(unique_values)
cdef np.ndarray[dtype=np.float64_t] weights_out = np.empty(unique_values)
cdef np.float64_t current_x = X[0]
cdef np.float64_t current_y = 0
cdef np.float64_t current_weight = 0
cdef np.float64_t y_old = 0
cdef int i = 0
cdef int current_count = 0
cdef int j
cdef np.float64_t x
cdef int n_samples = len(X)
for j in range(n_samples):
x = X[j]
if x != current_x:
# next unique value
x_out[i] = current_x
weights_out[i] = current_weight / current_count
y_out[i] = current_y / current_weight
i += 1
current_x = x
current_weight = sample_weights[j]
current_y = y[j] * sample_weights[j]
current_count = 1
else:
current_weight += sample_weights[j]
current_y += y[j] * sample_weights[j]
current_count += 1
x_out[i] = current_x
weights_out[i] = current_weight / current_count
y_out[i] = current_y / current_weight
return x_out, y_out, weights_out