Skip to content

Commit 9d9968f

Browse files
committed
Update Algorithms
1 parent 7aefa3e commit 9d9968f

File tree

13 files changed

+251
-132
lines changed

13 files changed

+251
-132
lines changed

NN/Basic/Layers.py

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,39 +17,47 @@ def conv_bp(n, n_filters, out_h, out_w, dx_padded,
1717
for j in range(out_h):
1818
for k in range(out_w):
1919
for h in range(dx_padded.shape[1]):
20-
dx_padded[i, h, j * sd:filter_height + j * sd, k * sd:filter_width + k * sd] += (
21-
inner_weight[f][h] * delta[i, f, j, k]
22-
)
20+
jsd, ksd = j * sd, k * sd
21+
for p in range(filter_height):
22+
for q in range(filter_width):
23+
dx_padded[i, h, jsd+p, ksd+q] += (
24+
inner_weight[f][h][p][q] * delta[i, f, j, k]
25+
)
2326

2427

2528
@numba.jit([
2629
"void(int64, int64, int64, int64, float32[:,:,:,:], float32[:,:,:,:],"
27-
"int64, int64, int64)"
30+
"int64, int64, int64, int32[:,:,:,:,:])"
2831
], nopython=True)
2932
def max_pool(n, n_channels, out_h, out_w, x, out,
30-
pool_height, pool_width, sd):
33+
pool_height, pool_width, sd, pos_cache):
3134
for i in range(n):
3235
for j in range(n_channels):
3336
for k in range(out_h):
3437
for l in range(out_w):
35-
window = x[i, j, k * sd:pool_height + k * sd, l * sd:pool_width + l * sd]
36-
out[i, j, k, l] = np.max(window)
38+
ksd, lsd = k * sd, l * sd
39+
_max = x[i, j, ksd, lsd]
40+
pos = (0, 0)
41+
for p in range(pool_height):
42+
for q in range(pool_width):
43+
if x[i, j, ksd+p, lsd+q] > _max:
44+
_max = x[i, j, ksd+p, lsd+q]
45+
pos = (p, q)
46+
pos_cache[i, j, k, l] = pos
47+
out[i, j, k, l] = _max
3748

3849

3950
@numba.jit([
40-
"void(int64, int64, int64, int64, float32[:,:,:,:],"
41-
"int64, int64, int64, float32[:,:,:,:], float32[:,:,:,:])"
51+
"void(int64, int64, int64, int64, int64, float32[:,:,:,:], float32[:,:,:,:], int32[:,:,:,:,:])"
4252
], nopython=True)
43-
def max_pool_bp(n, n_channels, out_h, out_w, x_cache,
44-
pool_height, pool_width, sd, dx, delta):
53+
def max_pool_bp(n, n_channels, out_h, out_w, sd, dx, delta, pos_cache):
4554
for i in range(n):
4655
for j in range(n_channels):
4756
for k in range(out_h):
4857
for l in range(out_w):
49-
window = x_cache[i, j, k * sd:pool_height + k * sd, l * sd:pool_width + l * sd]
50-
dx[i, j, k * sd:pool_height + k * sd, l * sd:pool_width + l * sd] = (
51-
(window == np.max(window)) * delta[i, j, k, l]
52-
)
58+
ksd, lsd = k * sd, l * sd
59+
pos = pos_cache[i, j, k, l]
60+
dx[i, j, ksd+pos[0], lsd+pos[1]] = delta[i, j, k, l]
5361

5462

5563
# Abstract Layers
@@ -519,11 +527,13 @@ def _activate(self, x, *args):
519527
self._pool_cache["method"] = "reshape"
520528
else:
521529
out = np.zeros((n, n_channels, self.out_h, self.out_w), dtype=np.float32)
530+
pos_cache = np.zeros((n, n_channels, self.out_h, self.out_w, 2), dtype=np.int32)
522531
max_pool(
523532
n, n_channels, self.out_h, self.out_w, x, out,
524-
pool_height, pool_width, sd
533+
pool_height, pool_width, sd, pos_cache
525534
)
526535
self._pool_cache["method"] = "original"
536+
self._pool_cache["pos_cache"] = pos_cache
527537
return out
528538

529539
def _derivative(self, y, *args):
@@ -549,10 +559,8 @@ def _derivative(self, y, *args):
549559
sd = self._stride
550560
dx = np.zeros_like(self.x_cache)
551561
n, n_channels, *_ = self.x_cache.shape
552-
_, pool_height, pool_width = self._shape[1]
553562
max_pool_bp(
554-
n, n_channels, self.out_h, self.out_w, self.x_cache,
555-
pool_height, pool_width, sd, dx, delta
563+
n, n_channels, self.out_h, self.out_w, sd, dx, delta, self._pool_cache["pos_cache"]
556564
)
557565
else:
558566
raise LayerError("Undefined pooling method '{}' found".format(method))

Notebooks/numba/Basic(zh-cn).ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
"+ 注意:\n",
7676
" + `numba`不支持 list comprehension,详情可参见[这里](https://github.com/numba/numba/issues/504)\n",
7777
" + `jit`会在某种程度上“预编译”你的代码,这意味着它会在某种程度上固定住各个变量的数据类型;所以在`jit`下定义数组时,如果想要使用的是`float`数组的话,就不能像上述`wrong_add`里那样用`[0] * len(x)`定义、而应该在`0`后面加一个小数点:`[0.] * len(x)`\n",
78-
" + `jit`能够加速的不限于`for`,但一般而言加速`for`会比较常见、效果也比较显著。我在我实现的`numpy`版本的卷积神经网络(`CNN`)中用了`jit`后、可以把代码加速 **20** 倍左右。具体代码可以参见[这里](https://github.com/carefree0910/MachineLearning/blob/master/NN/Basic/Layers.py#L9),不过如果不想看源代码的话,可以参见[CNN(zh-cn).ipynb][1],我在其中做了一些相应的、比较简单的实验\n",
78+
" + `jit`能够加速的不限于`for`,但一般而言加速`for`会比较常见、效果也比较显著。我在我实现的`numpy`版本的卷积神经网络(`CNN`)中用了`jit`后、可以把代码加速 **60 多倍**。具体代码可以参见[这里](https://github.com/carefree0910/MachineLearning/blob/master/NN/Basic/Layers.py#L9),不过如果不想看源代码的话,可以参见[CNN(zh-cn).ipynb][1],我在其中做了一些相应的、比较简单的实验\n",
7979
"\n",
8080
"[1]: https://github.com/carefree0910/MachineLearning/blob/master/Notebooks/numba/CNN(zh-cn).ipynb"
8181
]

Notebooks/numba/CNN(zh-cn).ipynb

Lines changed: 69 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99
"name": "stdout",
1010
"output_type": "stream",
1111
"text": [
12-
"0.00113585\n",
13-
"0.000733545\n",
14-
"0.00112681\n",
15-
"3.63 s ± 194 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
16-
"300 ms ± 20.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
17-
"8.69 ms ± 223 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
12+
"0.00116325\n",
13+
"0.000750613\n",
14+
"0.00115522\n",
15+
"3.32 s ± 115 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
16+
"300 ms ± 19.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
17+
"8.34 ms ± 171 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
1818
]
1919
}
2020
],
@@ -29,7 +29,6 @@
2929
" window = x[i, ..., j:j+filter_height, p:p+filter_width]\n",
3030
" for q in range(n_filters):\n",
3131
" rs[i, q, j, p] += np.sum(w[q] * window)\n",
32-
" return rs\n",
3332
"\n",
3433
"@nb.jit(nopython=True)\n",
3534
"def jit_conv_kernel(x, w, rs, n, n_channels, height, width, n_filters, filter_height, filter_width, out_h, out_w):\n",
@@ -84,18 +83,49 @@
8483
},
8584
{
8685
"cell_type": "code",
87-
"execution_count": 2,
86+
"execution_count": null,
8887
"metadata": {},
8988
"outputs": [
9089
{
9190
"name": "stdout",
9291
"output_type": "stream",
9392
"text": [
94-
"592 ms ± 25.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
95-
"8.5 ms ± 150 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
93+
"288 ms ± 33.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
94+
"64.8 ms ± 815 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
95+
"7.76 ms ± 104 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
9696
]
9797
}
9898
],
99+
"source": [
100+
"@nb.jit(nopython=True)\n",
101+
"def jit_conv_kernel2(x, w, rs, n, n_channels, height, width, n_filters, filter_height, filter_width, out_h, out_w):\n",
102+
" for i in range(n):\n",
103+
" for j in range(out_h):\n",
104+
" for p in range(out_w):\n",
105+
" for q in range(n_filters):\n",
106+
" for r in range(n_channels):\n",
107+
" for s in range(filter_height):\n",
108+
" for t in range(filter_width):\n",
109+
" rs[i, q, j, p] += x[i, r, j+s, p+t] * w[q, r, s, t]\n",
110+
" \n",
111+
"assert np.allclose(conv(x, w, jit_conv_kernel, args), conv(x, w, jit_conv_kernel, args))\n",
112+
"%timeit conv(x, w, jit_conv_kernel, args)\n",
113+
"%timeit conv(x, w, jit_conv_kernel2, args)\n",
114+
"%timeit cs231n_conv(x, w, args)"
115+
]
116+
},
117+
{
118+
"cell_type": "markdown",
119+
"metadata": {},
120+
"source": [
121+
"+ 可以看到,使用`jit`和使用纯`numpy`进行编程的很大一点不同就是,不要畏惧用`for`;事实上一般来说,代码“长得越像 C”、速度就会越快"
122+
]
123+
},
124+
{
125+
"cell_type": "code",
126+
"execution_count": null,
127+
"metadata": {},
128+
"outputs": [],
99129
"source": [
100130
"def max_pool_kernel(x, rs, *args):\n",
101131
" n, n_channels, pool_height, pool_width, out_h, out_w = args\n",
@@ -115,6 +145,21 @@
115145
" for q in range(out_w):\n",
116146
" window = x[i, j, p:p+pool_height, q:q+pool_width]\n",
117147
" rs[i, j, p, q] += np.max(window)\n",
148+
" \n",
149+
"@nb.jit(nopython=True)\n",
150+
"def jit_max_pool_kernel2(x, rs, *args):\n",
151+
" n, n_channels, pool_height, pool_width, out_h, out_w = args\n",
152+
" for i in range(n):\n",
153+
" for j in range(n_channels):\n",
154+
" for p in range(out_h):\n",
155+
" for q in range(out_w):\n",
156+
" _max = x[i, j, p, q]\n",
157+
" for r in range(pool_height):\n",
158+
" for s in range(pool_width):\n",
159+
" _tmp = x[i, j, p+r, q+s]\n",
160+
" if _tmp > _max:\n",
161+
" _max = _tmp\n",
162+
" rs[i, j, p, q] += _max\n",
118163
"\n",
119164
"def max_pool(x, kernel, args):\n",
120165
" n, n_channels = args[:2]\n",
@@ -130,8 +175,20 @@
130175
"args = (n, n_channels, pool_height, pool_width, out_h, out_w)\n",
131176
"\n",
132177
"assert np.allclose(max_pool(x, max_pool_kernel, args), max_pool(x, jit_max_pool_kernel, args))\n",
178+
"assert np.allclose(max_pool(x, jit_max_pool_kernel, args), max_pool(x, jit_max_pool_kernel2, args))\n",
133179
"%timeit max_pool(x, max_pool_kernel, args)\n",
134-
"%timeit max_pool(x, jit_max_pool_kernel, args)"
180+
"%timeit max_pool(x, jit_max_pool_kernel, args)\n",
181+
"%timeit max_pool(x, jit_max_pool_kernel2, args)"
182+
]
183+
},
184+
{
185+
"cell_type": "code",
186+
"execution_count": null,
187+
"metadata": {},
188+
"outputs": [],
189+
"source": [
190+
"%load_ext line_profiler\n",
191+
"%lprun -f max_pool(x, jit_max_pool_kernel2, args)"
135192
]
136193
}
137194
],
@@ -156,5 +213,5 @@
156213
}
157214
},
158215
"nbformat": 4,
159-
"nbformat_minor": 2
216+
"nbformat_minor": 1
160217
}

Util/Bases.py

Lines changed: 52 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,13 @@
22
import cv2
33
import time
44
import math
5+
import ctypes
6+
import multiprocessing
57
import numpy as np
68
import tensorflow as tf
79
import matplotlib.pyplot as plt
810
from PIL import Image
11+
from multiprocessing import Pool
912
from mpl_toolkits.mplot3d import Axes3D
1013

1114
from Util.Util import VisUtil
@@ -214,63 +217,57 @@ def f1_score(y, y_pred):
214217

215218
# noinspection PyUnusedLocal
216219
@staticmethod
217-
def _multi_clf(x, clfs, task, kwargs, stack=np.vstack):
218-
# n_cores = kwargs.get("n_cores", 2)
219-
# n_cores = multiprocessing.cpu_count() if n_cores <= 0 else n_cores
220-
# if n_cores == 1:
221-
# matrix = np.array([clf.predict(x, n_cores=1) for clf in clfs], dtype=np.float32).T
222-
# else:
223-
# pool = Pool(max_workers=n_cores)
224-
# batch_size = int(len(clfs) / n_cores)
225-
# batch_clfs, cursor = [], 0
226-
# for i in range(n_cores):
227-
# if i == n_cores - 1:
228-
# batch_clfs.append(clfs[cursor:])
229-
# else:
230-
# batch_clfs.append(clfs[cursor:cursor + batch_size])
231-
# cursor += batch_size
232-
# x_size = np.prod(x.shape) # type: int
233-
# shared_base = multiprocessing.Array(ctypes.c_float, int(x_size))
234-
# shared_matrix = np.ctypeslib.as_array(shared_base.get_obj()).reshape(x.shape)
235-
# shared_matrix[:] = x
236-
# del x, clfs, shared_base
237-
# matrix = stack(
238-
# pool.map(task, ((shared_matrix, clfs, 1) for clfs in batch_clfs))
239-
# ).T.astype(np.float32)
240-
# return matrix
241-
return np.array([clf.predict(x) for clf in clfs], dtype=np.float32).T
220+
def _multi_clf(x, clfs, task, kwargs, stack=np.vstack, target="single"):
221+
if target != "parallel":
222+
return np.array([clf.predict(x) for clf in clfs], dtype=np.float32).T
223+
n_cores = kwargs.get("n_cores", 2)
224+
n_cores = multiprocessing.cpu_count() if n_cores <= 0 else n_cores
225+
if n_cores == 1:
226+
matrix = np.array([clf.predict(x, n_cores=1) for clf in clfs], dtype=np.float32).T
227+
else:
228+
pool = Pool(processes=n_cores)
229+
batch_size = int(len(clfs) / n_cores)
230+
clfs = [clfs[i*batch_size:(i+1)*batch_size] for i in range(n_cores)]
231+
x_size = np.prod(x.shape) # type: int
232+
shared_base = multiprocessing.Array(ctypes.c_float, int(x_size))
233+
shared_matrix = np.ctypeslib.as_array(shared_base.get_obj()).reshape(x.shape)
234+
shared_matrix[:] = x
235+
matrix = stack(
236+
pool.map(task, ((shared_matrix, clfs, n_cores) for clfs in clfs))
237+
).T.astype(np.float32)
238+
return matrix
242239

243240
# noinspection PyUnusedLocal
244-
def _multi_data(self, x, task, kwargs, stack=np.hstack):
245-
# n_cores = kwargs.get("n_cores", 2)
246-
# n_cores = multiprocessing.cpu_count() if n_cores <= 0 else n_cores
247-
# if n_cores == 1:
248-
# matrix = task((x, self, n_cores))
249-
# else:
250-
# pool = Pool(max_workers=n_cores)
251-
# batch_size = int(len(x) / n_cores)
252-
# batch_base, batch_data, cursor = [], [], 0
253-
# x_dim = x.shape[1]
254-
# for i in range(n_cores):
255-
# if i == n_cores - 1:
256-
# batch_data.append(x[cursor:])
257-
# batch_base.append(multiprocessing.Array(ctypes.c_float, (len(x) - cursor) * x_dim))
258-
# else:
259-
# batch_data.append(x[cursor:cursor + batch_size])
260-
# batch_base.append(multiprocessing.Array(ctypes.c_float, batch_size * x_dim))
261-
# cursor += batch_size
262-
# shared_arrays = [
263-
# np.ctypeslib.as_array(shared_base.get_obj()).reshape(-1, x_dim)
264-
# for shared_base in batch_base
265-
# ]
266-
# for i, data in enumerate(batch_data):
267-
# shared_arrays[i][:] = data
268-
# del x, batch_base, batch_data
269-
# matrix = stack(
270-
# pool.map(task, ((x, self, n_cores) for x in shared_arrays))
271-
# )
272-
# return matrix.astype(np.float32)
273-
return task((x, self, 1))
241+
def _multi_data(self, x, task, kwargs, stack=np.hstack, target="single"):
242+
if target != "parallel":
243+
return task((x, self, 1))
244+
n_cores = kwargs.get("n_cores", 2)
245+
n_cores = multiprocessing.cpu_count() if n_cores <= 0 else n_cores
246+
if n_cores == 1:
247+
matrix = task((x, self, n_cores))
248+
else:
249+
pool = Pool(processes=n_cores)
250+
batch_size = int(len(x) / n_cores)
251+
batch_base, batch_data, cursor = [], [], 0
252+
x_dim = x.shape[1]
253+
for i in range(n_cores):
254+
if i == n_cores - 1:
255+
batch_data.append(x[cursor:])
256+
batch_base.append(multiprocessing.Array(ctypes.c_float, (len(x) - cursor) * x_dim))
257+
else:
258+
batch_data.append(x[cursor:cursor + batch_size])
259+
batch_base.append(multiprocessing.Array(ctypes.c_float, batch_size * x_dim))
260+
cursor += batch_size
261+
shared_arrays = [
262+
np.ctypeslib.as_array(shared_base.get_obj()).reshape(-1, x_dim)
263+
for shared_base in batch_base
264+
]
265+
for i, data in enumerate(batch_data):
266+
shared_arrays[i][:] = data
267+
matrix = stack(
268+
pool.map(task, ((x, self, n_cores) for x in shared_arrays))
269+
)
270+
return matrix.astype(np.float32)
274271

275272
@staticmethod
276273
def _get_train_repeat(x, batch_size):

0 commit comments

Comments
 (0)