Skip to content

Commit 251634f

Browse files
clcarwinpiiswrong
authored andcommitted
add NNPACK support for high convolution inference perf (apache#3666)
* add NNPACK support for high convolution inference perf * set USE_NNPACK to 0 * Fix header declaration * Fix input_size init value 1. data's shape is BxCxHxW, input_size is {width,height} 2. improve algorithm selection policy * Fix lint error
1 parent 0071324 commit 251634f

File tree

4 files changed

+142
-0
lines changed

4 files changed

+142
-0
lines changed

Makefile

+6
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,12 @@ ifeq ($(USE_OPENMP), 1)
5858
CFLAGS += -fopenmp
5959
endif
6060

61+
ifeq ($(USE_NNPACK), 1)
62+
CFLAGS += -DMXNET_USE_NNPACK=1
63+
CFLAGS += -DMXNET_USE_NNPACK_NUM_THREADS=$(USE_NNPACK_NUM_THREADS)
64+
LDFLAGS += -lnnpack
65+
endif
66+
6167
ifeq ($(USE_MKL2017), 1)
6268
CFLAGS += -DMXNET_USE_MKL2017=1
6369
CFLAGS += -DUSE_MKL=1

make/config.mk

+4
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@ USE_MKL2017 = 0
6565
# whether use MKL2017 experimental feature for high performance
6666
USE_MKL2017_EXPERIMENTAL = 0
6767

68+
# whether use NNPACK library
69+
USE_NNPACK = 0
70+
USE_NNPACK_NUM_THREADS = 4
71+
6872
# choose the version of blas you want to use
6973
# can be: mkl, blas, atlas, openblas
7074
# in default use atlas for linux while apple for osx

src/operator/convolution.cc

+15
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
#include "./mkl/mkl_memory-inl.h"
1212
#include "./mkl/mkl_convolution-inl.h"
1313
#endif // MXNET_USE_MKL2017
14+
#if MXNET_USE_NNPACK == 1
15+
#include "./nnpack/nnpack_convolution-inl.h"
16+
#endif // MXNET_USE_NNPACK
1417

1518
namespace mxnet {
1619
namespace op {
@@ -32,6 +35,18 @@ Operator* CreateOp<cpu>(ConvolutionParam param, int dtype,
3235
break;
3336
}
3437
}
38+
#endif
39+
#if MXNET_USE_NNPACK == 1
40+
if ((param.dilate[0] == 1 && param.dilate[1] == 1)
41+
&& param.kernel.ndim() == 2 && (!param.no_bias)
42+
&& param.num_group == 1) {
43+
switch (dtype) {
44+
case mshadow::kFloat32:
45+
return new NNPACKConvolutionOp<cpu, float>(param);
46+
default:
47+
break;
48+
}
49+
}
3550
#endif
3651
MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
3752
op = new ConvolutionOp<cpu, DType>(param);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
/*!
2+
* Copyright (c) 2016 by Contributors
3+
* \file nnpack_convolution-inl.h
4+
* \brief
5+
* \author Carwin
6+
*/
7+
#ifndef MXNET_OPERATOR_NNPACK_NNPACK_CONVOLUTION_INL_H_
8+
#define MXNET_OPERATOR_NNPACK_NNPACK_CONVOLUTION_INL_H_
9+
10+
#include <dmlc/logging.h>
11+
#include <dmlc/parameter.h>
12+
#include <mxnet/operator.h>
13+
#include <algorithm>
14+
#include <map>
15+
#include <vector>
16+
#include <string>
17+
#include <utility>
18+
#include "../convolution-inl.h"
19+
#include "nnpack.h"
20+
21+
namespace mxnet {
22+
namespace op {
23+
24+
class NNPACKInitialize {
25+
public:
26+
pthreadpool_t threadpool;
27+
28+
public:
29+
NNPACKInitialize() {
30+
nnp_status status = nnp_initialize();
31+
if (nnp_status_success != status) {
32+
LOG(FATAL) << "nnp_initialize failed status=" << status;
33+
}
34+
int num_threads = MXNET_USE_NNPACK_NUM_THREADS;
35+
this->threadpool = pthreadpool_create(num_threads);
36+
}
37+
virtual ~NNPACKInitialize() {
38+
nnp_status status = nnp_deinitialize();
39+
if (nnp_status_success != status) {
40+
LOG(FATAL) << "nnp_deinitialize failed status=" << status;
41+
}
42+
pthreadpool_destroy(threadpool);
43+
}
44+
};
45+
46+
static NNPACKInitialize nnpackinitialize;
47+
48+
template <typename xpu, typename DType>
49+
class NNPACKConvolutionOp : public ConvolutionOp<xpu, DType> {
50+
private:
51+
ConvolutionParam param_;
52+
53+
public:
54+
explicit NNPACKConvolutionOp(ConvolutionParam p)
55+
: ConvolutionOp<xpu, DType>(p) {
56+
this->param_ = p;
57+
}
58+
59+
public:
60+
virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
61+
const std::vector<OpReqType> &req,
62+
const std::vector<TBlob> &out_data,
63+
const std::vector<TBlob> &aux_args) {
64+
using namespace mshadow;
65+
using namespace mshadow::expr;
66+
Stream<xpu> *s = ctx.get_stream<xpu>();
67+
Tensor<xpu, 4, DType> data = in_data[conv::kData].get<xpu, 4, DType>(s);
68+
Shape<3> wmat_shape =
69+
Shape3(param_.num_group, param_.num_filter / param_.num_group,
70+
data.shape_[1] / param_.num_group * param_.kernel[0] *
71+
param_.kernel[1]);
72+
Tensor<xpu, 3, DType> wmat =
73+
in_data[conv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
74+
Tensor<xpu, 4, DType> out = out_data[conv::kOut].get<xpu, 4, DType>(s);
75+
76+
// nnp_convolution_inference optimize for batch_size==1
77+
// when W or H less than 16, ConvolutionOp fast than nnpack's convolution
78+
if ((data.shape_[0] != 1) || (data.shape_[2] < 16) ||
79+
(data.shape_[3] < 16)) {
80+
ConvolutionOp<xpu, DType>::Forward(ctx, in_data, req, out_data, aux_args);
81+
} else {
82+
nnp_size input_size = {data.shape_[3], data.shape_[2]};
83+
nnp_padding input_padding = {param_.pad[0], param_.pad[1], param_.pad[0],
84+
param_.pad[1]};
85+
nnp_size kernel_size = {param_.kernel[1], param_.kernel[0]};
86+
nnp_size output_subsampling = {param_.stride[1], param_.stride[0]};
87+
Tensor<xpu, 1, DType> bias = in_data[conv::kBias].get<xpu, 1, DType>(s);
88+
89+
nnp_convolution_algorithm algorithm = nnp_convolution_algorithm_auto;
90+
if ((data.shape_[2] < 32) || (data.shape_[3] < 32)) {
91+
algorithm = nnp_convolution_algorithm_implicit_gemm;
92+
}
93+
94+
nnp_status status = nnp_convolution_inference(
95+
algorithm, // enum nnp_convolution_algorithm algorithm,
96+
nnp_convolution_transform_strategy_tuple_based,
97+
data.shape_[1], // size_t input_channels,
98+
param_.num_filter, // size_t output_channels,
99+
input_size, // struct nnp_size input_size,
100+
input_padding, // struct nnp_padding input_padding,
101+
kernel_size, // struct nnp_size kernel_size,
102+
output_subsampling, // struct nnp_size output_subsampling,
103+
data.dptr_, // const float input[],
104+
wmat.dptr_, // const float kernel[],
105+
bias.dptr_, // const float bias[],
106+
out.dptr_, // float output[],
107+
nnpackinitialize.threadpool, // pthreadpool_t threadpool,
108+
nullptr);
109+
if (nnp_status_success != status) {
110+
LOG(FATAL) << "nnp_convolution_inference failed status=" << status;
111+
}
112+
}
113+
}
114+
}; // class NNPACKConvolutionOp
115+
} // namespace op
116+
} // namespace mxnet
117+
#endif // MXNET_OPERATOR_NNPACK_NNPACK_CONVOLUTION_INL_H_

0 commit comments

Comments
 (0)