Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize ERF with MKL math function #5

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Prev Previous commit
Next Next commit
add functions
  • Loading branch information
TaoLv committed Apr 18, 2019
commit 672be6a715b59a6c01f745a039d758c8ecf325fe
151 changes: 122 additions & 29 deletions src/operator/mkl_functions-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,36 +44,31 @@ static bool check_type(const int t) {
return (t == mshadow::kFloat32 || t == mshadow::kFloat64);
}

#define MXNET_MKL_UNARY_MATH_FUNC(name, func) \
struct name : public mxnet_op::tunable { \
template <typename DType> \
MSHADOW_XINLINE static void Map(const index_t n, const DType *src, float *dst) { \
vs##func(static_cast<MKL_INT>(n), reinterpret_cast<const float *>(src), dst); \
} \
MSHADOW_XINLINE static void Map(const index_t n, const double *src, double *dst) { \
vd##func(static_cast<MKL_INT>(n), src, dst); \
} \
}
#define MXNET_MKL_UNARY_MATH_FUNC(name, func) \
struct name { \
MSHADOW_XINLINE static void Vectorize(const index_t n, const float *src, float *dst) { \
vs##func(static_cast<MKL_INT>(n), src, dst); \
} \
MSHADOW_XINLINE static void Vectorize(const index_t n, const double *src, double *dst) { \
vd##func(static_cast<MKL_INT>(n), src, dst); \
} \
};

#define MXNET_MKL_BINARY_MATH_FUNC(name, func) \
struct name : public mxnet_op::tunable { \
template <typename DType> \
MSHADOW_XINLINE static void Map(const index_t n, \
const DType *a, \
const DType *b, \
float *c) { \
vs##func(static_cast<MKL_INT>(n), \
reinterpret_cast<const float *>(a), \
reinterpret_cast<const float *>(b), \
c); \
} \
MSHADOW_XINLINE static void Map(const index_t n, \
const double *a, \
const double *b, \
double *c) { \
vd##func(static_cast<MKL_INT>(n), a, b, c); \
} \
}
#define MXNET_MKL_BINARY_MATH_FUNC(name, func) \
struct name { \
MSHADOW_XINLINE static void Vectorize(const index_t n, \
const float *a, \
const float *b, \
float *c) { \
vs##func(static_cast<MKL_INT>(n), a, b, c); \
} \
MSHADOW_XINLINE static void Vectorize(const index_t n, \
const double *a, \
const double *b, \
double *c) { \
vd##func(static_cast<MKL_INT>(n), a, b, c); \
} \
};

MXNET_MKL_UNARY_MATH_FUNC(erf, Erf);
MXNET_MKL_UNARY_MATH_FUNC(exp, Exp);
Expand Down Expand Up @@ -118,6 +113,104 @@ MXNET_MKL_BINARY_MATH_FUNC(pow, Pow);
MXNET_MKL_BINARY_MATH_FUNC(hypot, Hypot);


template <typename DType>
MSHADOW_XINLINE static void sub_(index_t n, DType *in, DType b, DType *dst) {
for (index_t i = 0; i < n; i++)
dst[i] = in[i] - b;
}

template <typename DType>
MSHADOW_XINLINE static void div_(index_t n, DType *in, DType b, DType *dst) {
for (index_t i = 0; i < n; i++)
dst[i] = in[i] / b;
}

template <typename DType>
MSHADOW_XINLINE static void sum_(index_t n, DType *in, DType *dst) {
// dst[0] = cblas_sasum(n, in, 1);
DType sum = 0.0f;
for (index_t i = 0; i < n; i++)
sum += in[i];

dst[0] = sum;
}

template <typename DType>
MSHADOW_XINLINE static void max_(int n, DType * __restrict__ in, DType *dst) {
dst[0] = in[0];
for (int i = 1; i < n; i++)
dst[0] = (dst[0] < in[i]) ? in[i] : dst[0];
}

// LayerNorm on the last dimension
template <typename DType>
MSHADOW_XINLINE static void LayerNormLastDim(const index_t m,
const index_t n,
const DType *a,
const DType *b,
const DType *ws,
const DType *gamma,
const DType *beta,
const DType *mean,
const DType *var,
const DType eps) {
#pragma omp parallel for
for (index_t i = 0; i < m; i++) {
DType* in_offset = a + i * n;
DType* out_offset = b + i * n;
DType* ws_offset = ws + i * n;

sum_(n, in_offset, &(mean[i]));
mean[i] /= n;
sub_(n, in_offset, mean[i], out_offset);
square(n, out_offset, ws_offset);
sum_(n, ws_offset, &(var[i]));
var[i] = sqrt(var[i] / n + eps);

mul(n, out_offset, gamma, out_offset);
div_(n, out_offset, var[i], out_offset);
add(n, out_offset, beta, out_offset);
}
}

// softmax on the last dimension
template <typename DType>
MSHADOW_XINLINE static void SoftmaxLastDim(const index_t m,
const index_t n,
const DType *a,
const DType *b) {
#pragma omp paralle for
for (index_t i = 0; i < m; i++) {
DType* in_offset = a + i * n;
DType* out_offset = b + i * n;

exp(n, in_offset, out_offset);
float sum = 0.0f;
sum_(n, out_offset, &sum);
div_(n, out_offset, sum, out_offset);
}
}

template <typename DType>
MSHADOW_XINLINE static void LogSoftmaxLastDim(const index_t m,
const index_t n,
const DType *a,
const DType *b) {
#pragma parallel for
for (index_t i = 0; i < m; i++) {
DType* in_offset = a + i * n;
DType* out_offset = b + i * n;

DType b, logsum;
max_(n, in_offset, &b);
sub_(n, in_offset, b, out_offset);
exp(n, out_offset, out_offset);
sum_(n, out_offset, &logsum);
logsum = b + logf(logsum);
sub_(n, in_offset, logsum, out_offset);
}
}

} // namespace mkl_func
} // namespace op
} // namespace mxnet
Expand Down
48 changes: 24 additions & 24 deletions src/operator/tensor/elemwise_unary_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ class UnaryOp : public OpBase {
mkl_func::check_type(type_flag)) {
// set DType as float or double according to type_flag
MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
MKL_OP::Map(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
MKL_OP::Vectorize(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
});
} else {
Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
Expand Down Expand Up @@ -562,7 +562,7 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
NNVM_REGISTER_OP(__name$) \
.set_num_inputs(1) \
.set_num_outputs(1) \
.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>) \
.set_attr<nnvm::FInplaceOption>("FInplaceOption", \
[](const NodeAttrs& attrs){ \
Expand All @@ -578,7 +578,7 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
NNVM_REGISTER_OP(__name$) \
.set_num_inputs(1) \
.set_num_outputs(1) \
.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
.set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>) \
.set_attr<nnvm::FInplaceOption>("FInplaceOption", \
[](const NodeAttrs& attrs){ \
Expand All @@ -591,27 +591,27 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
* * With this macro means mxnet compile with MKL to accelerate math function with mkl.
* * Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
*/
#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$, __mkl_kernel$) \
MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$) \
MXNET_ADD_SPARSE_OP_ALIAS(__name$) \
.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>) \
.set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>) \
.set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kernel$>)

/*! \bried MKL Unary compute.
* * With this macro means mxnet compile with MKL to accelerate math function with mkl.
* * Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
*/
#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$, __mkl_kernel$) \
MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$) \
MXNET_ADD_SPARSE_OP_ALIAS(__name$) \
.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, false>) \
.set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>) \
.set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kerbel$>)

#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$, __mkl_kernel$) \
MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$) \
.set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)
#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$, __mkl_kernel$) \
MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$) \
MXNET_ADD_SPARSE_OP_ALIAS(__name$) \
.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>) \
.set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>) \
.set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kernel$>)

/*! \bried MKL Unary compute.
* * With this macro means mxnet compile with MKL to accelerate math function with mkl.
* * Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
*/
#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$, __mkl_kernel$) \
MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$) \
MXNET_ADD_SPARSE_OP_ALIAS(__name$) \
.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, false>)\
.set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>) \
.set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__kernel$, __mkl_kerbel$>)

#define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$, __mkl_kernel$)\
MXNET_MKL_OPERATOR_REGISTER_UNARY(__name$) \
.set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)
#endif

/*! \brief Unary compute, with FComputeEx for csr and rsp available */
Expand Down