Skip to content

Latest optimizations #10

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 26 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
daa76e1
enhancement with relu primitive reuse
gzmkl May 18, 2018
2bcd873
revert mkl_conv_ops.cc to avoid PR review confusion
gzmkl May 21, 2018
f369de2
code refactoring per Rasmus's suggestions on PR 19754
gzmkl Jun 12, 2018
c299b9c
[Intel MKL] Optimized implementation of GatherND using OpenMP
nhasabni Jun 22, 2018
3ee53e6
Merge branch 'master' into primreuse_relu
agramesh1 Jul 5, 2018
f814e24
Replace to use fast reorder path in MklRelu op.
yiqianglee Jul 15, 2018
f0f9a61
Merge branch 'master' into primreuse_relu
rmlarsen Jul 31, 2018
5ab0c8a
Fusing convolution and relu
agramesh1 Aug 1, 2018
478c416
Code changes based on Rasmus's code review suggestions on PR19403 and…
gzmkl Aug 1, 2018
8c23e1a
Conv3d enhancement
jzhoulon Aug 2, 2018
82e6460
rename CopyAttrsConv2D->CopyAttrsConv
jzhoulon Aug 6, 2018
b18dc21
Merge branch 'master' into conv3d
agramesh1 Aug 7, 2018
ffc12e1
rebase mkl_util.h with master branch
gzmkl Aug 7, 2018
0263310
Change to match clang formatting
nhasabni Aug 10, 2018
62191da
Merge branch 'master' into conv3d
agramesh1 Aug 11, 2018
9d652aa
Merge branch 'master' into agramesh/relu_fusion
agramesh1 Aug 11, 2018
83d85b0
Merge conflicts
agramesh1 Aug 11, 2018
135ac89
enable pooling3D op
gzmkl Aug 13, 2018
b2d8ad9
Merge branch 'master' into primreuse_relu
agramesh1 Aug 14, 2018
7b35aac
Replaced INTEL_MKL_ML with new macro INTEL_MKL_ML_ONLY
agramesh1 Aug 14, 2018
aaf3ddd
Merge branch 'conv3d' into 3dconv_fusion
aramesh1 Aug 15, 2018
de5d75c
Merge branch 'agramesh/relu_fusion' into 3dconv_fusion
aramesh1 Aug 15, 2018
e99cfa2
Merge branch 'primreuse_relu' into latest_optimizations
aramesh1 Aug 15, 2018
d69db87
Merge branch 'pooling3d' into latest_optimizations
aramesh1 Aug 15, 2018
b957d18
Merge branch 'nhasabni/gathernd' into latest_optimizations
aramesh1 Aug 15, 2018
4596ba1
Merge branch '3dconv_fusion' into latest_optimizations
aramesh1 Aug 15, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
479 changes: 436 additions & 43 deletions tensorflow/core/graph/mkl_layout_pass.cc

Large diffs are not rendered by default.

116 changes: 115 additions & 1 deletion tensorflow/core/graph/mkl_layout_pass_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2147,6 +2147,28 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
"DMT/_1->C:3");
}

// data_format attribute value mismatch. Merge should not be done
// in such case.
TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoConv2D) {
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'B' op: 'Input'}"
"node { name: 'C' op: 'MatMul'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'transpose_a' value { b: false } }"
" attr { key: 'transpose_b' value { b: false } }"
" input: ['A', 'B']}"
"node { name: 'D' op: 'Input'}"
"node { name: 'E' op: 'BiasAdd'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NHCW' } }"
" input: ['C', 'D'] }");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(MatMul);D(Input);"
"E(BiasAdd)|A->C;"
"B->C:1;C->E;D->E:1");
}

// Test set 2: BiasAddGrad + Conv2DBackpropFilter fusion tests

TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Positive) {
Expand Down Expand Up @@ -2174,6 +2196,97 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Positive) {
"DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
}

// Test set 3: Conv2DWithBias + Relu -> Conv2DWithBiasAndRelu merge tests
// C=Conv2D(A,B); E=BiasAdd(C,D); F=Relu(E); Z=Zeta(F,Y)
TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBiasAndRelu_Positive) {
CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'B' op: 'Input'}"
"node { name: 'C' op: 'Conv2D'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" attr { key: 'dilations' value { list: {i: 1, i:1, i:1, i:1} } }"
" input: ['A', 'B']}"
"node { name: 'D' op: 'Input'}"
"node { name: 'E' op: 'BiasAdd'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" input: ['C', 'D'] }"
"node { name: 'F' op: 'Relu'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['E'] }"
"node { name: 'Y' op: 'Input'}"
"node { name: 'Z' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['F', 'Y']}");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
"DMT/_2(Const);F(_MklConv2DWithBiasAndRelu);Y(Input);Z(Zeta)|A->F;"
"A:control->DMT/_0:control;A:control->DMT/_1:control;"
"A:control->DMT/_2:control;B->F:1;D->F:2;DMT/_0->F:3;DMT/_1->F:4;"
"DMT/_2->F:5;F->Z;Y->Z:1");
}

// Test set 5:
// C=Conv2D(A,B); D=Relu(C); Z=Zeta(D,Y)
TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithRelu_Positive) {
CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'B' op: 'Input'}"
"node { name: 'C' op: 'Conv2D'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" attr { key: 'dilations' value { list: {i: 1, i:2, i:2, i:1} } }"
" input: ['A', 'B']}"
"node { name: 'D' op: 'Relu'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['C'] }"
"node { name: 'Y' op: 'Input'}"
"node { name: 'Z' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['D', 'Y']}");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);D(_MklConv2DWithRelu);DMT/_0(Const);"
"DMT/_1(Const);Y(Input);Z(Zeta)|A->D;A:control->DMT/_0:control;"
"A:control->DMT/_1:control;B->D:1;D->Z;DMT/_0->D:2;DMT/_1->D:3;"
"Y->Z:1");
}

// Conv2D+ReLU fusion negative test - Conv2D feeds multiple nodes.
TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithRelu_Negative1) {
CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
InitGraph(
"node { name: 'A' op: 'Input'}"
"node { name: 'B' op: 'Input'}"
"node { name: 'C' op: 'Conv2D'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" attr { key: 'data_format' value { s: 'NCHW' } }"
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" attr { key: 'dilations' value { list: {i: 1, i:2, i:2, i:1} } }"
" input: ['A', 'B']}"
"node { name: 'D' op: 'Relu'"
" attr { key: 'T' value { type: DT_FLOAT } }"
" input: ['C'] }"
"node { name: 'Z' op: 'Zeta'"
" attr {key: 'T' value { type: DT_FLOAT } }"
" input: ['C', 'D']}");
EXPECT_EQ(DoMklLayoutOptimizationPass(),
"A(Input);B(Input);C(_MklConv2D);D(_MklRelu);DMT/_0(Const);"
"DMT/_1(Const);Z(Zeta)|A->C;A:control->DMT/_0:control;"
"A:control->DMT/_1:control;B->C:1;C->D;C->Z;C:2->D:1;D->Z:1;"
"DMT/_0->C:2;DMT/_1->C:3");
}

// BiasAddGrad fusion in the presence of BackpropFilter. But nodes do not match
// criteria for rewrite. So rewrite should not happen. 3rd input of
// Conv2DBackpropFilter is different than input to BiasAddGrad.
Expand Down Expand Up @@ -2357,7 +2470,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" attr { key: 'dilations' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'dilations' value { list: {i: 1, i:2, i:2, i:1} } }"
" input: ['A', 'B']}"
"node { name: 'D' op: 'Conv2D'"
" attr { key: 'T' value { type: DT_FLOAT } }"
Expand Down Expand Up @@ -3493,6 +3606,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
" attr { key: 'use_cudnn_on_gpu' value { b: false } }"
" attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }"
" attr { key: 'padding' value { s: 'SAME' } }"
" attr { key: 'dilations' value { list: {i: 1, i:1, i:1, i:1} } }"
" input: ['A', 'B', 'M', 'N']}"
"node { name: 'D' op: 'Input'}"
"node { name: 'E' op: 'BiasAdd'"
Expand Down
12 changes: 10 additions & 2 deletions tensorflow/core/graph/mkl_tfconversion_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,11 @@ Status MklToTfConversionPass::InsertConversionNodeOnEdge(
.Finalize(&**g, &conversion_node));

CHECK_NOTNULL(conversion_node);
if (GetNodeAttr(src->def(), "data_format", &data_format) == Status::OK()) {
// TODO(Intel-tf) MklToTf accepts only NHWC or NCHW, but doesn't seem to be
// using data_format. This code might be redundant.
if (GetNodeAttr(src->def(), "data_format", &data_format) == Status::OK() &&
(data_format == ToString(FORMAT_NHWC) ||
data_format == ToString(FORMAT_NCHW))) {
conversion_node->AddAttr("data_format", data_format);
}

Expand Down Expand Up @@ -254,9 +258,13 @@ Status MklToTfConversionPass::InsertInputConversionNode(
}
}

// TODO(Intel-tf) MklInputConversion accepts only NHWC or NCHW, but doesn't
// seem to be using data_format. This code might be redundant.
string data_format;
if (GetNodeAttr(edges[0]->src()->def(), "data_format", &data_format) ==
Status::OK()) {
Status::OK() &&
(data_format == ToString(FORMAT_NHWC) ||
data_format == ToString(FORMAT_NCHW))) {
conversion_node->AddAttr("data_format", data_format);
}

Expand Down
15 changes: 15 additions & 0 deletions tensorflow/core/kernels/gather_nd_op_cpu_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,25 @@ struct GatherNdSlice<CPUDevice, T, Index, IXDIM> {
#endif
generator::GatherNdSliceGenerator<T, Index, IXDIM> gather_nd_generator(
slice_size, Tindices, Tparams, Tout, &error_loc);

#ifdef INTEL_MKL
// Eigen implementation below is not highly performant. gather_nd_generator
// does not seem to be called in parallel, leading to very poor performance.
// Additionally, since it uses scalar (Tscratch) to invoke 'generate', it
// needs to go through redundant operations like 'reshape', 'broadcast' and
// 'sum'. OpenMP loop below essentially does same thing as Eigen code, but
// is considerably more efficient.
#pragma omp parallel for
for (Eigen::DenseIndex i = 0; i < batch_size; i++) {
const Eigen::array<Eigen::DenseIndex, 1> loc = i;
gather_nd_generator(loc);
}
#else
Tscratch.device(d) = Tscratch.reshape(reshape_dims)
.broadcast(broadcast_dims)
.generate(gather_nd_generator)
.sum();
#endif

// error_loc() returns -1 if there's no out-of-bounds index,
// otherwise it returns the location of an OOB index in Tindices.
Expand Down
20 changes: 16 additions & 4 deletions tensorflow/core/kernels/mkl_aggregate_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -392,16 +392,28 @@ class MklAddNOp : public OpKernel {
memory::format src1_mkl_data_format = src1_mkl_shape.GetTfDataFormat();
auto src1_tf_data_format =
MklDnnDataFormatToTFDataFormat(src1_mkl_data_format);
auto src2_dims =
TFShapeToMklDnnDimsInNCHW(src2_tensor.shape(), src1_tf_data_format);
memory::dims src2_dims;
if (src2_tensor.dims() == 4) {
src2_dims = TFShapeToMklDnnDimsInNCHW(src2_tensor.shape(),
src1_tf_data_format);
} else {
src2_dims = TFShapeToMklDnnDimsInNCDHW(src2_tensor.shape(),
src1_tf_data_format);
}
md2 = memory::desc(src2_dims, MklDnnType<T>(), src1_mkl_data_format);
} else if (input2_in_mkl_format && !input1_in_mkl_format) {
// Same comment as above.
memory::format src2_mkl_data_format = src2_mkl_shape.GetTfDataFormat();
auto src2_tf_data_format =
MklDnnDataFormatToTFDataFormat(src2_mkl_data_format);
auto src1_dims =
TFShapeToMklDnnDimsInNCHW(src1_tensor.shape(), src2_tf_data_format);
memory::dims src1_dims;
if (src1_tensor.dims() == 4) {
src1_dims = TFShapeToMklDnnDimsInNCHW(src1_tensor.shape(),
src2_tf_data_format);
} else {
src1_dims = TFShapeToMklDnnDimsInNCDHW(src1_tensor.shape(),
src2_tf_data_format);
}
md1 = memory::desc(src1_dims, MklDnnType<T>(), src2_mkl_data_format);

md2 = src2_mkl_shape.GetMklLayout();
Expand Down
51 changes: 35 additions & 16 deletions tensorflow/core/kernels/mkl_avgpooling_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {

// initialize variables for the pooling op
MklPoolParameters pool_params;
// check whether pooling is 2D or 3D
bool isPool2D = (this->ksize_.size() == 4);
// Get the input tensor and initialize the pooling parameters
TensorShape input_tensor_shape = input_tensor.shape();
this->InitMklPoolParameters(context, &pool_params, dnn_shape_input,
Expand All @@ -473,23 +475,22 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
}

memory::dims filter_dims, strides, padding_left, padding_right;
// Get src/filter/stride/padding information
this->PoolParamsToDims(&pool_params, &filter_dims, &strides,
&padding_left, &padding_right);
&padding_left, &padding_right, isPool2D);

// Get the input memory descriptor
memory::desc input_md =
dnn_shape_input.IsMklTensor()
? dnn_shape_input.GetMklLayout()
: memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
this->data_format_tf_),
MklDnnType<T>(), this->data_format_mkldnn_);

// Get src/filter/stride/padding information
memory::dims src_dims =
dnn_shape_input.IsMklTensor()
? dnn_shape_input.GetSizesAsMklDnnDims()
: TFShapeToMklDnnDimsInNCHW(input_tensor.shape(),
this->data_format_tf_);
: isPool2D ? TFShapeToMklDnnDimsInNCHW(input_tensor.shape(),
this->data_format_tf_)
: TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(),
this->data_format_tf_);
memory::desc input_md = dnn_shape_input.IsMklTensor()
? dnn_shape_input.GetMklLayout()
: memory::desc(src_dims, MklDnnType<T>(),
this->data_format_mkldnn_);

// Get an average pooling primitive from the op pool
MklPoolingFwdPrimitive<T>* pooling_fwd = nullptr;
Expand Down Expand Up @@ -562,24 +563,30 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
for (int i = 0; i < orig_input_tensor.NumElements(); i++) {
orig_input_shape.AddDim(shape_vec(i));
}

bool isPool2D = (this->ksize_.size() == 4);
this->InitMklPoolParameters(context, &pool_params, orig_input_mkl_shape,
orig_input_shape);

memory::dims filter_dims, strides, padding_left, padding_right;
this->PoolParamsToDims(&pool_params, &filter_dims, &strides,
&padding_left, &padding_right);
&padding_left, &padding_right, isPool2D);

memory::dims orig_input_dims_mkl_order =
orig_input_mkl_shape.IsMklTensor()
? orig_input_mkl_shape.GetSizesAsMklDnnDims()
: TFShapeToMklDnnDimsInNCHW(orig_input_shape,
this->data_format_tf_);
: isPool2D ? TFShapeToMklDnnDimsInNCHW(orig_input_shape,
this->data_format_tf_)
: TFShapeToMklDnnDimsInNCDHW(orig_input_shape,
this->data_format_tf_);

memory::dims diff_dst_dims =
grad_mkl_shape.IsMklTensor()
? grad_mkl_shape.GetSizesAsMklDnnDims()
: TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(),
this->data_format_tf_);
: isPool2D ? TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(),
this->data_format_tf_)
: TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(),
this->data_format_tf_);
memory::dims output_dims_mkl_order;
this->GetOutputDims(pool_params, &output_dims_mkl_order);

Expand Down Expand Up @@ -664,6 +671,18 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
}
}; // MklAvgPoolingGradOp

REGISTER_KERNEL_BUILDER(Name("_MklAvgPool3D")
.Device(DEVICE_CPU)
.TypeConstraint<float>("T")
.Label(mkl_op_registry::kMklOpLabel),
MklAvgPoolingOp<CPUDevice, float>);

REGISTER_KERNEL_BUILDER(Name("_MklAvgPool3DGrad")
.Device(DEVICE_CPU)
.TypeConstraint<float>("T")
.Label(mkl_op_registry::kMklOpLabel),
MklAvgPoolingGradOp<CPUDevice, float>);

#endif // INTEL_MKL_ML_ONLY

REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
Expand Down
Loading