Skip to content

Support DepthwiseConv2D fusion for bfloat16 type #15

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 105 additions & 86 deletions tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,19 @@ limitations under the License.
#include "tensorflow/core/grappler/utils/grappler_test.h"
#include "tensorflow/core/platform/test.h"

#define REGISTER_TEST_FLOAT32(TEST) REGISTER_TEST(TEST, DT_FLOAT, Float32Input);

#ifdef ENABLE_INTEL_MKL_BFLOAT16
#define REGISTER_TEST_BFLOAT16(TEST) \
REGISTER_TEST(TEST, DT_BFLOAT16, BFloat16Input);

#define REGISTER_TEST_ALL_TYPES(TEST) \
REGISTER_TEST_FLOAT32(TEST); \
REGISTER_TEST_BFLOAT16(TEST);
#else
#define REGISTER_TEST_ALL_TYPES(TEST) REGISTER_TEST_FLOAT32(TEST);
#endif // ENABLE_INTEL_MKL_BFLOAT16

namespace tensorflow {
namespace grappler {

Expand Down Expand Up @@ -206,93 +219,99 @@ CREATE_CONV2DFUSION_ADD_BCAST_TEST(AddV2);
#undef CREATE_CONV2DFUSION_ADD_BCAST_TEST
#undef CREATE_CONV2DFUSION_TEST

TEST_F(MklRemapperTest, FuseDepthwiseConv2DWithBiasAndActivation) {
using ::tensorflow::ops::Placeholder;

for (const string& activation : {"Relu", "Relu6", "Elu", "None"}) {
tensorflow::Scope s = tensorflow::Scope::NewRootScope();

auto input_shape = Placeholder::Shape({8, 32, 32, 3});
auto filter_shape = Placeholder::Shape({1, 1, 3, 1});
auto bias_shape = Placeholder::Shape({3});

auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);

std::vector<int> strides = {1, 1, 1, 1};
auto conv = ops::DepthwiseConv2dNative(s.WithOpName("depthwise_conv"),
input, filter, strides, "SAME");
auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);

ops::Identity fetch = [&]() -> ops::Identity {
auto activate = s.WithOpName("activation");
auto fetch = s.WithOpName("fetch");

if (activation == "Relu") {
return ops::Identity(fetch, ops::Relu(activate, bias_add));
} else if (activation == "Relu6") {
return ops::Identity(fetch, ops::Relu6(activate, bias_add));
} else if (activation == "Elu") {
return ops::Identity(fetch, ops::Elu(activate, bias_add));
}

DCHECK(activation == "None");
return ops::Identity(fetch, bias_add);
}();

auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 1});
auto bias_t = GenerateRandomTensor<DT_FLOAT>({3});

GrapplerItem item;
item.fetch = {"fetch"};
item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
TF_CHECK_OK(s.ToGraphDef(&item.graph));

// Place all nodes on CPU.
for (int i = 0; i < item.graph.node_size(); ++i) {
item.graph.mutable_node(i)->set_device("/device:CPU:0");
}

Remapper optimizer(RewriterConfig::ON);
GraphDef output;
TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));

int found = 0;
for (const NodeDef& node : output.node()) {
if (node.name() != "bias_add" && node.name() != "activation") continue;

EXPECT_EQ(node.op(), "_FusedDepthwiseConv2dNative");
ASSERT_EQ(node.input_size(), 3);
EXPECT_EQ(node.input(0), "input");
EXPECT_EQ(node.input(1), "filter");

EXPECT_EQ(node.attr().at("num_args").i(), 1);
EXPECT_EQ(node.input(2), "bias");

const auto fused_ops = node.attr().at("fused_ops").list().s();
if (node.name() == "bias_add") {
ASSERT_EQ(fused_ops.size(), 1);
EXPECT_EQ(fused_ops[0], "BiasAdd");
found++;
}
if (node.name() == "activation") {
ASSERT_EQ(fused_ops.size(), 2);
EXPECT_EQ(fused_ops[0], "BiasAdd");
EXPECT_EQ(fused_ops[1], activation);
found++;
}
}
EXPECT_EQ(found, 1);

auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
ASSERT_EQ(tensors_expected.size(), 1);
auto tensors = EvaluateNodes(output, item.fetch, item.feed);
ASSERT_EQ(tensors.size(), 1);
test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
#define REGISTER_TEST(NAME, T, INPUT) \
TEST_F(MklRemapperTest, NAME##_##T) { \
using ::tensorflow::ops::Placeholder; \
\
for (const string& activation : {"Relu", "Relu6", "Elu", "None"}) { \
tensorflow::Scope s = tensorflow::Scope::NewRootScope(); \
\
auto input_shape = Placeholder::Shape({8, 32, 32, 3}); \
auto filter_shape = Placeholder::Shape({1, 1, 3, 1}); \
auto bias_shape = Placeholder::Shape({3}); \
\
auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape); \
auto filter = \
Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape); \
auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape); \
\
std::vector<int> strides = {1, 1, 1, 1}; \
auto conv = ops::DepthwiseConv2dNative(s.WithOpName("depthwise_conv"), \
input, filter, strides, "SAME"); \
auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias); \
\
ops::Identity fetch = [&]() -> ops::Identity { \
auto activate = s.WithOpName("activation"); \
auto fetch = s.WithOpName("fetch"); \
\
if (activation == "Relu") { \
return ops::Identity(fetch, ops::Relu(activate, bias_add)); \
} else if (activation == "Relu6") { \
return ops::Identity(fetch, ops::Relu6(activate, bias_add)); \
} else if (activation == "Elu") { \
return ops::Identity(fetch, ops::Elu(activate, bias_add)); \
} \
\
DCHECK(activation == "None"); \
return ops::Identity(fetch, bias_add); \
}(); \
\
auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3}); \
auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 1}); \
auto bias_t = GenerateRandomTensor<DT_FLOAT>({3}); \
\
GrapplerItem item; \
item.fetch = {"fetch"}; \
item.feed = { \
{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}}; \
TF_CHECK_OK(s.ToGraphDef(&item.graph)); \
\
for (int i = 0; i < item.graph.node_size(); ++i) { \
item.graph.mutable_node(i)->set_device("/device:CPU:0"); \
} \
\
Remapper optimizer(RewriterConfig::ON); \
GraphDef output; \
TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output)); \
\
int found = 0; \
for (const NodeDef& node : output.node()) { \
if (node.name() != "bias_add" && node.name() != "activation") \
continue; \
\
EXPECT_EQ(node.op(), "_FusedDepthwiseConv2dNative"); \
ASSERT_EQ(node.input_size(), 3); \
EXPECT_EQ(node.input(0), "input"); \
EXPECT_EQ(node.input(1), "filter"); \
\
EXPECT_EQ(node.attr().at("num_args").i(), 1); \
EXPECT_EQ(node.input(2), "bias"); \
\
const auto fused_ops = node.attr().at("fused_ops").list().s(); \
if (node.name() == "bias_add") { \
ASSERT_EQ(fused_ops.size(), 1); \
EXPECT_EQ(fused_ops[0], "BiasAdd"); \
found++; \
} \
if (node.name() == "activation") { \
ASSERT_EQ(fused_ops.size(), 2); \
EXPECT_EQ(fused_ops[0], "BiasAdd"); \
EXPECT_EQ(fused_ops[1], activation); \
found++; \
} \
} \
EXPECT_EQ(found, 1); \
\
auto tensors_expected = \
EvaluateNodes(item.graph, item.fetch, item.feed); \
ASSERT_EQ(tensors_expected.size(), 1); \
auto tensors = EvaluateNodes(output, item.fetch, item.feed); \
ASSERT_EQ(tensors.size(), 1); \
test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6); \
} \
}
}
REGISTER_TEST_ALL_TYPES(FuseDepthwiseConv2DWithBiasAndActivation);
#undef REGISTER_TEST

class MklFuseMatMulWithBiasAddGrad : public MklRemapperTest {
public:
Expand Down
17 changes: 10 additions & 7 deletions tensorflow/core/grappler/optimizers/remapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -231,19 +231,22 @@ bool HasDataType(const NodeDef* node, const DataType& expected,
bool IsCpuCompatibleDataType(const NodeDef* contraction,
const string& type_attr = "T") {
DataType dtype = GetDataTypeFromAttr(*contraction, type_attr);
#if defined(INTEL_MKL) && defined(ENABLE_INTEL_MKL_BFLOAT16)
if (IsConv2D(*contraction)) {
return dtype == DT_FLOAT || dtype == DT_BFLOAT16;
} else if (IsDepthwiseConv2dNative(*contraction)) {
return dtype == DT_FLOAT || dtype == DT_BFLOAT16;
} else if (IsMatMul(*contraction)) {
#if defined(INTEL_MKL)
#if defined(ENABLE_INTEL_MKL_BFLOAT16)
if (IsConv2D(*contraction) || IsDepthwiseConv2dNative(*contraction) ||
IsMatMul(*contraction)) {
return dtype == DT_FLOAT || dtype == DT_BFLOAT16;
#else
if (IsConv2D(*contraction) || IsDepthwiseConv2dNative(*contraction) ||
IsMatMul(*contraction)) {
return dtype == DT_FLOAT;
#endif // ENABLE_INTEL_MKL_BFLOAT16
#else
if (IsConv2D(*contraction)) {
return dtype == DT_FLOAT || dtype == DT_DOUBLE;
} else if (IsMatMul(*contraction)) {
return dtype == DT_FLOAT;
#endif // INTEL_MKL && ENABLE_INTEL_MKL_BFLOAT16
#endif // INTEL_MKL
} else {
return false;
}
Expand Down
44 changes: 26 additions & 18 deletions tensorflow/core/kernels/mkl_conv_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ limitations under the License.
#include <unordered_map>
#include <vector>

#include "mkldnn.hpp"
#include "absl/strings/str_join.h"
#include "mkldnn.hpp"
#include "tensorflow/core/framework/bounds_check.h"
#include "tensorflow/core/framework/numeric_op.h"
#include "tensorflow/core/framework/op_kernel.h"
Expand Down Expand Up @@ -2309,11 +2309,20 @@ REGISTER_KERNEL_BUILDER(
.TypeConstraint<quint8>("out_type"),
NoOp);

REGISTER_KERNEL_BUILDER(Name("_FusedDepthwiseConv2dNative")
REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
.Device(DEVICE_CPU)
.TypeConstraint<float>("T"),
.TypeConstraint<bfloat16>("T"),
NoOp);

#define REGISTER_NO_OP_CPU_2D_DEPTHWISE(T) \
REGISTER_KERNEL_BUILDER(Name("_FusedDepthwiseConv2dNative") \
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T"), \
NoOp);

TF_CALL_float(REGISTER_NO_OP_CPU_2D_DEPTHWISE);
TF_CALL_bfloat16(REGISTER_NO_OP_CPU_2D_DEPTHWISE);

// Register templatized MKL kernels for non-fused and fused-versions of
// QuantizedDepthwiseConv2D.
REGISTER_KERNEL_BUILDER(Name("_MklQuantizedDepthwiseConv2D")
Expand Down Expand Up @@ -2367,14 +2376,6 @@ REGISTER_KERNEL_BUILDER(
MklQuantizedConv2DReluOp<CPUDevice, quint8, qint32, quint8, quint8, true,
true>);

REGISTER_KERNEL_BUILDER(
Name("_MklFusedDepthwiseConv2dNative")
.Device(DEVICE_CPU)
.TypeConstraint<float>("T")
.Label(mkl_op_registry::kMklLayoutDependentOpLabel),
MklFusedDepthwiseConvOp<CPUDevice, float, float, float, float, float, int32,
false, true, true>);

// Register 2D operations
#define REGISTER_MKL_CPU_2D(T) \
REGISTER_KERNEL_BUILDER( \
Expand Down Expand Up @@ -2426,13 +2427,20 @@ REGISTER_KERNEL_BUILDER(
TF_CALL_float(REGISTER_MKL_CPU_2D);
TF_CALL_bfloat16(REGISTER_MKL_CPU_2D);

#define REGISTER_MKL_CPU_2D_DEPTHWISE(T) \
REGISTER_KERNEL_BUILDER( \
Name("_MklDepthwiseConv2dNative") \
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T") \
.Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, true, false>);
#define REGISTER_MKL_CPU_2D_DEPTHWISE(T) \
REGISTER_KERNEL_BUILDER( \
Name("_MklDepthwiseConv2dNative") \
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T") \
.Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, true, false>); \
REGISTER_KERNEL_BUILDER( \
Name("_MklFusedDepthwiseConv2dNative") \
.Device(DEVICE_CPU) \
.TypeConstraint<T>("T") \
.Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
MklFusedDepthwiseConvOp<CPUDevice, T, T, T, T, T, int32, false, true, \
true>);

TF_CALL_float(REGISTER_MKL_CPU_2D_DEPTHWISE);
TF_CALL_bfloat16(REGISTER_MKL_CPU_2D_DEPTHWISE);
Expand Down