Intel-tensorflow · chuanqi129 · Apr 29, 2020 · Apr 29, 2020 · Apr 29, 2020
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -22,6 +22,19 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/platform/test.h"
 
+#define REGISTER_TEST_FLOAT32(TEST) REGISTER_TEST(TEST, DT_FLOAT, Float32Input);
+
+#ifdef ENABLE_INTEL_MKL_BFLOAT16
+#define REGISTER_TEST_BFLOAT16(TEST) \
+  REGISTER_TEST(TEST, DT_BFLOAT16, BFloat16Input);
+
+#define REGISTER_TEST_ALL_TYPES(TEST) \
+  REGISTER_TEST_FLOAT32(TEST);        \
+  REGISTER_TEST_BFLOAT16(TEST);
+#else
+#define REGISTER_TEST_ALL_TYPES(TEST) REGISTER_TEST_FLOAT32(TEST);
+#endif  // ENABLE_INTEL_MKL_BFLOAT16
+
 namespace tensorflow {
 namespace grappler {
 
@@ -206,93 +219,99 @@ CREATE_CONV2DFUSION_ADD_BCAST_TEST(AddV2);
 #undef CREATE_CONV2DFUSION_ADD_BCAST_TEST
 #undef CREATE_CONV2DFUSION_TEST
 
-TEST_F(MklRemapperTest, FuseDepthwiseConv2DWithBiasAndActivation) {
-  using ::tensorflow::ops::Placeholder;
-
-  for (const string& activation : {"Relu", "Relu6", "Elu", "None"}) {
-    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-
-    auto input_shape = Placeholder::Shape({8, 32, 32, 3});
-    auto filter_shape = Placeholder::Shape({1, 1, 3, 1});
-    auto bias_shape = Placeholder::Shape({3});
-
-    auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
-    auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
-    auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
-
-    std::vector<int> strides = {1, 1, 1, 1};
-    auto conv = ops::DepthwiseConv2dNative(s.WithOpName("depthwise_conv"),
-                                           input, filter, strides, "SAME");
-    auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
-
-    ops::Identity fetch = [&]() -> ops::Identity {
-      auto activate = s.WithOpName("activation");
-      auto fetch = s.WithOpName("fetch");
-
-      if (activation == "Relu") {
-        return ops::Identity(fetch, ops::Relu(activate, bias_add));
-      } else if (activation == "Relu6") {
-        return ops::Identity(fetch, ops::Relu6(activate, bias_add));
-      } else if (activation == "Elu") {
-        return ops::Identity(fetch, ops::Elu(activate, bias_add));
-      }
-
-      DCHECK(activation == "None");
-      return ops::Identity(fetch, bias_add);
-    }();
-
-    auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
-    auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 1});
-    auto bias_t = GenerateRandomTensor<DT_FLOAT>({3});
-
-    GrapplerItem item;
-    item.fetch = {"fetch"};
-    item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
-    TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
-    // Place all nodes on CPU.
-    for (int i = 0; i < item.graph.node_size(); ++i) {
-      item.graph.mutable_node(i)->set_device("/device:CPU:0");
-    }
-
-    Remapper optimizer(RewriterConfig::ON);
-    GraphDef output;
-    TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
-
-    int found = 0;
-    for (const NodeDef& node : output.node()) {
-      if (node.name() != "bias_add" && node.name() != "activation") continue;
-
-      EXPECT_EQ(node.op(), "_FusedDepthwiseConv2dNative");
-      ASSERT_EQ(node.input_size(), 3);
-      EXPECT_EQ(node.input(0), "input");
-      EXPECT_EQ(node.input(1), "filter");
-
-      EXPECT_EQ(node.attr().at("num_args").i(), 1);
-      EXPECT_EQ(node.input(2), "bias");
-
-      const auto fused_ops = node.attr().at("fused_ops").list().s();
-      if (node.name() == "bias_add") {
-        ASSERT_EQ(fused_ops.size(), 1);
-        EXPECT_EQ(fused_ops[0], "BiasAdd");
-        found++;
-      }
-      if (node.name() == "activation") {
-        ASSERT_EQ(fused_ops.size(), 2);
-        EXPECT_EQ(fused_ops[0], "BiasAdd");
-        EXPECT_EQ(fused_ops[1], activation);
-        found++;
-      }
-    }
-    EXPECT_EQ(found, 1);
-
-    auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
-    ASSERT_EQ(tensors_expected.size(), 1);
-    auto tensors = EvaluateNodes(output, item.fetch, item.feed);
-    ASSERT_EQ(tensors.size(), 1);
-    test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+#define REGISTER_TEST(NAME, T, INPUT)                                         \
+  TEST_F(MklRemapperTest, NAME##_##T) {                                       \
+    using ::tensorflow::ops::Placeholder;                                     \
+                                                                              \
+    for (const string& activation : {"Relu", "Relu6", "Elu", "None"}) {       \
+      tensorflow::Scope s = tensorflow::Scope::NewRootScope();                \
+                                                                              \
+      auto input_shape = Placeholder::Shape({8, 32, 32, 3});                  \
+      auto filter_shape = Placeholder::Shape({1, 1, 3, 1});                   \
+      auto bias_shape = Placeholder::Shape({3});                              \
+                                                                              \
+      auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape); \
+      auto filter =                                                           \
+          Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);        \
+      auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);    \
+                                                                              \
+      std::vector<int> strides = {1, 1, 1, 1};                                \
+      auto conv = ops::DepthwiseConv2dNative(s.WithOpName("depthwise_conv"),  \
+                                             input, filter, strides, "SAME"); \
+      auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);     \
+                                                                              \
+      ops::Identity fetch = [&]() -> ops::Identity {                          \
+        auto activate = s.WithOpName("activation");                           \
+        auto fetch = s.WithOpName("fetch");                                   \
+                                                                              \
+        if (activation == "Relu") {                                           \
+          return ops::Identity(fetch, ops::Relu(activate, bias_add));         \
+        } else if (activation == "Relu6") {                                   \
+          return ops::Identity(fetch, ops::Relu6(activate, bias_add));        \
+        } else if (activation == "Elu") {                                     \
+          return ops::Identity(fetch, ops::Elu(activate, bias_add));          \
+        }                                                                     \
+                                                                              \
+        DCHECK(activation == "None");                                         \
+        return ops::Identity(fetch, bias_add);                                \
+      }();                                                                    \
+                                                                              \
+      auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});          \
+      auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 1});           \
+      auto bias_t = GenerateRandomTensor<DT_FLOAT>({3});                      \
+                                                                              \
+      GrapplerItem item;                                                      \
+      item.fetch = {"fetch"};                                                 \
+      item.feed = {                                                           \
+          {"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};        \
+      TF_CHECK_OK(s.ToGraphDef(&item.graph));                                 \
+                                                                              \
+      for (int i = 0; i < item.graph.node_size(); ++i) {                      \
+        item.graph.mutable_node(i)->set_device("/device:CPU:0");              \
+      }                                                                       \
+                                                                              \
+      Remapper optimizer(RewriterConfig::ON);                                 \
+      GraphDef output;                                                        \
+      TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));                \
+                                                                              \
+      int found = 0;                                                          \
+      for (const NodeDef& node : output.node()) {                             \
+        if (node.name() != "bias_add" && node.name() != "activation")         \
+          continue;                                                           \
+                                                                              \
+        EXPECT_EQ(node.op(), "_FusedDepthwiseConv2dNative");                  \
+        ASSERT_EQ(node.input_size(), 3);                                      \
+        EXPECT_EQ(node.input(0), "input");                                    \
+        EXPECT_EQ(node.input(1), "filter");                                   \
+                                                                              \
+        EXPECT_EQ(node.attr().at("num_args").i(), 1);                         \
+        EXPECT_EQ(node.input(2), "bias");                                     \
+                                                                              \
+        const auto fused_ops = node.attr().at("fused_ops").list().s();        \
+        if (node.name() == "bias_add") {                                      \
+          ASSERT_EQ(fused_ops.size(), 1);                                     \
+          EXPECT_EQ(fused_ops[0], "BiasAdd");                                 \
+          found++;                                                            \
+        }                                                                     \
+        if (node.name() == "activation") {                                    \
+          ASSERT_EQ(fused_ops.size(), 2);                                     \
+          EXPECT_EQ(fused_ops[0], "BiasAdd");                                 \
+          EXPECT_EQ(fused_ops[1], activation);                                \
+          found++;                                                            \
+        }                                                                     \
+      }                                                                       \
+      EXPECT_EQ(found, 1);                                                    \
+                                                                              \
+      auto tensors_expected =                                                 \
+          EvaluateNodes(item.graph, item.fetch, item.feed);                   \
+      ASSERT_EQ(tensors_expected.size(), 1);                                  \
+      auto tensors = EvaluateNodes(output, item.fetch, item.feed);            \
+      ASSERT_EQ(tensors.size(), 1);                                           \
+      test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);   \
+    }                                                                         \
   }
-}
+REGISTER_TEST_ALL_TYPES(FuseDepthwiseConv2DWithBiasAndActivation);
+#undef REGISTER_TEST
 
 class MklFuseMatMulWithBiasAddGrad : public MklRemapperTest {
  public:

diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -231,19 +231,22 @@ bool HasDataType(const NodeDef* node, const DataType& expected,
 bool IsCpuCompatibleDataType(const NodeDef* contraction,
                              const string& type_attr = "T") {
   DataType dtype = GetDataTypeFromAttr(*contraction, type_attr);
-#if defined(INTEL_MKL) && defined(ENABLE_INTEL_MKL_BFLOAT16)
-  if (IsConv2D(*contraction)) {
-    return dtype == DT_FLOAT || dtype == DT_BFLOAT16;
-  } else if (IsDepthwiseConv2dNative(*contraction)) {
-    return dtype == DT_FLOAT || dtype == DT_BFLOAT16;
-  } else if (IsMatMul(*contraction)) {
+#if defined(INTEL_MKL)
+#if defined(ENABLE_INTEL_MKL_BFLOAT16)
+  if (IsConv2D(*contraction) || IsDepthwiseConv2dNative(*contraction) ||
+      IsMatMul(*contraction)) {
     return dtype == DT_FLOAT || dtype == DT_BFLOAT16;
+#else
+  if (IsConv2D(*contraction) || IsDepthwiseConv2dNative(*contraction) ||
+      IsMatMul(*contraction)) {
+    return dtype == DT_FLOAT;
+#endif  // ENABLE_INTEL_MKL_BFLOAT16
 #else
   if (IsConv2D(*contraction)) {
     return dtype == DT_FLOAT || dtype == DT_DOUBLE;
   } else if (IsMatMul(*contraction)) {
     return dtype == DT_FLOAT;
-#endif  // INTEL_MKL && ENABLE_INTEL_MKL_BFLOAT16
+#endif  // INTEL_MKL
   } else {
     return false;
   }

diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "mkldnn.hpp"
 #include "absl/strings/str_join.h"
+#include "mkldnn.hpp"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -2309,11 +2309,20 @@ REGISTER_KERNEL_BUILDER(
         .TypeConstraint<quint8>("out_type"),
     NoOp);
 
-REGISTER_KERNEL_BUILDER(Name("_FusedDepthwiseConv2dNative")
+REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
                             .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
+                            .TypeConstraint<bfloat16>("T"),
                         NoOp);
 
+#define REGISTER_NO_OP_CPU_2D_DEPTHWISE(T)                    \
+  REGISTER_KERNEL_BUILDER(Name("_FusedDepthwiseConv2dNative") \
+                              .Device(DEVICE_CPU)             \
+                              .TypeConstraint<T>("T"),        \
+                          NoOp);
+
+TF_CALL_float(REGISTER_NO_OP_CPU_2D_DEPTHWISE);
+TF_CALL_bfloat16(REGISTER_NO_OP_CPU_2D_DEPTHWISE);
+
 // Register templatized MKL kernels for non-fused and fused-versions of
 // QuantizedDepthwiseConv2D.
 REGISTER_KERNEL_BUILDER(Name("_MklQuantizedDepthwiseConv2D")
@@ -2367,14 +2376,6 @@ REGISTER_KERNEL_BUILDER(
     MklQuantizedConv2DReluOp<CPUDevice, quint8, qint32, quint8, quint8, true,
                              true>);
 
-REGISTER_KERNEL_BUILDER(
-    Name("_MklFusedDepthwiseConv2dNative")
-        .Device(DEVICE_CPU)
-        .TypeConstraint<float>("T")
-        .Label(mkl_op_registry::kMklLayoutDependentOpLabel),
-    MklFusedDepthwiseConvOp<CPUDevice, float, float, float, float, float, int32,
-                            false, true, true>);
-
 // Register 2D operations
 #define REGISTER_MKL_CPU_2D(T)                                                 \
   REGISTER_KERNEL_BUILDER(                                                     \
@@ -2426,13 +2427,20 @@ REGISTER_KERNEL_BUILDER(
 TF_CALL_float(REGISTER_MKL_CPU_2D);
 TF_CALL_bfloat16(REGISTER_MKL_CPU_2D);
 
-#define REGISTER_MKL_CPU_2D_DEPTHWISE(T)                       \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklDepthwiseConv2dNative")                        \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, true, false>);
+#define REGISTER_MKL_CPU_2D_DEPTHWISE(T)                                      \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklDepthwiseConv2dNative")                                       \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, true, false>); \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklFusedDepthwiseConv2dNative")                                  \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
+      MklFusedDepthwiseConvOp<CPUDevice, T, T, T, T, T, int32, false, true,   \
+                              true>);
 
 TF_CALL_float(REGISTER_MKL_CPU_2D_DEPTHWISE);
 TF_CALL_bfloat16(REGISTER_MKL_CPU_2D_DEPTHWISE);