Intel-tensorflow · Srini511 · Jan 15, 2021 · Jan 18, 2021 · Jan 6, 2021 · Jan 7, 2021
diff --git a/Intel-tensorflow-build-container.md b/Intel-tensorflow-build-container.md
@@ -0,0 +1,69 @@
+# Steps to generate a container with Intel® Optimization for TensorFlow
+
+This guide will help you generate a container with Intel's icx-base branch build.
+
+## Steps:
+
+1. Clone intel-tensorflow icx-base branch:
+
+    ```
+    $ git clone https://github.com/Intel-tensorflow/tensorflow.git --branch=icx-base --single-branch
+    $ cd tensorflow
+    $ git checkout icx-base
+    # Run "git log" and check for the right git hash
+    ```
+
+2.  Go to the directory that has Intel mkl docker files:
+
+    ```
+    $ cd tensorflow/tools/ci_build/linux/mkl/
+    ```
+
+3.  Run build-dev-container.sh by passing the following env parameters:
+
+    For ICX-SERVER containers:
+
+    ```
+    $ env  ROOT_CONTAINER=tensorflow/tensorflow \
+    	ROOT_CONTAINER_TAG=devel \
+    	TF_DOCKER_BUILD_DEVEL_BRANCH=icx-base	 \
+    	TF_REPO=https://github.com/Intel-tensorflow/tensorflow \
+    	BUILD_ICX_SERVER_CONTAINERS=yes \
+    	BUILD_TF_V2_CONTAINERS=yes \    	
+    	BAZEL_VERSION=3.7.2 \    	
+    	ENABLE_SECURE_BUILD=yes \
+        ENABLE_HOROVOD=yes \
+	BUILD_SSH=yes \
+	TF_NIGHTLY_FLAG=--nightly_flag \
+	ENABLE_GCC8=yes \
+	RELEASE_CONTAINER=yes \
+	OPENMPI_VERSION=openmpi-4.0.5 \
+	OPENMPI_DOWNLOAD_URL=https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.5.tar.gz \
+	HOROVOD_VERSION=56183ca42c43aad7afd619f0cc8bc4842336f3ec \
+	INSTALL_HOROVOD_FROM_COMMIT=yes \
+	BUILD_SSH=yes \
+    	./build-dev-container.sh > ./container_build.log
+    ```  
+
+4.  Open a second terminal session at the same location and run `tail -f container_build.log` to monitor container build progress
+    or wait until the build finishes and then open the log file <container_build.log> ...
+
+    ```
+	INFO: Build completed successfully, 17731 total actions.
+    ```
+
+    Below output indicates that the container has intel-optimized tensorflow:
+
+    ```
+    PASS: MKL enabled test in <intermediate container name>
+    ```
+
+5.  Check if the image was built successfully and tag it:
+
+    ICX-SERVER container:
+
+    ```
+    $ docker images
+	intel-mkl/tensorflow:nightly-icx-server-devel-mkl
+    $ docker tag intel-mkl/tensorflow:nightly-icx-server-devel-mkl intel/intel-optimized-tensorflow:2.4.0-devel-mkl
+    ```   
diff --git a/LEGAL-NOTICE b/LEGAL-NOTICE
@@ -0,0 +1,11 @@
+Intel® Optimized Tensorflow Container 
+
+
+        LEGAL NOTICE: Your use of this software and any required dependent software (the “Software Package”) is subject to the terms
+                      and conditions of the software license agreements for the Software Package, which may also include notices,
+                      disclaimers, or license terms for third party or open source software included in or with the Software Package,
+                      and your use indicates your acceptance of all such terms. Please refer to the “third-party-programs.txt” or other
+                      similarly-named text file included with the Software Package for additional details. 
+
+        Please refer to the  https://github.com/Intel-tensorflow/tensorflow/blob/v2.4.0/third-party-programs.txt and 
+        https://github.com/Intel-tensorflow/tensorflow/blob/v2.4.0/third-party-programs-for-docker-containers.txt for additional details.
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
@@ -663,6 +663,7 @@ cc_library(
         "//tensorflow/core/kernels/mkl:mkl_conv_op",
         "//tensorflow/core/kernels/mkl:mkl_cwise_ops_common",
         "//tensorflow/core/kernels/mkl:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels/mkl:mkl_layer_norm_op",
         "//tensorflow/core/kernels/mkl:mkl_identity_op",
         "//tensorflow/core/kernels/mkl:mkl_input_conversion_op",
         "//tensorflow/core/kernels/mkl:mkl_lrn_op",

diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -276,6 +276,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.fused_conv2d = "_FusedConv2D";
     csinfo_.fused_depthwise_conv2d = "_FusedDepthwiseConv2dNative";
     csinfo_.fused_matmul = "_FusedMatMul";
+    csinfo_.fused_batch_matmul = "_FusedBatchMatMul";
+    csinfo_.fused_batch_matmul_v2 = "_FusedBatchMatMulV2";
     csinfo_.identity = "Identity";
     csinfo_.leakyrelu = "LeakyRelu";
     csinfo_.leakyrelu_grad = "LeakyReluGrad";
@@ -300,6 +302,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_fused_conv2d = "_MklFusedConv2D";
     csinfo_.mkl_fused_depthwise_conv2d = "_MklFusedDepthwiseConv2dNative";
     csinfo_.mkl_fused_matmul = "_MklFusedMatMul";
+    csinfo_.mkl_fused_batch_matmul = "_MklFusedBatchMatMul";
+    csinfo_.mkl_fused_batch_matmul_v2 = "_MklFusedBatchMatMulV2";
     csinfo_.mkl_native_conv2d_with_bias = "_MklNativeConv2DWithBias";
     csinfo_.mkl_native_fused_batch_norm_ex = "_MklNativeFusedBatchNormEx";
     csinfo_.mkl_native_fused_conv2d = "_MklNativeFusedConv2D";
@@ -313,6 +317,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.pad = "Pad";
     csinfo_.pad_with_conv2d = "__MklDummyPadWithConv2D";
     csinfo_.pad_with_fused_conv2d = "__MklDummyPadWithFusedConv2D";
+    csinfo_.mkl_quantized_fused_matmul = "_MklQuantizedFusedMatMul";
+    csinfo_.mkl_quantized_fused_matmul_and_dequantize =
+        "_MklQuantizedFusedMatMulAndDequantize";
+    csinfo_.mkl_quantized_fused_matmul_and_requantize =
+        "_MklQuantizedFusedMatMulAndRequantize";
     csinfo_.quantized_avg_pool = "QuantizedAvgPool";
     csinfo_.quantized_concatv2 = "QuantizedConcatV2";
     csinfo_.quantized_conv2d = "QuantizedConv2D";
@@ -351,6 +360,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
         "QuantizedDepthwiseConv2DWithBiasAndRelu";
     csinfo_.quantized_depthwise_conv2d_with_bias_and_relu_and_requantize =
         "QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize";
+    csinfo_.quantized_fused_matmul = "_QuantizedFusedMatMul";
+    csinfo_.quantized_fused_matmul_and_dequantize =
+        "_QuantizedFusedMatMulAndDequantize";
+    csinfo_.quantized_fused_matmul_and_requantize =
+        "_QuantizedFusedMatMulAndRequantize";
     csinfo_.quantize_v2 = "QuantizeV2";
     csinfo_.relu = "Relu";
     csinfo_.relu_grad = "ReluGrad";
@@ -500,7 +514,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                  : csinfo_.mkl_fused_matmul,
                       CopyAttrsAllCheckConstFilter, FusedMatMulRewrite,
                       GetRewriteCause()});
-
+    rinfo_.push_back({csinfo_.fused_batch_matmul,
+                      csinfo_.mkl_fused_batch_matmul, CopyAttrsAll,
+                      AlwaysRewrite, kRewriteForOpNameChange});
+    rinfo_.push_back({csinfo_.fused_batch_matmul_v2,
+                      csinfo_.mkl_fused_batch_matmul_v2, CopyAttrsAll,
+                      AlwaysRewrite, kRewriteForOpNameChange});
     rinfo_.push_back(
         {csinfo_.identity, mkl_op_registry::GetMklOpName(csinfo_.identity),
          CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()});
@@ -653,6 +672,17 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
              csinfo_
                  .quantized_depthwise_conv2d_with_bias_and_relu_and_requantize),
          CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back(
+        {csinfo_.quantized_fused_matmul, csinfo_.mkl_quantized_fused_matmul,
+         CopyAttrsQuantizedFusedMatMul, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.quantized_fused_matmul_and_dequantize,
+                      csinfo_.mkl_quantized_fused_matmul_and_dequantize,
+                      CopyAttrsQuantizedFusedMatMul, AlwaysRewrite,
+                      GetRewriteCause()});
+    rinfo_.push_back({csinfo_.quantized_fused_matmul_and_requantize,
+                      csinfo_.mkl_quantized_fused_matmul_and_requantize,
+                      CopyAttrsQuantizedFusedMatMul, AlwaysRewrite,
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantize_v2,
                       mkl_op_registry::GetMklOpName(csinfo_.quantize_v2),
                       CopyAttrsAll, QuantizeOpRewrite, GetRewriteCause()});
@@ -935,6 +965,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string fused_conv2d;
     string fused_depthwise_conv2d;
     string fused_matmul;
+    string fused_batch_matmul;
+    string fused_batch_matmul_v2;
     string identity;
     string leakyrelu;
     string leakyrelu_grad;
@@ -957,6 +989,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_fused_conv2d;
     string mkl_fused_depthwise_conv2d;
     string mkl_fused_matmul;
+    string mkl_fused_batch_matmul;
+    string mkl_fused_batch_matmul_v2;
     string mkl_native_conv2d_with_bias;
     string mkl_native_fused_batch_norm_ex;
     string mkl_native_fused_conv2d;
@@ -966,6 +1000,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_native_pad_with_fused_conv2d;
     string mkl_pad_with_conv2d;
     string mkl_pad_with_fused_conv2d;
+    string mkl_quantized_fused_matmul;
+    string mkl_quantized_fused_matmul_and_dequantize;
+    string mkl_quantized_fused_matmul_and_requantize;
     string mul;
     string pad;
     string pad_with_conv2d;
@@ -994,6 +1031,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string quantized_depthwise_conv2d_with_bias;
     string quantized_depthwise_conv2d_with_bias_and_relu;
     string quantized_depthwise_conv2d_with_bias_and_relu_and_requantize;
+    string quantized_fused_matmul;
+    string quantized_fused_matmul_and_dequantize;
+    string quantized_fused_matmul_and_requantize;
     string quantize_v2;
     string relu;
     string relu_grad;
@@ -2035,6 +2075,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                              bool change_format = false);
   static void CopyAttrsQuantizedConv2D(const Node* orig_node, NodeBuilder* nb,
                                        bool change_format = false);
+  static void CopyAttrsQuantizedFusedMatMul(const Node* orig_node,
+                                            NodeBuilder* nb,
+                                            bool change_format = false);
   static void CopyFormatAttrsConv(const Node* orig_node, NodeBuilder* nb,
                                   const std::vector<int32>& strides,
                                   const std::vector<int32>& dilations,
@@ -2391,7 +2434,10 @@ Status MklLayoutRewritePass::SetUpInputs(
       "QuantizedDepthwiseConv2D",
       "QuantizedDepthwiseConv2DWithBias",
       "QuantizedDepthwiseConv2DWithBiasAndRelu",
-      "QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize"};
+      "QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize",
+      "_QuantizedFusedMatMul",
+      "_QuantizedFusedMatMulAndDequantize",
+      "_QuantizedFusedMatMulAndRequantize"};
   bool should_check_workspace =
       std::find(std::begin(quant_ops), std::end(quant_ops),
                 old_node->type_string()) == std::end(quant_ops);
@@ -2801,6 +2847,15 @@ void MklLayoutRewritePass::CopyAttrsQuantizedMatMulWithBiasAndDequantize(
   if (bias_status.ToString() == "OK") nb->Attr("Tbias", Tbias);
 }
 
+void MklLayoutRewritePass::CopyAttrsQuantizedFusedMatMul(const Node* orig_node,
+                                                         NodeBuilder* nb,
+                                                         bool change_format) {
+  DataType Toutput;
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Toutput", &Toutput));
+  CopyAttrsAll(orig_node, nb, change_format);
+  nb->Attr("T", Toutput);  // added "T" for facilitating MklToTf conversion.
+}
+
 void MklLayoutRewritePass::CopyAttrsQuantizedMatMulWithBias(
     const Node* orig_node, NodeBuilder* nb, bool change_format) {
   DataType T1, T2, Toutput;
@@ -3688,10 +3743,15 @@ MklLayoutRewritePass::CheckForQuantizedNodeRewrite(const Node* n) const {
           mkl_op_registry::GetMklOpName(n->type_string()), Tinput, Tfilter)) {
     type_attrs_present = true;
   } else if (TryGetNodeAttr(n->def(), "T1", &T1) &&
-             TryGetNodeAttr(n->def(), "T2", &T2) &&
-             mkl_op_registry::IsMklLayoutDependentOp(
-                 mkl_op_registry::GetMklOpName(n->type_string()), T1, T2)) {
-    type_attrs_present = true;
+             TryGetNodeAttr(n->def(), "T2", &T2)) {
+    StringPiece op_name(n->type_string());
+    // Assuming that we have T1 and T2 attribute for internal ops that is
+    // prefixed with "_".
+    absl::ConsumePrefix(&op_name, "_");
+    if (mkl_op_registry::IsMklLayoutDependentOp(
+            mkl_op_registry::GetMklOpName(string(op_name)), T1, T2)) {
+      type_attrs_present = true;
+    }
   }
 
   if (type_attrs_present) {
@@ -3749,6 +3809,8 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
       n->type_string() != csinfo_.fused_conv2d &&
       n->type_string() != csinfo_.fused_depthwise_conv2d &&
       n->type_string() != csinfo_.fused_matmul &&
+      n->type_string() != csinfo_.fused_batch_matmul &&
+      n->type_string() != csinfo_.fused_batch_matmul_v2 &&
       !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()),
                                 T)) {
     return nullptr;

diff --git a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
@@ -2063,6 +2063,56 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedMatMul_Negative);
 REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedMatMul_LeakyRelu_Positive);
 #undef REGISTER_TEST
 
+// Test set: _FusedBatchMatMul -> MklFusedBatchMatMul rewrite tests
+#define REGISTER_TEST(NAME, T, INPUT)                                          \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                      \
+  InitGraph(                                                                   \
+      "node { name: 'A' op: '" #INPUT "'}"                                     \
+      "node { name: 'B' op: '" #INPUT "'}"                                     \
+      "node { name: 'C' op: '" #INPUT "'}"                                     \
+      "node { name: 'D' op: '_FusedBatchMatMul'"                               \
+      " attr { key: 'T'          value { type:" #T  "} }"                      \
+      " attr { key: 'adj_x'      value { b: false } }"                         \
+      " attr { key: 'adj_y'      value { b: false } }"                         \
+      " attr { key: 'num_args'   value { i: 1 } }"                             \
+      " attr { key: 'fused_ops'  value { list: {s: 'Scale'} } }"               \
+      " input: ['A', 'B', 'C']}"                                               \
+      "node { name: 'Z' op: 'Zeta'"                                            \
+      " attr {key: 'T'           value { type: " #T " } }"                     \
+      " input: ['D', 'C']}");                                                  \
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),                                     \
+            "A(" #INPUT ");B(" #INPUT ");C(" #INPUT ");"                       \
+            "D(_MklFusedBatchMatMul);Z(Zeta)"                                  \
+            "|A->D;B->D:1;C->D:2;C->Z:1;D->Z");                                \
+}
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchMatMul_Positive)
+#undef REGISTER_TEST
+
+// Test set: _FusedBatchMatMulV2 -> MklFusedBatchMatMulV2 rewrite tests
+#define REGISTER_TEST(NAME, T, INPUT)                                          \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                      \
+  InitGraph(                                                                   \
+      "node { name: 'A' op: '" #INPUT "'}"                                     \
+      "node { name: 'B' op: '" #INPUT "'}"                                     \
+      "node { name: 'C' op: '" #INPUT "'}"                                     \
+      "node { name: 'D' op: '_FusedBatchMatMulV2'"                             \
+      " attr { key: 'T'          value { type:" #T  "} }"                      \
+      " attr { key: 'adj_x'      value { b: false } }"                         \
+      " attr { key: 'adj_y'      value { b: false } }"                         \
+      " attr { key: 'num_args'   value { i: 1 } }"                             \
+      " attr { key: 'fused_ops'  value { list: {s: 'Scale'} } }"               \
+      " input: ['A', 'B', 'C']}"                                               \
+      "node { name: 'Z' op: 'Zeta'"                                            \
+      " attr {key: 'T'           value { type: " #T " } }"                     \
+      " input: ['D', 'C']}");                                                  \
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),                                     \
+            "A(" #INPUT ");B(" #INPUT ");C(" #INPUT ");"                       \
+            "D(_MklFusedBatchMatMulV2);Z(Zeta)"                                \
+            "|A->D;B->D:1;C->D:2;C->Z:1;D->Z");                                \
+}
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchMatMulV2_Positive)
+#undef REGISTER_TEST
+
 // Merge test for PadWithFusedConv2D Op with BiasAdd fusion
 // padding is VALID type
 // A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
@@ -5194,6 +5244,42 @@ static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
                  // not whole graphs
   }
 }
+
+TEST_F(MklLayoutPassTest, QuantizedFusedMatMul) {
+  InitGraph(
+      "node { name: 'A' op: 'QInt8Input' }"
+      "node { name: 'B' op: 'QInt8Input' }"
+      "node { name: 'C' op: 'Float32Input' }"
+      "node { name: 'D' op: 'Float32Input' }"
+      "node { name: 'E' op: 'Float32Input' }"
+      "node { name: 'F' op: 'Float32Input' }"
+      "node { name: 'G' op: 'Float32Input' }"
+      "node { name: 'H' op: '_QuantizedFusedMatMul'"
+      " attr { key: 'T1'                value { type: DT_QINT8 } }"
+      " attr { key: 'T2'                value { type: DT_QINT8 } }"
+      " attr { key: 'Targs'             value { type: DT_FLOAT } }"
+      " attr { key: 'Toutput'           value { type: DT_QINT32 } }"
+      " attr { key: 'T'                 value { type: DT_QINT32 } }"
+      " attr { key: 'num_args'          value { i: 1 } }"
+      " attr { key: 'transpose_a'       value { b: false } }"
+      " attr { key: 'transpose_b'       value { b: false } }"
+      " attr { key: 'fused_ops'         value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'is_filter_const'   value { b: true } }"
+      " attr { key: 'is_bias_const'     value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E', 'F', 'G']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(QInt8Input);B(QInt8Input);C(Float32Input);D(Float32Input);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);"
+            "DMT/_4(Const);DMT/_5(Const);DMT/_6(Const);E(Float32Input);"
+            "F(Float32Input);G(Float32Input);H(_MklQuantizedFusedMatMul)|A->H;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;A:control->DMT/_5:control;"
+            "A:control->DMT/_6:control;B->H:1;C->H:2;D->H:3;DMT/_0->H:7;"
+            "DMT/_1->H:8;DMT/_2->H:9;DMT/_3->H:10;DMT/_4->H:11;DMT/_5->H:12;"
+            "DMT/_6->H:13;E->H:4;F->H:5;G->H:6");
+}
+
 BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 }  // namespace

diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
@@ -202,7 +202,7 @@ static inline bool IsMklLayoutDependentOp(const string& op_name, DataType T) {
 
   // Restrict quantized ops to QUINT8 and QINT8 for now
   if (kernel.find(kMklQuantizedOpLabelPattern) != string::npos) {
-    return (T == DT_QUINT8 || T == DT_QINT8 || T == DT_QINT32);
+    return (T == DT_QUINT8 || T == DT_QINT8 || T == DT_QINT32 || T == DT_FLOAT);
   }
 #ifdef ENABLE_INTEL_MKL_BFLOAT16
   // Restrict regular ops to FLOAT and BFLOAT16

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
@@ -880,6 +880,7 @@ tf_kernel_library(
         "//tensorflow/core/grappler/utils:graph_view",
         "//tensorflow/core/grappler/utils:symbolic_shapes",
         "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/grappler/utils:pattern_utils",
     ] + if_mkl(["//tensorflow/core/graph:mkl_graph_util"]),
 )