zhuhaozhe
diff --git a/‎examples/README.md
Lines changed: 112 additions & 0 deletions b/‎examples/README.md
Lines changed: 112 additions & 0 deletions
diff --git a/‎intel_pytorch_extension_py/ops/embeddingbag.py
Lines changed: 6 additions & 2 deletions b/‎intel_pytorch_extension_py/ops/embeddingbag.py
Lines changed: 6 additions & 2 deletions
diff --git a/‎intel_pytorch_extension_py/ops/interaction.py
Lines changed: 1 addition & 1 deletion b/‎intel_pytorch_extension_py/ops/interaction.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/cpu/test_emb.py
Lines changed: 16 additions & 0 deletions b/‎tests/cpu/test_emb.py
Lines changed: 16 additions & 0 deletions
diff --git a/‎tests/cpu/test_interaction.py
Lines changed: 53 additions & 28 deletions b/‎tests/cpu/test_interaction.py
Lines changed: 53 additions & 28 deletions
diff --git a/‎torch_ipex/csrc/cpu/CustomOPs.h
Lines changed: 2 additions & 2 deletions b/‎torch_ipex/csrc/cpu/CustomOPs.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎torch_ipex/csrc/cpu/DevOPs.cpp
Lines changed: 1 addition & 1 deletion b/‎torch_ipex/csrc/cpu/DevOPs.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch_ipex/csrc/cpu/ExtendOPs.cpp
Lines changed: 39 additions & 1 deletion b/‎torch_ipex/csrc/cpu/ExtendOPs.cpp
Lines changed: 39 additions & 1 deletion
diff --git a/‎torch_ipex/csrc/cpu/ShadeDataContext.h
Lines changed: 20 additions & 0 deletions b/‎torch_ipex/csrc/cpu/ShadeDataContext.h
Lines changed: 20 additions & 0 deletions
@@ -0,0 +1,112 @@
+# Guide to run auto-mix precision(bf16) models with intel extension for pytorch
+
+## Verified on
+
+| Item | Value |
+| -: | :- |
+| OS | Ubuntu 18.04 LTS |
+| Compiler | gcc 7.5.0 |
+| Memory | DDR4 3200MHz, 96GB/socket |
+
+## Environment setting
+
+1. Install anaconda 3.0
+```
+  wget https://repo.continuum.io/archive/Anaconda3-5.0.0-Linux-x86_64.sh -O anaconda3.sh
+  chmod +x anaconda3.sh
+  ./anaconda3.sh -b -p ~/anaconda3
+  ./anaconda3/bin/conda create -n ipex python=3.7
+```
+
+2. Setup anaconda virtual environment for ipex
+```
+  export PATH=~/anaconda3/bin:$PATH
+  source ./anaconda3/bin/activate ipex
+```
+
+3. Install dependencies
+```
+  conda config --append channels intel
+  conda install ninja pyyaml setuptools cmake cffi typing numpy
+  conda install mkl intel-openmp  mkl-include  -c intel --no-update-deps
+  conda install jemalloc
+
+```
+
+4. Clone source code and build
+
+```
+  # PyTorch
+  git clone https://github.com/otcshare/pytorch.git
+  git checkout tags/v1.7.0 -b v1.7.0
+  git submodule sync && git submodule update --init --recursive
+
+  # extension
+  git clone https://github.com/otcshare/intel-extension-for-pytorch.git
+  git checkout dlrm
+  git submodule update --init --recursive
+
+  # prepare patch to PyTorch
+  cp {path/to/intel-pytorch-extension}/torch_patches/dlrm_fp32.patch {path/to/pytorch}/
+  cp {path/to/intel-pytorch-extension}/torch_patches/xpu-1.7.patch {path/to/pytorch}/
+
+  # build PyTorch
+  cd {path/to/pytorch}
+  patch -p1 < xpu-1.7.patch
+  patch -p1 < dlrm_fp32-1.7.patch
+  pip install -r requirements.txt
+  python setup.py install
+
+  # build extension
+  cd {path/to/intel-pytorch-extension}
+  pip install -r requirements.txt
+  cd third_party/mkl-dnn
+  patch -p1 < ../../torch_patches/FIFO.diff
+  cd ../../
+  python setup.py install
+
+  ```
+
+## Prepare DLRM
+
+```
+  git clone https://github.com/otcshare/dlrm.git
+  cd dlrm
+  pip install -r requirements.txt
+```
+
+
+## Run Models
+```
+  export DATASET_PATH={patch/to/dlrm_dataset}
+```
+
+1. Inference with vanilla pytorch
+```
+  bash run_inference.sh
+```
+
+2. Inference with ipex fp32 
+```
+  bash run_inference.sh ipex
+```
+
+3. Inference with ipex bf16 
+```
+  bash run_inference.sh ipex bf16
+```
+
+4. Training with vanilla pytorch
+```
+  bash run_training.sh
+```
+
+5. Training with ipex fp32 
+```
+  bash run_training.sh ipex
+```
+
+6. Training with ipex bf16 
+```
+  bash run_training.sh ipex bf16
+```
@@ -1,13 +1,17 @@
 import torch
 from torch import nn
 from torch.autograd import Function
+import intel_pytorch_extension as ipex
 import _torch_ipex as core
 
 # # extension for BF16 fast path only
 
-
+torch_embedding_bag = torch.embedding_bag
 def embeddingbag(weights, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset):
-    ret = torch.ops.torch_ipex.embedding_bag(weights, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset)
+    if weights.device.type in ipex.DEVICE:
+        ret = torch.ops.torch_ipex.embedding_bag(weights, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset)
+    else:
+        ret = torch_embedding_bag(weights, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset)
     if len(ret)==1:
         ret += [torch.Tensor(), torch.Tensor(), torch.Tensor()]
     return ret
 
@@ -8,7 +8,7 @@ def interaction(*args):
     # So we preserve python custom function while need backward
     # Since python custom function will meet GIL when run multi-thread in one process
     # We will drop python custom function after c++ are supported
-    if torch.is_grad_enabled():
+    if torch.is_grad_enabled() and core.get_train():
         return InteractionFunc.apply(*args)
     return torch.ops.torch_ipex.interaction_forward(args)
 
 
@@ -5,7 +5,23 @@
 import copy
 from common_utils import TestCase
 
+from common_ipex_conf import AutoMixPrecision, AutoDNNL
+
 class TestEMB(TestCase):
+    def test_automix_emb(self):
+        EE = nn.EmbeddingBag(10, 3, mode='sum', sparse=True)
+        emb_auto_mix = copy.deepcopy(EE).to(ipex.DEVICE)
+        emb_auto_mix.weight.requires_grad = False
+        input = torch.LongTensor([1,2,4,5,4,3,2,9])
+        offsets = torch.LongTensor([0,1,2,3,4,5,6,7])
+        res_fp32 = EE(input, offsets)
+
+        with AutoDNNL(True), AutoMixPrecision(True):
+            res_auto_mix = emb_auto_mix(input.to(ipex.DEVICE), offsets.to(ipex.DEVICE))
+            self.assertEqual(res_auto_mix.dtype, torch.float)
+            self.assertTrue(ipex.core.is_bf16_dil_tensor(res_auto_mix))
+            self.assertTrue(torch.allclose(res_fp32,res_auto_mix, rtol=1e-5, atol=1e-2))
+
     def test_emb(self):
         #E = nn.EmbeddingBag(10, 5, mode="sum", sparse=True)
         cpu_emb = nn.EmbeddingBag(10, 3, mode='sum', sparse=True)
 
@@ -21,47 +21,72 @@
     IS_SANDCASTLE, load_tests, brute_pdist, brute_cdist, slowTest, \
     skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf
 
+from common_ipex_conf import AutoMixPrecision, AutoDNNL
+
 class TestInteractionCases(TestCase):
-    def test_interaction(self):
-        def interact_fusion(x, ly):
-            A = [x] + ly
-            R = ipex.interaction(*A)
-            return R
+    def interact_fusion(self, x, ly):
+        A = [x] + ly
+        R = ipex.interaction(*A)
+        return R
 
-        def interact_features(x, ly):
-            (batch_size, d) = x.shape
-            T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d))
-            # Z = pcl_embedding_bag.bdot(T)
-            Z = torch.bmm(T, torch.transpose(T, 1, 2))
-            _, ni, nj = Z.shape
-            offset =  0
-            li = torch.tensor([i for i in range(ni) for j in range(i + offset)], device=ipex.DEVICE)
-            lj = torch.tensor([j for i in range(nj) for j in range(i + offset)], device=ipex.DEVICE)
-            Zflat = Z[:, li, lj]
-            # concatenate dense features and interactions
-            R = torch.cat([x] + [Zflat], dim=1)
-            return R
+    def interact_features(self, x, ly):
+        (batch_size, d) = x.shape
+        T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d))
+        # Z = pcl_embedding_bag.bdot(T)
+        Z = torch.bmm(T, torch.transpose(T, 1, 2))
+        _, ni, nj = Z.shape
+        offset =  0
+        li = torch.tensor([i for i in range(ni) for j in range(i + offset)], device=ipex.DEVICE)
+        lj = torch.tensor([j for i in range(nj) for j in range(i + offset)], device=ipex.DEVICE)
+        Zflat = Z[:, li, lj]
+        # concatenate dense features and interactions
+        R = torch.cat([x] + [Zflat], dim=1)
+        return R
 
-        dtypes=[torch.float32]
-        for dtype in dtypes:
+    def get_input(self, dtype):
             x1 = torch.randn([2048, 128], device=ipex.DEVICE).to(dtype).clone().detach().requires_grad_()
-            x2 = x1.clone().detach().requires_grad_()
+            x1_clone = x1.clone().detach().requires_grad_()
             ly1 = []
-            ly2 = []
+            ly1_clone = []
             for i in range(0, 26):
                 V = torch.randn([2048, 128], device=ipex.DEVICE).to(dtype).clone().detach().requires_grad_()
                 ly1.append(V)
-                ly2.append(V.clone().detach().requires_grad_())
+                ly1_clone.append(V.clone().detach().requires_grad_())
+            return x1, ly1, x1_clone, ly1_clone
 
-            A = interact_fusion(x1, ly1)
-            B = interact_features(x2, ly2)
-            self.assertEqual(A, B)
+    def test_interaction_fusion(self):
+        dtypes=[torch.float32, torch.bfloat16]
+        for dtype in dtypes:
+            x1, ly1, x1_clone, ly1_clone = self.get_input(dtype)
+            ipex.core.set_execution_mode(train=True)
+            A = self.interact_fusion(x1, ly1).to(torch.float32)
+            B = self.interact_features(x1_clone, ly1_clone).to(torch.float32)
+            self.assertTrue(torch.allclose(A, B, rtol=1e-4, atol=1e-4))
 
             A.mean().backward()
             B.mean().backward()
-            self.assertEqual(x1.grad, x2.grad)
+            self.assertTrue(torch.allclose(x1.grad.to(torch.float32), x1_clone.grad.to(torch.float32), rtol=1e-4, atol=1e-4))
             for i in range(0, 26):
-                self.assertEqual(ly1[i].grad, ly2[i].grad)
+                self.assertTrue(torch.allclose(ly1[i].grad.to(torch.float32), ly1_clone[i].grad.to(torch.float32), rtol=1e-4, atol=1e-4))
+
+    def test_automix_fused_interaction(self):
+        x1, ly1, x1_clone, ly1_clone = self.get_input(torch.float32)
+        man_bf16_x1 = x1_clone.to(torch.bfloat16)
+        man_bf16_ly1= [y.to(torch.bfloat16) for y in ly1_clone]
+        with AutoDNNL(True), AutoMixPrecision(False):
+            self.assertEqual(man_bf16_x1.dtype, torch.bfloat16)
+            for i in range(0, 26):
+                self.assertEqual(man_bf16_ly1[i].dtype, torch.bfloat16)
+            res_man_bf16 = self.interact_fusion(man_bf16_x1, man_bf16_ly1)
+            self.assertEqual(res_man_bf16.dtype, torch.bfloat16)
+
+            with AutoMixPrecision(True):
+                res_auto_bf16 = self.interact_fusion(x1, ly1)
+                self.assertTrue(ipex.core.is_bf16_dil_tensor(x1))
+                for i in range(0, 26):
+                    self.assertTrue(ipex.core.is_bf16_dil_tensor(ly1[i]))
+                self.assertTrue(ipex.core.is_bf16_dil_tensor(res_auto_bf16))
+                self.assertEqual(res_man_bf16.to(torch.float32), res_auto_bf16)
 
 if __name__ == '__main__':
     test = unittest.main()
@@ -83,13 +83,13 @@ class NewLinearOp : public torch::autograd::Function<NewLinearOp> {
             input.sizes(),
             grad_output.is_contiguous() ? grad_output
                                         : grad_output.contiguous(),
-            weight.is_contiguous() ? weight : weight.contiguous());
+            weight);
         std::tie(grad_weight, grad_bias) =
             torch_ipex::cpu::AtenIpexCPUDev::dil_linear_backward_weights(
                 grad_output.is_contiguous() ? grad_output
                                             : grad_output.contiguous(),
                 input.is_contiguous() ? input : input.contiguous(),
-                weight.is_contiguous() ? weight : weight.contiguous(),
+                weight,
                 bias.defined());
         return {grad_input, grad_weight, grad_bias};
       }
 
@@ -1049,7 +1049,7 @@ at::Tensor AtenIpexCPUDev::dil_linear(
   // reshape first if input dim is greater than 2 and the reshape will cost a memory copy.
   auto self_reshaped = self.dim() > 2 ? dil_reshape(self, {-1, dil_size(self, self.dim() - 1)}) : self;
   const dil::tensor x = dbl::comm::try_gen_dil_tensor(self_reshaped);
-  if (!check_train() && check_tensor_own_whole_storage(weight)) {
+  if (!(check_auto_mix_bf16_fp32() && check_train()) && check_tensor_own_whole_storage(weight)) {
     dbl::linear::prepack_linear_weights(self_reshaped, x, weight);
   }
   const dil::tensor w = dbl::comm::try_gen_dil_tensor(weight);
 
@@ -12,6 +12,8 @@
 #include <algorithm>
 #include <c10/util/Exception.h>
 #include <torch/csrc/autograd/function.h>
+#include "ShadeDataContext.h"
+#include "torch_ipex/csrc/cpu/bf16/Bridge.hpp"
 
 namespace torch_ipex {
 
@@ -233,7 +235,7 @@ inline at::Tensor _interaction_forward(const std::vector<at::Tensor> &input) {
   std::vector<T *> input_data(input.size());
   for (int i = 0; i < input.size(); i++) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input[i].is_contiguous());
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input[i].device().is_xpu());
+    // TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input[i].device().is_xpu());
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input[i].dim() == 2);
     feature_sizes[i] = input[i].sizes()[1];
     total_feature_size += input[i].sizes()[1];
@@ -343,8 +345,24 @@ _interaction_backward(const at::Tensor &grad_out,
   return output;
 }
 
+at::Tensor
+interaction_forward_auto_mix_dispatch(const std::vector<at::Tensor> &input) {
+  for (auto &in : input) {
+    cpu::dbl::comm::reorder_to_bf16_for_mix_prec(in);
+    IPEX_CHECK(cpu::ShadeDataContext::isTensorMixPrecisionBF16(in));
+  }
+  const std::vector<at::Tensor>& consistent_input = cpu::bf16::gen_consistent_tensorlist(input);
+  auto &&_ipex_result = _interaction_forward<at::BFloat16>(consistent_input);
+  return cpu::bf16::gen_mix_prec_tensor(_ipex_result);
+}
+
 at::Tensor
 AtenIpexTypeExt::interaction_forward(const std::vector<at::Tensor> &input) {
+  bool auto_mix_bf16 = check_auto_mix_bf16_fp32();
+  if (auto_mix_bf16){
+    return interaction_forward_auto_mix_dispatch(input);
+  }
+  // preserve the support of origin pytorch bfloat 16 path
   if (input[0].scalar_type() == at::kFloat) {
     for (auto &in : input) {
       cpu::dbl::comm::reorder_to_public(in);
@@ -360,9 +378,29 @@ AtenIpexTypeExt::interaction_forward(const std::vector<at::Tensor> &input) {
   }
 }
 
+std::vector<at::Tensor>
+interaction_backward_auto_mix_dispatch(const at::Tensor &grad_out,
+                                      const std::vector<at::Tensor> &input) {
+  for (auto &in : input) {
+    cpu::dbl::comm::reorder_to_bf16_for_mix_prec(in);
+    IPEX_CHECK(cpu::ShadeDataContext::isTensorMixPrecisionBF16(in));
+  }
+  cpu::dbl::comm::reorder_to_bf16_for_mix_prec(grad_out);
+  IPEX_CHECK(cpu::ShadeDataContext::isTensorMixPrecisionBF16(grad_out));
+  const std::vector<at::Tensor>& consistent_input = cpu::bf16::gen_consistent_tensorlist(input);
+  const at::Tensor& consistent_grad_out = cpu::bf16::gen_consistent_tensor(grad_out);
+  auto &&_ipex_result = _interaction_backward<at::BFloat16>(consistent_grad_out, consistent_input);
+  return cpu::bf16::gen_mix_prec_tensorlist(_ipex_result);
+}
+
 std::vector<at::Tensor>
 AtenIpexTypeExt::interaction_backward(const at::Tensor &grad_out,
                                       const std::vector<at::Tensor> &input) {
+  bool auto_mix_bf16 = check_auto_mix_bf16_fp32();
+  if (auto_mix_bf16){
+    return interaction_backward_auto_mix_dispatch(grad_out, input);
+  }
+  // preserve the support of origin pytorch bfloat 16 path
   if (grad_out.scalar_type() == at::kFloat) {
     cpu::dbl::comm::reorder_to_public(grad_out);
     return _interaction_backward<float>(grad_out, input);
 
@@ -294,6 +294,26 @@ struct ShadeDataContext {
     ShadeDataContext *shade_data_context = (ShadeDataContext*)storage_context;
     shade_data_context->packed = value;
   }
+
+  static inline bool isTensorMixPrecisionBF16(const at::Tensor &tensor) {
+    auto dil_tensor_type = getDilStorage(tensor).get_data_type();
+    auto aten_tensor_type = tensor.scalar_type();
+    if (aten_tensor_type != at::kFloat) {
+      return false;
+    }
+    auto res = (dil_tensor_type == dil::data_type::bf16);
+
+    // Check mix_precision
+    void *raw_context = tensor.storage().data_ptr().get_context();
+    ShadeDataContext *shade_data_context = (ShadeDataContext*)raw_context;
+    if (shade_data_context->mix_prec_type == MIX_PREC_TYPE::MIX_BF16_FP32) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(res);
+    } else {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!res);
+    }
+
+    return res;
+  }
 };
 
 }  // namespace cpu
Original file line number	Diff line number	Diff line change
`@@ -1049,7 +1049,7 @@ at::Tensor AtenIpexCPUDev::dil_linear(`
`1049`	`1049`	`// reshape first if input dim is greater than 2 and the reshape will cost a memory copy.`
`1050`	`1050`	`auto self_reshaped = self.dim() > 2 ? dil_reshape(self, {-1, dil_size(self, self.dim() - 1)}) : self;`
`1051`	`1051`	`const dil::tensor x = dbl::comm::try_gen_dil_tensor(self_reshaped);`
`1052`		`- if (!check_train() && check_tensor_own_whole_storage(weight)) {`
	`1052`	`+ if (!(check_auto_mix_bf16_fp32() && check_train()) && check_tensor_own_whole_storage(weight)) {`
`1053`	`1053`	`dbl::linear::prepack_linear_weights(self_reshaped, x, weight);`
`1054`	`1054`	`}`
`1055`	`1055`	`const dil::tensor w = dbl::comm::try_gen_dil_tensor(weight);`