reminisce
diff --git a/‎MKL_README.md
+5-4 b/‎MKL_README.md
+5-4
diff --git a/‎Makefile
+6-4 b/‎Makefile
+6-4
diff --git a/‎include/mxnet/mkl_memory.h
+66 b/‎include/mxnet/mkl_memory.h
+66
diff --git a/‎include/mxnet/ndarray.h
+23-2 b/‎include/mxnet/ndarray.h
+23-2
diff --git a/‎include/mxnet/tensor_blob.h
+43-5 b/‎include/mxnet/tensor_blob.h
+43-5
diff --git a/‎make/config.mk
+3 b/‎make/config.mk
+3
diff --git a/‎prepare_mkl.sh
+3-4 b/‎prepare_mkl.sh
+3-4
@@ -14,15 +14,16 @@ Download MKL:
   1. Enable USE_MKL2017=1 in make/config.mk
     1.1 USE_BLAS should be atlas by default
     1.2 if need USE_BLAS to be mkl, please  Navigate here - https://registrationcenter.intel.com/en/forms/?productid=2558&licensetype=2 to do a full MKL installation
+    1.3 By default, MKL_2017_EXPRIEMENTAL=0. If setting MKL_2017_EXPRIEMENTAL=1, MKL buffer will be created and transferred between layers to achiever much higher performance. 
   2. Run 'make -jX'
-    2.1 Makefile will execute "prepare_mkl.sh" to download the mkl under root folder.e.g. <MXNET ROOTDIR> /mklml_lnx_2017.0.0.20160801
+    2.1 Makefile will execute "prepare_mkl.sh" to download the mkl under root folder.e.g. <MXNET ROOTDIR> /mklml_lnx_<MKL VRSION>
     2.2 if download failed because of proxy setting, please do it manually before make
-    2.2.1 wget https://github.com/intel/caffe/releases/download/self_containted_MKLGOLD/mklml_lnx_2017.0.0.20160801.tgz
-    2.2.2 tar zxvf mklml_lnx_2017.0.0.20160801.tgz
+    2.2.1 wget https://github.com/dmlc/web-data/raw/master/mxnet/mklml-release/mklml_lnx_<MKL VERSION>.tgz
+    2.2.2 tar zxvf mklml_lnx_<MKL VERSION>.tgz
 
   3. Navigate into the python directory
   4. Run 'sudo python setup.py install'
   5. Before excute python scipt, need to set LD_LIBRARY_PATH
-  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<MXNET ROOTDIR>/mklml_lnx_2017.0.0.20160801/lib
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<MXNET ROOTDIR>/mklml_lnx_<MKL VERSION>/lib
 ```
 
@@ -61,8 +61,12 @@ endif
 ifeq ($(USE_MKL2017), 1)
 	CFLAGS += -DMXNET_USE_MKL2017=1
 	CFLAGS += -DUSE_MKL=1
-ifneq ($(USE_BLAS), mkl)
-	ICC_ON=0
+ifeq ($(USE_MKL2017_EXPERIMENTAL), 1)
+	CFLAGS += -DMKL_EXPERIMENTAL=1
+else
+	CFLAGS += -DMKL_EXPERIMENTAL=0
+endif
+	ICC_ON=0	
 	RETURN_STRING=$(shell ./prepare_mkl.sh $(ICC_ON))
 	MKLROOT=$(firstword $(RETURN_STRING))
 	MKL_LDFLAGS=-l$(word 2, $(RETURN_STRING))
@@ -73,8 +77,6 @@ ifeq ($(MKL_EXTERNAL), 1)
 	LDFLAGS += -L$(MKLROOT)/lib/ -liomp5 -lmklml_gnu -lmklml_intel
 endif
 endif
-endif
-
 
 ifeq ($(USE_CUDNN), 1)
 	CFLAGS += -DMSHADOW_USE_CUDNN=1
 
@@ -46,10 +46,76 @@ struct PrvMemDescr {
   virtual PrvDescrType get_descr_type() = 0;
 };
 
+#if MKL_EXPERIMENTAL == 1
+// Currently HEAD_AT_PRV do not free CPU data
+enum SyncedHead {
+  HEAD_AT_CPU,
+  HEAD_AT_PRV,
+};
+struct MKLMemHolder {
+  SyncedHead head_;
+  std::shared_ptr<PrvMemDescr> prv_descriptor_;
+  bool  b_disable_prv_2_cpu;
+  void disable_prv_2_cpu(bool flag) {
+    b_disable_prv_2_cpu = flag;
+  }
+  void set_prv_descriptor(std::shared_ptr<PrvMemDescr> descriptor, bool same_data = false) {
+    head_ = HEAD_AT_PRV;
+    prv_descriptor_ = descriptor;
+  }
+  std::shared_ptr<PrvMemDescr> get_prv_descriptor() {
+    return  prv_descriptor_;
+  }
+  bool head_at_prv() {
+    return (head_ == HEAD_AT_PRV) ? true : false;
+  }
+  void* prv_data() {
+    if (head_ != HEAD_AT_PRV) {
+      return NULL;
+    }
+    if (prv_descriptor_ == NULL) {
+      LOG(FATAL) << " prv_descriptor_  is NULL";
+    }
+    CHECK(prv_descriptor_.get());
+    return reinterpret_cast<void*>(prv_descriptor_->prv_ptr());
+  }
+
+  const int prv_count() {
+    if (head_ != HEAD_AT_PRV) {
+      return 0;
+    }
+    if (prv_descriptor_ == NULL) {
+      LOG(FATAL) << " prv_descriptor_  is NULL";
+    }
+    CHECK(prv_descriptor_.get());
+    return prv_descriptor_->prv_count();
+  }
+  static std::shared_ptr<MKLMemHolder> create() {
+    return std::make_shared<MKLMemHolder>();
+  }
+  void  check_and_prv_to_cpu(void *dptr_) {
+    if (!b_disable_prv_2_cpu && head_ == HEAD_AT_PRV) {
+      CHECK(prv_descriptor_ != nullptr);
+      prv_descriptor_->convert_from_prv(dptr_);
+      // Because operator use CPU & maybe change it, change to CPU Flag
+      head_ = HEAD_AT_CPU;
+    }
+    if (b_disable_prv_2_cpu) {
+      b_disable_prv_2_cpu = false;
+    }
+  }
+  MKLMemHolder() :
+    head_(HEAD_AT_CPU) {
+    prv_descriptor_ = NULL;
+    b_disable_prv_2_cpu = false;
+  }
+};
+#else
 struct MKLMemHolder {
  public:
   virtual std::shared_ptr<PrvMemDescr> get_prv_descriptor() = 0;
 };
+#endif
 
 }  // namespace mxnet
 #endif  // MXNET_MKL_MEMORY_H_
@@ -18,7 +18,9 @@
 #include "./base.h"
 #include "./storage.h"
 #include "./engine.h"
-
+#if MKL_EXPERIMENTAL == 1
+#include "./mkl_memory.h"
+#endif
 // check c++11
 #if DMLC_USE_CXX11 == 0
 #error "cxx11 was required for ndarray module"
@@ -31,7 +33,11 @@ namespace mxnet {
 class NDArray {
  public:
   /*! \brief default cosntructor */
-  NDArray() {}
+  NDArray() {
+#if MKL_EXPERIMENTAL == 1
+      Mkl_mem_ = MKLMemHolder::create();
+#endif
+  }
   /*!
    * \brief constructing a new dynamic NDArray
    * \param shape the shape of array
@@ -43,6 +49,9 @@ class NDArray {
           bool delay_alloc = false, int dtype = mshadow::default_type_flag)
       : ptr_(std::make_shared<Chunk>(shape.Size(), ctx, delay_alloc, dtype)),
         shape_(shape), offset_(0), dtype_(dtype) {
+#if MKL_EXPERIMENTAL == 1
+      Mkl_mem_ = std::make_shared<MKLMemHolder>();
+#endif
   }
   /*!
    * \brief constructing a static NDArray that shares data with TBlob
@@ -54,6 +63,9 @@ class NDArray {
   NDArray(const TBlob &data, int dev_id)
       : ptr_(std::make_shared<Chunk>(data, dev_id)), shape_(data.shape_), offset_(0),
         dtype_(data.type_flag_) {
+#if MKL_EXPERIMENTAL == 1
+      Mkl_mem_ = std::make_shared<MKLMemHolder>();
+#endif
   }
   /*!
    * \return the shape of current NDArray
@@ -70,6 +82,9 @@ class NDArray {
       res = TBlob(static_cast<DType*>(ptr_->shandle.dptr)
         + offset_, shape_, ptr_->shandle.ctx.dev_mask());
     });
+#if MKL_EXPERIMENTAL == 1
+    res.Mkl_mem_ = Mkl_mem_;
+#endif
     return res;
   }
   /*!
@@ -358,6 +373,10 @@ class NDArray {
       }
     }
   };
+
+#if MKL_EXPERIMENTAL == 1
+  std::shared_ptr<MKLMemHolder> Mkl_mem_;
+#endif
   /*! \brief internal data of NDArray */
   std::shared_ptr<Chunk> ptr_;
   /*! \brief shape of current NDArray */
@@ -380,6 +399,8 @@ class NDArray {
  *     due to different possible convention carried by copy function.
  */
 void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0);
+
+
 /*!
  * \brief Perform elementwise sum over each data from source, store result into out.
  * \param source the ndarray we want to sum
 
@@ -16,7 +16,9 @@
 #include <utility>
 #include <algorithm>
 #include "./base.h"
-
+#if MXNET_USE_MKL2017 == 1
+#include "./mkl_memory.h"
+#endif
 namespace mxnet {
 
 /*!
@@ -551,10 +553,19 @@ class TBlob {
   int dev_mask_;
   /*! \brief type flag of the tensor blob */
   int type_flag_;
+
+  /*! \brief storing mkl chunk buffer blob, use for experimental only */
+#if MKL_EXPERIMENTAL == 1
+  std::shared_ptr<MKLMemHolder> Mkl_mem_;
+#endif
   /*! \brief default constructor, default copy assign will work */
   TBlob(void)
       : dptr_(NULL), dev_mask_(cpu::kDevMask),
-        type_flag_(mshadow::DataType<real_t>::kFlag) {}
+        type_flag_(mshadow::DataType<real_t>::kFlag) {
+#if MKL_EXPERIMENTAL == 1
+      Mkl_mem_ = NULL;
+#endif
+  }
   /*!
    * \brief constructor that construct TBlob from contiguous memory
    * \param dptr the pointer to the memory
@@ -568,7 +579,12 @@ class TBlob {
       : dptr_(dptr), shape_(shape),
         stride_(shape[shape.ndim() - 1]),
         dev_mask_(dev_mask),
-        type_flag_(mshadow::DataType<DType>::kFlag) {}
+        type_flag_(mshadow::DataType<DType>::kFlag) {
+#if MKL_EXPERIMENTAL == 1
+      Mkl_mem_ = NULL;
+#endif
+  }
+
   /*!
    * \brief constructor that construct TBlob from contiguous memory
    * \param dptr the pointer to the memory
@@ -583,7 +599,11 @@ class TBlob {
       : dptr_(dptr), shape_(shape),
         stride_(shape[shape.ndim() - 1]),
         dev_mask_(dev_mask),
-        type_flag_(type_flag) {}
+        type_flag_(type_flag) {
+#if MKL_EXPERIMENTAL == 1
+      Mkl_mem_ = NULL;
+#endif
+  }
   /*!
    * \brief constructor from tensor
    * \param src source tensor
@@ -594,6 +614,9 @@ class TBlob {
   template<typename Device, int dim, typename DType>
   TBlob(const mshadow::Tensor<Device, dim, DType> &src) {  // NOLINT(*)
     *this = src;
+#if MKL_EXPERIMENTAL == 1
+    Mkl_mem_ = NULL;
+#endif
   }
   /*!
    * \brief assignment from tensor
@@ -628,12 +651,17 @@ class TBlob {
    */
   template<typename Device, typename DType>
   inline mshadow::Tensor<Device, 2, DType> FlatTo2D(
-      mshadow::Stream<Device> *stream = NULL) const {
+    mshadow::Stream<Device> *stream = NULL) const {
     CHECK(Device::kDevMask == dev_mask_)
       << "TBlob.get: device type do not match specified type";
     CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
       << "TBlob.get_with_shape: data type do not match specified type."
       << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
+#if MKL_EXPERIMENTAL == 1
+    if (Mkl_mem_ != nullptr) {
+      Mkl_mem_->check_and_prv_to_cpu(dptr_);
+    }
+#endif
     return mshadow::Tensor<Device, 2, DType>(static_cast<DType*>(dptr_),
                                              shape_.FlatTo2D(), stride_, stream);
   }
@@ -682,6 +710,11 @@ class TBlob {
     CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
       << "TBlob.get_with_shape: data type do not match specified type."
       << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
+#if MKL_EXPERIMENTAL == 1
+    if (Mkl_mem_ != nullptr) {
+      Mkl_mem_->check_and_prv_to_cpu(dptr_);
+    }
+#endif
     return mshadow::Tensor<Device, dim, DType>(static_cast<DType*>(dptr_),
                                                shape_.get<dim>(),
                                                stride_, stream);
@@ -708,6 +741,11 @@ class TBlob {
     CHECK_EQ(this->CheckContiguous(), true) << "TBlob.get_reshape: must be contiguous";
     CHECK_EQ(this->shape_.Size(), shape.Size())
       << "TBlob.get_with_shape: new and old shape do not match total elements";
+#if MKL_EXPERIMENTAL == 1
+    if (Mkl_mem_ != nullptr) {
+      Mkl_mem_->check_and_prv_to_cpu(dptr_);
+    }
+#endif
     return mshadow::Tensor<Device, dim, DType>(static_cast<DType*>(dptr_),
                                                shape,
                                                shape[dim - 1],
 
@@ -62,6 +62,9 @@ USE_OPENMP = 1
 # whether use MKL2017 library
 USE_MKL2017 = 0
 
+# whether use MKL2017 experimental feature for high performance
+USE_MKL2017_EXPERIMENTAL = 0
+
 # choose the version of blas you want to use
 # can be: mkl, blas, atlas, openblas
 # in default use atlas for linux while apple for osx
 
@@ -64,11 +64,10 @@ echo $VERSION_LINE  # Return Version Line
 # MKL
 DST=`dirname $0`
 OMP=0
-VERSION_MATCH=20160706
-ARCHIVE_BASENAME=mklml_lnx_2017.0.0.20160801.tgz
+VERSION_MATCH=20120601
+ARCHIVE_BASENAME=mklml_lnx_2017.0.1.20161005.tgz
 MKL_CONTENT_DIR=`echo $ARCHIVE_BASENAME | rev | cut -d "." -f 2- | rev`
-GITHUB_RELEASE_TAG=self_containted_MKLGOLD
-MKLURL="https://github.com/intel/caffe/releases/download/$GITHUB_RELEASE_TAG/$ARCHIVE_BASENAME"
+MKLURL="https://github.com/dmlc/web-data/raw/master/mxnet/mklml-release/$ARCHIVE_BASENAME"
 # there are diffrent MKL lib to be used for GCC and for ICC
 reg='^[0-9]+$'
 VERSION_LINE=`GetVersionName $MKLROOT`