Skip to content

Commit a7c1aad

Browse files
zhenlinluopiiswrong
authored andcommitted
add MKL2017 experimental support for high perf (apache#3589)
* add MKL2017 experimental support for high perf * move mkl2017 experiment config into use_mkl * unify mklmemholder * add new MKL 1005 support * change mkl experimental config * change back origin interface
1 parent 1931cf5 commit a7c1aad

19 files changed

+925
-89
lines changed

MKL_README.md

+5-4
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,16 @@ Download MKL:
1414
1. Enable USE_MKL2017=1 in make/config.mk
1515
1.1 USE_BLAS should be atlas by default
1616
1.2 if need USE_BLAS to be mkl, please Navigate here - https://registrationcenter.intel.com/en/forms/?productid=2558&licensetype=2 to do a full MKL installation
17+
1.3 By default, MKL_2017_EXPRIEMENTAL=0. If setting MKL_2017_EXPRIEMENTAL=1, MKL buffer will be created and transferred between layers to achiever much higher performance.
1718
2. Run 'make -jX'
18-
2.1 Makefile will execute "prepare_mkl.sh" to download the mkl under root folder.e.g. <MXNET ROOTDIR> /mklml_lnx_2017.0.0.20160801
19+
2.1 Makefile will execute "prepare_mkl.sh" to download the mkl under root folder.e.g. <MXNET ROOTDIR> /mklml_lnx_<MKL VRSION>
1920
2.2 if download failed because of proxy setting, please do it manually before make
20-
2.2.1 wget https://github.com/intel/caffe/releases/download/self_containted_MKLGOLD/mklml_lnx_2017.0.0.20160801.tgz
21-
2.2.2 tar zxvf mklml_lnx_2017.0.0.20160801.tgz
21+
2.2.1 wget https://github.com/dmlc/web-data/raw/master/mxnet/mklml-release/mklml_lnx_<MKL VERSION>.tgz
22+
2.2.2 tar zxvf mklml_lnx_<MKL VERSION>.tgz
2223

2324
3. Navigate into the python directory
2425
4. Run 'sudo python setup.py install'
2526
5. Before excute python scipt, need to set LD_LIBRARY_PATH
26-
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<MXNET ROOTDIR>/mklml_lnx_2017.0.0.20160801/lib
27+
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<MXNET ROOTDIR>/mklml_lnx_<MKL VERSION>/lib
2728
```
2829

Makefile

+6-4
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,12 @@ endif
6161
ifeq ($(USE_MKL2017), 1)
6262
CFLAGS += -DMXNET_USE_MKL2017=1
6363
CFLAGS += -DUSE_MKL=1
64-
ifneq ($(USE_BLAS), mkl)
65-
ICC_ON=0
64+
ifeq ($(USE_MKL2017_EXPERIMENTAL), 1)
65+
CFLAGS += -DMKL_EXPERIMENTAL=1
66+
else
67+
CFLAGS += -DMKL_EXPERIMENTAL=0
68+
endif
69+
ICC_ON=0
6670
RETURN_STRING=$(shell ./prepare_mkl.sh $(ICC_ON))
6771
MKLROOT=$(firstword $(RETURN_STRING))
6872
MKL_LDFLAGS=-l$(word 2, $(RETURN_STRING))
@@ -73,8 +77,6 @@ ifeq ($(MKL_EXTERNAL), 1)
7377
LDFLAGS += -L$(MKLROOT)/lib/ -liomp5 -lmklml_gnu -lmklml_intel
7478
endif
7579
endif
76-
endif
77-
7880

7981
ifeq ($(USE_CUDNN), 1)
8082
CFLAGS += -DMSHADOW_USE_CUDNN=1

include/mxnet/mkl_memory.h

+66
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,76 @@ struct PrvMemDescr {
4646
virtual PrvDescrType get_descr_type() = 0;
4747
};
4848

49+
#if MKL_EXPERIMENTAL == 1
50+
// Currently HEAD_AT_PRV do not free CPU data
51+
enum SyncedHead {
52+
HEAD_AT_CPU,
53+
HEAD_AT_PRV,
54+
};
55+
struct MKLMemHolder {
56+
SyncedHead head_;
57+
std::shared_ptr<PrvMemDescr> prv_descriptor_;
58+
bool b_disable_prv_2_cpu;
59+
void disable_prv_2_cpu(bool flag) {
60+
b_disable_prv_2_cpu = flag;
61+
}
62+
void set_prv_descriptor(std::shared_ptr<PrvMemDescr> descriptor, bool same_data = false) {
63+
head_ = HEAD_AT_PRV;
64+
prv_descriptor_ = descriptor;
65+
}
66+
std::shared_ptr<PrvMemDescr> get_prv_descriptor() {
67+
return prv_descriptor_;
68+
}
69+
bool head_at_prv() {
70+
return (head_ == HEAD_AT_PRV) ? true : false;
71+
}
72+
void* prv_data() {
73+
if (head_ != HEAD_AT_PRV) {
74+
return NULL;
75+
}
76+
if (prv_descriptor_ == NULL) {
77+
LOG(FATAL) << " prv_descriptor_ is NULL";
78+
}
79+
CHECK(prv_descriptor_.get());
80+
return reinterpret_cast<void*>(prv_descriptor_->prv_ptr());
81+
}
82+
83+
const int prv_count() {
84+
if (head_ != HEAD_AT_PRV) {
85+
return 0;
86+
}
87+
if (prv_descriptor_ == NULL) {
88+
LOG(FATAL) << " prv_descriptor_ is NULL";
89+
}
90+
CHECK(prv_descriptor_.get());
91+
return prv_descriptor_->prv_count();
92+
}
93+
static std::shared_ptr<MKLMemHolder> create() {
94+
return std::make_shared<MKLMemHolder>();
95+
}
96+
void check_and_prv_to_cpu(void *dptr_) {
97+
if (!b_disable_prv_2_cpu && head_ == HEAD_AT_PRV) {
98+
CHECK(prv_descriptor_ != nullptr);
99+
prv_descriptor_->convert_from_prv(dptr_);
100+
// Because operator use CPU & maybe change it, change to CPU Flag
101+
head_ = HEAD_AT_CPU;
102+
}
103+
if (b_disable_prv_2_cpu) {
104+
b_disable_prv_2_cpu = false;
105+
}
106+
}
107+
MKLMemHolder() :
108+
head_(HEAD_AT_CPU) {
109+
prv_descriptor_ = NULL;
110+
b_disable_prv_2_cpu = false;
111+
}
112+
};
113+
#else
49114
struct MKLMemHolder {
50115
public:
51116
virtual std::shared_ptr<PrvMemDescr> get_prv_descriptor() = 0;
52117
};
118+
#endif
53119

54120
} // namespace mxnet
55121
#endif // MXNET_MKL_MEMORY_H_

include/mxnet/ndarray.h

+23-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
#include "./base.h"
1919
#include "./storage.h"
2020
#include "./engine.h"
21-
21+
#if MKL_EXPERIMENTAL == 1
22+
#include "./mkl_memory.h"
23+
#endif
2224
// check c++11
2325
#if DMLC_USE_CXX11 == 0
2426
#error "cxx11 was required for ndarray module"
@@ -31,7 +33,11 @@ namespace mxnet {
3133
class NDArray {
3234
public:
3335
/*! \brief default cosntructor */
34-
NDArray() {}
36+
NDArray() {
37+
#if MKL_EXPERIMENTAL == 1
38+
Mkl_mem_ = MKLMemHolder::create();
39+
#endif
40+
}
3541
/*!
3642
* \brief constructing a new dynamic NDArray
3743
* \param shape the shape of array
@@ -43,6 +49,9 @@ class NDArray {
4349
bool delay_alloc = false, int dtype = mshadow::default_type_flag)
4450
: ptr_(std::make_shared<Chunk>(shape.Size(), ctx, delay_alloc, dtype)),
4551
shape_(shape), offset_(0), dtype_(dtype) {
52+
#if MKL_EXPERIMENTAL == 1
53+
Mkl_mem_ = std::make_shared<MKLMemHolder>();
54+
#endif
4655
}
4756
/*!
4857
* \brief constructing a static NDArray that shares data with TBlob
@@ -54,6 +63,9 @@ class NDArray {
5463
NDArray(const TBlob &data, int dev_id)
5564
: ptr_(std::make_shared<Chunk>(data, dev_id)), shape_(data.shape_), offset_(0),
5665
dtype_(data.type_flag_) {
66+
#if MKL_EXPERIMENTAL == 1
67+
Mkl_mem_ = std::make_shared<MKLMemHolder>();
68+
#endif
5769
}
5870
/*!
5971
* \return the shape of current NDArray
@@ -70,6 +82,9 @@ class NDArray {
7082
res = TBlob(static_cast<DType*>(ptr_->shandle.dptr)
7183
+ offset_, shape_, ptr_->shandle.ctx.dev_mask());
7284
});
85+
#if MKL_EXPERIMENTAL == 1
86+
res.Mkl_mem_ = Mkl_mem_;
87+
#endif
7388
return res;
7489
}
7590
/*!
@@ -358,6 +373,10 @@ class NDArray {
358373
}
359374
}
360375
};
376+
377+
#if MKL_EXPERIMENTAL == 1
378+
std::shared_ptr<MKLMemHolder> Mkl_mem_;
379+
#endif
361380
/*! \brief internal data of NDArray */
362381
std::shared_ptr<Chunk> ptr_;
363382
/*! \brief shape of current NDArray */
@@ -380,6 +399,8 @@ class NDArray {
380399
* due to different possible convention carried by copy function.
381400
*/
382401
void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0);
402+
403+
383404
/*!
384405
* \brief Perform elementwise sum over each data from source, store result into out.
385406
* \param source the ndarray we want to sum

include/mxnet/tensor_blob.h

+43-5
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
#include <utility>
1717
#include <algorithm>
1818
#include "./base.h"
19-
19+
#if MXNET_USE_MKL2017 == 1
20+
#include "./mkl_memory.h"
21+
#endif
2022
namespace mxnet {
2123

2224
/*!
@@ -551,10 +553,19 @@ class TBlob {
551553
int dev_mask_;
552554
/*! \brief type flag of the tensor blob */
553555
int type_flag_;
556+
557+
/*! \brief storing mkl chunk buffer blob, use for experimental only */
558+
#if MKL_EXPERIMENTAL == 1
559+
std::shared_ptr<MKLMemHolder> Mkl_mem_;
560+
#endif
554561
/*! \brief default constructor, default copy assign will work */
555562
TBlob(void)
556563
: dptr_(NULL), dev_mask_(cpu::kDevMask),
557-
type_flag_(mshadow::DataType<real_t>::kFlag) {}
564+
type_flag_(mshadow::DataType<real_t>::kFlag) {
565+
#if MKL_EXPERIMENTAL == 1
566+
Mkl_mem_ = NULL;
567+
#endif
568+
}
558569
/*!
559570
* \brief constructor that construct TBlob from contiguous memory
560571
* \param dptr the pointer to the memory
@@ -568,7 +579,12 @@ class TBlob {
568579
: dptr_(dptr), shape_(shape),
569580
stride_(shape[shape.ndim() - 1]),
570581
dev_mask_(dev_mask),
571-
type_flag_(mshadow::DataType<DType>::kFlag) {}
582+
type_flag_(mshadow::DataType<DType>::kFlag) {
583+
#if MKL_EXPERIMENTAL == 1
584+
Mkl_mem_ = NULL;
585+
#endif
586+
}
587+
572588
/*!
573589
* \brief constructor that construct TBlob from contiguous memory
574590
* \param dptr the pointer to the memory
@@ -583,7 +599,11 @@ class TBlob {
583599
: dptr_(dptr), shape_(shape),
584600
stride_(shape[shape.ndim() - 1]),
585601
dev_mask_(dev_mask),
586-
type_flag_(type_flag) {}
602+
type_flag_(type_flag) {
603+
#if MKL_EXPERIMENTAL == 1
604+
Mkl_mem_ = NULL;
605+
#endif
606+
}
587607
/*!
588608
* \brief constructor from tensor
589609
* \param src source tensor
@@ -594,6 +614,9 @@ class TBlob {
594614
template<typename Device, int dim, typename DType>
595615
TBlob(const mshadow::Tensor<Device, dim, DType> &src) { // NOLINT(*)
596616
*this = src;
617+
#if MKL_EXPERIMENTAL == 1
618+
Mkl_mem_ = NULL;
619+
#endif
597620
}
598621
/*!
599622
* \brief assignment from tensor
@@ -628,12 +651,17 @@ class TBlob {
628651
*/
629652
template<typename Device, typename DType>
630653
inline mshadow::Tensor<Device, 2, DType> FlatTo2D(
631-
mshadow::Stream<Device> *stream = NULL) const {
654+
mshadow::Stream<Device> *stream = NULL) const {
632655
CHECK(Device::kDevMask == dev_mask_)
633656
<< "TBlob.get: device type do not match specified type";
634657
CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
635658
<< "TBlob.get_with_shape: data type do not match specified type."
636659
<< "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
660+
#if MKL_EXPERIMENTAL == 1
661+
if (Mkl_mem_ != nullptr) {
662+
Mkl_mem_->check_and_prv_to_cpu(dptr_);
663+
}
664+
#endif
637665
return mshadow::Tensor<Device, 2, DType>(static_cast<DType*>(dptr_),
638666
shape_.FlatTo2D(), stride_, stream);
639667
}
@@ -682,6 +710,11 @@ class TBlob {
682710
CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
683711
<< "TBlob.get_with_shape: data type do not match specified type."
684712
<< "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
713+
#if MKL_EXPERIMENTAL == 1
714+
if (Mkl_mem_ != nullptr) {
715+
Mkl_mem_->check_and_prv_to_cpu(dptr_);
716+
}
717+
#endif
685718
return mshadow::Tensor<Device, dim, DType>(static_cast<DType*>(dptr_),
686719
shape_.get<dim>(),
687720
stride_, stream);
@@ -708,6 +741,11 @@ class TBlob {
708741
CHECK_EQ(this->CheckContiguous(), true) << "TBlob.get_reshape: must be contiguous";
709742
CHECK_EQ(this->shape_.Size(), shape.Size())
710743
<< "TBlob.get_with_shape: new and old shape do not match total elements";
744+
#if MKL_EXPERIMENTAL == 1
745+
if (Mkl_mem_ != nullptr) {
746+
Mkl_mem_->check_and_prv_to_cpu(dptr_);
747+
}
748+
#endif
711749
return mshadow::Tensor<Device, dim, DType>(static_cast<DType*>(dptr_),
712750
shape,
713751
shape[dim - 1],

make/config.mk

+3
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ USE_OPENMP = 1
6262
# whether use MKL2017 library
6363
USE_MKL2017 = 0
6464

65+
# whether use MKL2017 experimental feature for high performance
66+
USE_MKL2017_EXPERIMENTAL = 0
67+
6568
# choose the version of blas you want to use
6669
# can be: mkl, blas, atlas, openblas
6770
# in default use atlas for linux while apple for osx

prepare_mkl.sh

+3-4
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,10 @@ echo $VERSION_LINE # Return Version Line
6464
# MKL
6565
DST=`dirname $0`
6666
OMP=0
67-
VERSION_MATCH=20160706
68-
ARCHIVE_BASENAME=mklml_lnx_2017.0.0.20160801.tgz
67+
VERSION_MATCH=20120601
68+
ARCHIVE_BASENAME=mklml_lnx_2017.0.1.20161005.tgz
6969
MKL_CONTENT_DIR=`echo $ARCHIVE_BASENAME | rev | cut -d "." -f 2- | rev`
70-
GITHUB_RELEASE_TAG=self_containted_MKLGOLD
71-
MKLURL="https://github.com/intel/caffe/releases/download/$GITHUB_RELEASE_TAG/$ARCHIVE_BASENAME"
70+
MKLURL="https://github.com/dmlc/web-data/raw/master/mxnet/mklml-release/$ARCHIVE_BASENAME"
7271
# there are diffrent MKL lib to be used for GCC and for ICC
7372
reg='^[0-9]+$'
7473
VERSION_LINE=`GetVersionName $MKLROOT`

0 commit comments

Comments
 (0)