Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.

[MXNET-331] Single machine All Reduce Topology-aware Communication #11357

Closed
wants to merge 27 commits into from
Closed
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
9678143
add multiroot all-reduce communication pattern
Jun 4, 2018
d5e51d6
fix bug with UpdateWeight
Jun 4, 2018
0708dbc
fix PCI-E links appearing in weight matrix bug
Jun 4, 2018
5590920
optimization to skip CopyFromTo in ReduceInner gains a bit of throughput
Jun 4, 2018
4f8f58b
remove unnecessary if statement
Jun 5, 2018
908534a
Add tests
Jun 15, 2018
25cbbdc
add more tests, 6 tests left to add
Jun 16, 2018
310ee4d
get rid of some dead code
Jun 16, 2018
9cce8ea
Add comments
Jun 18, 2018
4d2790d
Add randomized tests for backtrack and kernighan-lin
Jun 18, 2018
b5b42bc
Fix Postprocess
Jun 18, 2018
6327ceb
Add switch for first valid tree when num_gpus > 8, and for maximum we…
Jun 18, 2018
8694fe7
Kernighan-Lin seems to find better trees
Jun 18, 2018
c6cd67a
get rid of printfs
Jun 20, 2018
7466c4d
change defaults
Jun 21, 2018
153ec0b
Merge branch 'feature_multirootv9' of https://github.com/ctcyang/incu…
Jun 21, 2018
7c61b6c
Merge branch 'master' of https://github.com/apache/incubator-mxnet in…
Jun 21, 2018
cc935a2
inherit from CommDevice instead of Comm
Jun 22, 2018
ba60aaa
Fix lint errors
Jun 22, 2018
972e9c0
Add Python test using MXNET_KVSTORE_USETREE, fix CMake compilation pr…
Jun 27, 2018
6627dcf
fix lint errors
Jun 27, 2018
4de89a7
better header guard that works for tests
Jun 27, 2018
317c66b
get rid of unused variable warning
Jun 27, 2018
c364fd3
retrigger jenkins
Jun 28, 2018
3241d71
resolve 2 comments
Jun 29, 2018
bd926bf
address comment using Class to do test, get rid of extraneous test, u…
Jul 2, 2018
0e1a704
resolve merge conflicts
Jul 2, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
change defaults
Carl Yang committed Jun 21, 2018
commit 7466c4deb24e28f0ba1187a36cced34baf80fdce
10 changes: 5 additions & 5 deletions src/kvstore/comm_tree.h
Original file line number Diff line number Diff line change
@@ -50,7 +50,7 @@ class CommDeviceTree : public Comm {
public:
CommDeviceTree() {
inited_ = false;
bigarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_BIGARRAY_BOUND", 10000000);
gpuarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_GPUARRAY_BOUND", 10000000);
backtrack_ = dmlc::GetEnv("MXNET_KVSTORE_BACKTRACK", 0);
link_usage_penalty_ = dmlc::GetEnv("MXNET_KVSTORE_LINK_USAGE_PENALTY", 0.7);
stream_ = dmlc::GetEnv("MXNET_KVSTORE_STREAM", 1);
@@ -205,7 +205,7 @@ class CommDeviceTree : public Comm {
const NDArrayStorageType stype = src[0].storage_type();
// normal dense reduce
if (stype == kDefaultStorage) {
if (total_size > bigarray_bound_ && first_size >= devs_.size()) {
if (total_size > gpuarray_bound_ && first_size >= devs_.size()) {
// Find slice bounds
slice_scan[0] = 0;
int slice_size = (first_size + devs_.size()-1)/devs_.size();
@@ -356,7 +356,7 @@ class CommDeviceTree : public Comm {
} else {
int total_size = src.shape().Size();
unsigned first_size = src.shape()[0];
if (total_size > bigarray_bound_ && first_size >= devs_.size()) {
if (total_size > gpuarray_bound_ && first_size >= devs_.size()) {
std::vector<int> slice_scan(devs_.size()+1);
slice_scan[0] = 0;
int slice_size = (dst[0]->shape()[0]+devs_.size()-1)/devs_.size();
@@ -548,7 +548,7 @@ class CommDeviceTree : public Comm {
TShape shape_copy = shape;
int total_size = shape.Size();
unsigned first_size = shape[0];
if (total_size > bigarray_bound_ && first_size >= devs_.size()) {
if (total_size > gpuarray_bound_ && first_size >= devs_.size()) {
// Find slice bounds
int slice_size = (first_size+devs_.size()-1)/devs_.size();
int last_slice = first_size-(devs_.size()-1)*slice_size;
@@ -644,7 +644,7 @@ class CommDeviceTree : public Comm {
/// \brief Highest numbered device
int max_dev_;
int depth_;
int bigarray_bound_;
int gpuarray_bound_;
bool inited_;
bool stream_;
bool backtrack_;
2 changes: 1 addition & 1 deletion src/kvstore/kvstore_local.h
Original file line number Diff line number Diff line change
@@ -57,7 +57,7 @@ class KVStoreLocal : public KVStore {
*/
explicit KVStoreLocal(bool use_device_comm) : KVStore() {
if (use_device_comm) {
bool tree = dmlc::GetEnv("MXNET_KVSTORE_USETREE", 1);
bool tree = dmlc::GetEnv("MXNET_KVSTORE_USETREE", 0);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we also have python gpu kvstore test with MXNET_KVSTORE_USETREE set?

if (tree) {
comm_ = new CommDeviceTree();
} else {