Skip to content
Open
Changes from 1 commit
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
d625394
Make mxnet build successful in CPU
apeforest Nov 30, 2018
02ab771
update required mxnet version
apeforest Dec 3, 2018
9abcc4e
remove outdated comment
apeforest Dec 3, 2018
cd096e4
remove commented line
apeforest Dec 3, 2018
f9b2083
Merge remote-tracking branch 'origin/mxnet_feature_fp16' into develop…
apeforest Dec 3, 2018
b0e2e58
fix test in CPU
apeforest Dec 4, 2018
dd4f9e2
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Dec 4, 2018
b617e14
refactor
apeforest Dec 4, 2018
84ed58e
Merge branch 'mxnet_feature_fp16' into develop/mxnet
yuxihu Dec 4, 2018
2b902ae
link nccl to mpi_lib for mxnet
yuxihu Dec 4, 2018
ff57e51
Merge branch 'develop/mxnet' of https://github.com/ctcyang/horovod in…
apeforest Dec 4, 2018
6013957
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Dec 4, 2018
bc47aa9
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Dec 19, 2018
297e79a
make mxnet build process the same as tensorflow
apeforest Dec 19, 2018
f28ba01
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Dec 28, 2018
ab78201
compute allreduce average in C++ to avoid perf deg
apeforest Dec 28, 2018
dc62625
rename variable
apeforest Dec 28, 2018
c56322f
add mxnet mnist example
apeforest Jan 1, 2019
4eb787e
fix lint
apeforest Jan 1, 2019
3e5491a
reduce epoch and acc check
apeforest Jan 2, 2019
9589209
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Jan 2, 2019
b42f0c5
broadcast initial parames
apeforest Jan 2, 2019
13adbb3
Update README
apeforest Jan 2, 2019
b4aa9f2
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Jan 3, 2019
f9c9d73
remove unused handle manager
apeforest Jan 3, 2019
dc96acc
renaming variable type
apeforest Jan 3, 2019
aaf3d7f
return non empty op name
apeforest Jan 4, 2019
0797570
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Jan 4, 2019
89ba103
scale learning rate by workers
apeforest Jan 4, 2019
60877b7
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Jan 4, 2019
b3a24db
refactor test_mxnet to make it easier to read
apeforest Jan 5, 2019
6e4b845
fix a bug in building on GPU
apeforest Jan 5, 2019
710c703
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Jan 5, 2019
0112e6a
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Jan 5, 2019
4a1c010
polish imagenet example
apeforest Jan 6, 2019
61741e8
add handle_manager
apeforest Jan 6, 2019
c24d0bd
error handling in MXNet
apeforest Jan 7, 2019
effd043
Merge branch 'mxnet_feature_fp16' into develop/mxnet
apeforest Jan 7, 2019
1c9443f
add exception handling
apeforest Jan 8, 2019
9b9bab1
rename c_api_common
apeforest Jan 8, 2019
2d64e05
wrap MXNet C API with exception handling
apeforest Jan 8, 2019
1cd08be
remove unused function declaration
apeforest Jan 9, 2019
77cbb8b
fix a typo
apeforest Jan 9, 2019
4f1a626
fix a bug
apeforest Jan 9, 2019
c1c476c
fix build error
apeforest Jan 9, 2019
51f81d0
Merge branch 'mxnet_feature_fp16' into develop/mxnet
Jan 14, 2019
75c56f7
Merge remote-tracking branch 'origin/mxnet_feature_fp16' into develop…
Jan 14, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix test in CPU
  • Loading branch information
apeforest committed Dec 4, 2018
commit b0e2e58ba4c7f2fde7d3ca0962f6aa924f2dbe9a
185 changes: 51 additions & 134 deletions test/test_mxnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,20 @@ class MXTests(unittest.TestCase):
Tests for ops in horovod.mxnet.
"""

def _is_test_for_gpu(self):
return mx.current_context().device_type == 'gpu'

def test_horovod_allreduce(self):
"""Test that the allreduce correctly sums 1D, 2D, 3D tensors."""
hvd.init()
size = hvd.size()
dtypes = ['int32', 'int64',
'float32', 'float64']
dims = [1, 2, 3]
dev = mx.gpu(hvd.local_rank())
if self._is_test_for_gpu():
dev = mx.gpu(hvd.local_rank())
else:
dev = mx.current_context()
count = 0
shapes = [(), (17), (17, 17), (17, 17, 17)]
for dtype, dim in itertools.product(dtypes, dims):
Expand Down Expand Up @@ -77,7 +83,10 @@ def test_horovod_allreduce_average(self):
dtypes = ['int32', 'int64',
'float32', 'float64']
dims = [1, 2, 3]
dev = mx.gpu(hvd.local_rank())
if self._is_test_for_gpu():
dev = mx.gpu(hvd.local_rank())
else:
dev = mx.current_context()
count = 0
shapes = [(), (17), (17, 17), (17, 17, 17)]
for dtype, dim in itertools.product(dtypes, dims):
Expand Down Expand Up @@ -115,7 +124,10 @@ def test_horovod_allreduce_inplace(self):
dtypes = ['int32', 'int64',
'float32', 'float64']
dims = [1, 2, 3]
dev = mx.gpu(hvd.local_rank())
if self._is_test_for_gpu():
dev = mx.gpu(hvd.local_rank())
else:
dev = mx.current_context()
count = 0
shapes = [(), (17), (17, 17), (17, 17, 17)]
for dtype, dim in itertools.product(dtypes, dims):
Expand Down Expand Up @@ -159,7 +171,10 @@ def test_horovod_allreduce_error(self):
hvd.init()
rank = hvd.rank()
size = hvd.size()
dev = mx.gpu(hvd.local_rank())
if self._is_test_for_gpu():
dev = mx.gpu(hvd.local_rank())
else:
dev = mx.current_context()

# This test does not apply if there is only one worker.
if size == 1:
Expand All @@ -182,7 +197,10 @@ def test_horovod_allreduce_rank_error(self):
hvd.init()
rank = hvd.rank()
size = hvd.size()
dev = mx.gpu(hvd.local_rank())
if self._is_test_for_gpu():
dev = mx.gpu(hvd.local_rank())
else:
dev = mx.current_context()

# This test does not apply if there is only one worker.
if size == 1:
Expand All @@ -208,7 +226,10 @@ def test_horovod_allreduce_type_error(self):
hvd.init()
rank = hvd.rank()
size = hvd.size()
dev = mx.gpu(hvd.local_rank())
if self._is_test_for_gpu():
dev = mx.gpu(hvd.local_rank())
else:
dev = mx.current_context()

# This test does not apply if there is only one worker.
if size == 1:
Expand All @@ -233,7 +254,10 @@ def test_horovod_allreduce_cpu_gpu_error(self):
hvd.init()
rank = hvd.rank()
size = hvd.size()
dev = mx.gpu(hvd.local_rank())
if self._is_test_for_gpu():
dev = mx.gpu(hvd.local_rank())
else:
dev = mx.current_context()

# This test does not apply if there is only one worker.
if size == 1:
Expand All @@ -252,128 +276,6 @@ def test_horovod_allreduce_cpu_gpu_error(self):
except Exception as e:
print(e)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove print or keep it?


# Currently, MXNet doesn't track gradient of hvd.allreduce op
#def test_horovod_allreduce_grad(self):

# Currently, MXNet doesn't track gradient of hvd.allreduce op
#def test_horovod_allreduce_grad_average(self):

@unittest.skip("")
def test_horovod_allgather(self):
"""Test that the allgather correctly gathers 1D, 2D, 3D tensors."""
hvd.init()
rank = hvd.rank()
size = hvd.size()

dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor]
if torch.cuda.is_available():
dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor,
torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor,
torch.cuda.DoubleTensor]
dims = [1, 2, 3]
for dtype, dim in itertools.product(dtypes, dims):
tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank)
tensor = tensor.type(dtype)
gathered = hvd.allgather(tensor)

assert list(gathered.shape) == [17 * size] + [17] * (dim - 1)

for i in range(size):
rank_tensor = gathered[i * 17:(i + 1) * 17]
assert list(rank_tensor.shape) == [17] * dim, \
'hvd.allgather produces incorrect gathered shape'
assert rank_tensor.data.min() == i, 'hvd.allgather produces incorrect gathered tensor'
assert rank_tensor.data.max() == i, 'hvd.allgather produces incorrect gathered tensor'

@unittest.skip("")
def test_horovod_allgather_variable_size(self):
"""Test that the allgather correctly gathers 1D, 2D, 3D tensors,
even if those tensors have different sizes along the first dim."""
hvd.init()
rank = hvd.rank()
size = hvd.size()

dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor]
if torch.cuda.is_available():
dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor,
torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor,
torch.cuda.DoubleTensor]
dims = [1, 2, 3]
for dtype, dim in itertools.product(dtypes, dims):
# Support tests up to MPI Size of 35
if size > 35:
break

tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5
tensor_sizes = tensor_sizes[:size]

tensor = torch.FloatTensor(
*([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank)
tensor = tensor.type(dtype)
gathered = hvd.allgather(tensor)

expected_size = sum(tensor_sizes)
assert list(gathered.shape) == [expected_size] + [17] * (dim - 1)

for i in range(size):
rank_size = [tensor_sizes[i]] + [17] * (dim - 1)
rank_tensor = gathered[sum(
tensor_sizes[:i]):sum(tensor_sizes[:i + 1])]
assert list(rank_tensor.shape) == rank_size
assert rank_tensor.data.min() == i
assert rank_tensor.data.max() == i

@unittest.skip("")
def test_horovod_allgather_error(self):
"""Test that the allgather returns an error if any dimension besides
the first is different among the tensors being gathered."""
hvd.init()
rank = hvd.rank()
size = hvd.size()

# This test does not apply if there is only one worker.
if size == 1:
return

tensor_size = [17] * 3
tensor_size[1] = 10 * (rank + 1)
tensor = torch.FloatTensor(*tensor_size).fill_(1).mul_(rank)

try:
hvd.allgather(tensor)
assert False, 'hvd.allgather did not throw error'
except torch.FatalError:
pass

@unittest.skip("")
def test_horovod_allgather_type_error(self):
"""Test that the allgather returns an error if the types being gathered
differ among the processes"""
hvd.init()
rank = hvd.rank()
size = hvd.size()

# This test does not apply if there is only one worker.
if size == 1:
return

tensor_size = [17] * 3
if rank % 2 == 0:
tensor = torch.IntTensor(*tensor_size)
else:
tensor = torch.FloatTensor(*tensor_size)

try:
hvd.allgather(tensor)
assert False, 'hvd.allgather did not throw error'
except torch.FatalError:
pass

# MXNet doesn't track gradient of hvd.allgather
#def test_horovod_allgather_grad(self):
"""Test the correctness of the allgather gradient."""

def test_horovod_broadcast(self):
"""Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors."""
Expand All @@ -388,7 +290,10 @@ def test_horovod_broadcast(self):
dtypes = ['int32', 'int64',
'float32', 'float64']
dims = [1, 2, 3]
dev = mx.gpu(hvd.local_rank())
if self._is_test_for_gpu():
dev = mx.gpu(hvd.local_rank())
else:
dev = mx.current_context()
count = 0
shapes = [(), (17), (17, 17), (17, 17, 17)]
root_ranks = list(range(size))
Expand Down Expand Up @@ -432,7 +337,10 @@ def test_horovod_broadcast_inplace(self):
dtypes = ['int32', 'int64',
'float32', 'float64']
dims = [1, 2, 3]
dev = mx.gpu(hvd.local_rank())
if self._is_test_for_gpu():
dev = mx.gpu(hvd.local_rank())
else:
dev = mx.current_context()
count = 0
shapes = [(), (17), (17, 17), (17, 17, 17)]
root_ranks = list(range(size))
Expand Down Expand Up @@ -491,7 +399,10 @@ def test_horovod_broadcast_type_error(self):
hvd.init()
rank = hvd.rank()
size = hvd.size()
dev = mx.gpu(hvd.local_rank())
if self._is_test_for_gpu():
dev = mx.gpu(hvd.local_rank())
else:
dev = mx.current_context()

# This test does not apply if there is only one worker.
if size == 1:
Expand All @@ -515,7 +426,10 @@ def test_horovod_broadcast_rank_error(self):
hvd.init()
rank = hvd.rank()
size = hvd.size()
dev = mx.gpu(hvd.local_rank())
if self._is_test_for_gpu():
dev = mx.gpu(hvd.local_rank())
else:
dev = mx.current_context()

# This test does not apply if there is only one worker.
if size == 1:
Expand All @@ -542,7 +456,10 @@ def test_horovod_broadcast_grad(self):
dtypes = ['int32', 'int64',
'float32', 'float64']
dims = [1, 2, 3]
dev = mx.gpu(hvd.local_rank())
if self._is_test_for_gpu():
dev = mx.gpu(hvd.local_rank())
else:
dev = mx.current_context()
count = 0
shapes = [(), (17), (17, 17), (17, 17, 17)]
root_rank = 1
Expand Down