Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
8aa34ac
add distributed ci
sljlp Jul 6, 2022
0938aec
remote notes
sljlp Jul 6, 2022
074a033
update
sljlp Jul 6, 2022
e9a7212
update
sljlp Jul 6, 2022
b8f1420
update cmakelists
sljlp Jul 6, 2022
7b0cc34
fix for current ci
sljlp Jul 7, 2022
3e1ea1a
update for current ci
sljlp Jul 7, 2022
3c35ba5
Update dygraph_hybrid_dp.py
sljlp Jul 7, 2022
5e5ed3f
Update test_dygraph_hybrid_dp.py
sljlp Jul 7, 2022
a22e0eb
update cmake
sljlp Jul 7, 2022
ca02352
update
sljlp Jul 7, 2022
3f2c0e6
update
sljlp Jul 7, 2022
288d7e4
rm test_dygrapg_hybrid_dp
sljlp Jul 7, 2022
739e8ee
add common.py
sljlp Jul 11, 2022
36a8d3c
add dygraph_hybrid_dpppmp
sljlp Jul 12, 2022
bd56622
add hybrid test: dp+mp+pp
sljlp Jul 13, 2022
9189f67
prettify
sljlp Jul 13, 2022
e657356
recompoute test and combine hybrid tests
sljlp Jul 13, 2022
f0f6662
update date
sljlp Jul 13, 2022
6140d0d
update
sljlp Jul 13, 2022
8b196fe
rm multinode test dpppmp
sljlp Jul 13, 2022
2df91a5
rename
sljlp Jul 13, 2022
76fc368
update sharding test
sljlp Jul 14, 2022
dbef960
change 16 to 8 cards for dp test
sljlp Jul 14, 2022
77dfc8e
Update multinode_dist_test.sh
sljlp Jul 19, 2022
f9eca4b
update test base
sljlp Jul 19, 2022
a775de7
Merge branch 'dist-ci' of https://github.com/sljlp/Paddle into dist-ci
sljlp Jul 19, 2022
b635886
update
sljlp Jul 19, 2022
86fa86c
Merge branch 'develop' into test
sljlp Jul 19, 2022
d331de2
Merge commit 'refs/pull/44085/head' of https://github.com/PaddlePaddl…
sljlp Jul 19, 2022
aa2a89d
update for 2 nodes
sljlp Jul 20, 2022
e15fdea
fix mpirun
sljlp Jul 20, 2022
21a3c7e
Merge commit 'refs/pull/44085/head' of https://github.com/PaddlePaddl…
sljlp Jul 20, 2022
1914f80
update for 2 nodes
sljlp Jul 21, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ include(simd)
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_MULTINODE_TESTING "Test multinode apis and ops" OFF)
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
Expand Down
27 changes: 27 additions & 0 deletions python/paddle/fluid/tests/unittests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1
FLAGS_memory_fraction_of_eager_deletion=1.0)
set(dist_ENVS http_proxy="" https_proxy="")

file(
GLOB MULTINODE_DIST_TEST_OPS
RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
"test_multinode_*.py")
string(REPLACE ".py" "" MULTINODE_DIST_TEST_OPS "${MULTINODE_DIST_TEST_OPS}")

file(
GLOB DIST_TEST_OPS
RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
Expand Down Expand Up @@ -78,6 +84,11 @@ list(APPEND DIST_TEST_OPS test_collective_batch_isend_irecv)
list(APPEND DIST_TEST_OPS test_collective_reduce_scatter)
set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
#remove distribute unittests.

foreach(TEST_OP ${MULTINODE_DIST_TEST_OPS})
list(APPEND MIXED_DIST_TEST_OPS ${TEST_OP})
endforeach()

list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
list(APPEND MIXED_DIST_TEST_OPS test_dgc_momentum_op)
list(APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer)
Expand Down Expand Up @@ -135,6 +146,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_mppp)
list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_dpmppp)
list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_cost_model)
list(APPEND MIXED_DIST_TEST_OPS test_tcp_store)
list(APPEND MIXED_DIST_TEST_OPS test_dygraph_hybrid_dp)
foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
list(REMOVE_ITEM TEST_OPS ${TEST_OP})
endforeach()
Expand Down Expand Up @@ -957,6 +969,21 @@ if(WITH_DISTRIBUTE)
PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
endif()

# add new dist test
if(WITH_DISTRIBUTE AND WITH_MULTINODE_TESTING)
foreach(TEST_OP ${MULTINODE_DIST_TEST_OPS})
bash_test_modules(
${TEST_OP}
START_BASH
multinode_dist_test.sh
LABELS
"RUN_TYPE=EXCLUSIVE"
ENVS
"PADDLE_DIST_UT_PORT=${dist_ut_port}")
endforeach()

endif()

# port range (20000, 23000) is reserved for dist-ops
foreach(TEST_OP ${DIST_TEST_OPS})
bash_test_modules(
Expand Down
60 changes: 60 additions & 0 deletions python/paddle/fluid/tests/unittests/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from paddle.distributed import fleet


def init_parallel_env(mode, global_batch_size, seed=1024):
'''
Args:
mode:(str) DP1-MP1-PP1-SH1-O1
'''

def parse_mode(mode):
assert "DP" == mode[:2]
assert "-MP" in mode
assert "-PP" in mode
assert "-SH" in mode
assert "-O" in mode
modes = mode.split("-")
DP = int(modes[0][2:])
MP = int(modes[1][2:])
PP = int(modes[2][2:])
SH = int(modes[3][2:])
Ostage = int(modes[4][1:])
return DP, MP, PP, SH, Ostage

DP, MP, PP, SH, Ostage = parse_mode(mode)

strategy = fleet.DistributedStrategy()
strategy.hybrid_configs = {
"dp_degree": DP,
"mp_degree": MP,
"pp_degree": PP,
"sharding_degree": SH
}

accumulate_steps = 1

if PP > 1:
strategy.pipeline_configs = {
"accumulate_steps": accumulate_steps,
"micro_batch_size": global_batch_size // DP // accumulate_steps
}

# set control in tensor parallel
strategy.tensor_parallel_configs = {"tensor_init_seed": seed}
fleet.init(is_collective=True, strategy=strategy)

return fleet.get_hybrid_communicate_group()
67 changes: 67 additions & 0 deletions python/paddle/fluid/tests/unittests/dygraph_hybrid_dp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import numpy as np
import argparse
import os
import sys
import signal
import time
import socket
from contextlib import closing
from six import string_types
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
import paddle.fluid.unique_name as nameGen
from paddle.fluid import core
import paddle.distributed.fleet as fleet
from paddle.fluid.incubate.fleet.base import role_maker
import unittest
from multiprocessing import Process
import paddle.fluid.layers as layers
from functools import reduce
from test_collective_multi_nodes import TestCollectiveAPIRunnerBase, runtime_main


class TestDygrapgHybridDP(TestCollectiveAPIRunnerBase):

def __init__(self):
pass

def check_pass(self, *args, **kwargs):
from common import init_parallel_env
import paddle
from paddle.distributed import fleet
hcg = init_parallel_env("DP16-MP1-PP1-SH1-O1", 2)
import numpy as np
dp_group = hcg.get_data_parallel_group()
np.random.seed(1024)
data = np.random.random((10 * dp_group.nranks, 100)).reshape(
(dp_group.nranks, -1, 100))
data_part = paddle.to_tensor(data[dp_group.rank])
paddle.distributed.collective.all_reduce(data_part)
data_reduced = data_part
data_sumed = np.sum(data, axis=0)
assert np.allclose(data_sumed,
data_reduced.numpy(),
rtol=1e-8,
atol=1e-8)


if __name__ == "__main__":
runtime_main(TestDygrapgHybridDP, "dp")
Loading