PaddlePaddle · FeixLiu · Aug 1, 2022 · Jul 6, 2022 · Jul 6, 2022 · Jul 6, 2022
@@ -243,6 +243,7 @@ include(simd)
 option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
 option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
 option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
+option(WITH_MULTINODE_TESTING "Test multinode apis and ops" OFF)
 option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
 option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
 option(WITH_DISTRIBUTE "Compile with distributed support" OFF)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -7,6 +7,12 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1
             FLAGS_memory_fraction_of_eager_deletion=1.0)
 set(dist_ENVS http_proxy="" https_proxy="")
 
+file(
+  GLOB MULTINODE_DIST_TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_multinode_*.py")
+string(REPLACE ".py" "" MULTINODE_DIST_TEST_OPS "${MULTINODE_DIST_TEST_OPS}")
+
 file(
   GLOB DIST_TEST_OPS
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
@@ -78,6 +84,11 @@ list(APPEND DIST_TEST_OPS test_collective_batch_isend_irecv)
 list(APPEND DIST_TEST_OPS test_collective_reduce_scatter)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
+
+foreach(TEST_OP ${MULTINODE_DIST_TEST_OPS})
+  list(APPEND MIXED_DIST_TEST_OPS ${TEST_OP})
+endforeach()
+
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_momentum_op)
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer)
@@ -135,6 +146,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_mppp)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_dpmppp)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_cost_model)
 list(APPEND MIXED_DIST_TEST_OPS test_tcp_store)
+list(APPEND MIXED_DIST_TEST_OPS test_dygraph_hybrid_dp)
 foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
@@ -957,6 +969,21 @@ if(WITH_DISTRIBUTE)
                         PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
     endif()
 
+    # add new dist test
+    if(WITH_DISTRIBUTE AND WITH_MULTINODE_TESTING)
+      foreach(TEST_OP ${MULTINODE_DIST_TEST_OPS})
+        bash_test_modules(
+          ${TEST_OP}
+          START_BASH
+          multinode_dist_test.sh
+          LABELS
+          "RUN_TYPE=EXCLUSIVE"
+          ENVS
+          "PADDLE_DIST_UT_PORT=${dist_ut_port}")
+      endforeach()
+
+    endif()
+
     # port range (20000, 23000) is reserved for dist-ops
     foreach(TEST_OP ${DIST_TEST_OPS})
       bash_test_modules(

diff --git a/python/paddle/fluid/tests/unittests/common.py b/python/paddle/fluid/tests/unittests/common.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed import fleet
+
+
+def init_parallel_env(mode, global_batch_size, seed=1024):
+    '''
+        Args:
+            mode:(str) DP1-MP1-PP1-SH1-O1
+    '''
+
+    def parse_mode(mode):
+        assert "DP" == mode[:2]
+        assert "-MP" in mode
+        assert "-PP" in mode
+        assert "-SH" in mode
+        assert "-O" in mode
+        modes = mode.split("-")
+        DP = int(modes[0][2:])
+        MP = int(modes[1][2:])
+        PP = int(modes[2][2:])
+        SH = int(modes[3][2:])
+        Ostage = int(modes[4][1:])
+        return DP, MP, PP, SH, Ostage
+
+    DP, MP, PP, SH, Ostage = parse_mode(mode)
+
+    strategy = fleet.DistributedStrategy()
+    strategy.hybrid_configs = {
+        "dp_degree": DP,
+        "mp_degree": MP,
+        "pp_degree": PP,
+        "sharding_degree": SH
+    }
+
+    accumulate_steps = 1
+
+    if PP > 1:
+        strategy.pipeline_configs = {
+            "accumulate_steps": accumulate_steps,
+            "micro_batch_size": global_batch_size // DP // accumulate_steps
+        }
+
+    # set control in tensor parallel
+    strategy.tensor_parallel_configs = {"tensor_init_seed": seed}
+    fleet.init(is_collective=True, strategy=strategy)
+
+    return fleet.get_hybrid_communicate_group()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_hybrid_dp.py b/python/paddle/fluid/tests/unittests/dygraph_hybrid_dp.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import paddle.distributed.fleet as fleet
+from paddle.fluid.incubate.fleet.base import role_maker
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_multi_nodes import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestDygrapgHybridDP(TestCollectiveAPIRunnerBase):
+
+    def __init__(self):
+        pass
+
+    def check_pass(self, *args, **kwargs):
+        from common import init_parallel_env
+        import paddle
+        from paddle.distributed import fleet
+        hcg = init_parallel_env("DP16-MP1-PP1-SH1-O1", 2)
+        import numpy as np
+        dp_group = hcg.get_data_parallel_group()
+        np.random.seed(1024)
+        data = np.random.random((10 * dp_group.nranks, 100)).reshape(
+            (dp_group.nranks, -1, 100))
+        data_part = paddle.to_tensor(data[dp_group.rank])
+        paddle.distributed.collective.all_reduce(data_part)
+        data_reduced = data_part
+        data_sumed = np.sum(data, axis=0)
+        assert np.allclose(data_sumed,
+                           data_reduced.numpy(),
+                           rtol=1e-8,
+                           atol=1e-8)
+
+
+if __name__ == "__main__":
+    runtime_main(TestDygrapgHybridDP, "dp")