Remove PROCESS GROUP rpc backend (pytorch#62411)

H-Huang · facebook-github-bot · commit dc1bd6acee95 · 2021-08-02T12:26:22.000-07:00
Summary: Pull Request resolved: pytorch#62411 Test Plan: Imported from OSS Reviewed By: mrshenli Differential Revision: D29990408 Pulled By: H-Huang fbshipit-source-id: 183d3b316767b12993cebbe32b73c2850fd1cc42
diff --git a/docs/source/rpc.rst b/docs/source/rpc.rst
@@ -191,52 +191,6 @@ Example::
     :inherited-members:
 
 
-Process Group Backend
-"""""""""""""""""""""
-
-.. warning ::
-     The Process Group Backend will be deprecated soon, we recommend using the
-     TensorPipe Backend instead.
-
-The Process Group agent instantiates a process group from
-the :mod:`~torch.distributed` module and utilizes its point-to-point
-communication capabilities to send RPC messages. Internally, the process
-group uses `the Gloo library <https://github.com/facebookincubator/gloo/>`_.
-
-Gloo has been hardened by years of extensive use in PyTorch and is thus very
-reliable. However, as it was designed to perform collective communication, it
-may not always be the best fit for RPC. For example, each networking operation
-is synchronous and blocking, which means that it cannot be run in parallel with
-others. Moreover, it opens a connection between all pairs of nodes, and brings
-down all of them when one fails, thus reducing the resiliency and the elasticity
-of the system.
-
-Example::
-
-    >>> import os
-    >>> from torch.distributed import rpc
-    >>> os.environ['MASTER_ADDR'] = 'localhost'
-    >>> os.environ['MASTER_PORT'] = '29500'
-    >>>
-    >>> rpc.init_rpc(
-    >>>     "worker1",
-    >>>     rank=0,
-    >>>     world_size=2,
-    >>>     backend=rpc.BackendType.PROCESS_GROUP,
-    >>>     rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(
-    >>>         num_send_recv_threads=16,
-    >>>         rpc_timeout=20 # 20 second timeout
-    >>>     )
-    >>> )
-    >>>
-    >>> # omitting init_rpc invocation on worker2
-
-
-.. autoclass:: ProcessGroupRpcBackendOptions
-    :members:
-    :inherited-members:
-
-
 .. _rref:
 
 RRef
diff --git a/test/distributed/test_store.py b/test/distributed/test_store.py
@@ -197,7 +197,7 @@ def test_init_pg_and_rpc_with_same_socket(self):
             world_size=1,
         )
 
-        backend_opts = rpc.ProcessGroupRpcBackendOptions(
+        backend_opts = rpc.TensorPipeRpcBackendOptions(
             init_method=f"tcp://{addr}:{port}"
         )
         rpc.init_rpc(
diff --git a/torch/_C/_distributed_rpc.pyi b/torch/_C/_distributed_rpc.pyi
@@ -8,7 +8,6 @@ from ._distributed_c10d import ProcessGroup, Store
 
 # This module is defined in torch/csrc/distributed/rpc/init.cpp
 
-_DEFAULT_NUM_SEND_RECV_THREADS: int
 _DEFAULT_INIT_METHOD: str
 _DEFAULT_NUM_WORKER_THREADS: int
 _UNSET_RPC_TIMEOUT: float
@@ -66,36 +65,6 @@ class PyRRef:
     def __repr__(self) -> str: ...
     ...
 
-class ProcessGroupRpcBackendOptions(RpcBackendOptions):
-    num_send_recv_threads: int
-    def __init__(
-        self,
-        num_send_recv_threads: int,
-        rpc_timeout: float,
-        init_method: str
-    ): ...
-
-class ProcessGroupAgent(RpcAgent):
-    def __init__(
-        self,
-        store: Store,
-        worker_name: str,
-        pg: ProcessGroup,
-        numSendRecvThreads: int,
-        rpcTimeout: timedelta
-    ): ...
-    @overload
-    def get_worker_info(self) -> WorkerInfo: ...
-    @overload
-    def get_worker_info(self, workerName: str) -> WorkerInfo: ...
-    @overload
-    def get_worker_info(self, id: int) -> WorkerInfo: ...
-    def get_worker_infos(self) -> List[WorkerInfo]: ...
-    def _get_device_map(self, dst: WorkerInfo) -> Dict[torch.device, torch.device]: ...
-    def join(self): ...
-    def shutdown(self): ...
-    def sync(self): ...
-
 class _TensorPipeRpcBackendOptionsBase(RpcBackendOptions):
     num_worker_threads: int
     device_maps: Dict[str, Dict[torch.device, torch.device]]
diff --git a/torch/_C/_distributed_rpc_testing.pyi b/torch/_C/_distributed_rpc_testing.pyi
@@ -1,8 +1,6 @@
 import torch
 from ._distributed_c10d import ProcessGroup, Store
 from ._distributed_rpc import (
-    ProcessGroupAgent,
-    ProcessGroupRpcBackendOptions,
     _TensorPipeRpcBackendOptionsBase,
     TensorPipeAgent,
     WorkerInfo,
@@ -12,21 +10,6 @@ from datetime import timedelta
 
 # This module is defined in torch/csrc/distributed/rpc/testing/init.cpp
 
-class FaultyProcessGroupRpcBackendOptions(ProcessGroupRpcBackendOptions):
-    def __init__(
-        self,
-        num_send_recv_threads: int,
-        rpc_timeout: float,
-        init_method: str,
-        messages_to_fail: List[str],
-        messages_to_delay: Dict[str, float],
-        num_fail_sends: int,
-    ): ...
-    num_send_recv_threads: int
-    messages_to_fail: List[str]
-    messages_to_delay: Dict[str, float]
-    num_fail_sends: int
-
 class FaultyTensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
     def __init__(
         self,
@@ -42,28 +25,6 @@ class FaultyTensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
     messages_to_delay: Dict[str, float]
     num_fail_sends: int
 
-class FaultyProcessGroupAgent(ProcessGroupAgent):
-    def __init__(
-        self,
-        store: Store,
-        name: str,
-        process_group: ProcessGroup,
-        num_send_recv_threads: int,
-        rpc_timeout: timedelta,
-        messages_to_fail: List[str],
-        messages_to_delay: Dict[str, float],
-        num_fail_sends: int,
-    ): ...
-    def join(self): ...
-    def shutdown(self): ...
-    @overload
-    def get_worker_info(self) -> WorkerInfo: ...
-    @overload
-    def get_worker_info(self, workerName: str) -> WorkerInfo: ...
-    @overload
-    def get_worker_info(self, id: int) -> WorkerInfo: ...
-    def get_worker_infos(self) -> List[WorkerInfo]: ...
-
 class FaultyTensorPipeAgent(TensorPipeAgent):
     def __init__(
         self,
diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
@@ -1,6 +1,5 @@
 #include <torch/csrc/python_headers.h>
 
-#include <torch/csrc/distributed/rpc/process_group_agent.h>
 #include <torch/csrc/distributed/rpc/profiler/remote_profiler_manager.h>
 #include <torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h>
 #include <torch/csrc/distributed/rpc/py_rref.h>
@@ -514,97 +513,6 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
           // not releasing GIL to avoid context switch
           .def("__repr__", &PyRRef::str);
 
-  shared_ptr_class_<ProcessGroupRpcBackendOptions>(
-      module,
-      "ProcessGroupRpcBackendOptions",
-      rpcBackendOptions,
-      R"(
-          The backend options class for ``ProcessGroupAgent``, which is derived
-          from ``RpcBackendOptions``.
-
-          Args:
-              num_send_recv_threads (int, optional): The number of threads in
-                  the thread-pool used by ``ProcessGroupAgent`` (default: 4).
-              rpc_timeout (float, optional): The default timeout, in seconds,
-                  for RPC requests (default: 60 seconds). If the
-                  RPC has not completed in this timeframe, an exception
-                  indicating so will be raised. Callers can override this
-                  timeout for individual RPCs in
-                  :meth:`~torch.distributed.rpc.rpc_sync` and
-                  :meth:`~torch.distributed.rpc.rpc_async` if necessary.
-              init_method (str, optional): The URL to initialize
-                  ``ProcessGroupGloo`` (default: ``env://``).
-      )")
-      .def(
-          py::init<int, float, std::string>(),
-          py::arg("num_send_recv_threads") = kDefaultNumSendRecvThreads,
-          py::arg("rpc_timeout") = kDefaultRpcTimeoutSeconds,
-          py::arg("init_method") = kDefaultInitMethod)
-      .def_readwrite(
-          "num_send_recv_threads",
-          &ProcessGroupRpcBackendOptions::numSendRecvThreads,
-          R"(
-              The number of threads in the thread-pool used by ProcessGroupAgent.
-          )");
-
-  module.attr("_DEFAULT_NUM_SEND_RECV_THREADS") =
-      py::cast(kDefaultNumSendRecvThreads);
-
-  shared_ptr_class_<ProcessGroupAgent>(module, "ProcessGroupAgent", rpcAgent)
-      .def(py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
-                       std::string workerName,
-                       const c10::intrusive_ptr<::c10d::ProcessGroup>& pg,
-                       int numSendRecvThreads,
-                       std::chrono::milliseconds rpcTimeout) {
-        return std::shared_ptr<ProcessGroupAgent>(
-            new ProcessGroupAgent(
-                store,
-                std::move(workerName),
-                pg,
-                numSendRecvThreads,
-                rpcTimeout,
-                std::make_unique<RequestCallbackImpl>()),
-            impl::destroy_without_gil<ProcessGroupAgent>);
-      }))
-      .def(
-          "get_worker_info",
-          (const WorkerInfo& (ProcessGroupAgent::*)(void) const) &
-              RpcAgent::getWorkerInfo,
-          py::call_guard<py::gil_scoped_release>())
-      .def(
-          "get_worker_info",
-          (const WorkerInfo& (ProcessGroupAgent::*)(const std::string&) const) &
-              ProcessGroupAgent::getWorkerInfo,
-          py::call_guard<py::gil_scoped_release>())
-      .def(
-          "get_worker_info",
-          (const WorkerInfo& (ProcessGroupAgent::*)(worker_id_t id) const) &
-              ProcessGroupAgent::getWorkerInfo,
-          py::call_guard<py::gil_scoped_release>())
-      .def(
-          "get_worker_infos",
-          (std::vector<WorkerInfo>(ProcessGroupAgent::*)() const) &
-              ProcessGroupAgent::getWorkerInfos,
-          py::call_guard<py::gil_scoped_release>())
-      .def(
-          "_get_device_map",
-          (DeviceMap(ProcessGroupAgent::*)(const WorkerInfo& dst) const) &
-              ProcessGroupAgent::getDeviceMap,
-          py::call_guard<py::gil_scoped_release>())
-      .def(
-          "join",
-          &ProcessGroupAgent::join,
-          py::call_guard<py::gil_scoped_release>(),
-          py::arg("shutdown") = false)
-      .def(
-          "shutdown",
-          &ProcessGroupAgent::shutdown,
-          py::call_guard<py::gil_scoped_release>())
-      .def(
-          "sync",
-          &ProcessGroupAgent::sync,
-          py::call_guard<py::gil_scoped_release>());
-
 #ifdef USE_TENSORPIPE
 
   // Base class: torch.distributed.rpc.RpcBackendOptions.
diff --git a/torch/csrc/distributed/rpc/message.h b/torch/csrc/distributed/rpc/message.h
@@ -12,7 +12,7 @@ enum RPCErrorType {
   UNKNOWN_ERROR = 0, /* Indicates that error type could not be parsed */
   TIMEOUT = 1, /* Indicates that the RPC has timed out */
   INTENTIONAL_FAILURE = 2 /* Deliberate failure, such as those injected by
-                             FaultyProcessGroupAgent for testing */
+                             FaultyAgent for testing */
 };
 
 // The enum values are bitwise ORed with MessageType
diff --git a/torch/csrc/distributed/rpc/testing/init.cpp b/torch/csrc/distributed/rpc/testing/init.cpp
@@ -31,8 +31,7 @@ PyObject* faulty_agent_init(PyObject* _unused, PyObject* noargs) {
       "_distributed_rpc_testing", "distributed rpc testing bindings");
   auto module = py::handle(m).cast<py::module>();
 
-  // Import the rpc_module so we can subclass ProcessGroupAgent and
-  // TensorPipeAgent
+  // Import the rpc_module so we can subclass TensorPipeAgent
   py::module rpc_module = py::module::import("torch.distributed.rpc");
 
   shared_ptr_class_<FaultyTensorPipeRpcBackendOptions>(
diff --git a/torch/distributed/rpc/__init__.py b/torch/distributed/rpc/__init__.py
@@ -49,15 +49,12 @@ def is_available():
         enable_gil_profiling,
         RpcBackendOptions,
         _TensorPipeRpcBackendOptionsBase,
-        ProcessGroupRpcBackendOptions,
         RpcAgent,
         PyRRef,
-        ProcessGroupAgent,
         TensorPipeAgent,
         RemoteProfilerManager,
         WorkerInfo,
         _DEFAULT_INIT_METHOD,
-        _DEFAULT_NUM_SEND_RECV_THREADS,
         _DEFAULT_NUM_WORKER_THREADS,
         _UNSET_RPC_TIMEOUT,
         _DEFAULT_RPC_TIMEOUT_SEC,
@@ -95,10 +92,9 @@ def init_rpc(
                 Name can only contain number, alphabet, underscore, colon,
                 and/or dash, and must be shorter than 128 characters.
             backend (BackendType, optional): The type of RPC backend
-                implementation. Supported values include
-                ``BackendType.TENSORPIPE`` (the default) and
-                ``BackendType.PROCESS_GROUP``. See :ref:`rpc-backends` for more
-                information.
+                implementation. Supported values is
+                ``BackendType.TENSORPIPE`` (the default).
+                See :ref:`rpc-backends` for more information.
             rank (int): a globally unique id/rank of this node.
             world_size (int): The number of workers in the group.
             rpc_backend_options (RpcBackendOptions, optional): The options
@@ -126,10 +122,7 @@ def init_rpc(
                 "Argument rpc_backend_options must be an instance of RpcBackendOptions"
             )
 
-        # To avoid breaking users that passed a ProcessGroupRpcBackendOptions
-        # without specifying the backend as PROCESS_GROUP when that was the
-        # default, we try to detect the backend from the options when only the
-        # latter is passed.
+        # Try to detect the backend from the options
         if backend is None and rpc_backend_options is not None:
             for candidate_backend in BackendType:
                 if isinstance(
@@ -159,13 +152,12 @@ def init_rpc(
             backend = BackendType.TENSORPIPE  # type: ignore[attr-defined]
 
         if backend == BackendType.PROCESS_GROUP:  # type: ignore[attr-defined]
-            warnings.warn(
-                "RPC was initialized with the PROCESS_GROUP backend which is "
-                "deprecated and slated to be removed and superseded by the TENSORPIPE "
-                "backend. It is recommended to migrate to the TENSORPIPE backend. "
-                "PyTorch v1.9 will be the last release that carries PROCESS_GROUP "
-                "RPC backend. If you have concerns or suggestions please comment in "
-                "https://github.com/pytorch/pytorch/issues/55615"
+            raise RuntimeError(
+                "RPC was initialized with the PROCESS_GROUP backend which has "
+                "been removed and is superseded by the TENSORPIPE backend. "
+                "Please migrate to the TENSORPIPE backend. "
+                "PyTorch v1.9 was the last release that carries PROCESS_GROUP "
+                "RPC backend."
             )
 
         if rpc_backend_options is None:
diff --git a/torch/distributed/rpc/backend_registry.py b/torch/distributed/rpc/backend_registry.py
diff --git a/torch/distributed/rpc/constants.py b/torch/distributed/rpc/constants.py
diff --git a/torch/testing/_internal/dist_utils.py b/torch/testing/_internal/dist_utils.py
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py

Original file line number	Diff line number	Diff line change
`@@ -197,7 +197,7 @@ def test_init_pg_and_rpc_with_same_socket(self):`
`197`	`197`	`world_size=1,`
`198`	`198`	`)`
`199`	`199`
`200`		`- backend_opts = rpc.ProcessGroupRpcBackendOptions(`
	`200`	`+ backend_opts = rpc.TensorPipeRpcBackendOptions(`
`201`	`201`	`init_method=f"tcp://{addr}:{port}"`
`202`	`202`	`)`
`203`	`203`	`rpc.init_rpc(`