diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 022454afd..f145d8425 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -4,7 +4,7 @@ steps: command: bash .buildkite/scripts/benchmark_master.sh plugins: - docker#v3.8.0: - image: "baguasys/bagua:latest" + image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8" workdir: /upstream user: root propagate-environment: true @@ -20,7 +20,7 @@ steps: command: bash .buildkite/scripts/benchmark_worker.sh plugins: - docker#v3.8.0: - image: "baguasys/bagua:latest" + image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8" workdir: /upstream user: root propagate-environment: true @@ -34,7 +34,7 @@ steps: command: bash .buildkite/scripts/benchmark.sh plugins: - docker#v3.8.0: - image: "baguasys/bagua:latest" + image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8" workdir: /upstream user: root propagate-environment: true @@ -48,7 +48,7 @@ steps: command: bash .buildkite/scripts/run_pytest.sh plugins: - docker#v3.8.0: - image: "baguasys/bagua:latest" + image: "baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8" workdir: /upstream user: root propagate-environment: true diff --git a/.github/workflows/bagua-pypi-publish.yml b/.github/workflows/bagua-pypi-publish.yml index add242789..bc86f6a95 100644 --- a/.github/workflows/bagua-pypi-publish.yml +++ b/.github/workflows/bagua-pypi-publish.yml @@ -9,7 +9,7 @@ concurrency: jobs: publish: runs-on: ubuntu-latest - container: baguasys/bagua:latest + container: baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8 steps: - uses: actions/checkout@v2 with: @@ -38,7 +38,7 @@ jobs: check_source_install: runs-on: ubuntu-latest - container: baguasys/bagua:latest + container: baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8 needs: - publish steps: @@ -115,7 +115,6 @@ jobs: with: fetch-depth: 0 submodules: recursive - - name: setup python uses: actions/setup-python@v2 with: diff --git a/.github/workflows/bagua-python-package-check.yml b/.github/workflows/bagua-python-package-check.yml index b38746129..01d753a9d 100644 --- a/.github/workflows/bagua-python-package-check.yml +++ b/.github/workflows/bagua-python-package-check.yml @@ -12,7 +12,7 @@ on: jobs: build: runs-on: ubuntu-latest - container: baguasys/bagua:latest + container: baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8 steps: - uses: actions/checkout@v2 with: diff --git a/.github/workflows/dockerhub.yml b/.github/workflows/dockerhub.yml index a2d9e3340..f96077819 100644 --- a/.github/workflows/dockerhub.yml +++ b/.github/workflows/dockerhub.yml @@ -17,6 +17,9 @@ jobs: - cuda-version: "cuda11.1" cudnn-version: "cudnn8" pytorch-version: "pytorch-1.9.0" + - cuda-version: "cuda11.3" + cudnn-version: "cudnn8" + pytorch-version: "pytorch-1.10.0" name: 'Build' runs-on: ubuntu-latest steps: diff --git a/.github/workflows/pytype.yml b/.github/workflows/pytype.yml index 816b61439..271314065 100644 --- a/.github/workflows/pytype.yml +++ b/.github/workflows/pytype.yml @@ -19,7 +19,7 @@ jobs: build: # The type of runner that the job will run on runs-on: ubuntu-latest - container: baguasys/bagua:latest + container: baguasys/bagua:master-pytorch-1.9.0-cuda11.1-cudnn8 # Steps represent a sequence of tasks that will be executed as part of the job steps: diff --git a/bagua/torch_api/algorithms/decentralized.py b/bagua/torch_api/algorithms/decentralized.py index fc2130392..317781786 100644 --- a/bagua/torch_api/algorithms/decentralized.py +++ b/bagua/torch_api/algorithms/decentralized.py @@ -87,7 +87,7 @@ def hook(): def _init_states(self, bucket: BaguaBucket): weight_tensor = bucket.flattened_tensor() - bucket._peer_weight = weight_tensor.to_bagua_tensor("peer_weight") + bucket._peer_weight = weight_tensor.ensure_bagua_tensor("peer_weight") def init_operations( self, @@ -182,11 +182,11 @@ def _init_states(self, bucket: BaguaBucket): left_peer_weight_tensor = bucket.flattened_tensor() right_peer_weight_tensor = bucket.flattened_tensor() - bucket._weight = weight_tensor.to_bagua_tensor("weight") - bucket._left_peer_weight = left_peer_weight_tensor.to_bagua_tensor( + bucket._weight = weight_tensor.ensure_bagua_tensor("weight") + bucket._left_peer_weight = left_peer_weight_tensor.ensure_bagua_tensor( "left_peer_weight" ) - bucket._right_peer_weight = right_peer_weight_tensor.to_bagua_tensor( + bucket._right_peer_weight = right_peer_weight_tensor.ensure_bagua_tensor( "right_peer_weight" ) diff --git a/bagua/torch_api/bucket.py b/bagua/torch_api/bucket.py index cec70b29f..c04ddb4ac 100644 --- a/bagua/torch_api/bucket.py +++ b/bagua/torch_api/bucket.py @@ -55,7 +55,7 @@ def __init__( # padding tensor must be of name bagua_padding_tensor, so that they are always marked as ready for communication in the backend self.padding_tensor = torch.zeros( padding, dtype=self.tensors[0].dtype, device=self.tensors[0].device - ).to_bagua_tensor( + ).ensure_bagua_tensor( "bagua_padding_tensor_bucket_" + name, module_name=self.bagua_module_name, ) @@ -243,7 +243,7 @@ def append_decentralized_synchronous_op( Args: peer_weight (BaguaTensor): A tensor used for averaging model with peers, should be of the same size - with the bucket tensors total size. Use ``self.flattened_tensor().to_bagua_tensor(...)`` to create such a tensor. + with the bucket tensors total size. Use ``self.flattened_tensor().ensure_bagua_tensor(...)`` to create such a tensor. hierarchical (bool): Enable hierarchical communication. Which means the GPUs on the same machine will communicate will each other first. After that, machines do inter-node communication. This can boost performance when the inter-node communication cost is high. @@ -292,12 +292,12 @@ def append_low_precision_decentralized_synchronous_op( Args: weight (BaguaTensor): Model replica of current worker's local model. It should be of the same size - with the bucket tensors total size. Use ``self.flattened_tensor().to_bagua_tensor(...)`` to create such a tensor. + with the bucket tensors total size. Use ``self.flattened_tensor().ensure_bagua_tensor(...)`` to create such a tensor. left_peer_weight (BaguaTensor): Model replica of current worker's left peer. It should be of the same size - with the bucket tensors total size. Use ``self.flattened_tensor().to_bagua_tensor(...)`` to create such a tensor, + with the bucket tensors total size. Use ``self.flattened_tensor().ensure_bagua_tensor(...)`` to create such a tensor, then copy the initializing weights of current worker's left peer to the tensor. right_peer_weight (BaguaTensor): Model replica of current worker's right peer. It should be of the same size - with the bucket tensors total size. Use ``self.flattened_tensor().to_bagua_tensor(...)`` to create such a tensor. + with the bucket tensors total size. Use ``self.flattened_tensor().ensure_bagua_tensor(...)`` to create such a tensor. then copy the initializing weights of current worker's right peer to the tensor. hierarchical (bool): Enable hierarchical communication. Which means the GPUs on the same machine will communicate will each other first. After that, machines do inter-node communication. This can diff --git a/bagua/torch_api/tensor.py b/bagua/torch_api/tensor.py index be436fc81..b31d63c70 100644 --- a/bagua/torch_api/tensor.py +++ b/bagua/torch_api/tensor.py @@ -14,7 +14,7 @@ class BaguaTensor: with additional methods. A Bagua tensor is required to use Bagua's communication algorithms. Users can convert a PyTorch tensor to Bagua - tensor by :meth:`ensure_bagua_tensor` or :meth:`to_bagua_tensor`. + tensor by :meth:`ensure_bagua_tensor`. Bagua tensor features a proxy structure, where the actual tensor used by backend is accessed via a **"Proxy Tensor"**. The proxy tensor is registered in Bagua, whenever the Bagua backend needs a tensor (for example use it for @@ -134,24 +134,6 @@ def ensure_bagua_tensor( self._bagua_bucket = None return self - def bagua_getter_closure(self) -> torch.Tensor: - """Returns the tensor that will be used in runtime.""" - return ( - self._bagua_getter_closure(self) - if self._bagua_getter_closure is not None - else self - ) - - def bagua_setter_closure(self, tensor: torch.Tensor): - """Sets the tensor that will be used in runtime to a new Pytorch tensor :attr:`tensor`. - - Args: - tensor: The new tensor to be set to. - """ - - assert self._bagua_setter_closure is not None - self._bagua_setter_closure(self, tensor) - def to_bagua_tensor( self, name: Optional[str] = None, @@ -161,9 +143,13 @@ def to_bagua_tensor( ): """ Create a new Bagua tensor from a PyTorch tensor or parameter and return it. - The original tensor is not changed. A Bagua tensor is required to use - Bagua's communication algorithms. See :meth:`ensure_bagua_tensor` for more - information. + The new Bagua tensor will share the same storage with the input PyTorch tensor. + A Bagua tensor is required to use Bagua's communication algorithms. + See :meth:`ensure_bagua_tensor` for more information. + + Caveat: Be aware that if the original tensor changes to use a different storage + using for example ``torch.Tensor.set_(...)``, the new Bagua tensor will still + use the old storage. Args: name: The unique name of the tensor. @@ -173,15 +159,32 @@ def to_bagua_tensor( getter_closure: A function that accepts a Pytorch tensor as its input and returns a Pytorch tensor as its output. See :meth:`ensure_bagua_tensor`. setter_closure: A function that accepts two Pytorch tensors as its inputs and returns nothing. See :meth:`ensure_bagua_tensor`. - Returns: The new Bagua tensor sharing the same storage with the original tensor. """ - new_tensor = torch.Tensor(cdata=self._cdata) + new_tensor = self.view(self.dtype) return new_tensor.ensure_bagua_tensor( name, module_name, getter_closure, setter_closure ) + def bagua_getter_closure(self) -> torch.Tensor: + """Returns the tensor that will be used in runtime.""" + return ( + self._bagua_getter_closure(self) + if self._bagua_getter_closure is not None + else self + ) + + def bagua_setter_closure(self, tensor: torch.Tensor): + """Sets the tensor that will be used in runtime to a new Pytorch tensor :attr:`tensor`. + + Args: + tensor: The new tensor to be set to. + """ + + assert self._bagua_setter_closure is not None + self._bagua_setter_closure(self, tensor) + def bagua_backend_tensor(self) -> B.BaguaTensorPy: """ Returns: diff --git a/docker/Dockerfile.pytorch-1.10.0-cuda11.3-cudnn8 b/docker/Dockerfile.pytorch-1.10.0-cuda11.3-cudnn8 new file mode 100644 index 000000000..b7eea817a --- /dev/null +++ b/docker/Dockerfile.pytorch-1.10.0-cuda11.3-cudnn8 @@ -0,0 +1,56 @@ +FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel + +RUN apt-get update && apt-get install -y curl software-properties-common wget sudo +RUN add-apt-repository ppa:git-core/ppa -y +RUN sed -i 's/mozilla\/DST_Root_CA_X3.crt/!mozilla\/DST_Root_CA_X3.crt/g' /etc/ca-certificates.conf && update-ca-certificates +RUN curl -sSf https://apt.kitware.com/kitware-archive.sh | sh +RUN apt-get update && apt-get install -y git cmake +RUN curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain stable -y +ENV PATH=/root/.cargo/bin:${PATH} +RUN cargo install mdbook mdbook-linkcheck mdbook-katex mdbook-open-on-gh + +RUN yes | python3 -m pip install -U setuptools wheel build pip + +ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64/stubs/:/usr/local/lib64:/usr/local/lib" +ENV LIBRARY_PATH="/usr/local/cuda/lib64/stubs/:/usr/local/lib64:/usr/local/lib" +ENV PKG_CONFIG_PATH="/usr/local/cuda/pkgconfig/" +ENV CUDA_LIBRARY_PATH="/usr/local/cuda/lib64/" + + +# OpenMPI version 4.0.3 +RUN apt-get update -y && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + bzip2 \ + file \ + hwloc \ + libnuma-dev \ + make \ + openssh-client \ + perl \ + tar \ + wget && \ + rm -rf /var/lib/apt/lists/* +RUN mkdir -p /var/tmp && wget -q -nc --no-check-certificate -P /var/tmp https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.3.tar.bz2 && \ + mkdir -p /var/tmp && tar -x -f /var/tmp/openmpi-4.0.3.tar.bz2 -C /var/tmp -j && \ + cd /var/tmp/openmpi-4.0.3 && ./configure --disable-getpwuid --disable-oshmem --enable-fortran --enable-mca-no-build=btl-uct --enable-orterun-prefix-by-default --with-cuda --without-verbs && \ + make -j$(nproc) && \ + make -j$(nproc) install && \ + rm -rf /var/tmp/openmpi-4.0.3 /var/tmp/openmpi-4.0.3.tar.bz2 && cd - + +# hwloc +RUN mkdir -p /var/tmp && wget -q -nc --no-check-certificate -P /var/tmp https://download.open-mpi.org/release/hwloc/v2.5/hwloc-2.5.0.tar.bz2 && \ + mkdir -p /var/tmp && tar -x -f /var/tmp/hwloc-2.5.0.tar.bz2 -C /var/tmp -j && \ + cd /var/tmp/hwloc-2.5.0 && ./configure && \ + make -j$(nproc) && \ + make -j$(nproc) install && \ + rm -rf /var/tmp/hwloc* && cd - + +# Redis +RUN add-apt-repository ppa:redislabs/redis +RUN apt-get update && apt-get install -y redis +RUN yes | python3 -m pip install -U redis + +RUN mkdir /bagua +COPY examples/ /bagua/examples +COPY ./ /var/tmp/bagua +RUN cd /var/tmp/bagua && python3 -m pip install . && cd - && rm -rf /var/tmp/bagua diff --git a/docker/Dockerfile.pytorch-1.9.0-cuda10.2-cudnn7 b/docker/Dockerfile.pytorch-1.9.0-cuda10.2-cudnn7 index 4524fd4d8..5a6442c83 100644 --- a/docker/Dockerfile.pytorch-1.9.0-cuda10.2-cudnn7 +++ b/docker/Dockerfile.pytorch-1.9.0-cuda10.2-cudnn7 @@ -1,6 +1,7 @@ FROM pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel -RUN apt-get update && apt-get install -y curl software-properties-common wget +RUN apt-get update && apt-get install -y curl software-properties-common wget sudo +RUN add-apt-repository ppa:git-core/ppa -y RUN sed -i 's/mozilla\/DST_Root_CA_X3.crt/!mozilla\/DST_Root_CA_X3.crt/g' /etc/ca-certificates.conf && update-ca-certificates RUN curl -sSf https://apt.kitware.com/kitware-archive.sh | sh RUN apt-get update && apt-get install -y git cmake diff --git a/docker/Dockerfile.pytorch-1.9.0-cuda11.1-cudnn8 b/docker/Dockerfile.pytorch-1.9.0-cuda11.1-cudnn8 index 10018c0fc..78f12d072 100644 --- a/docker/Dockerfile.pytorch-1.9.0-cuda11.1-cudnn8 +++ b/docker/Dockerfile.pytorch-1.9.0-cuda11.1-cudnn8 @@ -1,6 +1,7 @@ FROM pytorch/pytorch:1.9.0-cuda11.1-cudnn8-devel -RUN apt-get update && apt-get install -y curl software-properties-common wget +RUN apt-get update && apt-get install -y curl software-properties-common wget sudo +RUN add-apt-repository ppa:git-core/ppa -y RUN sed -i 's/mozilla\/DST_Root_CA_X3.crt/!mozilla\/DST_Root_CA_X3.crt/g' /etc/ca-certificates.conf && update-ca-certificates RUN curl -sSf https://apt.kitware.com/kitware-archive.sh | sh RUN apt-get update && apt-get install -y git cmake diff --git a/tests/comm/test_communicator.py b/tests/comm/test_communicator.py index 28ff1fc8d..368506924 100644 --- a/tests/comm/test_communicator.py +++ b/tests/comm/test_communicator.py @@ -56,7 +56,7 @@ def abort(): data = torch.rand(10).cuda() for _ in range(rank + 1): - comm.allreduce_inplace(data.to_bagua_tensor().bagua_backend_tensor(), 10) + comm.allreduce_inplace(data.ensure_bagua_tensor().bagua_backend_tensor(), 10) comm_stream.synchronize()