Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Dockerfile-default-gpu
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ RUN if [ -n "${HOROVOD_PIP}" ]; then ldconfig /usr/local/cuda/targets/x86_64-lin
RUN python -m pip install -r /tmp/det_dockerfile_scripts/additional-requirements.txt

ARG DEEPSPEED_PIP
# ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
RUN if [ -n "$DEEPSPEED_PIP" ]; then /tmp/det_dockerfile_scripts/install_deepspeed.sh; fi

RUN rm -r /tmp/*
191 changes: 175 additions & 16 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,31 @@ SHORT_GIT_HASH := $(shell git rev-parse --short HEAD)

NGC_REGISTRY := nvcr.io/isv-ngc-partner/determined
NGC_PUBLISH := 1
export DOCKERHUB_REGISTRY := determinedai
export REGISTRY_REPO := environments
export DOCKERHUB_REGISTRY := au-docker-reg.tenant-augment-eng.ord1.ingress.coreweave.cloud
export REGISTRY_REPO := dai_environments

CPU_PREFIX_38 := $(REGISTRY_REPO):py-3.8-
CPU_PREFIX_310 := $(REGISTRY_REPO):py-3.10-
CUDA_111_PREFIX := $(REGISTRY_REPO):cuda-11.1-
CUDA_112_PREFIX := $(REGISTRY_REPO):cuda-11.2-
CUDA_113_PREFIX := $(REGISTRY_REPO):cuda-11.3-
CUDA_118_PREFIX := $(REGISTRY_REPO):cuda-11.8-
CUDA_117_PREFIX := $(REGISTRY_REPO):cuda-11.7-
CUDA_118_PREFIX := $(REGISTRY_REPO):cuda-11.8-
CUDA_121_PREFIX := $(REGISTRY_REPO):cuda-12.1-
ROCM_50_PREFIX := $(REGISTRY_REPO):rocm-5.0-

CPU_SUFFIX := -cpu
GPU_SUFFIX := -gpu
ARTIFACTS_DIR := /tmp/artifacts
PYTHON_VERSION := 3.8.12
PYTHON_VERSION_37 := 3.7.11
PYTHON_VERSION_38 := 3.8.12
PYTHON_VERSION_39 := 3.9.16
PYTHON_VERSION_310 := 3.10.12
PY_37_TAG := py-3.7-
PY_38_TAG := py-3.8-
PY_39_TAG := py-3.9-
UBUNTU_VERSION := ubuntu20.04
UBUNTU_IMAGE_TAG := ubuntu:20.04
UBUNTU_VERSION_1804 := ubuntu18.04
Expand Down Expand Up @@ -69,13 +78,27 @@ export CPU_PY_310_BASE_NAME := $(CPU_PREFIX_310)base$(CPU_SUFFIX)
export GPU_CUDA_111_BASE_NAME := $(CUDA_111_PREFIX)base$(GPU_SUFFIX)
export GPU_CUDA_112_BASE_NAME := $(CUDA_112_PREFIX)base$(GPU_SUFFIX)
export GPU_CUDA_113_BASE_NAME := $(CUDA_113_PREFIX)base$(GPU_SUFFIX)
export GPU_CUDA_117_BASE_NAME := $(CUDA_117_PREFIX)base$(GPU_SUFFIX)
export GPU_CUDA_118_BASE_NAME := $(CUDA_118_PREFIX)base$(GPU_SUFFIX)
export GPU_CUDA_121_BASE_NAME := $(CUDA_121_PREFIX)base$(GPU_SUFFIX)

# Timeout used by packer for AWS operations. Default is 120 (30 minutes) for
# waiting for AMI availablity. Bump to 360 attempts = 90 minutes.
export AWS_MAX_ATTEMPTS=360

# Base images.
.PHONY: build-cpu-py-37-base
build-cpu-py-37-base:
docker build -f Dockerfile-base-cpu \
--build-arg BASE_IMAGE="$(UBUNTU_IMAGE_TAG)" \
--build-arg PYTHON_VERSION="$(PYTHON_VERSION_37)" \
--build-arg UBUNTU_VERSION="$(UBUNTU_VERSION)" \
--build-arg "$(MPI_BUILD_ARG)" \
--build-arg "$(OFI_BUILD_ARG)" \
-t $(DOCKERHUB_REGISTRY)/$(CPU_PY_37_BASE_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(CPU_PY_37_BASE_NAME)-$(VERSION) \
.

.PHONY: build-cpu-py-38-base
build-cpu-py-38-base:
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
Expand Down Expand Up @@ -144,18 +167,94 @@ build-gpu-cuda-113-base:
-t $(DOCKERHUB_REGISTRY)/$(GPU_CUDA_113_BASE_NAME)-$(VERSION) \
.

# From upstream, not used
# .PHONY: build-gpu-cuda-118-base
# build-gpu-cuda-118-base:
# docker build -f Dockerfile-base-gpu \
# --build-arg BASE_IMAGE="nvidia/cuda:11.8.0-cudnn8-devel-$(UBUNTU_VERSION)" \
# --build-arg PYTHON_VERSION="$(PYTHON_VERSION_310)" \
# --build-arg UBUNTU_VERSION="$(UBUNTU_VERSION)" \
# --build-arg WITH_AWS_TRACE="$(WITH_AWS_TRACE)" \
# --build-arg "$(MPI_BUILD_ARG)" \
# --build-arg "$(OFI_BUILD_ARG)" \
# --build-arg "$(NCCL_BUILD_ARG)" \
# -t $(DOCKERHUB_REGISTRY)/$(GPU_CUDA_118_BASE_NAME)-$(SHORT_GIT_HASH) \
# -t $(DOCKERHUB_REGISTRY)/$(GPU_CUDA_118_BASE_NAME)-$(VERSION) \

.PHONY: build-gpu-cuda-117-base
build-gpu-cuda-117-base:
docker build -f Dockerfile-base-gpu \
--build-arg BASE_IMAGE="nvidia/cuda:11.7.1-cudnn8-devel-$(UBUNTU_VERSION)" \
--build-arg PYTHON_VERSION="$(PYTHON_VERSION_39)" \
--build-arg UBUNTU_VERSION="$(UBUNTU_VERSION)" \
--build-arg "$(MPI_BUILD_ARG)" \
-t $(DOCKERHUB_REGISTRY)/$(GPU_CUDA_117_BASE_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(GPU_CUDA_117_BASE_NAME)-$(VERSION) \
-o type=image,push=false \
.

.PHONY: build-gpu-cuda-118-base
build-gpu-cuda-118-base:
docker build -f Dockerfile-base-gpu \
--build-arg BASE_IMAGE="nvidia/cuda:11.8.0-cudnn8-devel-$(UBUNTU_VERSION)" \
--build-arg PYTHON_VERSION="$(PYTHON_VERSION_310)" \
--build-arg PYTHON_VERSION="$(PYTHON_VERSION_39)" \
--build-arg UBUNTU_VERSION="$(UBUNTU_VERSION)" \
--build-arg WITH_AWS_TRACE="$(WITH_AWS_TRACE)" \
--build-arg "$(MPI_BUILD_ARG)" \
--build-arg "$(OFI_BUILD_ARG)" \
--build-arg "$(NCCL_BUILD_ARG)" \
-t $(DOCKERHUB_REGISTRY)/$(GPU_CUDA_118_BASE_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(GPU_CUDA_118_BASE_NAME)-$(VERSION) \
-o type=image,push=false \
.

.PHONY: build-gpu-cuda-121-base
build-gpu-cuda-121-base:
docker build -f Dockerfile-base-gpu \
--build-arg BASE_IMAGE="nvidia/cuda:12.1.0-cudnn8-devel-$(UBUNTU_VERSION)" \
--build-arg PYTHON_VERSION="$(PYTHON_VERSION_39)" \
--build-arg UBUNTU_VERSION="$(UBUNTU_VERSION)" \
--build-arg "$(MPI_BUILD_ARG)" \
-t $(DOCKERHUB_REGISTRY)/$(GPU_CUDA_121_BASE_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(GPU_CUDA_121_BASE_NAME)-$(VERSION) \
-o type=image,push=false \
.

export CPU_TF1_ENVIRONMENT_NAME := $(CPU_PREFIX_37)pytorch-1.7-tf-1.15$(CPU_SUFFIX)
export GPU_TF1_ENVIRONMENT_NAME := $(CUDA_102_PREFIX)pytorch-1.7-tf-1.15$(GPU_SUFFIX)

# Full images.
.PHONY: build-tf1-cpu
build-tf1-cpu: build-cpu-py-37-base
docker build -f Dockerfile-default-cpu \
--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(CPU_PY_37_BASE_NAME)-$(SHORT_GIT_HASH)" \
--build-arg TENSORFLOW_PIP="tensorflow==1.15.5" \
--build-arg TORCH_PIP="torch==1.7.1 -f https://download.pytorch.org/whl/cpu/torch_stable.html" \
--build-arg TORCHVISION_PIP="torchvision==0.8.2 -f https://download.pytorch.org/whl/cpu/torch_stable.html" \
--build-arg HOROVOD_PIP="horovod==0.24.2" \
--build-arg HOROVOD_WITH_MPI="$(HOROVOD_WITH_MPI)" \
--build-arg HOROVOD_WITHOUT_MPI="$(HOROVOD_WITHOUT_MPI)" \
--build-arg HOROVOD_CPU_OPERATIONS="$(HOROVOD_CPU_OPERATIONS)" \
-t $(DOCKERHUB_REGISTRY)/$(CPU_TF1_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(CPU_TF1_ENVIRONMENT_NAME)-$(VERSION) \
-t $(NGC_REGISTRY)/$(CPU_TF1_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(NGC_REGISTRY)/$(CPU_TF1_ENVIRONMENT_NAME)-$(VERSION) \
.

.PHONY: build-tf1-gpu
build-tf1-gpu: build-gpu-cuda-102-base
docker build -f Dockerfile-default-gpu \
--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(GPU_CUDA_102_BASE_NAME)-$(SHORT_GIT_HASH)" \
--build-arg TENSORFLOW_PIP="https://github.com/determined-ai/tensorflow-wheels/releases/download/0.1.0/tensorflow_gpu-1.15.5-cp37-cp37m-linux_x86_64.whl" \
--build-arg TORCH_PIP="torch==1.7.1 -f https://download.pytorch.org/whl/cu102/torch_stable.html" \
--build-arg TORCHVISION_PIP="torchvision==0.8.2 -f https://download.pytorch.org/whl/cu102/torch_stable.html" \
--build-arg TORCH_CUDA_ARCH_LIST="3.7;6.0;6.1;6.2;7.0;7.5" \
--build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \
--build-arg HOROVOD_PIP="horovod==0.24.2" \
--build-arg HOROVOD_WITH_MPI="$(HOROVOD_WITH_MPI)" \
--build-arg HOROVOD_WITHOUT_MPI="$(HOROVOD_WITHOUT_MPI)" \
--build-arg HOROVOD_CPU_OPERATIONS="$(HOROVOD_CPU_OPERATIONS)" \
-t $(DOCKERHUB_REGISTRY)/$(GPU_TF1_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(GPU_TF1_ENVIRONMENT_NAME)-$(VERSION) \
-t $(NGC_REGISTRY)/$(GPU_TF1_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(NGC_REGISTRY)/$(GPU_TF1_ENVIRONMENT_NAME)-$(VERSION) \
.

export ROCM50_TORCH_TF_ENVIRONMENT_NAME := $(ROCM_50_PREFIX)pytorch-1.10-tf-2.7-rocm
Expand All @@ -171,9 +270,13 @@ build-pytorch10-tf27-rocm50:
.

DEEPSPEED_VERSION := 0.8.3
export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX)
export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-gpt-neox-deepspeed$(GPU_SUFFIX)
export TORCH_PIP_DEEPSPEED_GPU := torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_117_PREFIX)pytorch-1.13-tf-2.8-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX)
export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_117_PREFIX)$(PY_39_TAG)pytorch-1.13-gpt-neox-deepspeed$(GPU_SUFFIX)
export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_201 := $(CUDA_118_PREFIX)$(PY_39_TAG)pytorch-2.0.1-gpt-neox-deepspeed$(GPU_SUFFIX)
export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_210 := $(CUDA_121_PREFIX)$(PY_39_TAG)pytorch-2.1.0-gpt-neox-deepspeed$(GPU_SUFFIX)
export TORCH_PIP_DEEPSPEED_GPU := torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1+cu117 -f https://download.pytorch.org/whl/cu117/torch_stable.html
export TORCH_PIP_DEEPSPEED_GPU_201 := torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 -f https://download.pytorch.org/whl/cu118/torch_stable.html
export TORCH_PIP_DEEPSPEED_GPU_210 := torch==2.1.0+cu121 torchvision==0.16.0+cu121 torchaudio==2.1.0+cu121 -f https://download.pytorch.org/whl/cu121/torch_stable.html
export TORCH_TB_PROFILER_PIP := torch-tb-profiler==0.4.1

# This builds deepspeed environment off of upstream microsoft/DeepSpeed.
Expand All @@ -193,22 +296,75 @@ build-deepspeed-gpu: build-gpu-cuda-113-base
-t $(NGC_REGISTRY)/$(GPU_DEEPSPEED_ENVIRONMENT_NAME)-$(VERSION) \
.

# TODO
#
# parameterize better, fix target names?

.PHONY: augment-torch-113
augment-torch-113: build-gpt-neox-deepspeed-gpu

.PHONY: augment-torch-201
augment-torch-201: build-gpt-neox-deepspeed-gpu-torch-201

.PHONY: augment-torch-210
augment-torch-210: build-gpt-neox-deepspeed-gpu-torch-210

# This builds deepspeed environment off of a patched version of EleutherAI's fork of DeepSpeed
# that we need for gpt-neox support.
.PHONY: build-gpt-neox-deepspeed-gpu
build-gpt-neox-deepspeed-gpu: build-gpu-cuda-113-base
build-gpt-neox-deepspeed-gpu: build-gpu-cuda-117-base
# We should consider building without tensorflow in the future. Going to keep tensorflow for
# now since we want to have tensorboard support. It should be possible to install tensorboard
# without tensorflow though.
docker build -f Dockerfile-default-gpu \
--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(GPU_CUDA_113_BASE_NAME)-$(SHORT_GIT_HASH)" \
--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(GPU_CUDA_117_BASE_NAME)-$(SHORT_GIT_HASH)" \
--build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU)" \
--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
--build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \
--build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \
--build-arg "$(NCCL_BUILD_ARG)" \
--build-arg DEEPSPEED_PIP="git+https://github.com/determined-ai/deepspeed.git@eleuther_dai" \
--build-arg APEX_GIT="https://github.com/NVIDIA/apex.git" \
--build-arg DET_BUILD_NCCL="" \
--build-arg DEEPSPEED_PIP="git+https://github.com/augmentcode/DeeperSpeed.git@ea3711b1d6b2134d8ad1be26854ff0d9f60c383f" \
-t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(VERSION) \
-t $(NGC_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \
-t $(NGC_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(VERSION) \
-o type=image,push=false \
.

# This builds deepspeed environment off of a patched version of EleutherAI's fork of DeepSpeed
# that we need for gpt-neox support.
.PHONY: build-gpt-neox-deepspeed-gpu-torch-201
build-gpt-neox-deepspeed-gpu-torch-201: build-gpu-cuda-118-base
# We should consider building without tensorflow in the future. Going to keep tensorflow for
# now since we want to have tensorboard support. It should be possible to install tensorboard
# without tensorflow though.
docker build -f Dockerfile-default-gpu \
--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(GPU_CUDA_118_BASE_NAME)-$(SHORT_GIT_HASH)" \
--build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU_201)" \
--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
--build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0;8.6;9.0" \
--build-arg APEX_GIT="https://github.com/NVIDIA/apex.git" \
--build-arg DET_BUILD_NCCL="" \
--build-arg DEEPSPEED_PIP="git+https://github.com/augmentcode/DeeperSpeed.git@ea3711b1d6b2134d8ad1be26854ff0d9f60c383f" \
-t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_201)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_201)-$(VERSION) \
-o type=image,push=false \
.

.PHONY: build-gpt-neox-deepspeed-gpu-torch-210
build-gpt-neox-deepspeed-gpu-torch-210: build-gpu-cuda-121-base
# We should consider building without tensorflow in the future. Going to keep tensorflow for
# now since we want to have tensorboard support. It should be possible to install tensorboard
# without tensorflow though.
docker build -f Dockerfile-default-gpu \
--build-arg BASE_IMAGE="$(DOCKERHUB_REGISTRY)/$(GPU_CUDA_121_BASE_NAME)-$(SHORT_GIT_HASH)" \
--build-arg TORCH_PIP="$(TORCH_PIP_DEEPSPEED_GPU_210)" \
--build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \
--build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0;8.6;9.0" \
--build-arg APEX_GIT="https://github.com/NVIDIA/apex.git" \
--build-arg DET_BUILD_NCCL="" \
--build-arg DEEPSPEED_PIP="git+https://github.com/augmentcode/DeeperSpeed.git@d08ec4e806ace0721026dd83067ca43ddc697e15" \
-t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_210)-$(SHORT_GIT_HASH) \
-t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME_210)-$(VERSION) \
-o type=image,push=false \
.

ifeq ($(NGC_PUBLISH),)
Expand All @@ -227,6 +383,9 @@ endif

export CPU_TF28_ENVIRONMENT_NAME := $(CPU_PREFIX_38)tf-2.8$(CPU_SUFFIX)
export GPU_TF28_ENVIRONMENT_NAME := $(CUDA_112_PREFIX)tf-2.8$(GPU_SUFFIX)
TORCH_VERSION := 1.13
export CPU_TF27_ENVIRONMENT_NAME := $(CPU_PREFIX)pytorch-$(TORCH_VERSION)-tf-2.7$(CPU_SUFFIX)
export GPU_TF27_ENVIRONMENT_NAME := $(CUDA_112_PREFIX)pytorch-$(TORCH_VERSION)-tf-2.7$(GPU_SUFFIX)

.PHONY: build-tf28-cpu
build-tf28-cpu: build-cpu-py-38-base
Expand Down
8 changes: 8 additions & 0 deletions README_Augment.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
## Building the determined environment for the augment environment

The following make targets exist for building the base determined container(s) used by augment:

- augment-torch-113
torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1+cu117
- augment-torch-201
torch==2.0.1+cu117 torchvision==0.15.2+cu117 torchaudio==2.0.2+cu117
33 changes: 25 additions & 8 deletions dockerfile_scripts/install_apex.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,29 @@
set -e

if [ "$APEX_GIT" ]; then
pip install \
-v \
--disable-pip-version-check \
--no-cache-dir \
--no-build-isolation \
--global-option="--cpp_ext" \
--global-option="--cuda_ext" \
git+$APEX_GIT
if [ "$APEX_PATCH" == 1 ]; then
APEX_DIR=/tmp/apex/
APEX_GIT_URL="${APEX_GIT%@*}"
APEX_GIT_VER="${APEX_GIT#*@}"
git clone "$APEX_GIT_URL" "$APEX_DIR"
pushd "$APEX_DIR"
git checkout "$APEX_GIT_VER"
git apply /tmp/det_dockerfile_scripts/apex.patch
popd
pip install ninja
pip install \
--no-cache-dir \
--no-build-isolation \
--config-settings "--build-option=--cpp_ext" \
--config-settings "--build-option=--cuda_ext" \
"$APEX_DIR"
else
pip install ninja
pip install \
--no-cache-dir \
--no-build-isolation \
--config-settings "--build-option=--cpp_ext" \
--config-settings "--build-option=--cuda_ext" \
git+$APEX_GIT
fi
fi
11 changes: 8 additions & 3 deletions dockerfile_scripts/install_deepspeed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@ set -e

DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev
# Triton is needed to build deepspeed's sparse_attn operation.
python -m pip install triton==1.0.0
python -m pip install pydantic==1.10.11
DS_BUILD_OPS=1 python -m pip install $DEEPSPEED_PIP --no-binary deepspeed
# python -m pip install triton==1.0.0
# python -m pip install pydantic==1.10.11
# We explicitly build only the async-I/O, fused Adam, and utils extensions in DeepSpeed.
# This avoids depending on `triton` here which interacts badly with PyTorch's dependency on same.
# You can see all the ops here: https://github.com/augmentcode/DeeperSpeed/tree/augment/op_builder
# along with some docs: https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops.
# Protobuf to make sure it doesn't get upgraded
DS_BUILD_AIO=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python -m pip install "protobuf==3.20.1" $DEEPSPEED_PIP --no-binary deepspeed
python -m deepspeed.env_report