Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion common/install_azcopy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ azcopy_version=$(jq -r '.version' <<< $azcopy_metadata)
azcopy_release=$(jq -r '.release' <<< $azcopy_metadata)
azcopy_sha256=$(jq -r '.sha256' <<< $azcopy_metadata)
TARBALL="azcopy_linux_amd64_$azcopy_version.tar.gz"
AZCOPY_DOWNLOAD_URL="https://azcopyvnext.azureedge.net/releases/release-${azcopy_release}/${TARBALL}"
AZCOPY_DOWNLOAD_URL="https://azcopyvnext-awgzd8g7aagqhzhe.b02.azurefd.net/releases/release-${azcopy_release}/${TARBALL}"

${COMMON_DIR}/download_and_verify.sh ${AZCOPY_DOWNLOAD_URL} ${azcopy_sha256}
tar -xvf ${TARBALL}
Expand Down
20 changes: 15 additions & 5 deletions common/install_health_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,29 @@ aznhc_metadata=$(get_component_config "aznhc")
AZHC_VERSION=$(jq -r '.version' <<< $aznhc_metadata)

DEST_TEST_DIR=/opt/azurehpc/test
GPU_PLAT=$1

mkdir -p $DEST_TEST_DIR

pushd $DEST_TEST_DIR

git clone https://github.com/Azure/azurehpc-health-checks.git --branch v$AZHC_VERSION
if [ "${GPU_PLAT}" = "NVIDIA" ]; then
git clone https://github.com/Azure/azurehpc-health-checks.git --branch v$AZHC_VERSION

pushd azurehpc-health-checks
pushd azurehpc-health-checks

# Pull down docker container from MCR
./dockerfile/pull-image-acr.sh cuda
# Pull down docker container from MCR
./dockerfile/pull-image-acr.sh cuda
popd
else
git clone https://github.com/Azure/azurehpc-health-checks.git
pushd azurehpc-health-checks
# Build docker image for AMD while waiting to be published on MCR
./dockerfile/build_image.sh rocm

popd
fi

popd
popd

$COMMON_DIR/write_component_version.sh "AZ_HEALTH_CHECKS" ${AZHC_VERSION}
21 changes: 17 additions & 4 deletions tests/run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@ function test_component {
case $component in
check_impi_2021) verify_impi_2021_installation;;
check_impi_2018) verify_impi_2018_installation;;
check_gdrcopy) verify_gdrcopy_installation;;
check_cuda) verify_cuda_installation;;
check_nccl) verify_nccl_installation;;
check_rocm) verify_rocm_installation;;
check_rccl) verify_rccl_installation;;
check_gcc) verify_gcc_modulefile;;
check_aocl) verify_aocl_installation;;
check_aocc) verify_aocc_installation;;
Expand All @@ -44,7 +47,6 @@ function verify_common_components {
verify_hpcdiag_installation;
verify_ipoib_status;
verify_lustre_installation;
verify_gdrcopy_installation;
verify_pssh_installation;
verify_aznfs_installation;
}
Expand All @@ -67,10 +69,21 @@ function initiate_test_suite {
}

function set_test_matrix {
gpu_platform="NVIDIA"
if [[ "$#" -gt 0 ]]; then
GPU_PLAT=$1
if [[ ${GPU_PLAT} == "AMD" ]]; then
gpu_platform="AMD"
elif [[ ${GPU_PLAT} != "NVIDIA" ]]; then
echo "${GPU_PLAT} is not a valid GPU platform"
exit 1

fi
fi
export distro=$(. /etc/os-release;echo $ID$VERSION_ID)
test_matrix_file=$(jq -r . $HPC_ENV/test/test-matrix.json)
test_matrix_file=$(jq -r . $HPC_ENV/test/test-matrix_${gpu_platform}.json)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please delete the now unused test-matrix.json file to avoid confusion

export TEST_MATRIX=$(jq -r '."'"$distro"'" // empty' <<< $test_matrix_file)

if [[ -z "$TEST_MATRIX" ]]; then
echo "*****No test matrix found for distribution $distro!*****"
exit 1
Expand Down Expand Up @@ -127,7 +140,7 @@ set_component_versions
# Set current SKU
set_sku_configuration
# Set test matrix
set_test_matrix
set_test_matrix $1
# Initiate test suite
initiate_test_suite

Expand Down
33 changes: 33 additions & 0 deletions tests/test-definitions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,39 @@ function verify_nccl_installation {
module unload mpi/hpcx
}

function verify_rocm_installation {
# Verify AMD GPU Driver installation
# Verify if ROCM is installed
check_exists "/opt/rocm/"

amd_rocm_version=$(cat /opt/rocm/.info/version)
check_exit_code "AMD ROCM version ${amd_rocm_version} found" "AMD ROCM not found"

# Verify if AMD GPU driver exists
amd_driver_version=$(modinfo amdgpu | grep "^version" | cut -d ":" -f 2 | tr -d '[:blank:]')
check_exit_code "AMD GPU driver ${amd_driver_version} found" "AMD GPU driver not found"
}

function verify_rccl_installation {

module load mpi/hpcx

case ${VMSIZE} in
standard_nd96isr_mi300x_v5) mpirun -np 8 \
--allow-run-as-root \
--map-by ppr:8:node \
-x LD_LIBRARY_PATH=/opt/rccl/lib:$LD_LIBRARY_PATH \
-x CUDA_DEVICE_ORDER=PCI_BUS_ID \
-x NCCL_SOCKET_IFNAME=eth0 \
-x NCCL_DEBUG=WARN \
/opt/rccl-tests/all_reduce_perf -b1K -f2 -g1 -e 4G;;
*) ;;
esac
check_exit_code "RCCL ${VERSION_RCCL}" "Failed to run RCCL all reduce perf"

module unload mpi/hpcx
}

function verify_package_updates {
case ${ID} in
ubuntu) sudo apt -q --assume-no update;;
Expand Down
22 changes: 0 additions & 22 deletions tests/test-matrix.json

This file was deleted.

22 changes: 22 additions & 0 deletions tests/test-matrix_AMD.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"ubuntu22.04": {
"components": ["check_impi_2021", "check_rocm", "check_rccl", "check_aocl", "check_aocc", "check_docker"],
"services": ["check_sku_customization", "check_sunrpc_tcp_settings"]
},
"ubuntu20.04": {
"components": ["check_impi_2021", "check_aocl", "check_aocc", "check_docker"],
"services": ["check_sku_customization", "check_sunrpc_tcp_settings"]
},
"almalinux8.7": {
"components": ["check_impi_2021", "check_gcc", "check_aocl", "check_aocc", "check_docker"],
"services": ["check_sku_customization", "check_sunrpc_tcp_settings"]
},
"almalinux8.10": {
"components": ["check_impi_2021", "check_gcc", "check_aocl", "check_aocc", "check_docker"],
"services": ["check_sku_customization", "check_sunrpc_tcp_settings"]
},
"azurelinux3.0": {
"components": ["check_impi_2021", "check_gcc", "check_aocl", "check_aocc", "check_docker"],
"services": ["check_sku_customization", "check_sunrpc_tcp_settings"]
}
}
22 changes: 22 additions & 0 deletions tests/test-matrix_NVIDIA.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"ubuntu22.04": {
"components": ["check_impi_2021", "check_gdrcopy", "check_cuda", "check_nccl", "check_aocl", "check_aocc", "check_docker", "check_dcgm"],
"services": ["check_sku_customization", "check_nvidia_fabricmanager", "check_sunrpc_tcp_settings"]
},
"ubuntu20.04": {
"components": ["check_impi_2021", "check_gdrcopy", "check_cuda", "check_nccl", "check_aocl", "check_aocc", "check_docker", "check_dcgm"],
"services": ["check_sku_customization", "check_nvidia_fabricmanager", "check_sunrpc_tcp_settings"]
},
"almalinux8.7": {
"components": ["check_impi_2021", "check_gdrcopy", "check_cuda", "check_nccl", "check_gcc", "check_aocl", "check_aocc", "check_docker", "check_dcgm"],
"services": ["check_sku_customization", "check_nvidia_fabricmanager", "check_sunrpc_tcp_settings"]
},
"almalinux8.10": {
"components": ["check_impi_2021", "check_gdrcopy", "check_cuda", "check_nccl", "check_gcc", "check_aocl", "check_aocc", "check_docker", "check_dcgm"],
"services": ["check_sku_customization", "check_nvidia_fabricmanager", "check_sunrpc_tcp_settings"]
},
"azurelinux3.0": {
"components": ["check_impi_2021", "check_gdrcopy", "check_cuda", "check_nccl", "check_gcc", "check_aocl", "check_aocc", "check_docker", "check_dcgm"],
"services": ["check_sku_customization", "check_nvidia_fabricmanager", "check_sunrpc_tcp_settings"]
}
}
24 changes: 24 additions & 0 deletions ubuntu/common/install_cmake.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
set -ex

source ${COMMON_DIR}/utilities.sh

#update CMAKE
cmake_metadata=$(get_component_config "cmake")
cmake_version=$(jq -r '.version' <<< $cmake_metadata)
cmake_url=$(jq -r '.url' <<< $cmake_metadata)
cmake_sha256=$(jq -r '.sha256' <<< $cmake_metadata)
TARBALL="cmake-${cmake_version}-linux-x86_64.tar.gz"

$COMMON_DIR/download_and_verify.sh ${cmake_url} ${cmake_sha256}
tar -xzf ${TARBALL}
pushd cmake-${cmake_version}-linux-x86_64
cp -f bin/{ccmake,cmake,cpack,ctest} /usr/local/bin
cp -rf share/cmake-* /usr/local/share/
popd
hash -r

$COMMON_DIR/write_component_version.sh "CMAKE" ${cmake_version}

# Remove installation files
rm -rf cmake-${cmake_version}-linux-x86_64*
8 changes: 5 additions & 3 deletions ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ if [[ "$#" -gt 0 ]]; then
INPUT=$1
if [ "$INPUT" == "AMD" ]; then
GPUi="AMD"
echo "ERROR, the AMD pathway is not fully implemented yet."
exit 1
echo "Configuring VM for AMD GPUs."
elif [ "$INPUT" != "NVIDIA" ]; then
echo "Error: Invalid GPU type. Please specify 'NVIDIA' or 'AMD'."
exit 1
Expand Down Expand Up @@ -96,7 +95,7 @@ $COMMON_DIR/install_monitoring_tools.sh
$COMMON_DIR/install_amd_libs.sh

# install Azure/NHC Health Checks
$COMMON_DIR/install_health_checks.sh
$COMMON_DIR/install_health_checks.sh "$GPU"

# disable cloud-init
$UBUNTU_COMMON_DIR/disable_cloudinit.sh
Expand All @@ -111,6 +110,9 @@ $UBUNTU_COMMON_DIR/disable_predictive_interface_renaming.sh
$COMMON_DIR/setup_sku_customizations.sh

if [ "$GPU" = "AMD" ]; then
#update cmake
$UBUNTU_COMMON_DIR/install_cmake.sh

#install rocm software stack
./install_rocm.sh

Expand Down
42 changes: 26 additions & 16 deletions ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_rccl.sh
Original file line number Diff line number Diff line change
@@ -1,32 +1,41 @@
#!/bin/bash
set -ex

source ${COMMON_DIR}/utilities.sh

#install the rccl library
apt install libstdc++-12-dev
apt remove -y rccl
pushd ~
git clone https://github.com/rocm/rccl
popd
mkdir ~/rccl/build
pushd ~/rccl/build
rccl_metadata=$(get_component_config "rccl")
rccl_version=$(jq -r '.version' <<< $rccl_metadata)
rccl_url=$(jq -r '.url' <<< $rccl_metadata)
rccl_sha256=$(jq -r '.sha256' <<< $rccl_metadata)
#the content of this tar ball is rccl but its name is misleading
TARBALL="rocm-6.2.4.tar.gz"

$COMMON_DIR/download_and_verify.sh ${rccl_url} ${rccl_sha256}
tar -xzf ${TARBALL}
mkdir ./rccl-rocm-6.2.4/build
pushd ./rccl-rocm-6.2.4/build
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ -DCMAKE_INSTALL_PREFIX=/opt/rccl ..
make -j 32
make -j$(nproc)
make install
popd

pushd ~
pushd ../..
rm -rf ${TARBALL} rccl-rocm-6.2.4
$COMMON_DIR/write_component_version.sh "RCCL" ${rccl_version}

sysctl kernel.numa_balancing=0
echo "kernel.numa_balancing=0" | tee -a /etc/sysctl.conf


git clone https://github.com/ROCmSoftwarePlatform/rccl-tests
pushd ~/rccl-tests
pushd ./rccl-tests

source /opt/hpcx*/hpcx-init.sh
hpcx_load

HPCX="/opt/hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2"
HPCX+="-nccl2.18-x86_64/ompi/"
#HPCX="/opt/hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2"
#HPCX+="-nccl2.18-x86_64/ompi/"
RCCLLIB="/opt/rccl/lib/librccl.so"
RCCLDIR="/opt/rccl"

Expand All @@ -42,15 +51,16 @@ popd
DEST_TEST_DIR=/opt/rccl-tests
mkdir -p $DEST_TEST_DIR

cp -r ~/rccl-tests/build/* $DEST_TEST_DIR
cp -r ./rccl-tests/build/* $DEST_TEST_DIR
rm -rf rccl-tests

git clone https://github.com/ROCm/rdma-perftest
mkdir /opt/rocm-perftest
pushd ~/rdma-perftest
mkdir -p /opt/rocm-perftest
pushd ./rdma-perftest
./autogen.sh
./configure --enable-rocm --with-rocm=/opt/rocm --prefix=/opt/rocm-perftest/
make -j 32
make -j$(nproc)
make install

popd
rm -rf rdma-perftest
30 changes: 23 additions & 7 deletions ubuntu/ubuntu-22.x/ubuntu-22.04-hpc/install_rocm.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,21 @@
#!/bin/bash
set -ex

source ${COMMON_DIR}/utilities.sh

#move to rocm package
./amdgpu-install -y --usecase=graphics,rocm
rocm_metadata=$(get_component_config "rocm")
rocm_version=$(jq -r '.version' <<< $rocm_metadata)
rocm_url=$(jq -r '.url' <<< $rocm_metadata)
rocm_sha256=$(jq -r '.sha256' <<< $rocm_metadata)
DEBPACKAGE=$(echo "${rocm_url}" | awk -F '/' '{print $NF}')

${COMMON_DIR}/download_and_verify.sh ${rocm_url} ${rocm_sha256}
apt install -y ./${DEBPACKAGE}
amdgpu-install -y --usecase=graphics,rocm
apt install -y rocm-bandwidth-test
rm -f ./${DEBPACKAGE}
$COMMON_DIR/write_component_version.sh "ROCM" ${rocm_version}

#Add self to render and video groups so they can access gpus.
usermod -a -G render $(logname)
Expand All @@ -30,19 +42,26 @@ mv tmplimits.conf /etc/security/limits.conf
echo blacklist amdgpu | tee -a /etc/modprobe.d/blacklist.conf
update-initramfs -c -k $(uname -r)

#1002:740c is Mi200
#1002:74b5 is Mi300x
#1002:74bd is Mi300HF
echo "Writing gpu mode probe in init.d"
cat <<'EOF' > /tmp/tempinit.sh
#!/bin/sh
at_count=0
while [ $at_count -le 90 ]
do
if [ $(lspci -d 1002:74b5 | wc -l) -eq 8 -o $(lspci -d 1002:740c | wc -l) -eq 16 ]; then
if [ $(lspci -d 1002:74b5 | wc -l) -eq 8 -o $(lspci -d 1002:74bd | wc -l) -eq 8 -o $(lspci -d 1002:740c | wc -l) -eq 16 ]; then
echo Required number of GPUs found
at_count=91
sleep 120s
echo doing Modprobe for amdgpu
sudo modprobe -r hyperv_drm
sudo modprobe amdgpu ip_block_mask=0x7f
if [ $(lspci -d 1002:740c | wc -l) -eq 16 ]; then
sudo modprobe amdgpu
else
sudo modprobe -r hyperv_drm
sudo modprobe amdgpu ip_block_mask=0x7f
fi
else
sleep 10
at_count=$(($at_count + 1))
Expand All @@ -67,6 +86,3 @@ echo -e '[Install]\n\nWantedBy=multi-user.target' \
mv rocmstartup.service /etc/systemd/system/rocmstartup.service
systemctl start rocmstartup
systemctl enable rocmstartup

apt install -y rocm-bandwidth-test

Loading