diff --git a/.archive.mk b/.archive.mk index 6cc2975..49f4724 100755 --- a/.archive.mk +++ b/.archive.mk @@ -2,18 +2,20 @@ # For JSC Courses # Generates a tar from all top-level directory in this current folder, without hidden files # -Andreas Herten, 2021 April 27 +# +# Changelog: +# * Nov 2022: The archive is extracted again, then slides.pdf is removed if a patched slides-sc22.pdf is found (which includes an SC22 slide 0 title slide); and then repackaged .PHONY: all -all: tut147s1-multi-gpu.tar.gz +all: tut105-multi-gpu.tar.gz -SOURCES=$(shell gfind . -maxdepth 1 -mindepth 1 -not -path "./.*" -not -name "tut147s1-multi-gpu.tar.gz" -printf '%P\n' | sort -h) +SOURCES=$(shell gfind . -maxdepth 1 -mindepth 1 -not -path "./.*" -not -name "tut105-multi-gpu.tar.gz" -printf '%P\n' | sort -h) -tut147s1-multi-gpu.tar.gz: $(shell find . -not -name "tut147s1-multi-gpu.tar.gz") -# if ! grep -q "Please check Github"; then \ - sed -i '1 i***Please check GitHub repo for latest version of slides: https://github.com/FZJ-JSC/tutorial-multi-gpu/ ***\n' README.md; \ - fi; +tut105-multi-gpu.tar.gz: $(shell find . -not -name "tut105-multi-gpu.tar.gz") sed -i '1 i***Please check GitHub repo for latest version of slides: https://github.com/FZJ-JSC/tutorial-multi-gpu/ ***\n' README.md - tar czf $@ --transform 's,^,ISC22-tut147s1-Multi-GPU/,' --exclude=".*" $(SOURCES) -# if grep -q "Please check Github"; then \ - sed -i '2d' README.md; \ - fi + tar czf $@ --transform 's,^,ISC25-tut105-Multi-GPU/,' --exclude=".*" $(SOURCES) + tar xf $@ + rm $@ + find ISC25-tut105-Multi-GPU/ -not -path './.*' -iname 'slides-*.pdf' -execdir rm slides.pdf \; + tar czf $@ ISC25-tut105-Multi-GPU + rm -rf ISC25-tut105-Multi-GPU sed -i '1,2d' README.md \ No newline at end of file diff --git a/.etc/.gitignore b/.etc/.gitignore new file mode 100644 index 0000000..450689b --- /dev/null +++ b/.etc/.gitignore @@ -0,0 +1 @@ +raw/* diff --git a/.etc/.set-facl-permissions.sh b/.etc/.set-facl-permissions.sh index 1bc8de4..18dfd2e 100644 --- a/.etc/.set-facl-permissions.sh +++ b/.etc/.set-facl-permissions.sh @@ -2,9 +2,9 @@ set -x -for user in haghighimood1 kraus1 hrywniak1 oden1 garciadegonzalo1; do - setfacl -m u:$user:rwx -R $PROJECT_training2216/common/ - setfacl -m u:$user:rwx -R $PROJECT_training2216/env.sh +for user in haghighimood1 kraus1 hrywniak1 oden1 garciadegonzalo1 badwaik1 john2; do + setfacl -m u:$user:rwx -R $PROJECT_training2446/common/ + setfacl -m u:$user:rwx -R $PROJECT_training2446/env.sh done set +x \ No newline at end of file diff --git a/.etc/deploy-material.sh b/.etc/deploy-material.sh index 07e18dd..b546b40 100755 --- a/.etc/deploy-material.sh +++ b/.etc/deploy-material.sh @@ -1 +1 @@ -rsync --archive --exclude=".*" --verbose ../ judac:/p/project/training2216/common/material/ +rsync --archive --exclude="*minified.pdf" --exclude="tut*" --exclude=".*" --exclude="*-sc*.pdf" --verbose ../ judac:/p/project/training2446/common/material/ diff --git a/.etc/deploy.sh b/.etc/deploy.sh index 55eb7c6..8e615ed 100755 --- a/.etc/deploy.sh +++ b/.etc/deploy.sh @@ -1 +1 @@ -rsync --archive --exclude="deploy.sh" --verbose . judac:/p/project/training2216/common/environment/ +rsync --archive --exclude="deploy.sh" --exclude="raw/" --exclude="sc24-titleslides/" --verbose . judac:/p/project/training2446/common/environment/ diff --git a/.etc/instructions-header.md b/.etc/instructions-header.md index db1c36f..63d9078 100644 --- a/.etc/instructions-header.md +++ b/.etc/instructions-header.md @@ -1,7 +1,6 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 29 May 2022 9AM - 6PM CEST -- Location: Hall Y6, Congress Center Hamburg (CCH) +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 - + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 diff --git a/.etc/jsccourse-bashrc.sh b/.etc/jsccourse-bashrc.sh index 491c5ee..e05172c 100644 --- a/.etc/jsccourse-bashrc.sh +++ b/.etc/jsccourse-bashrc.sh @@ -11,10 +11,10 @@ # Andreas Herten, >2017 ################################################ if [ -z "$_JSCCOURSE_ENV_SOURCED" ]; then - project="training2216" + project="training2446" export JSCCOURSE_DIR_GROUP=/p/project/$project - export JSCCOURSE_DIR_LOCAL=${JSCCOURSE_DIR_LOCAL_BASE:-$HOME}/ISC22-Multi-GPU-Tutorial + export JSCCOURSE_DIR_LOCAL=${JSCCOURSE_DIR_LOCAL_BASE:-$HOME}/SC24-Multi-GPU-Tutorial export _JSCCOURSE_ENV_SOURCED="$(date)" export C_V_D="0,1,2,3" @@ -23,12 +23,13 @@ if [ -z "$_JSCCOURSE_ENV_SOURCED" ]; then res="" currentday=$(date +%d) - if [[ "$currentday" == "29" ]]; then - res="--reservation multi-gpu-tutorial-2022-05-29" + if [[ "$currentday" == "17" ]]; then + res="--reservation sc24-multi-gpu" fi export SLURM_NTASKS=1 + export _JSCCOURSE_GPU_ARCH='80' JSC_SUBMIT_CMD_SYSTEM_SPECIFIC_OPTIONS="" # export SLURM_GRES=gpu:4 # SALLOC_GRES and SBATCH_GRES are not yet available @@ -37,18 +38,28 @@ if [ -z "$_JSCCOURSE_ENV_SOURCED" ]; then ngpus=1 export NP=2 export PSP_CUDA_ENFORCE_STAGING=1 - JSC_SUBMIT_CMD_SYSTEM_SPECIFIC_OPTIONS="--ntasks-per-node 1" + JSC_SUBMIT_CMD_SYSTEM_SPECIFIC_OPTIONS="--ntasks-per-node 1 --disable-dcgm" partition=${partition:-gpus} + export _JSCCOURSE_GPU_ARCH='70' ;; - juwels|juwelsbooster) + juwelsbooster) ngpus=4 export NP=4 partition=${partition:-booster} + JSC_SUBMIT_CMD_SYSTEM_SPECIFIC_OPTIONS="--disable-dcgm" ;; jurecadc) ngpus=4 export NP=4 partition=${partition:-dc-gpu} + JSC_SUBMIT_CMD_SYSTEM_SPECIFIC_OPTIONS="--disable-dcgm" + ;; + jedi) + ngpus=4 + export NP=4 + partition=${parittion:-all} + # JSC_SUBMIT_CMD_SYSTEM_SPECIFIC_OPTIONS="--gpus-per-task=1" + export _JSCCOURSE_GPU_ARCH='90' ;; *) echo "This system is not yet tested, setting ngpus=4" @@ -56,9 +67,10 @@ if [ -z "$_JSCCOURSE_ENV_SOURCED" ]; then ;; esac - export JSC_BATCH_CONFIG="$res --partition ${partition} --cpu-bind=sockets --gres=gpu:$ngpus $JSC_SUBMIT_CMD_SYSTEM_SPECIFIC_OPTIONS --time 0:10:00" + export JSC_BATCH_CONFIG="$res --partition ${partition} --gres=gpu:$ngpus $JSC_SUBMIT_CMD_SYSTEM_SPECIFIC_OPTIONS --time 0:10:00" export JSC_ALLOC_CMD="salloc $JSC_BATCH_CONFIG" - export JSC_SUBMIT_CMD="srun $JSC_BATCH_CONFIG --pty" + # export JSC_SUBMIT_CMD="srun $JSC_BATCH_CONFIG --pty" + export JSC_SUBMIT_CMD="${JSC_ALLOC_CMD} srun --cpu-bind=sockets --pty" export _JSC_MATERIAL_SYNC="rsync --archive --update --exclude='.*' --exclude='.*/' $JSCCOURSE_DIR_GROUP/common/material/ $JSCCOURSE_DIR_LOCAL" export _JSC_MATERIAL_SYNC_FORCE="rsync --archive --exclude='.*' --exclude='.*/' $JSCCOURSE_DIR_GROUP/common/material/ $JSCCOURSE_DIR_LOCAL" @@ -104,11 +116,9 @@ if [[ $- =~ "i" ]]; then alias jsc-material-reset-10="rsync --archive --delete $JSCCOURSE_DIR_GROUP/common/material/10-* $JSCCOURSE_DIR_LOCAL" alias jsc-material-reset-11="rsync --archive --delete $JSCCOURSE_DIR_GROUP/common/material/11-* $JSCCOURSE_DIR_LOCAL" - export MPI_HOME=$EBROOTPSMPI - echo "" echo "*******************************************************************************" - echo " Welcome to the ISC22 Tutorial on Multi-GPU Computing for Exascale! " + echo " Welcome to the ISC25 Tutorial on Multi-GPU Computing for Exascale! " # echo " A default call to get a batch system allocation is stored in \$JSC_ALLOC_CMD!" # echo " Use it with \`eval \$JSC_ALLOC_CMD\`. The value of \$JSC_ALLOC_CMD is:" # echo -n " " diff --git a/.etc/modules.sh b/.etc/modules.sh index b945085..c831c94 100644 --- a/.etc/modules.sh +++ b/.etc/modules.sh @@ -1,10 +1,16 @@ -module use $OTHERSTAGES module purge -module load Stages/2022 -module load GCC/11.2.0 -module load CUDA/11.5 -module load ParaStationMPI/5.5.0-1 -module load NVSHMEM/2.5.0 -module load NCCL/2.12.7-1-CUDA-11.5 -module load Nsight-Systems/2022.2.1 -# module use $JSCCOURSE_DIR_GROUP/common/modulefiles \ No newline at end of file +module use $OTHERSTAGES +#module use /p/project/training2446/easybuild/juwelsbooster/modules/all/MPI/GCC/12.3.0/psmpi/5/ +module load Stages/2024 +module load GCC/12.3.0 +module load CUDA/12 +module load OpenMPI/4.1.6 +export MPI_HOME=$EBROOTOPENMPI +#export MPI_HOME=$EBROOTPSMPI +module load NCCL/default-CUDA-12 +module load NVSHMEM/2.10.1-CUDA-12 +module load Nsight-Systems/2024.4.1 +module load MPI-settings/CUDA +export USERINSTALLATIONS=${JSCCOURSE_DIR_GROUP} +module update # will also load different NCCL +module load NVSHMEM/3.1.7-CUDA-12 \ No newline at end of file diff --git a/.etc/printout.tex b/.etc/printout.tex new file mode 100644 index 0000000..16df284 --- /dev/null +++ b/.etc/printout.tex @@ -0,0 +1,63 @@ +\documentclass{article} +\usepackage[a4paper,margin=1cm,landscape]{geometry} + +\usepackage{fontspec} +\setmainfont{PT Sans} +\newfontfamily{\ptsansnarrow}{PT Sans Narrow} +\newfontfamily{\ptsanscaption}{PT Sans Caption} + +\usepackage{tikz} +\usetikzlibrary{positioning} +\usetikzlibrary{fit} +\usepackage{xcolor} +\usepackage{hyperref} +\usepackage{relsize} +\renewcommand\RSlargest{50pt} + +\usepackage{lipsum} +\usepackage{verbatim} +\pagenumbering{gobble} + +\definecolor{fzjblue}{RGB}{2,61,107} % first color +\definecolor{fzjlightblue}{RGB}{173,189,227} % second color +\definecolor{fzjgray}{RGB}{235,235,235} % third color +\definecolor{fzjred}{RGB}{235, 95, 115} % FZJ raspberry red +\definecolor{fzjgreen}{RGB}{185, 210, 95} % FZJ grass green +\definecolor{fzjyellow}{RGB}{250, 235, 90} % FZJ citrus yellow +\definecolor{fzjviolet}{RGB}{175, 130, 185} % FZJ Hyacinth violet +\definecolor{fzjorange}{RGB}{250, 180, 90} % FZJ apricot orange + +\newcommand{\highlight}[1]{\textcolor{fzjblue}{#1}} + +\begin{document} +\relsize{4} +\setlength{\parindent}{0pt} +% \centering + % {\larger[3]\tikz{\node at (0, 0) [fill=black, text=white, text depth=0.25ex, font=\ptsansnarrow] {\textbf{Onboarding}};} + + % \vspace*{0.3\baselineskip} + % \emph{\textbf{Efficient Distributed GPU Programming for Exascale}}} + + \begin{tikzpicture}[every node/.style={outer sep=0pt}] + \node (onboarding) [fill=fzjblue, text=white, text depth=0.5ex, font=\ptsansnarrow\relsize{4}, yscale=2] {\textbf{Onboarding}}; + \renewcommand{\baselinestretch}{0.9} + \node (title) [right=of onboarding, font=\relsize{4}\bfseries, text width=0.66\pagewidth, color=fzjblue] {\emph{Efficient Distributed GPU\\Programming for Exascale}}; + \node [draw=fzjblue, fit=(onboarding) (title), inner sep=0pt, outer sep=0pt, thick] {}; + \end{tikzpicture} + + + \vspace*{1\baselineskip} + The tutorial uses HPC resources at \textcolor{fzjred}{Jülich Supercomputing Centre}. The following steps need to be done to access the JEDI machine. Feel free to start them as soon as you sit down, ask the Tutors in the room for help.\\ + Tutorial repository (incl. slides): \href{https://go.fzj.de/mg-gh}{\texttt{go.fzj.de/mg-gh}} + + \vspace*{1\baselineskip} + \begin{enumerate} + \item Create \highlight{\emph{JuDoor}} account, \highlight{join} \texttt{training2446} project: \href{https://go.fzj.de/mg-jd}{\texttt{go.fzj.de/mg-jd}} + \item Fill \highlight{usage agreement}; wait at least 15 min for synchronization + \item Login to \highlight{Jupyter}: \href{https://jupyter.jsc.fz-juelich.de}{\texttt{jupyter.jsc.fz-juelich.de}} (JuDoor credentials like \texttt{usr1}) + \item Create new Jupyter \highlight{instance} on JEDI using \texttt{LoginNode} with training project + \item Start Jupyter Terminal, \highlight{source} course environment\\ + \verb|source $PROJECT_training2446/env.sh| + \item \highlight{Sync} course material: \verb|jsc-material-sync| + \end{enumerate} +\end{document} \ No newline at end of file diff --git a/.etc/sc23-titleslides/.gitignore b/.etc/sc23-titleslides/.gitignore new file mode 100644 index 0000000..cb67314 --- /dev/null +++ b/.etc/sc23-titleslides/.gitignore @@ -0,0 +1,49 @@ +*.sublime-workspace + +# automatically generated LaTeX stuff +*.log +*.aux +*.toc +*.out +*.fdb_* +*.gz +*.vrb +*.nav +*.snm +*.fls + +# automatically generated bibtex stuff +*.bbl +*.bcf +*.blg + +# automatically generated glossary stuff +*.acn +*.acr +*.alg +*.glg +*.glo +*.gls +*.ist +*.glsdefs +*.run.xml +*.xdy +*.xdv + +# feynmf auto files +*.600pk +*.mf +*.tfm +*.mp +*.t1 +*.t2 +*.t3 +*.1 +*.2 +*.3 + +# minted/tcolorbox/fancyvrb +_minted-*/* +*.listing + +title-slide.*.* \ No newline at end of file diff --git a/.etc/sc23-titleslides/README.md b/.etc/sc23-titleslides/README.md new file mode 100644 index 0000000..768e26f --- /dev/null +++ b/.etc/sc23-titleslides/README.md @@ -0,0 +1,15 @@ +# Title Slide Injector + +For the program package, the slides should all have a common title slide (_slide 0_). + +It feels wrong to commit slidedecks with this slide 0 also to Github, so we add them in post-processing before sending slides to the conference. + +The `prelude_slides.mk` Makefile takes care of the following steps: + +* Make a template out of the LaTeX `title-slide.tex` which is called `title-slide.in.tex` by using `sed` +* Use the template to create TeX files (`title-slide.01.tex`, etc) for each of the title slides, by using `gen-titleslide.py` with parameters from `session.yml` (and `yq`) +* Typeset the created TeX files to PDFs in a temporary directory and copy them back to here +* Use `mutool` to create custom versions of the original slide decks which have a `-sc22.pdf` suffix + + +The `prelude_slides.mk` Makefile is pretty modular and can easily extended; remember also extend the `sessions.yml`. \ No newline at end of file diff --git a/.etc/sc23-titleslides/gen-titleslide.py b/.etc/sc23-titleslides/gen-titleslide.py new file mode 100644 index 0000000..91aa011 --- /dev/null +++ b/.etc/sc23-titleslides/gen-titleslide.py @@ -0,0 +1,35 @@ +# import jinja2 +import argparse +from jinja2 import Environment, FileSystemLoader + +def main(args): + environment_tex = Environment( + block_start_string='((*', + block_end_string='*))', + variable_start_string='(((', + variable_end_string=')))', + comment_start_string='((=', + comment_end_string='=))', + # line_statement_prefix='%%', + # line_comment_prefix='%#', + trim_blocks=True, + autoescape=False, + loader=FileSystemLoader(".") + ) + template = environment_tex.get_template(args.template) + rendered = template.render(title=args.title, author=args.author) + # print(rendered) + with open(args.out, mode='w', encoding='utf-8') as output_file: + output_file.write(rendered) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Generate LaTeX documents for title slides.') + parser.add_argument('--title', '-t', help='Title of presentation (first line)', required=True) + parser.add_argument('--author', '-a', help='Author of presentation (second line)', required=True) + parser.add_argument('--out', '-o', help='Output file', required=True) + parser.add_argument('--template', help='LaTeX template to use', default="title-slide.in.tex") + + args = parser.parse_args() + print(args) + main(args) \ No newline at end of file diff --git a/.etc/sc23-titleslides/prelude_slides.mk b/.etc/sc23-titleslides/prelude_slides.mk new file mode 100755 index 0000000..2d0d3a6 --- /dev/null +++ b/.etc/sc23-titleslides/prelude_slides.mk @@ -0,0 +1,44 @@ +#!/usr/bin/make -f +# LAUNCH FROM THIS FOLDER + +# OUTPUT=../../01-L_Introduction_Overview/slides-sc23.pdf ../../01b-H_Onboarding/slides-sc23.pdf ../../04-L_Performance_and_debugging_tools/slides-sc23.pdf ../../11-L_Summary_Advanced/slides-sc23.pdf +OUTPUT=../../01-L_Introduction_Overview/slides-sc23.pdf ../../01b-H_Onboarding/slides-sc23.pdf ../../02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides-sc23.pdf ../../04-L_Performance_and_debugging_tools/slides-sc23.pdf ../../05-L_Optimization_techniques_for_multi-GPU_applications/slides-sc23.pdf ../../07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides-sc23.pdf ../../09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides-sc23.pdf ../../11-L_Summary_Advanced/slides-sc23.pdf + +.PHONY: all +all: $(OUTPUT) +MYTMPDIR:=$(shell mktemp -d) + +title-slide.in.tex: title-slide.tex + cat $< | \ + sed 's#INSERT TITLE HERE#((( title )))#' | \ + sed 's#Insert Author Here#((( author )))#' > \ + $@ + +title-slide.01.tex ../../01-L_Introduction_Overview/slides-sc23.pdf: SESSIONKEY=01 +title-slide.01b.tex ../../01b-H_Onboarding/slides-sc23.pdf: SESSIONKEY=01b +title-slide.02.tex ../../02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides-sc23.pdf: SESSIONKEY=02 +title-slide.04.tex ../../04-L_Performance_and_debugging_tools/slides-sc23.pdf: SESSIONKEY=04 +title-slide.05.tex ../../05-L_Optimization_techniques_for_multi-GPU_applications/slides-sc23.pdf: SESSIONKEY=05 +title-slide.07.tex ../../07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides-sc23.pdf: SESSIONKEY=07 +title-slide.09.tex ../../09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides-sc23.pdf: SESSIONKEY=09 +title-slide.11.tex ../../11-L_Summary_Advanced/slides-sc23.pdf: SESSIONKEY=11 +title-slide.01.tex title-slide.01b.tex title-slide.02.tex title-slide.04.tex title-slide.05.tex title-slide.07.tex title-slide.09.tex title-slide.11.tex: title-slide.tex + python3 gen-titleslide.py --author "$(shell cat sessions.yml | yq .$(SESSIONKEY).author)" --title "$(shell cat sessions.yml | yq .$(SESSIONKEY).title)" --out "$@" + +../../01-L_Introduction_Overview/slides-sc23.pdf: BASEDECK=../../01-L_Introduction_Overview/slides.pdf +../../01b-H_Onboarding/slides-sc23.pdf: BASEDECK=../../01b-H_Onboarding/slides.pdf +../../02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides-sc23.pdf: BASEDECK=../../02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides.pdf +../../04-L_Performance_and_debugging_tools/slides-sc23.pdf: BASEDECK=../../04-L_Performance_and_debugging_tools/slides.pdf +../../05-L_Optimization_techniques_for_multi-GPU_applications/slides-sc23.pdf: BASEDECK=../../05-L_Optimization_techniques_for_multi-GPU_applications/slides.pdf +../../07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides-sc23.pdf: BASEDECK=../../07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides.pdf +../../09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides-sc23.pdf: BASEDECK=../../09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides.pdf +../../11-L_Summary_Advanced/slides-sc23.pdf: BASEDECK=../../11-L_Summary_Advanced/slides.pdf + +.SECONDEXPANSION: +%-sc23.pdf: %.pdf title-slide.$$(SESSIONKEY).tex $(BASEDECK) + latexmk -output-directory=$(MYTMPDIR) -jobname=${SESSIONKEY} -pdfxe title-slide.$(SESSIONKEY).tex + cp $(MYTMPDIR)/${SESSIONKEY}.pdf title-slide.$(SESSIONKEY).pdf + papersize=$(shell pdfinfo title-slide.$(SESSIONKEY).pdf | awk '/^Page size:/ {printf "{%fbp,%fbp}", $$3, $$5}') && \ + pdfjam --outfile tmp.pdf --papersize "$$papersize" $(BASEDECK) + mutool merge -o $@ title-slide.$(SESSIONKEY).pdf 0 tmp.pdf + rm tmp.pdf \ No newline at end of file diff --git a/.etc/sc23-titleslides/sc-background.png b/.etc/sc23-titleslides/sc-background.png new file mode 100644 index 0000000..d5c8d61 Binary files /dev/null and b/.etc/sc23-titleslides/sc-background.png differ diff --git a/.etc/sc23-titleslides/sessions.yml b/.etc/sc23-titleslides/sessions.yml new file mode 100644 index 0000000..e36192e --- /dev/null +++ b/.etc/sc23-titleslides/sessions.yml @@ -0,0 +1,24 @@ +01: + title: 'Distributed GPU Programming for Exascale' + author: 'Andreas Herten, JSC' +01b: + title: 'Onboarding JUWELS Booster' + author: 'Andreas Herten, JSC' +02: + title: 'Introduction to MPI-Distributed Computing with GPUs' + author: 'Simon Garcia, SNL' +04: + title: 'Peformance and Debugging Tools' + author: 'Markus Hrywniak, NVIDIA' +05: + title: 'Optimization Techniques for Multi-GPU Applications' + author: 'Simon Garcia, SNL' +07: + title: 'NCCL and Host-Initiated NVSHMEM' + author: 'Jiri Kraus, NVIDIA' +09: + title: 'CUDA Graphs and Device-initiated Communication with NVSHMEM' + author: 'Jiri Kraus, NVIDIA' +11: + title: 'Summary and Advanced Topics' + author: 'Andreas Herten, JSC' \ No newline at end of file diff --git a/.etc/sc23-titleslides/title-slide.pdf b/.etc/sc23-titleslides/title-slide.pdf new file mode 100644 index 0000000..abc39f9 Binary files /dev/null and b/.etc/sc23-titleslides/title-slide.pdf differ diff --git a/.etc/sc23-titleslides/title-slide.tex b/.etc/sc23-titleslides/title-slide.tex new file mode 100644 index 0000000..23bb847 --- /dev/null +++ b/.etc/sc23-titleslides/title-slide.tex @@ -0,0 +1,23 @@ +\documentclass[aspectratio=169]{beamer} +\usepackage{amsmath} +\usepackage{mathspec} +\usepackage{tikz} +\setmainfont{Source Sans Pro}[BoldFont={* Semibold}, BoldItalicFont={* Semibold Italic}] +\setsansfont{Source Sans Pro}[BoldFont={* Semibold}, BoldItalicFont={* Semibold Italic}] + +\setbeamertemplate{navigation symbols}{} +\setbeamertemplate{background}{\includegraphics[width=\paperwidth,height=\paperheight]{sc-background.png}} + +\setbeamercolor{normal text}{fg=black} +\begin{document} + +\newcommand{\mytitle}{INSERT TITLE HERE} +\newcommand{\myauthor}{Insert Author Here} +\begin{frame}[plain, t] + \begin{tikzpicture}[overlay, remember picture, align=left, text width=0.85\paperwidth] + \coordinate (main anchor) at ([yshift=0.4\paperheight,xshift=0.06\paperwidth]current page.south west); + \node (title) at (main anchor) [anchor=south west, font=\LARGE\bfseries] {\mytitle}; + \node (subtitle) at ([yshift=-0.05\paperheight]main anchor) [anchor=north west, font=\large, text=white] {\myauthor}; + \end{tikzpicture} +\end{frame} +\end{document} diff --git a/.gitignore b/.gitignore index f3057c8..fc34298 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ -tut147s1-multi-gpu.tar.gz +tut105-multi-gpu.tar.gz +*-sc24.pdf +tut* diff --git a/.zenodo.json b/.zenodo.json index e012d96..6da1b10 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -2,11 +2,11 @@ "creators": [ { "orcid": "0000-0002-5699-1793", - "affiliation": "Barcelona Supercomputing Center", + "affiliation": "Sandia National Laboratories", "name": "Garcia de Gonzalo, Simon" }, { - "orcid": "0000-0002-1825-0097", + "orcid": "0000-0002-7150-2505", "affiliation": "Jülich Supercomputing Centre", "name": "Herten, Andreas" }, @@ -29,21 +29,21 @@ "title": "Efficient Distributed GPU Programming for Exascale", - "publication_date": "2022-05-29", + "publication_date": "2025-06-13", - "description": "

Over the past years, GPUs became ubiquitous in HPC installations around the world. Today, they provide the majority of performance of some of the largest supercomputers (e.g. Summit, Sierra, JUWELS Booster). This trend continues in the pre-exascale and exascale systems (LUMI, Leonardo; Perlmutter, Frontier): GPUs are chosen as the core computing devices to enter this next era of HPC.

To take advantage of future GPU-accelerated systems with tens of thousands of devices, application developers need to have the propers skills and tools to understand, manage, and optimize distributed GPU applications. In this tutorial, participants will learn techniques to efficiently program large-scale multi-GPU systems. While programming multiple GPUs with MPI is explained in detail, advanced tuning techniques and complementary programming models like NCCL and NVSHMEM are presented as well. Tools for analysis are shown and used to motivate and implement performance optimizations. The tutorial is a combination of lectures and hands-on exercises, using Europe’s fastest supercomputer, JUWELS Booster with NVIDIA GPUs, for interactive learning and discovery.

", + "description": "

Over the past decade, GPUs became ubiquitous in HPC installations around the world, delivering the majority of performance of some of the largest supercomputers (e.g. Summit, Sierra, JUWELS Booster). This trend continues in the recently deployed and upcoming Pre-Exascale and Exascale systems (JUPITER, LUMI, Leonardo; El Capitan, Frontier, Aurora): GPUs are chosen as the core computing devices to enter this next era of HPC.To take advantage of future GPU-accelerated systems with tens of thousands of devices, application developers need to have the proper skills and tools to understand, manage, and optimize distributed GPU applications.In this tutorial, participants will learn techniques to efficiently program large-scale multi-GPU systems. While programming multiple GPUs with MPI is explained in detail, also advanced tuning techniques and complementing programming models like NCCL and NVSHMEM are presented. Tools for analysis are shown and used to motivate and implement performance optimizations. The tutorial teaches fundamental concepts that apply to GPU-accelerated systems in general, taking the NVIDIA platform as an example. It is a combination of lectures and hands-on exercises, using a development system for JUPITER (JEDI), for interactive learning and discovery.

", - "notes": "Slides and exercises of tutorial presented virtually at ISC22 (ISC High Performance 2022); https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2", + "notes": "Slides and exercises of tutorial presented at ISC High Performance 2025; https://isc.app.swapcard.com/widget/event/isc-high-performance-2025/planning/UGxhbm5pbmdfMjU4MTc5Ng==", "access_right": "open", - "conference_title": "ISC HPC 2022", - "conference_acronym": "ISC22", - "conference_dates": "29 May-02 June 2022", + "conference_title": "ISC 2025", + "conference_acronym": "ISC25", + "conference_dates": "10 June-13 June 2025", "conference_place": "Hamburg, Germany", "conference_url": "https://www.isc-hpc.com/", "conference_session": "Tutorials", - "conference_session_part": "Day 1", + "conference_session_part": "Afternoon", "upload_type": "lesson" } diff --git a/01-L_Introduction_Overview/slides.pdf b/01-L_Introduction_Overview/slides.pdf index 5a5302a..789f1ca 100644 Binary files a/01-L_Introduction_Overview/slides.pdf and b/01-L_Introduction_Overview/slides.pdf differ diff --git a/01b-H_Onboarding/slides.pdf b/01b-H_Onboarding/slides.pdf index 299d103..7970007 100644 Binary files a/01b-H_Onboarding/slides.pdf and b/01b-H_Onboarding/slides.pdf differ diff --git a/02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides.pdf b/02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides.pdf index f28aedb..6b026e7 100644 Binary files a/02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides.pdf and b/02-L_Introduction_to_MPI-Distributed_Computing_with_GPUs/slides.pdf differ diff --git a/03-H_Multi_GPU_Parallelization/.master/Instructions.ipynb b/03-H_Multi_GPU_Parallelization/.master/Instructions.ipynb index 0bfdfab..4a2dc44 100644 --- a/03-H_Multi_GPU_Parallelization/.master/Instructions.ipynb +++ b/03-H_Multi_GPU_Parallelization/.master/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", "## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", "\n", @@ -71,7 +71,7 @@ " size + 1 rows\n", "- Adapt the computation of (`iy_start_global`)" ], - "id": "6307a649-048f-4dd3-82ee-863dfcfba33b" + "id": "e42b5ab3-f626-4da5-b0c9-52a444cefde8" } ], "nbformat": 4, diff --git a/03-H_Multi_GPU_Parallelization/.master/Instructions.md b/03-H_Multi_GPU_Parallelization/.master/Instructions.md index f9794b2..4427a60 100644 --- a/03-H_Multi_GPU_Parallelization/.master/Instructions.md +++ b/03-H_Multi_GPU_Parallelization/.master/Instructions.md @@ -1,9 +1,9 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 29 May 2022 9AM - 6PM CEST -- Location: Hall Y6, Congress Center Hamburg (CCH) +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 ## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI diff --git a/03-H_Multi_GPU_Parallelization/.master/Makefile b/03-H_Multi_GPU_Parallelization/.master/Makefile.in similarity index 72% rename from 03-H_Multi_GPU_Parallelization/.master/Makefile rename to 03-H_Multi_GPU_Parallelization/.master/Makefile.in index 9f9c84b..e15d85c 100644 --- a/03-H_Multi_GPU_Parallelization/.master/Makefile +++ b/03-H_Multi_GPU_Parallelization/.master/Makefile.in @@ -1,4 +1,6 @@ -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 03H-@@TASKSOL@@ +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 4 NVCC=nvcc JSC_SUBMIT_CMD ?= srun --gres=gpu:4 --ntasks-per-node 4 @@ -6,6 +8,7 @@ CUDA_HOME ?= /usr/local/cuda ifndef MPI_HOME $(error MPI_HOME is not set) endif +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -14,7 +17,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -31,10 +35,10 @@ clean: rm -f jacobi jacobi.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi profile: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/03-H_Multi_GPU_Parallelization/.master/copy.mk b/03-H_Multi_GPU_Parallelization/.master/copy.mk index bf1a86e..895d460 100755 --- a/03-H_Multi_GPU_Parallelization/.master/copy.mk +++ b/03-H_Multi_GPU_Parallelization/.master/copy.mk @@ -1,13 +1,13 @@ #!/usr/bin/make -f -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. TASKDIR = ../tasks/ SOLUTIONDIR = ../solutions/ -OPT_SOLUTIONDIR = ../solutions/advanced +OPT_SOLUTIONDIR = ../solutions/advanced/ IYPNB_TEMPLATE = ../../.template.json PROCESSFILES = jacobi.cu -COPYFILES = Makefile Instructions.ipynb Instructions.md +COPYFILES = Instructions.ipynb Instructions.md TASKPROCCESFILES = $(addprefix $(TASKDIR)/,$(PROCESSFILES)) @@ -16,12 +16,19 @@ SOLUTIONPROCCESFILES = $(addprefix $(SOLUTIONDIR)/,$(PROCESSFILES)) OPT_SOLUTIONPROCCESFILES = $(addprefix $(OPT_SOLUTIONDIR)/,$(PROCESSFILES)) SOLUTIONCOPYFILES = $(addprefix $(SOLUTIONDIR)/,$(COPYFILES)) OPT_SOLUTIONCOPYFILES = $(addprefix $(OPT_SOLUTIONDIR)/,$(COPYFILES)) +MAKEFILES = $(addsuffix /Makefile,$(TASKDIR) $(SOLUTIONDIR) $(OPT_SOLUTIONDIR)) .PHONY: all task all: task -task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES} ${OPT_SOLUTIONPROCCESFILES} ${OPT_SOLUTIONCOPYFILES} - +task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES} ${OPT_SOLUTIONPROCCESFILES} ${OPT_SOLUTIONCOPYFILES} ${MAKEFILES} + +$(TASKDIR)/Makefile: Makefile.in + sed -e 's/@@TASKSOL@@/task/' $< > $@ +$(SOLUTIONDIR)/Makefile: Makefile.in + sed -e 's/@@TASKSOL@@/sol/' $< > $@ +$(OPT_SOLUTIONDIR)/Makefile: Makefile.in + sed -e 's/@@TASKSOL@@/solopt/' $< > $@ ${TASKPROCCESFILES}: $(PROCESSFILES) mkdir -p $(TASKDIR)/ diff --git a/03-H_Multi_GPU_Parallelization/solutions/Instructions.ipynb b/03-H_Multi_GPU_Parallelization/solutions/Instructions.ipynb index 0bfdfab..4a2dc44 100644 --- a/03-H_Multi_GPU_Parallelization/solutions/Instructions.ipynb +++ b/03-H_Multi_GPU_Parallelization/solutions/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", "## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", "\n", @@ -71,7 +71,7 @@ " size + 1 rows\n", "- Adapt the computation of (`iy_start_global`)" ], - "id": "6307a649-048f-4dd3-82ee-863dfcfba33b" + "id": "e42b5ab3-f626-4da5-b0c9-52a444cefde8" } ], "nbformat": 4, diff --git a/03-H_Multi_GPU_Parallelization/solutions/Instructions.md b/03-H_Multi_GPU_Parallelization/solutions/Instructions.md index f9794b2..4427a60 100644 --- a/03-H_Multi_GPU_Parallelization/solutions/Instructions.md +++ b/03-H_Multi_GPU_Parallelization/solutions/Instructions.md @@ -1,9 +1,9 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 29 May 2022 9AM - 6PM CEST -- Location: Hall Y6, Congress Center Hamburg (CCH) +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 ## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI diff --git a/03-H_Multi_GPU_Parallelization/solutions/Makefile b/03-H_Multi_GPU_Parallelization/solutions/Makefile index 9f9c84b..92f033c 100644 --- a/03-H_Multi_GPU_Parallelization/solutions/Makefile +++ b/03-H_Multi_GPU_Parallelization/solutions/Makefile @@ -1,4 +1,6 @@ -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 03H-sol +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 4 NVCC=nvcc JSC_SUBMIT_CMD ?= srun --gres=gpu:4 --ntasks-per-node 4 @@ -6,6 +8,7 @@ CUDA_HOME ?= /usr/local/cuda ifndef MPI_HOME $(error MPI_HOME is not set) endif +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -14,7 +17,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -31,10 +35,10 @@ clean: rm -f jacobi jacobi.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi profile: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.ipynb b/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.ipynb index 0bfdfab..4a2dc44 100644 --- a/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.ipynb +++ b/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", "## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", "\n", @@ -71,7 +71,7 @@ " size + 1 rows\n", "- Adapt the computation of (`iy_start_global`)" ], - "id": "6307a649-048f-4dd3-82ee-863dfcfba33b" + "id": "e42b5ab3-f626-4da5-b0c9-52a444cefde8" } ], "nbformat": 4, diff --git a/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.md b/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.md index f9794b2..4427a60 100644 --- a/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.md +++ b/03-H_Multi_GPU_Parallelization/solutions/advanced/Instructions.md @@ -1,9 +1,9 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 29 May 2022 9AM - 6PM CEST -- Location: Hall Y6, Congress Center Hamburg (CCH) +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 ## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI diff --git a/03-H_Multi_GPU_Parallelization/solutions/advanced/Makefile b/03-H_Multi_GPU_Parallelization/solutions/advanced/Makefile index 9f9c84b..a6399eb 100644 --- a/03-H_Multi_GPU_Parallelization/solutions/advanced/Makefile +++ b/03-H_Multi_GPU_Parallelization/solutions/advanced/Makefile @@ -1,4 +1,6 @@ -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 03H-solopt +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 4 NVCC=nvcc JSC_SUBMIT_CMD ?= srun --gres=gpu:4 --ntasks-per-node 4 @@ -6,6 +8,7 @@ CUDA_HOME ?= /usr/local/cuda ifndef MPI_HOME $(error MPI_HOME is not set) endif +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -14,7 +17,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -31,10 +35,10 @@ clean: rm -f jacobi jacobi.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi profile: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/03-H_Multi_GPU_Parallelization/tasks/Instructions.ipynb b/03-H_Multi_GPU_Parallelization/tasks/Instructions.ipynb index 0bfdfab..4a2dc44 100644 --- a/03-H_Multi_GPU_Parallelization/tasks/Instructions.ipynb +++ b/03-H_Multi_GPU_Parallelization/tasks/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", "## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI\n", "\n", @@ -71,7 +71,7 @@ " size + 1 rows\n", "- Adapt the computation of (`iy_start_global`)" ], - "id": "6307a649-048f-4dd3-82ee-863dfcfba33b" + "id": "e42b5ab3-f626-4da5-b0c9-52a444cefde8" } ], "nbformat": 4, diff --git a/03-H_Multi_GPU_Parallelization/tasks/Instructions.md b/03-H_Multi_GPU_Parallelization/tasks/Instructions.md index f9794b2..4427a60 100644 --- a/03-H_Multi_GPU_Parallelization/tasks/Instructions.md +++ b/03-H_Multi_GPU_Parallelization/tasks/Instructions.md @@ -1,9 +1,9 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 29 May 2022 9AM - 6PM CEST -- Location: Hall Y6, Congress Center Hamburg (CCH) +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 ## Hands-On 3: Multi-GPU Parallelization with CUDA-aware MPI diff --git a/03-H_Multi_GPU_Parallelization/tasks/Makefile b/03-H_Multi_GPU_Parallelization/tasks/Makefile index 9f9c84b..d293686 100644 --- a/03-H_Multi_GPU_Parallelization/tasks/Makefile +++ b/03-H_Multi_GPU_Parallelization/tasks/Makefile @@ -1,4 +1,6 @@ -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 03H-task +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 4 NVCC=nvcc JSC_SUBMIT_CMD ?= srun --gres=gpu:4 --ntasks-per-node 4 @@ -6,6 +8,7 @@ CUDA_HOME ?= /usr/local/cuda ifndef MPI_HOME $(error MPI_HOME is not set) endif +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -14,7 +17,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -31,10 +35,10 @@ clean: rm -f jacobi jacobi.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi profile: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/04-L_Performance_and_debugging_tools/slides.pdf b/04-L_Performance_and_debugging_tools/slides.pdf index 42a8a49..c79e82d 100644 Binary files a/04-L_Performance_and_debugging_tools/slides.pdf and b/04-L_Performance_and_debugging_tools/slides.pdf differ diff --git a/05-L_Optimization_techniques_for_multi-GPU_applications/slides.pdf b/05-L_Optimization_techniques_for_multi-GPU_applications/slides.pdf index b76953f..fe2d1ed 100644 Binary files a/05-L_Optimization_techniques_for_multi-GPU_applications/slides.pdf and b/05-L_Optimization_techniques_for_multi-GPU_applications/slides.pdf differ diff --git a/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.ipynb b/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.ipynb index 3c7fa05..a5132d6 100644 --- a/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.ipynb +++ b/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", "## Hands-On 6: Overlap Communication and Computation with MPI\n", "\n", @@ -45,7 +45,9 @@ " target (`make profile`)\n", "3. Open the recorded profile in the GUI\n", " - Either: Install Nsight Systems locally, and transfer the\n", - " .qdrep/.nsys-rep file\n", + " .nsys-rep file.\n", + " - *Note*: Right-click in file-browser, choose “Download” from\n", + " context menu\n", " - Or: By running Xpra in your browser: In Jupyter, select “File \\>\n", " New Launcher” and “Xpra Desktop”, which will open in a new tab.\n", " Don’t forget to source the environment in your `xterm`.\n", @@ -85,7 +87,7 @@ "- Destroy the additional cuda streams and events before ending the\n", " application" ], - "id": "846bc0ce-c189-4bb0-b5ea-7980298d88eb" + "id": "9d8c72f4-e257-40f3-a0e3-8dcc087f0e7f" } ], "nbformat": 4, diff --git a/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.md b/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.md index f971296..caf1792 100644 --- a/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.md +++ b/06-H_Overlap_Communication_and_Computation_MPI/.master/Instructions.md @@ -1,9 +1,9 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 29 May 2022 9AM - 6PM CEST -- Location: Hall Y6, Congress Center Hamburg (CCH) +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 ## Hands-On 6: Overlap Communication and Computation with MPI @@ -32,7 +32,8 @@ Use the Nsight System profiler to profile the starting point version non-Overlap 1. Start by compiling and running the application with `make run` 1. Record an Nsight Systems profile, using the appropriate Makefile target (`make profile`) 1. Open the recorded profile in the GUI - - Either: Install Nsight Systems locally, and transfer the .qdrep/.nsys-rep file + - Either: Install Nsight Systems locally, and transfer the .nsys-rep file. + - *Note*: Right-click in file-browser, choose "Download" from context menu - Or: By running Xpra in your browser: In Jupyter, select "File > New Launcher" and "Xpra Desktop", which will open in a new tab. Don't forget to source the environment in your `xterm`. 1. Familiarize yourself with the different rows and the traces they represent. - See if you can correlate a CUDA API kernel launch call and the resulting kernel execution on the device diff --git a/06-H_Overlap_Communication_and_Computation_MPI/.master/Makefile b/06-H_Overlap_Communication_and_Computation_MPI/.master/Makefile.in similarity index 72% rename from 06-H_Overlap_Communication_and_Computation_MPI/.master/Makefile rename to 06-H_Overlap_Communication_and_Computation_MPI/.master/Makefile.in index 2ca46c7..b7ce7a5 100644 --- a/06-H_Overlap_Communication_and_Computation_MPI/.master/Makefile +++ b/06-H_Overlap_Communication_and_Computation_MPI/.master/Makefile.in @@ -1,9 +1,12 @@ -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 06H-@@TASKSOL@@ +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 1 NVCC=nvcc MPICXX=mpicxx JSC_SUBMIT_CMD ?= srun --gres=gpu:4 --ntasks-per-node 4 CUDA_HOME ?= /usr/local/cuda +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -12,7 +15,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -32,10 +36,10 @@ clean: rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi profile: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/06-H_Overlap_Communication_and_Computation_MPI/.master/copy.mk b/06-H_Overlap_Communication_and_Computation_MPI/.master/copy.mk index 8a96f59..4d8511a 100755 --- a/06-H_Overlap_Communication_and_Computation_MPI/.master/copy.mk +++ b/06-H_Overlap_Communication_and_Computation_MPI/.master/copy.mk @@ -1,23 +1,28 @@ #!/usr/bin/make -f -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. TASKDIR = ../tasks/ SOLUTIONDIR = ../solutions/ IYPNB_TEMPLATE = ../../.template.json PROCESSFILES = jacobi.cpp -COPYFILES = Makefile Instructions.ipynb jacobi_kernels.cu +COPYFILES = Instructions.ipynb Instructions.md jacobi_kernels.cu TASKPROCCESFILES = $(addprefix $(TASKDIR)/,$(PROCESSFILES)) TASKCOPYFILES = $(addprefix $(TASKDIR)/,$(COPYFILES)) SOLUTIONPROCCESFILES = $(addprefix $(SOLUTIONDIR)/,$(PROCESSFILES)) SOLUTIONCOPYFILES = $(addprefix $(SOLUTIONDIR)/,$(COPYFILES)) +MAKEFILES = $(addsuffix /Makefile,$(TASKDIR) $(SOLUTIONDIR)) .PHONY: all task clean all: task -task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES} +task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES} ${MAKEFILES} +$(TASKDIR)/Makefile: Makefile.in + sed -e 's/@@TASKSOL@@/task/' $< > $@ +$(SOLUTIONDIR)/Makefile: Makefile.in + sed -e 's/@@TASKSOL@@/sol/' $< > $@ ${TASKPROCCESFILES}: $(PROCESSFILES) mkdir -p $(TASKDIR)/ diff --git a/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.ipynb b/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.ipynb index 3c7fa05..a5132d6 100644 --- a/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.ipynb +++ b/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", "## Hands-On 6: Overlap Communication and Computation with MPI\n", "\n", @@ -45,7 +45,9 @@ " target (`make profile`)\n", "3. Open the recorded profile in the GUI\n", " - Either: Install Nsight Systems locally, and transfer the\n", - " .qdrep/.nsys-rep file\n", + " .nsys-rep file.\n", + " - *Note*: Right-click in file-browser, choose “Download” from\n", + " context menu\n", " - Or: By running Xpra in your browser: In Jupyter, select “File \\>\n", " New Launcher” and “Xpra Desktop”, which will open in a new tab.\n", " Don’t forget to source the environment in your `xterm`.\n", @@ -85,7 +87,7 @@ "- Destroy the additional cuda streams and events before ending the\n", " application" ], - "id": "846bc0ce-c189-4bb0-b5ea-7980298d88eb" + "id": "9d8c72f4-e257-40f3-a0e3-8dcc087f0e7f" } ], "nbformat": 4, diff --git a/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.md b/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.md new file mode 100644 index 0000000..caf1792 --- /dev/null +++ b/06-H_Overlap_Communication_and_Computation_MPI/solutions/Instructions.md @@ -0,0 +1,63 @@ +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale + +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA +- Program Link: + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 + +## Hands-On 6: Overlap Communication and Computation with MPI + +You are now going to apply the concepts you learned in the lectures 4 and 5: Using profiling tools, +and applying them to implement overlapping MPI with GPU kernels. + +Compile with + +``` {.bash} +make +``` + +Submit your compiled application to the batch system with + +``` {.bash} +make run +``` + +Study the performance by glimpsing at the profile generated with +`make profile`. For `make run` and `make profile` the environment variable `NP` can be set to change the number of processes. + +### Task 0: Profile the non-overlap MPI-CUDA version of the code + +Use the Nsight System profiler to profile the starting point version non-Overlap MPI jacobi solver. The objective is to become familiar in navigating the GUI identify possible areas to overlap computation and communication. + +1. Start by compiling and running the application with `make run` +1. Record an Nsight Systems profile, using the appropriate Makefile target (`make profile`) +1. Open the recorded profile in the GUI + - Either: Install Nsight Systems locally, and transfer the .nsys-rep file. + - *Note*: Right-click in file-browser, choose "Download" from context menu + - Or: By running Xpra in your browser: In Jupyter, select "File > New Launcher" and "Xpra Desktop", which will open in a new tab. Don't forget to source the environment in your `xterm`. +1. Familiarize yourself with the different rows and the traces they represent. + - See if you can correlate a CUDA API kernel launch call and the resulting kernel execution on the device +1. Follow the lecture steps and identify the relevant section with overlap potential in your code + - Hint: Try navigating with the NVTX ranges. + + +### Task 1: Implement Communication/Computation overlap + +Realize the optimization potential you discovered in the previous task and reduce the whitespace between kernel calls on the GPU profile by implementing communication/computation overlap. + +You will need to separately calculate the boundary, and you should use high-priority streams. A less efficient (problem size-dependent) alternative to high-priority streams would be to launch the boundary processing kernels before the bulk kernel. +regions for the halo exchange. + +The starting point of this task is the non-overlapping MPI variant of the Jacobi solver. +Follow the `TODO`s in `jacobi.cpp`: + +- Query the priority range to be used by the CUDA streams +- Create new top and bottom CUDA streams and corresponding CUDA events +- Initialize all streams using priorities +- Modify the original call to `launch_jacobi_kernel` to not compute the top and bottom regions +- Add additional calls to `launch_jacobi_kernel` for the top and bottom regions using the high-priority streams +- Wait on both top and bottom streams when calculating the norm +- Synchronize top and bottom streams before applying the periodic boundary conditions using MPI +- Destroy the additional cuda streams and events before ending the application + + diff --git a/06-H_Overlap_Communication_and_Computation_MPI/solutions/Makefile b/06-H_Overlap_Communication_and_Computation_MPI/solutions/Makefile index 2ca46c7..a8ee71a 100644 --- a/06-H_Overlap_Communication_and_Computation_MPI/solutions/Makefile +++ b/06-H_Overlap_Communication_and_Computation_MPI/solutions/Makefile @@ -1,9 +1,12 @@ -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 06H-sol +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 1 NVCC=nvcc MPICXX=mpicxx JSC_SUBMIT_CMD ?= srun --gres=gpu:4 --ntasks-per-node 4 CUDA_HOME ?= /usr/local/cuda +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -12,7 +15,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -32,10 +36,10 @@ clean: rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi profile: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.ipynb b/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.ipynb index 3c7fa05..a5132d6 100644 --- a/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.ipynb +++ b/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", "## Hands-On 6: Overlap Communication and Computation with MPI\n", "\n", @@ -45,7 +45,9 @@ " target (`make profile`)\n", "3. Open the recorded profile in the GUI\n", " - Either: Install Nsight Systems locally, and transfer the\n", - " .qdrep/.nsys-rep file\n", + " .nsys-rep file.\n", + " - *Note*: Right-click in file-browser, choose “Download” from\n", + " context menu\n", " - Or: By running Xpra in your browser: In Jupyter, select “File \\>\n", " New Launcher” and “Xpra Desktop”, which will open in a new tab.\n", " Don’t forget to source the environment in your `xterm`.\n", @@ -85,7 +87,7 @@ "- Destroy the additional cuda streams and events before ending the\n", " application" ], - "id": "846bc0ce-c189-4bb0-b5ea-7980298d88eb" + "id": "9d8c72f4-e257-40f3-a0e3-8dcc087f0e7f" } ], "nbformat": 4, diff --git a/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.md b/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.md new file mode 100644 index 0000000..caf1792 --- /dev/null +++ b/06-H_Overlap_Communication_and_Computation_MPI/tasks/Instructions.md @@ -0,0 +1,63 @@ +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale + +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA +- Program Link: + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 + +## Hands-On 6: Overlap Communication and Computation with MPI + +You are now going to apply the concepts you learned in the lectures 4 and 5: Using profiling tools, +and applying them to implement overlapping MPI with GPU kernels. + +Compile with + +``` {.bash} +make +``` + +Submit your compiled application to the batch system with + +``` {.bash} +make run +``` + +Study the performance by glimpsing at the profile generated with +`make profile`. For `make run` and `make profile` the environment variable `NP` can be set to change the number of processes. + +### Task 0: Profile the non-overlap MPI-CUDA version of the code + +Use the Nsight System profiler to profile the starting point version non-Overlap MPI jacobi solver. The objective is to become familiar in navigating the GUI identify possible areas to overlap computation and communication. + +1. Start by compiling and running the application with `make run` +1. Record an Nsight Systems profile, using the appropriate Makefile target (`make profile`) +1. Open the recorded profile in the GUI + - Either: Install Nsight Systems locally, and transfer the .nsys-rep file. + - *Note*: Right-click in file-browser, choose "Download" from context menu + - Or: By running Xpra in your browser: In Jupyter, select "File > New Launcher" and "Xpra Desktop", which will open in a new tab. Don't forget to source the environment in your `xterm`. +1. Familiarize yourself with the different rows and the traces they represent. + - See if you can correlate a CUDA API kernel launch call and the resulting kernel execution on the device +1. Follow the lecture steps and identify the relevant section with overlap potential in your code + - Hint: Try navigating with the NVTX ranges. + + +### Task 1: Implement Communication/Computation overlap + +Realize the optimization potential you discovered in the previous task and reduce the whitespace between kernel calls on the GPU profile by implementing communication/computation overlap. + +You will need to separately calculate the boundary, and you should use high-priority streams. A less efficient (problem size-dependent) alternative to high-priority streams would be to launch the boundary processing kernels before the bulk kernel. +regions for the halo exchange. + +The starting point of this task is the non-overlapping MPI variant of the Jacobi solver. +Follow the `TODO`s in `jacobi.cpp`: + +- Query the priority range to be used by the CUDA streams +- Create new top and bottom CUDA streams and corresponding CUDA events +- Initialize all streams using priorities +- Modify the original call to `launch_jacobi_kernel` to not compute the top and bottom regions +- Add additional calls to `launch_jacobi_kernel` for the top and bottom regions using the high-priority streams +- Wait on both top and bottom streams when calculating the norm +- Synchronize top and bottom streams before applying the periodic boundary conditions using MPI +- Destroy the additional cuda streams and events before ending the application + + diff --git a/06-H_Overlap_Communication_and_Computation_MPI/tasks/Makefile b/06-H_Overlap_Communication_and_Computation_MPI/tasks/Makefile index 2ca46c7..36da1bf 100644 --- a/06-H_Overlap_Communication_and_Computation_MPI/tasks/Makefile +++ b/06-H_Overlap_Communication_and_Computation_MPI/tasks/Makefile @@ -1,9 +1,12 @@ -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 06H-task +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 1 NVCC=nvcc MPICXX=mpicxx JSC_SUBMIT_CMD ?= srun --gres=gpu:4 --ntasks-per-node 4 CUDA_HOME ?= /usr/local/cuda +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -12,7 +15,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -32,10 +36,10 @@ clean: rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi profile: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides.pdf b/07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides.pdf index 2e5ca68..aa4daad 100644 Binary files a/07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides.pdf and b/07-L_Overview_of_NCCL_and_NVSHMEM_in_MPI_Programs/slides.pdf differ diff --git a/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.ipynb b/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.ipynb index cd4d85a..cebc40c 100644 --- a/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", "## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication\n", "\n", @@ -22,15 +22,20 @@ "variant of the jacobi solver. You need to work on `TODOs` in\n", "`jacobi.cpp`:\n", "\n", - "- Initialize NVSHMEM:\n", - " - Include NCCL headers.\n", - " - Create a NCCL unique ID, and initialize it\n", - " - Create a NCCL communicator and initilize it\n", - " - Replace the MPI_Sendrecv calls with ncclRecv and ncclSend calls\n", - " for the warmup stage\n", - " - Replace MPI for the periodic boundary conditions with NCCL\n", - " - Fix output message to indicate nccl rather than mpi\n", - " - Destroy NCCL comunicator\n", + "- Include NCCL headers.\n", + "- Create a NCCL unique ID, and initialize it\n", + "- Create a NCCL communicator and initialize it\n", + "- Replace the MPI_Sendrecv calls with ncclRecv and ncclSend calls for\n", + " the warmup stage\n", + "- Replace MPI for the periodic boundary conditions with NCCL\n", + "- Fix output message to indicate nccl rather than mpi\n", + "- Destroy NCCL communicator\n", + "\n", + "If you have time left:\n", + "\n", + "- Use ncclMemAlloc to allocate the buffers and register them for\n", + " communication\n", + "- Don\\`t forget to deregister and free the buffers correctly\n", "\n", "Compile with\n", "\n", @@ -48,7 +53,7 @@ "`make profile`. For `make run` and `make profile` the environment\n", "variable `NP` can be set to change the number of processes." ], - "id": "8c9e9e42-bda5-4b52-a322-0e72171476c5" + "id": "42337a25-287a-4ae5-bcc6-e87fcf42ee4e" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.md b/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.md index 465181f..8c52274 100644 --- a/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.md +++ b/08-H_NCCL_NVSHMEM/.master/NCCL/Instructions.md @@ -1,9 +1,9 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 29 May 2022 9AM - 6PM CEST -- Location: Hall Y6, Congress Center Hamburg (CCH) +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 ## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication @@ -13,14 +13,18 @@ The purpose of this task is to use NCCL instead of MPI to implement a multi-GPU jacobi solver. The starting point of this task is the MPI variant of the jacobi solver. You need to work on `TODOs` in `jacobi.cpp`: -- Initialize NVSHMEM: - - Include NCCL headers. - - Create a NCCL unique ID, and initialize it - - Create a NCCL communicator and initilize it - - Replace the MPI\_Sendrecv calls with ncclRecv and ncclSend calls for the warmup stage - - Replace MPI for the periodic boundary conditions with NCCL - - Fix output message to indicate nccl rather than mpi - - Destroy NCCL comunicator +- Include NCCL headers. +- Create a NCCL unique ID, and initialize it +- Create a NCCL communicator and initialize it +- Replace the MPI\_Sendrecv calls with ncclRecv and ncclSend calls for the warmup stage +- Replace MPI for the periodic boundary conditions with NCCL +- Fix output message to indicate nccl rather than mpi +- Destroy NCCL communicator + +If you have time left: + +- Use ncclMemAlloc to allocate the buffers and register them for communication +- Don`t forget to deregister and free the buffers correctly Compile with diff --git a/08-H_NCCL_NVSHMEM/.master/NCCL/Makefile.in b/08-H_NCCL_NVSHMEM/.master/NCCL/Makefile.in new file mode 100644 index 0000000..ccbec37 --- /dev/null +++ b/08-H_NCCL_NVSHMEM/.master/NCCL/Makefile.in @@ -0,0 +1,49 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 08H-NCCL-@@TASKSOL@@ +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') +NP ?= 1 +NVCC=nvcc +JSC_SUBMIT_CMD ?= srun --cpu-bind=socket --gres=gpu:4 --ntasks-per-node 4 +MPICXX=mpicxx +CUDA_HOME ?= /usr/local/cuda +NCCL_HOME ?= /usr +_JSCCOURSE_GPU_ARCH?=80 +GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 +GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 +GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 +GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 +GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 +GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 +GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 +GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) +ifdef DISABLE_CUB + NVCC_FLAGS = -Xptxas --optimize-float-atomics +else + NVCC_FLAGS = -DHAVE_CUB +endif +NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14 +MPICXX_FLAGS = -DUSE_NVTX -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include -std=c++14 +LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt -lnccl +jacobi: Makefile jacobi.cpp jacobi_kernels.o + $(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi + +jacobi_kernels.o: Makefile jacobi_kernels.cu + $(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c + +.PHONY.: clean +clean: + rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log + +sanitize: jacobi + $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + +run: jacobi + $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi + +run_user_buffer: jacobi + $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi -user_buffer_reg + +profile: jacobi + $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/08-H_NCCL_NVSHMEM/.master/NCCL/copy.mk b/08-H_NCCL_NVSHMEM/.master/NCCL/copy.mk index b13d4df..3b55268 100755 --- a/08-H_NCCL_NVSHMEM/.master/NCCL/copy.mk +++ b/08-H_NCCL_NVSHMEM/.master/NCCL/copy.mk @@ -1,23 +1,28 @@ #!/usr/bin/make -f -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. TASKDIR = ../../tasks/NCCL SOLUTIONDIR = ../../solutions/NCCL IYPNB_TEMPLATE = ../../../.template.json PROCESSFILES = jacobi.cpp -COPYFILES = Makefile jacobi_kernels.cu Instructions.ipynb Instructions.md +COPYFILES = jacobi_kernels.cu Instructions.ipynb Instructions.md TASKPROCCESFILES = $(addprefix $(TASKDIR)/,$(PROCESSFILES)) TASKCOPYFILES = $(addprefix $(TASKDIR)/,$(COPYFILES)) SOLUTIONPROCCESFILES = $(addprefix $(SOLUTIONDIR)/,$(PROCESSFILES)) SOLUTIONCOPYFILES = $(addprefix $(SOLUTIONDIR)/,$(COPYFILES)) +MAKEFILES = $(addsuffix /Makefile,$(TASKDIR) $(SOLUTIONDIR)) .PHONY: all task all: task -task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES} +task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES} ${MAKEFILES} +$(TASKDIR)/Makefile: Makefile.in + sed -e 's/@@TASKSOL@@/task/' $< > $@ +$(SOLUTIONDIR)/Makefile: Makefile.in + sed -e 's/@@TASKSOL@@/sol/' $< > $@ ${TASKPROCCESFILES}: $(PROCESSFILES) mkdir -p $(TASKDIR)/ diff --git a/08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp b/08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp index 33760e2..5945b12 100644 --- a/08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp +++ b/08-H_NCCL_NVSHMEM/.master/NCCL/jacobi.cpp @@ -95,6 +95,13 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t); #ifdef SOLUTION #include #endif +#ifdef NCCL_VERSION +#define NCCL_VERSION_UB NCCL_VERSION(2,19,1) +#define NCCL_UB_SUPPORT NCCL_VERSION_CODE >= NCCL_VERSION_UB +#else +#define NCCL_UB_SUPPORT 0 +#endif + #define NCCL_CALL(call) \ { \ @@ -172,6 +179,13 @@ int main(int argc, char* argv[]) { const int nx = get_argval(argv, argv + argc, "-nx", 16384); const int ny = get_argval(argv, argv + argc, "-ny", 16384); const bool csv = get_arg(argv, argv + argc, "-csv"); + bool user_buffer_reg = get_arg(argv, argv + argc, "-user_buffer_reg"); +#if NCCL_UB_SUPPORT == 0 + if (user_buffer_reg) { + fprintf(stderr,"WARNING: Ignoring -user_buffer_reg, required NCCL APIs are provided by NCCL 2.19.1 or later.\n"); + user_buffer_reg = false; + } +#endif //NCCL_UB_SUPPORT == 0 int local_rank = -1; { @@ -226,10 +240,30 @@ int main(int argc, char* argv[]) { chunk_size = chunk_size_high; real* a; - CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real))); real* a_new; - CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real))); +#if NCCL_UB_SUPPORT + void* a_reg_handle; + void* a_new_reg_handle; + if (user_buffer_reg) { + //TODO: Allocate the memory with ncclMemAlloc and register it for the commmunicatior +#ifdef SOLUTION + + NCCL_CALL(ncclMemAlloc( (void**) &a , nx * (chunk_size + 2) * sizeof(real))); + NCCL_CALL(ncclMemAlloc( (void**) &a_new, nx * (chunk_size + 2) * sizeof(real))); + NCCL_CALL(ncclCommRegister(nccl_comm, a , nx * (chunk_size + 2) * sizeof(real), &a_reg_handle)); + NCCL_CALL(ncclCommRegister(nccl_comm, a_new, nx * (chunk_size + 2) * sizeof(real), &a_new_reg_handle)); +#endif + if ( nccl_version < 22304 ) { + fprintf(stderr,"WARNING: -user_buffer_reg available, but Jacobi communication pattern needs NCCL 2.23.4 or later.\n"); + } + } + else +#endif //NCCL_UB_SUPPORT + { + CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real))); + CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real))); + } CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real))); CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(real))); @@ -271,7 +305,7 @@ int main(int argc, char* argv[]) { real* l2_norm_h; CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); -//TODO: Rename range + //TODO: Rename range #ifdef SOLUTION PUSH_RANGE("NCCL_Warmup", 5) #else @@ -326,7 +360,7 @@ int main(int argc, char* argv[]) { CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0)); calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); - launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream); launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, @@ -346,7 +380,7 @@ int main(int argc, char* argv[]) { const int bottom = (rank + 1) % size; // Apply periodic boundary conditions - //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls + //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls // using the nccl communicator and push_stream. // Remember to use ncclGroupStart() and ncclGroupEnd() #ifdef SOLUTION @@ -358,14 +392,14 @@ int main(int argc, char* argv[]) { NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream)); NCCL_CALL(ncclGroupEnd()); #else - PUSH_RANGE("MPI", 5) + PUSH_RANGE("MPI", 5) MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); #endif - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE if (calculate_norm) { @@ -410,13 +444,13 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { - //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap + //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap #ifdef SOLUTION printf("nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, #else - printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, + printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, #endif - (stop - start), runtime_serial); + (stop - start), runtime_serial); } else { printf("Num GPUs: %d.\n", size); printf( @@ -434,10 +468,22 @@ int main(int argc, char* argv[]) { CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); CUDA_RT_CALL(cudaFree(l2_norm_d)); - +#if NCCL_UB_SUPPORT + if (user_buffer_reg) { +//TODO: Deregister and Free the Buffer +#ifdef SOLUTION + NCCL_CALL(ncclCommDeregister(nccl_comm, a_new_reg_handle)); + NCCL_CALL(ncclCommDeregister(nccl_comm, a_reg_handle)); + NCCL_CALL(ncclMemFree(a_new)); + NCCL_CALL(ncclMemFree(a)); +#endif + } + else +#endif //NCCL_UB_SUPPORT + { CUDA_RT_CALL(cudaFree(a_new)); CUDA_RT_CALL(cudaFree(a)); - + } CUDA_RT_CALL(cudaFreeHost(a_h)); CUDA_RT_CALL(cudaFreeHost(a_ref_h)); diff --git a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.ipynb b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.ipynb index eb6077a..8a725ad 100644 --- a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", "## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM\n", "\n", @@ -22,18 +22,17 @@ "is the MPI variant of the jacobi solver. You need to work on `TODOs` in\n", "`jacobi.cu`:\n", "\n", - "- Initialize NVSHMEM:\n", - " - Include NVSHMEM headers.\n", - " - Initialize NVSHMEM using `MPI_COMM_WORLD`.\n", - " - Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric\n", - " heap. Take care of passing in a consistent size!\n", - " - Calculate halo/boundary row index of top and bottom neighbors.\n", - " - Add necessary inter PE synchronization.\n", - " - Replace MPI periodic boundary conditions with\n", - " `nvshmemx_float_put_on_stream` to directly push values needed by\n", - " top and bottom neighbors.\n", - " - Deallocate memory from the NVSHMEM symetric heap.\n", - " - Finalize NVSHMEM before existing the application\n", + "- Include NVSHMEM headers.\n", + "- Initialize NVSHMEM using `MPI_COMM_WORLD`.\n", + "- Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric\n", + " heap. Take care of passing in a consistent size!\n", + "- Calculate halo/boundary row index of top and bottom neighbors.\n", + "- Add necessary inter PE synchronization.\n", + "- Replace MPI periodic boundary conditions with\n", + " `nvshmemx_float_put_on_stream` to directly push values needed by top\n", + " and bottom neighbors.\n", + "- Deallocate memory from the NVSHMEM symmetric heap.\n", + "- Finalize NVSHMEM before existing the application\n", "\n", "Compile with\n", "\n", @@ -53,18 +52,18 @@ "\n", "#### Note\n", "\n", - "The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES`\n", - "automatically so that each spawned process only sees the GPU it should\n", - "use (see [GPU\n", - "Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", - "in the JUWELS Booster Overview documentation). This is not supported for\n", - "NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled\n", - "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes\n", - "srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all\n", - "GPUs listed. This is automatically done for the `sanitize`, `run` and\n", + "The Slurm installation on JEDI sets `CUDA_VISIBLE_DEVICES` automatically\n", + "so that each spawned process only sees the GPU it should use see\n", + "[Hardware\n", + "Configuration](https://apps.fz-juelich.de/jsc/hps/jedi/configuration.html#hardware-configuration)\n", + "in the JEDI user documentation). This is not supported for NVSHMEM. The\n", + "automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting\n", + "`CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With\n", + "`CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs\n", + "listed. This is automatically done for the `sanitize`, `run` and\n", "`profile` make targets." ], - "id": "4153946b-60de-494a-ad07-7ecb34a91c84" + "id": "aea0b932-fe75-453b-a546-0bafe398d77d" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.md b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.md index a263c17..750f873 100644 --- a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.md +++ b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Instructions.md @@ -1,9 +1,9 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 29 May 2022 9AM - 6PM CEST -- Location: Hall Y6, Congress Center Hamburg (CCH) +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 ## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM @@ -13,15 +13,14 @@ The purpose of this task is to use the NVSHMEM host API instead of MPI to implement a multi-GPU jacobi solver. The starting point of this task is the MPI variant of the jacobi solver. You need to work on `TODOs` in `jacobi.cu`: -- Initialize NVSHMEM: - - Include NVSHMEM headers. - - Initialize NVSHMEM using `MPI_COMM_WORLD`. - - Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric heap. Take care of passing in a consistent size! - - Calculate halo/boundary row index of top and bottom neighbors. - - Add necessary inter PE synchronization. - - Replace MPI periodic boundary conditions with `nvshmemx_float_put_on_stream` to directly push values needed by top and bottom neighbors. - - Deallocate memory from the NVSHMEM symetric heap. - - Finalize NVSHMEM before existing the application +- Include NVSHMEM headers. +- Initialize NVSHMEM using `MPI_COMM_WORLD`. +- Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric heap. Take care of passing in a consistent size! +- Calculate halo/boundary row index of top and bottom neighbors. +- Add necessary inter PE synchronization. +- Replace MPI periodic boundary conditions with `nvshmemx_float_put_on_stream` to directly push values needed by top and bottom neighbors. +- Deallocate memory from the NVSHMEM symmetric heap. +- Finalize NVSHMEM before existing the application Compile with @@ -40,5 +39,5 @@ Study the performance by glimpsing at the profile generated with #### Note -The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES` automatically so that each spawned process only sees the GPU it should use (see [GPU Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices) in the JUWELS Booster Overview documentation). This is not supported for NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs listed. This is automatically done for the `sanitize`, `run` and `profile` make targets. +The Slurm installation on JEDI sets `CUDA_VISIBLE_DEVICES` automatically so that each spawned process only sees the GPU it should use see [Hardware Configuration](https://apps.fz-juelich.de/jsc/hps/jedi/configuration.html#hardware-configuration) in the JEDI user documentation). This is not supported for NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs listed. This is automatically done for the `sanitize`, `run` and `profile` make targets. diff --git a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Makefile b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Makefile.in similarity index 77% rename from 08-H_NCCL_NVSHMEM/.master/NVSHMEM/Makefile rename to 08-H_NCCL_NVSHMEM/.master/NVSHMEM/Makefile.in index e711f37..8aad9e0 100644 --- a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Makefile +++ b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/Makefile.in @@ -1,4 +1,6 @@ -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2024, All rights reserved. +THIS_TASK := 08H-NVSHMEM-@@TASKSOL@@ +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 4 NVCC=nvcc N_D_C_VMM=1 #Enabled to hide warning and errors only found in NVSHMEM/2.5.0 to be fixed in next release @@ -11,6 +13,7 @@ endif ifndef MPI_HOME $(error MPI_HOME is not set) endif +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -19,7 +22,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -36,10 +40,10 @@ clean: rm -f jacobi jacobi.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi profile: jacobi - CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/copy.mk b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/copy.mk index a4c37c2..4d62bd5 100755 --- a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/copy.mk +++ b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/copy.mk @@ -1,23 +1,28 @@ #!/usr/bin/make -f -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. TASKDIR = ../../tasks/NVSHMEM SOLUTIONDIR = ../../solutions/NVSHMEM IYPNB_TEMPLATE = ../../../.template.json PROCESSFILES = jacobi.cu -COPYFILES = Makefile Instructions.ipynb Instructions.md +COPYFILES = Instructions.ipynb Instructions.md TASKPROCCESFILES = $(addprefix $(TASKDIR)/,$(PROCESSFILES)) TASKCOPYFILES = $(addprefix $(TASKDIR)/,$(COPYFILES)) SOLUTIONPROCCESFILES = $(addprefix $(SOLUTIONDIR)/,$(PROCESSFILES)) SOLUTIONCOPYFILES = $(addprefix $(SOLUTIONDIR)/,$(COPYFILES)) +MAKEFILES = $(addsuffix /Makefile,$(TASKDIR) $(SOLUTIONDIR)) .PHONY: all task all: task -task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES} +task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES} ${MAKEFILES} +$(TASKDIR)/Makefile: Makefile.in + sed -e 's/@@TASKSOL@@/task/' $< > $@ +$(SOLUTIONDIR)/Makefile: Makefile.in + sed -e 's/@@TASKSOL@@/sol/' $< > $@ ${TASKPROCCESFILES}: $(PROCESSFILES) mkdir -p $(TASKDIR)/ diff --git a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu index e755c7c..e4f6bcd 100644 --- a/08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu +++ b/08-H_NCCL_NVSHMEM/.master/NVSHMEM/jacobi.cu @@ -355,11 +355,11 @@ int main(int argc, char* argv[]) { compute_stream)); } - //TODO: Replace MPI communication with Host initiated NVSHMEM calls + //TODO: Replace MPI communication with Host initiated NVSHMEM calls // Apply periodic boundary conditions #ifdef SOLUTION - PUSH_RANGE("NVSHMEM", 5) - nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream); + PUSH_RANGE("NVSHMEM", 5) + nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream); nvshmemx_float_put_on_stream(a_new + iy_bottom_upper_boundary_idx * nx, a_new + (iy_end - 1) * nx, nx, bottom, push_stream); #else PUSH_RANGE("MPI", 5) @@ -369,12 +369,12 @@ int main(int argc, char* argv[]) { MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); #endif - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0)); - //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) + //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) #ifdef SOLUTION nvshmemx_barrier_all_on_stream(compute_stream); #endif @@ -419,9 +419,9 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { -//TODO: Replace MPI with NVSHMEM for your output + //TODO: Replace MPI with NVSHMEM for your output #ifdef SOLUTION - printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, + printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, #else printf("mpi, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, #endif diff --git a/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.ipynb b/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.ipynb index cd4d85a..cebc40c 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", "## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication\n", "\n", @@ -22,15 +22,20 @@ "variant of the jacobi solver. You need to work on `TODOs` in\n", "`jacobi.cpp`:\n", "\n", - "- Initialize NVSHMEM:\n", - " - Include NCCL headers.\n", - " - Create a NCCL unique ID, and initialize it\n", - " - Create a NCCL communicator and initilize it\n", - " - Replace the MPI_Sendrecv calls with ncclRecv and ncclSend calls\n", - " for the warmup stage\n", - " - Replace MPI for the periodic boundary conditions with NCCL\n", - " - Fix output message to indicate nccl rather than mpi\n", - " - Destroy NCCL comunicator\n", + "- Include NCCL headers.\n", + "- Create a NCCL unique ID, and initialize it\n", + "- Create a NCCL communicator and initialize it\n", + "- Replace the MPI_Sendrecv calls with ncclRecv and ncclSend calls for\n", + " the warmup stage\n", + "- Replace MPI for the periodic boundary conditions with NCCL\n", + "- Fix output message to indicate nccl rather than mpi\n", + "- Destroy NCCL communicator\n", + "\n", + "If you have time left:\n", + "\n", + "- Use ncclMemAlloc to allocate the buffers and register them for\n", + " communication\n", + "- Don\\`t forget to deregister and free the buffers correctly\n", "\n", "Compile with\n", "\n", @@ -48,7 +53,7 @@ "`make profile`. For `make run` and `make profile` the environment\n", "variable `NP` can be set to change the number of processes." ], - "id": "8c9e9e42-bda5-4b52-a322-0e72171476c5" + "id": "42337a25-287a-4ae5-bcc6-e87fcf42ee4e" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.md b/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.md index 465181f..8c52274 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.md +++ b/08-H_NCCL_NVSHMEM/solutions/NCCL/Instructions.md @@ -1,9 +1,9 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 29 May 2022 9AM - 6PM CEST -- Location: Hall Y6, Congress Center Hamburg (CCH) +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 ## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication @@ -13,14 +13,18 @@ The purpose of this task is to use NCCL instead of MPI to implement a multi-GPU jacobi solver. The starting point of this task is the MPI variant of the jacobi solver. You need to work on `TODOs` in `jacobi.cpp`: -- Initialize NVSHMEM: - - Include NCCL headers. - - Create a NCCL unique ID, and initialize it - - Create a NCCL communicator and initilize it - - Replace the MPI\_Sendrecv calls with ncclRecv and ncclSend calls for the warmup stage - - Replace MPI for the periodic boundary conditions with NCCL - - Fix output message to indicate nccl rather than mpi - - Destroy NCCL comunicator +- Include NCCL headers. +- Create a NCCL unique ID, and initialize it +- Create a NCCL communicator and initialize it +- Replace the MPI\_Sendrecv calls with ncclRecv and ncclSend calls for the warmup stage +- Replace MPI for the periodic boundary conditions with NCCL +- Fix output message to indicate nccl rather than mpi +- Destroy NCCL communicator + +If you have time left: + +- Use ncclMemAlloc to allocate the buffers and register them for communication +- Don`t forget to deregister and free the buffers correctly Compile with diff --git a/08-H_NCCL_NVSHMEM/solutions/NCCL/Makefile b/08-H_NCCL_NVSHMEM/solutions/NCCL/Makefile index 2ba58cf..b7ff2f5 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NCCL/Makefile +++ b/08-H_NCCL_NVSHMEM/solutions/NCCL/Makefile @@ -1,10 +1,13 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 08H-NCCL-sol +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 1 NVCC=nvcc JSC_SUBMIT_CMD ?= srun --cpu-bind=socket --gres=gpu:4 --ntasks-per-node 4 MPICXX=mpicxx CUDA_HOME ?= /usr/local/cuda NCCL_HOME ?= /usr +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -13,7 +16,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -33,10 +37,13 @@ clean: rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi +run_user_buffer: jacobi + $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi -user_buffer_reg + profile: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp b/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp index 0b0b018..0c71eef 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp +++ b/08-H_NCCL_NVSHMEM/solutions/NCCL/jacobi.cpp @@ -93,6 +93,13 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t); //TODO: include NCCL headers #include +#ifdef NCCL_VERSION +#define NCCL_VERSION_UB NCCL_VERSION(2,19,1) +#define NCCL_UB_SUPPORT NCCL_VERSION_CODE >= NCCL_VERSION_UB +#else +#define NCCL_UB_SUPPORT 0 +#endif + #define NCCL_CALL(call) \ { \ @@ -168,6 +175,13 @@ int main(int argc, char* argv[]) { const int nx = get_argval(argv, argv + argc, "-nx", 16384); const int ny = get_argval(argv, argv + argc, "-ny", 16384); const bool csv = get_arg(argv, argv + argc, "-csv"); + bool user_buffer_reg = get_arg(argv, argv + argc, "-user_buffer_reg"); +#if NCCL_UB_SUPPORT == 0 + if (user_buffer_reg) { + fprintf(stderr,"WARNING: Ignoring -user_buffer_reg, required NCCL APIs are provided by NCCL 2.19.1 or later.\n"); + user_buffer_reg = false; + } +#endif //NCCL_UB_SUPPORT == 0 int local_rank = -1; { @@ -220,10 +234,28 @@ int main(int argc, char* argv[]) { chunk_size = chunk_size_high; real* a; - CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real))); real* a_new; - CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real))); +#if NCCL_UB_SUPPORT + void* a_reg_handle; + void* a_new_reg_handle; + if (user_buffer_reg) { + //TODO: Allocate the memory with ncclMemAlloc and register it for the commmunicatior + + NCCL_CALL(ncclMemAlloc( (void**) &a , nx * (chunk_size + 2) * sizeof(real))); + NCCL_CALL(ncclMemAlloc( (void**) &a_new, nx * (chunk_size + 2) * sizeof(real))); + NCCL_CALL(ncclCommRegister(nccl_comm, a , nx * (chunk_size + 2) * sizeof(real), &a_reg_handle)); + NCCL_CALL(ncclCommRegister(nccl_comm, a_new, nx * (chunk_size + 2) * sizeof(real), &a_new_reg_handle)); + if ( nccl_version < 22304 ) { + fprintf(stderr,"WARNING: -user_buffer_reg available, but Jacobi communication pattern needs NCCL 2.23.4 or later.\n"); + } + } + else +#endif //NCCL_UB_SUPPORT + { + CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real))); + CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real))); + } CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real))); CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(real))); @@ -265,7 +297,7 @@ int main(int argc, char* argv[]) { real* l2_norm_h; CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); -//TODO: Rename range + //TODO: Rename range PUSH_RANGE("NCCL_Warmup", 5) for (int i = 0; i < 10; ++i) { const int top = rank > 0 ? rank - 1 : (size - 1); @@ -308,7 +340,7 @@ int main(int argc, char* argv[]) { CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0)); calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); - launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream); launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, @@ -328,7 +360,7 @@ int main(int argc, char* argv[]) { const int bottom = (rank + 1) % size; // Apply periodic boundary conditions - //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls + //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls // using the nccl communicator and push_stream. // Remember to use ncclGroupStart() and ncclGroupEnd() PUSH_RANGE("NCCL_LAUNCH", 5) @@ -338,7 +370,7 @@ int main(int argc, char* argv[]) { NCCL_CALL(ncclRecv(a_new + (iy_end * nx), nx, NCCL_REAL_TYPE, bottom, nccl_comm, push_stream)); NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream)); NCCL_CALL(ncclGroupEnd()); - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE if (calculate_norm) { @@ -383,9 +415,9 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { - //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap + //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap printf("nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, - (stop - start), runtime_serial); + (stop - start), runtime_serial); } else { printf("Num GPUs: %d.\n", size); printf( @@ -403,10 +435,20 @@ int main(int argc, char* argv[]) { CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); CUDA_RT_CALL(cudaFree(l2_norm_d)); - +#if NCCL_UB_SUPPORT + if (user_buffer_reg) { +//TODO: Deregister and Free the Buffer + NCCL_CALL(ncclCommDeregister(nccl_comm, a_new_reg_handle)); + NCCL_CALL(ncclCommDeregister(nccl_comm, a_reg_handle)); + NCCL_CALL(ncclMemFree(a_new)); + NCCL_CALL(ncclMemFree(a)); + } + else +#endif //NCCL_UB_SUPPORT + { CUDA_RT_CALL(cudaFree(a_new)); CUDA_RT_CALL(cudaFree(a)); - + } CUDA_RT_CALL(cudaFreeHost(a_h)); CUDA_RT_CALL(cudaFreeHost(a_ref_h)); diff --git a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.ipynb b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.ipynb index eb6077a..8a725ad 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", "## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM\n", "\n", @@ -22,18 +22,17 @@ "is the MPI variant of the jacobi solver. You need to work on `TODOs` in\n", "`jacobi.cu`:\n", "\n", - "- Initialize NVSHMEM:\n", - " - Include NVSHMEM headers.\n", - " - Initialize NVSHMEM using `MPI_COMM_WORLD`.\n", - " - Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric\n", - " heap. Take care of passing in a consistent size!\n", - " - Calculate halo/boundary row index of top and bottom neighbors.\n", - " - Add necessary inter PE synchronization.\n", - " - Replace MPI periodic boundary conditions with\n", - " `nvshmemx_float_put_on_stream` to directly push values needed by\n", - " top and bottom neighbors.\n", - " - Deallocate memory from the NVSHMEM symetric heap.\n", - " - Finalize NVSHMEM before existing the application\n", + "- Include NVSHMEM headers.\n", + "- Initialize NVSHMEM using `MPI_COMM_WORLD`.\n", + "- Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric\n", + " heap. Take care of passing in a consistent size!\n", + "- Calculate halo/boundary row index of top and bottom neighbors.\n", + "- Add necessary inter PE synchronization.\n", + "- Replace MPI periodic boundary conditions with\n", + " `nvshmemx_float_put_on_stream` to directly push values needed by top\n", + " and bottom neighbors.\n", + "- Deallocate memory from the NVSHMEM symmetric heap.\n", + "- Finalize NVSHMEM before existing the application\n", "\n", "Compile with\n", "\n", @@ -53,18 +52,18 @@ "\n", "#### Note\n", "\n", - "The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES`\n", - "automatically so that each spawned process only sees the GPU it should\n", - "use (see [GPU\n", - "Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", - "in the JUWELS Booster Overview documentation). This is not supported for\n", - "NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled\n", - "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes\n", - "srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all\n", - "GPUs listed. This is automatically done for the `sanitize`, `run` and\n", + "The Slurm installation on JEDI sets `CUDA_VISIBLE_DEVICES` automatically\n", + "so that each spawned process only sees the GPU it should use see\n", + "[Hardware\n", + "Configuration](https://apps.fz-juelich.de/jsc/hps/jedi/configuration.html#hardware-configuration)\n", + "in the JEDI user documentation). This is not supported for NVSHMEM. The\n", + "automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting\n", + "`CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With\n", + "`CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs\n", + "listed. This is automatically done for the `sanitize`, `run` and\n", "`profile` make targets." ], - "id": "4153946b-60de-494a-ad07-7ecb34a91c84" + "id": "aea0b932-fe75-453b-a546-0bafe398d77d" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.md b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.md index a263c17..750f873 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.md +++ b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Instructions.md @@ -1,9 +1,9 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 29 May 2022 9AM - 6PM CEST -- Location: Hall Y6, Congress Center Hamburg (CCH) +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 ## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM @@ -13,15 +13,14 @@ The purpose of this task is to use the NVSHMEM host API instead of MPI to implement a multi-GPU jacobi solver. The starting point of this task is the MPI variant of the jacobi solver. You need to work on `TODOs` in `jacobi.cu`: -- Initialize NVSHMEM: - - Include NVSHMEM headers. - - Initialize NVSHMEM using `MPI_COMM_WORLD`. - - Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric heap. Take care of passing in a consistent size! - - Calculate halo/boundary row index of top and bottom neighbors. - - Add necessary inter PE synchronization. - - Replace MPI periodic boundary conditions with `nvshmemx_float_put_on_stream` to directly push values needed by top and bottom neighbors. - - Deallocate memory from the NVSHMEM symetric heap. - - Finalize NVSHMEM before existing the application +- Include NVSHMEM headers. +- Initialize NVSHMEM using `MPI_COMM_WORLD`. +- Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric heap. Take care of passing in a consistent size! +- Calculate halo/boundary row index of top and bottom neighbors. +- Add necessary inter PE synchronization. +- Replace MPI periodic boundary conditions with `nvshmemx_float_put_on_stream` to directly push values needed by top and bottom neighbors. +- Deallocate memory from the NVSHMEM symmetric heap. +- Finalize NVSHMEM before existing the application Compile with @@ -40,5 +39,5 @@ Study the performance by glimpsing at the profile generated with #### Note -The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES` automatically so that each spawned process only sees the GPU it should use (see [GPU Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices) in the JUWELS Booster Overview documentation). This is not supported for NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs listed. This is automatically done for the `sanitize`, `run` and `profile` make targets. +The Slurm installation on JEDI sets `CUDA_VISIBLE_DEVICES` automatically so that each spawned process only sees the GPU it should use see [Hardware Configuration](https://apps.fz-juelich.de/jsc/hps/jedi/configuration.html#hardware-configuration) in the JEDI user documentation). This is not supported for NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs listed. This is automatically done for the `sanitize`, `run` and `profile` make targets. diff --git a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Makefile b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Makefile index e711f37..823b736 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Makefile +++ b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/Makefile @@ -1,4 +1,6 @@ -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2024, All rights reserved. +THIS_TASK := 08H-NVSHMEM-sol +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 4 NVCC=nvcc N_D_C_VMM=1 #Enabled to hide warning and errors only found in NVSHMEM/2.5.0 to be fixed in next release @@ -11,6 +13,7 @@ endif ifndef MPI_HOME $(error MPI_HOME is not set) endif +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -19,7 +22,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -36,10 +40,10 @@ clean: rm -f jacobi jacobi.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi profile: jacobi - CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu index d293c62..dd55b30 100644 --- a/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu +++ b/08-H_NCCL_NVSHMEM/solutions/NVSHMEM/jacobi.cu @@ -342,17 +342,17 @@ int main(int argc, char* argv[]) { compute_stream)); } - //TODO: Replace MPI communication with Host initiated NVSHMEM calls + //TODO: Replace MPI communication with Host initiated NVSHMEM calls // Apply periodic boundary conditions - PUSH_RANGE("NVSHMEM", 5) - nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream); + PUSH_RANGE("NVSHMEM", 5) + nvshmemx_float_put_on_stream(a_new + iy_top_lower_boundary_idx * nx, a_new + iy_start * nx, nx, top, push_stream); nvshmemx_float_put_on_stream(a_new + iy_bottom_upper_boundary_idx * nx, a_new + (iy_end - 1) * nx, nx, bottom, push_stream); - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0)); - //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) + //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) nvshmemx_barrier_all_on_stream(compute_stream); if (calculate_norm) { @@ -395,8 +395,8 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { -//TODO: Replace MPI with NVSHMEM for your output - printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, + //TODO: Replace MPI with NVSHMEM for your output + printf("nvshmem, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, (stop - start), runtime_serial); } else { printf("Num GPUs: %d.\n", size); diff --git a/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.ipynb b/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.ipynb index cd4d85a..cebc40c 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", "## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication\n", "\n", @@ -22,15 +22,20 @@ "variant of the jacobi solver. You need to work on `TODOs` in\n", "`jacobi.cpp`:\n", "\n", - "- Initialize NVSHMEM:\n", - " - Include NCCL headers.\n", - " - Create a NCCL unique ID, and initialize it\n", - " - Create a NCCL communicator and initilize it\n", - " - Replace the MPI_Sendrecv calls with ncclRecv and ncclSend calls\n", - " for the warmup stage\n", - " - Replace MPI for the periodic boundary conditions with NCCL\n", - " - Fix output message to indicate nccl rather than mpi\n", - " - Destroy NCCL comunicator\n", + "- Include NCCL headers.\n", + "- Create a NCCL unique ID, and initialize it\n", + "- Create a NCCL communicator and initialize it\n", + "- Replace the MPI_Sendrecv calls with ncclRecv and ncclSend calls for\n", + " the warmup stage\n", + "- Replace MPI for the periodic boundary conditions with NCCL\n", + "- Fix output message to indicate nccl rather than mpi\n", + "- Destroy NCCL communicator\n", + "\n", + "If you have time left:\n", + "\n", + "- Use ncclMemAlloc to allocate the buffers and register them for\n", + " communication\n", + "- Don\\`t forget to deregister and free the buffers correctly\n", "\n", "Compile with\n", "\n", @@ -48,7 +53,7 @@ "`make profile`. For `make run` and `make profile` the environment\n", "variable `NP` can be set to change the number of processes." ], - "id": "8c9e9e42-bda5-4b52-a322-0e72171476c5" + "id": "42337a25-287a-4ae5-bcc6-e87fcf42ee4e" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.md b/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.md index 465181f..8c52274 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.md +++ b/08-H_NCCL_NVSHMEM/tasks/NCCL/Instructions.md @@ -1,9 +1,9 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 29 May 2022 9AM - 6PM CEST -- Location: Hall Y6, Congress Center Hamburg (CCH) +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 ## Hands-On 8-NCCL: Using NCCL for Inter-GPU Communication @@ -13,14 +13,18 @@ The purpose of this task is to use NCCL instead of MPI to implement a multi-GPU jacobi solver. The starting point of this task is the MPI variant of the jacobi solver. You need to work on `TODOs` in `jacobi.cpp`: -- Initialize NVSHMEM: - - Include NCCL headers. - - Create a NCCL unique ID, and initialize it - - Create a NCCL communicator and initilize it - - Replace the MPI\_Sendrecv calls with ncclRecv and ncclSend calls for the warmup stage - - Replace MPI for the periodic boundary conditions with NCCL - - Fix output message to indicate nccl rather than mpi - - Destroy NCCL comunicator +- Include NCCL headers. +- Create a NCCL unique ID, and initialize it +- Create a NCCL communicator and initialize it +- Replace the MPI\_Sendrecv calls with ncclRecv and ncclSend calls for the warmup stage +- Replace MPI for the periodic boundary conditions with NCCL +- Fix output message to indicate nccl rather than mpi +- Destroy NCCL communicator + +If you have time left: + +- Use ncclMemAlloc to allocate the buffers and register them for communication +- Don`t forget to deregister and free the buffers correctly Compile with diff --git a/08-H_NCCL_NVSHMEM/tasks/NCCL/Makefile b/08-H_NCCL_NVSHMEM/tasks/NCCL/Makefile index 2ba58cf..a6bb0dd 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NCCL/Makefile +++ b/08-H_NCCL_NVSHMEM/tasks/NCCL/Makefile @@ -1,10 +1,13 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 08H-NCCL-task +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 1 NVCC=nvcc JSC_SUBMIT_CMD ?= srun --cpu-bind=socket --gres=gpu:4 --ntasks-per-node 4 MPICXX=mpicxx CUDA_HOME ?= /usr/local/cuda NCCL_HOME ?= /usr +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -13,7 +16,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -33,10 +37,13 @@ clean: rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi +run_user_buffer: jacobi + $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi -user_buffer_reg + profile: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp b/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp index 3588d21..5bcf77e 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp +++ b/08-H_NCCL_NVSHMEM/tasks/NCCL/jacobi.cpp @@ -92,6 +92,13 @@ const int num_colors = sizeof(colors) / sizeof(uint32_t); } //TODO: include NCCL headers +#ifdef NCCL_VERSION +#define NCCL_VERSION_UB NCCL_VERSION(2,19,1) +#define NCCL_UB_SUPPORT NCCL_VERSION_CODE >= NCCL_VERSION_UB +#else +#define NCCL_UB_SUPPORT 0 +#endif + #define NCCL_CALL(call) \ { \ @@ -164,6 +171,13 @@ int main(int argc, char* argv[]) { const int nx = get_argval(argv, argv + argc, "-nx", 16384); const int ny = get_argval(argv, argv + argc, "-ny", 16384); const bool csv = get_arg(argv, argv + argc, "-csv"); + bool user_buffer_reg = get_arg(argv, argv + argc, "-user_buffer_reg"); +#if NCCL_UB_SUPPORT == 0 + if (user_buffer_reg) { + fprintf(stderr,"WARNING: Ignoring -user_buffer_reg, required NCCL APIs are provided by NCCL 2.19.1 or later.\n"); + user_buffer_reg = false; + } +#endif //NCCL_UB_SUPPORT == 0 int local_rank = -1; { @@ -206,10 +220,23 @@ int main(int argc, char* argv[]) { chunk_size = chunk_size_high; real* a; - CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real))); real* a_new; - CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real))); +#if NCCL_UB_SUPPORT + void* a_reg_handle; + void* a_new_reg_handle; + if (user_buffer_reg) { + //TODO: Allocate the memory with ncclMemAlloc and register it for the commmunicatior + if ( nccl_version < 22304 ) { + fprintf(stderr,"WARNING: -user_buffer_reg available, but Jacobi communication pattern needs NCCL 2.23.4 or later.\n"); + } + } + else +#endif //NCCL_UB_SUPPORT + { + CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real))); + CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real))); + } CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real))); CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(real))); @@ -251,7 +278,7 @@ int main(int argc, char* argv[]) { real* l2_norm_h; CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); -//TODO: Rename range + //TODO: Rename range PUSH_RANGE("MPI_Warmup", 5) for (int i = 0; i < 10; ++i) { const int top = rank > 0 ? rank - 1 : (size - 1); @@ -292,7 +319,7 @@ int main(int argc, char* argv[]) { CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0)); calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); - launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, compute_stream); launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, @@ -312,16 +339,16 @@ int main(int argc, char* argv[]) { const int bottom = (rank + 1) % size; // Apply periodic boundary conditions - //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls + //TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls // using the nccl communicator and push_stream. // Remember to use ncclGroupStart() and ncclGroupEnd() - PUSH_RANGE("MPI", 5) + PUSH_RANGE("MPI", 5) MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE if (calculate_norm) { @@ -366,9 +393,9 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { - //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap - printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, - (stop - start), runtime_serial); + //TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap + printf("mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, + (stop - start), runtime_serial); } else { printf("Num GPUs: %d.\n", size); printf( @@ -386,10 +413,16 @@ int main(int argc, char* argv[]) { CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); CUDA_RT_CALL(cudaFree(l2_norm_d)); - +#if NCCL_UB_SUPPORT + if (user_buffer_reg) { +//TODO: Deregister and Free the Buffer + } + else +#endif //NCCL_UB_SUPPORT + { CUDA_RT_CALL(cudaFree(a_new)); CUDA_RT_CALL(cudaFree(a)); - + } CUDA_RT_CALL(cudaFreeHost(a_h)); CUDA_RT_CALL(cudaFreeHost(a_ref_h)); diff --git a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.ipynb b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.ipynb index eb6077a..8a725ad 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.ipynb +++ b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", "## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM\n", "\n", @@ -22,18 +22,17 @@ "is the MPI variant of the jacobi solver. You need to work on `TODOs` in\n", "`jacobi.cu`:\n", "\n", - "- Initialize NVSHMEM:\n", - " - Include NVSHMEM headers.\n", - " - Initialize NVSHMEM using `MPI_COMM_WORLD`.\n", - " - Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric\n", - " heap. Take care of passing in a consistent size!\n", - " - Calculate halo/boundary row index of top and bottom neighbors.\n", - " - Add necessary inter PE synchronization.\n", - " - Replace MPI periodic boundary conditions with\n", - " `nvshmemx_float_put_on_stream` to directly push values needed by\n", - " top and bottom neighbors.\n", - " - Deallocate memory from the NVSHMEM symetric heap.\n", - " - Finalize NVSHMEM before existing the application\n", + "- Include NVSHMEM headers.\n", + "- Initialize NVSHMEM using `MPI_COMM_WORLD`.\n", + "- Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric\n", + " heap. Take care of passing in a consistent size!\n", + "- Calculate halo/boundary row index of top and bottom neighbors.\n", + "- Add necessary inter PE synchronization.\n", + "- Replace MPI periodic boundary conditions with\n", + " `nvshmemx_float_put_on_stream` to directly push values needed by top\n", + " and bottom neighbors.\n", + "- Deallocate memory from the NVSHMEM symmetric heap.\n", + "- Finalize NVSHMEM before existing the application\n", "\n", "Compile with\n", "\n", @@ -53,18 +52,18 @@ "\n", "#### Note\n", "\n", - "The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES`\n", - "automatically so that each spawned process only sees the GPU it should\n", - "use (see [GPU\n", - "Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", - "in the JUWELS Booster Overview documentation). This is not supported for\n", - "NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled\n", - "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes\n", - "srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all\n", - "GPUs listed. This is automatically done for the `sanitize`, `run` and\n", + "The Slurm installation on JEDI sets `CUDA_VISIBLE_DEVICES` automatically\n", + "so that each spawned process only sees the GPU it should use see\n", + "[Hardware\n", + "Configuration](https://apps.fz-juelich.de/jsc/hps/jedi/configuration.html#hardware-configuration)\n", + "in the JEDI user documentation). This is not supported for NVSHMEM. The\n", + "automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting\n", + "`CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With\n", + "`CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs\n", + "listed. This is automatically done for the `sanitize`, `run` and\n", "`profile` make targets." ], - "id": "4153946b-60de-494a-ad07-7ecb34a91c84" + "id": "aea0b932-fe75-453b-a546-0bafe398d77d" } ], "nbformat": 4, diff --git a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.md b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.md index a263c17..750f873 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.md +++ b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Instructions.md @@ -1,9 +1,9 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 29 May 2022 9AM - 6PM CEST -- Location: Hall Y6, Congress Center Hamburg (CCH) +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 ## Hands-On 8-NVSHMEM: Host-initiated Communication with NVSHMEM @@ -13,15 +13,14 @@ The purpose of this task is to use the NVSHMEM host API instead of MPI to implement a multi-GPU jacobi solver. The starting point of this task is the MPI variant of the jacobi solver. You need to work on `TODOs` in `jacobi.cu`: -- Initialize NVSHMEM: - - Include NVSHMEM headers. - - Initialize NVSHMEM using `MPI_COMM_WORLD`. - - Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric heap. Take care of passing in a consistent size! - - Calculate halo/boundary row index of top and bottom neighbors. - - Add necessary inter PE synchronization. - - Replace MPI periodic boundary conditions with `nvshmemx_float_put_on_stream` to directly push values needed by top and bottom neighbors. - - Deallocate memory from the NVSHMEM symetric heap. - - Finalize NVSHMEM before existing the application +- Include NVSHMEM headers. +- Initialize NVSHMEM using `MPI_COMM_WORLD`. +- Allocate work arrays `a` and `a_new` from the NVSHMEM symmetric heap. Take care of passing in a consistent size! +- Calculate halo/boundary row index of top and bottom neighbors. +- Add necessary inter PE synchronization. +- Replace MPI periodic boundary conditions with `nvshmemx_float_put_on_stream` to directly push values needed by top and bottom neighbors. +- Deallocate memory from the NVSHMEM symmetric heap. +- Finalize NVSHMEM before existing the application Compile with @@ -40,5 +39,5 @@ Study the performance by glimpsing at the profile generated with #### Note -The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES` automatically so that each spawned process only sees the GPU it should use (see [GPU Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices) in the JUWELS Booster Overview documentation). This is not supported for NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs listed. This is automatically done for the `sanitize`, `run` and `profile` make targets. +The Slurm installation on JEDI sets `CUDA_VISIBLE_DEVICES` automatically so that each spawned process only sees the GPU it should use see [Hardware Configuration](https://apps.fz-juelich.de/jsc/hps/jedi/configuration.html#hardware-configuration) in the JEDI user documentation). This is not supported for NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs listed. This is automatically done for the `sanitize`, `run` and `profile` make targets. diff --git a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Makefile b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Makefile index e711f37..7c57e3e 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Makefile +++ b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/Makefile @@ -1,4 +1,6 @@ -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2024, All rights reserved. +THIS_TASK := 08H-NVSHMEM-task +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 4 NVCC=nvcc N_D_C_VMM=1 #Enabled to hide warning and errors only found in NVSHMEM/2.5.0 to be fixed in next release @@ -11,6 +13,7 @@ endif ifndef MPI_HOME $(error MPI_HOME is not set) endif +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -19,7 +22,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -36,10 +40,10 @@ clean: rm -f jacobi jacobi.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi profile: jacobi - CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu index 0359d7e..b754207 100644 --- a/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu +++ b/08-H_NCCL_NVSHMEM/tasks/NVSHMEM/jacobi.cu @@ -333,7 +333,7 @@ int main(int argc, char* argv[]) { compute_stream)); } - //TODO: Replace MPI communication with Host initiated NVSHMEM calls + //TODO: Replace MPI communication with Host initiated NVSHMEM calls // Apply periodic boundary conditions PUSH_RANGE("MPI", 5) MPI_CALL(MPI_Sendrecv(a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0, @@ -341,12 +341,12 @@ int main(int argc, char* argv[]) { MPI_STATUS_IGNORE)); MPI_CALL(MPI_Sendrecv(a_new + (iy_end - 1) * nx, nx, MPI_REAL_TYPE, bottom, 0, a_new, nx, MPI_REAL_TYPE, top, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE)); - CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); POP_RANGE CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0)); - //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) + //TODO: add necessary inter PE synchronization using the nvshmemx_barrier_all_on_stream(...) if (calculate_norm) { CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); @@ -388,7 +388,7 @@ int main(int argc, char* argv[]) { if (rank == 0 && result_correct) { if (csv) { -//TODO: Replace MPI with NVSHMEM for your output + //TODO: Replace MPI with NVSHMEM for your output printf("mpi, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, (stop - start), runtime_serial); } else { diff --git a/09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides.pdf b/09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides.pdf new file mode 100644 index 0000000..c2821dd Binary files /dev/null and b/09-L_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/slides.pdf differ diff --git a/09-L_Device-initiated_Communication_with_NVSHMEM/slides.pdf b/09-L_Device-initiated_Communication_with_NVSHMEM/slides.pdf deleted file mode 100644 index c12ae33..0000000 Binary files a/09-L_Device-initiated_Communication_with_NVSHMEM/slides.pdf and /dev/null differ diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Instructions.ipynb b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb similarity index 76% rename from 10-H_Device-initiated_Communication_with_NVSHMEM/.master/Instructions.ipynb rename to 10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb index 00b04f9..56f5b8a 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Instructions.ipynb +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb @@ -4,14 +4,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", - "## Hands-On 10: Device-initiated Communication with NVSHMEM\n", + "## Hands-On 10A: Device-initiated Communication with NVSHMEM\n", "\n", "### Task: Using NVSHMEM device API\n", "\n", @@ -53,15 +53,15 @@ "\n", "#### Note\n", "\n", - "The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES`\n", - "automatically so that each spawned process only sees the GPU it should\n", - "use (see [GPU\n", - "Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", - "in the JUWELS Booster Overview documentation). This is not supported for\n", - "NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled\n", - "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes\n", - "srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all\n", - "GPUs listed. This is automatically done for the `sanitize`, `run` and\n", + "The Slurm installation on JEDI sets `CUDA_VISIBLE_DEVICES` automatically\n", + "so that each spawned process only sees the GPU it should use (see\n", + "[Hardware\n", + "Configuration](https://apps.fz-juelich.de/jsc/hps/jedi/configuration.html#hardware-configuration)\n", + "in the JEDI user documentation). This is not supported for NVSHMEM. The\n", + "automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting\n", + "`CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With\n", + "`CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs\n", + "listed. This is automatically done for the `sanitize`, `run` and\n", "`profile` make targets.\n", "\n", "### Advanced Task: Use `nvshmemx_float_put_nbi_block`\n", @@ -86,7 +86,7 @@ " repository](https://github.com/NVIDIA/multi-gpu-programming-models)\n", " implements the same strategy." ], - "id": "f7525123-132c-4d36-890e-9efe369db7be" + "id": "ab1bfa7b-226f-4465-9f2d-5376e65931e2" } ], "nbformat": 4, diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Instructions.md b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Instructions.md similarity index 69% rename from 10-H_Device-initiated_Communication_with_NVSHMEM/.master/Instructions.md rename to 10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Instructions.md index bd8df5f..7432522 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Instructions.md +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Instructions.md @@ -1,11 +1,11 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 29 May 2022 9AM - 6PM CEST -- Location: Hall Y6, Congress Center Hamburg (CCH) +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 -## Hands-On 10: Device-initiated Communication with NVSHMEM +## Hands-On 10A: Device-initiated Communication with NVSHMEM ### Task: Using NVSHMEM device API @@ -41,7 +41,7 @@ Study the performance by glimpsing at the profile generated with #### Note -The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES` automatically so that each spawned process only sees the GPU it should use (see [GPU Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices) in the JUWELS Booster Overview documentation). This is not supported for NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs listed. This is automatically done for the `sanitize`, `run` and `profile` make targets. +The Slurm installation on JEDI sets `CUDA_VISIBLE_DEVICES` automatically so that each spawned process only sees the GPU it should use (see [Hardware Configuration](https://apps.fz-juelich.de/jsc/hps/jedi/configuration.html#hardware-configuration) in the JEDI user documentation). This is not supported for NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs listed. This is automatically done for the `sanitize`, `run` and `profile` make targets. ### Advanced Task: Use `nvshmemx_float_put_nbi_block` diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Makefile b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Makefile.in similarity index 75% rename from 10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Makefile rename to 10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Makefile.in index 1d83127..1917f62 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Makefile +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/Makefile.in @@ -1,4 +1,6 @@ -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 10H-NVSHMEM-@@TASKSOL@@ +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 4 NVCC=nvcc JSC_SUBMIT_CMD ?= srun --cpu-bind=socket --gres=gpu:4 --ntasks-per-node 4 @@ -10,6 +12,7 @@ endif ifndef MPI_HOME $(error MPI_HOME is not set) endif +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -18,7 +21,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -35,10 +39,10 @@ clean: rm -f jacobi jacobi.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi profile: jacobi - CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/copy.mk b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/copy.mk similarity index 61% rename from 10-H_Device-initiated_Communication_with_NVSHMEM/.master/copy.mk rename to 10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/copy.mk index 0557b43..02996b9 100755 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/copy.mk +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/copy.mk @@ -1,23 +1,28 @@ #!/usr/bin/make -f -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -TASKDIR = ../tasks/ -SOLUTIONDIR = ../solutions/ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. +TASKDIR = ../../tasks/Device-initiated_Communication_with_NVSHMEM +SOLUTIONDIR = ../../solutions/Device-initiated_Communication_with_NVSHMEM -IYPNB_TEMPLATE = ../../.template.json +IYPNB_TEMPLATE = ../../../.template.json PROCESSFILES = jacobi.cu -COPYFILES = Makefile Instructions.ipynb Instructions.md +COPYFILES = Instructions.ipynb Instructions.md TASKPROCCESFILES = $(addprefix $(TASKDIR)/,$(PROCESSFILES)) TASKCOPYFILES = $(addprefix $(TASKDIR)/,$(COPYFILES)) SOLUTIONPROCCESFILES = $(addprefix $(SOLUTIONDIR)/,$(PROCESSFILES)) SOLUTIONCOPYFILES = $(addprefix $(SOLUTIONDIR)/,$(COPYFILES)) +MAKEFILES = $(addsuffix /Makefile,$(TASKDIR) $(SOLUTIONDIR)) .PHONY: all task all: task -task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES} +task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES} ${MAKEFILES} +$(TASKDIR)/Makefile: Makefile.in + sed -e 's/@@TASKSOL@@/task/' $< > $@ +$(SOLUTIONDIR)/Makefile: Makefile.in + sed -e 's/@@TASKSOL@@/sol/' $< > $@ ${TASKPROCCESFILES}: $(PROCESSFILES) mkdir -p $(TASKDIR)/ diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/jacobi.cu b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/jacobi.cu similarity index 100% rename from 10-H_Device-initiated_Communication_with_NVSHMEM/.master/jacobi.cu rename to 10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Device-initiated_Communication_with_NVSHMEM/jacobi.cu diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Instructions.ipynb b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Instructions.ipynb new file mode 100644 index 0000000..a09a97d --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Instructions.ipynb @@ -0,0 +1,92 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", + "- Program Link:\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", + "\n", + "## Hands-On 10B: Using CUDA Graphs\n", + "\n", + "### Task: Combining CUDA Graphs with NCCL for Inter-GPU Communication\n", + "\n", + "#### Description\n", + "\n", + "The purpose of this task is to introduce [CUDA\n", + "Graphs](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-graphs).\n", + "For that, the NCCL version of the Jacobi solver developed in hands-on 8\n", + "is modified to use the [CUDA Graph Management\n", + "API](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH)\n", + "so that the CUDA API calls required in the main solver loop are\n", + "minimized. You need to work on `TODOs` in `jacobi.cpp`:\n", + "\n", + "- Use\n", + " [`cudaStreamBeginCapture`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g793d7d4e474388ddfda531603dc34aa3)\n", + " and\n", + " [`cudaStreamEndCapture`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1gf5a0efebc818054ceecd1e3e5e76d93e)\n", + " to create the necessary graphs.\n", + " - Read the comment at the top of the\n", + " `PUSH_RANGE(\"Build graphs\", 0)` structured block.\n", + "- Instantiate captured graphs with\n", + " [`cudaGraphInstantiateWithFlags`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233).\n", + " - Extra: Experiment (compare Nsight Systems timelines) with\n", + " [`cudaGraphInstantiate`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1gb25beab33abe4b2d13edbb6e35cb72ff)\n", + " not using `cudaGraphInstantiateFlagUseNodePriority`.\n", + "- Optional: Manually upload instantiated graphs with\n", + " [`cudaGraphUpload`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ge546432e411b4495b93bdcbf2fc0b2bd).\n", + "- Use\n", + " [`cudaGraphLaunch`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597)\n", + " to launch a single graph per iteration instead of launching multiple\n", + " kernels in different streams and managing their dependencies with\n", + " `cudaEventRecord` and `cudaStreamWaitEvent`.\n", + "- Free resources with\n", + " [`cudaGraphDestroy`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga351557d4d9ecab23d56395599b0e069)\n", + " and\n", + " [`cudaGraphExecDestroy`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g6d101c2cbc6dea2b4fba0fbe407eb91f).\n", + "\n", + "Compile with\n", + "\n", + "``` bash\n", + "make\n", + "```\n", + "\n", + "Submit your compiled application to the batch system with\n", + "\n", + "``` bash\n", + "make run\n", + "```\n", + "\n", + "Study the performance by glimpsing at the profile generated with\n", + "`make profile`. For `make run` and `make profile` the environment\n", + "variable `NP` can be set to change the number of processes." + ], + "id": "b5b4c6d4-656b-44dc-97c2-b51e226d522f" + } + ], + "nbformat": 4, + "nbformat_minor": 5, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + } +} diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Instructions.md b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Instructions.md new file mode 100644 index 0000000..f0fa5be --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Instructions.md @@ -0,0 +1,40 @@ +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale + +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA +- Program Link: + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 + +## Hands-On 10B: Using CUDA Graphs + +### Task: Combining CUDA Graphs with NCCL for Inter-GPU Communication + +#### Description + +The purpose of this task is to introduce [CUDA Graphs](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-graphs). +For that, the NCCL version of the Jacobi solver developed in hands-on 8 is modified to use the +[CUDA Graph Management API](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH) +so that the CUDA API calls required in the main solver loop are minimized. You need to work on `TODOs` in `jacobi.cpp`: + +- Use [`cudaStreamBeginCapture`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g793d7d4e474388ddfda531603dc34aa3) and [`cudaStreamEndCapture`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1gf5a0efebc818054ceecd1e3e5e76d93e) to create the necessary graphs. + - Read the comment at the top of the `PUSH_RANGE("Build graphs", 0)` structured block. +- Instantiate captured graphs with [`cudaGraphInstantiateWithFlags`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233). + - Extra: Experiment (compare Nsight Systems timelines) with [`cudaGraphInstantiate`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1gb25beab33abe4b2d13edbb6e35cb72ff) not using `cudaGraphInstantiateFlagUseNodePriority`. +- Optional: Manually upload instantiated graphs with [`cudaGraphUpload`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ge546432e411b4495b93bdcbf2fc0b2bd). +- Use [`cudaGraphLaunch`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597) to launch a single graph per iteration instead of launching multiple kernels in different streams and managing their dependencies with `cudaEventRecord` and `cudaStreamWaitEvent`. +- Free resources with [`cudaGraphDestroy`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga351557d4d9ecab23d56395599b0e069) and [`cudaGraphExecDestroy`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g6d101c2cbc6dea2b4fba0fbe407eb91f). + +Compile with + +``` {.bash} +make +``` + +Submit your compiled application to the batch system with + +``` {.bash} +make run +``` + +Study the performance by glimpsing at the profile generated with +`make profile`. For `make run` and `make profile` the environment variable `NP` can be set to change the number of processes. diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Makefile.in b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Makefile.in new file mode 100644 index 0000000..7aa2d9c --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/Makefile.in @@ -0,0 +1,46 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 10H-Graphs-@@TASKSOL@@ +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') +NP ?= 1 +NVCC=nvcc +JSC_SUBMIT_CMD ?= srun --cpu-bind=socket --gres=gpu:4 --ntasks-per-node 4 +MPICXX=mpicxx +CUDA_HOME ?= /usr/local/cuda +NCCL_HOME ?= /usr +_JSCCOURSE_GPU_ARCH?=80 +GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 +GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 +GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 +GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 +GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 +GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 +GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 +GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) +ifdef DISABLE_CUB + NVCC_FLAGS = -Xptxas --optimize-float-atomics +else + NVCC_FLAGS = -DHAVE_CUB +endif +NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14 +MPICXX_FLAGS = -DUSE_NVTX -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include -std=c++14 +LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt -lnccl +jacobi: Makefile jacobi.cpp jacobi_kernels.o + $(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi + +jacobi_kernels.o: Makefile jacobi_kernels.cu + $(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c + +.PHONY.: clean +clean: + rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log + +sanitize: jacobi + $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + +run: jacobi + $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi + +profile: jacobi + $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx --cuda-graph-trace=node -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/copy.mk b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/copy.mk new file mode 100755 index 0000000..a8ac791 --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/copy.mk @@ -0,0 +1,47 @@ +#!/usr/bin/make -f +# Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. +TASKDIR = ../../tasks/Using_CUDA_Graphs +SOLUTIONDIR = ../../solutions/Using_CUDA_Graphs + +IYPNB_TEMPLATE = ../../../.template.json + +PROCESSFILES = jacobi.cpp +COPYFILES = jacobi_kernels.cu Instructions.ipynb Instructions.md + + +TASKPROCCESFILES = $(addprefix $(TASKDIR)/,$(PROCESSFILES)) +TASKCOPYFILES = $(addprefix $(TASKDIR)/,$(COPYFILES)) +SOLUTIONPROCCESFILES = $(addprefix $(SOLUTIONDIR)/,$(PROCESSFILES)) +SOLUTIONCOPYFILES = $(addprefix $(SOLUTIONDIR)/,$(COPYFILES)) +MAKEFILES = $(addsuffix /Makefile,$(TASKDIR) $(SOLUTIONDIR)) + +.PHONY: all task +all: task +task: ${TASKPROCCESFILES} ${TASKCOPYFILES} ${SOLUTIONPROCCESFILES} ${SOLUTIONCOPYFILES} ${MAKEFILES} + +$(TASKDIR)/Makefile: Makefile.in + sed -e 's/@@TASKSOL@@/task/' $< > $@ +$(SOLUTIONDIR)/Makefile: Makefile.in + sed -e 's/@@TASKSOL@@/sol/' $< > $@ + +${TASKPROCCESFILES}: $(PROCESSFILES) + mkdir -p $(TASKDIR)/ + cppp -USOLUTION $(notdir $@) $@ + +${SOLUTIONPROCCESFILES}: $(PROCESSFILES) + mkdir -p $(SOLUTIONDIR)/ + cppp -DSOLUTION $(notdir $@) $@ + + +${TASKCOPYFILES}: $(COPYFILES) + mkdir -p $(TASKDIR)/ + cp $(notdir $@) $@ + +${SOLUTIONCOPYFILES}: $(COPYFILES) + mkdir -p $(SOLUTIONDIR)/ + cp $(notdir $@) $@ + +%.ipynb: %.md + pandoc $< -o $@ + # add metadata so this is seen as python + jq -s '.[0] * .[1]' $@ $(IYPNB_TEMPLATE) | sponge $@ diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/jacobi.cpp b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/jacobi.cpp new file mode 100644 index 0000000..360e66a --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/jacobi.cpp @@ -0,0 +1,597 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2017,2021,2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include + +#include +#define MPI_CALL(call) \ + { \ + int mpi_status = call; \ + if (0 != mpi_status) { \ + char mpi_error_string[MPI_MAX_ERROR_STRING]; \ + int mpi_error_string_length = 0; \ + MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \ + if (NULL != mpi_error_string) \ + fprintf(stderr, \ + "ERROR: MPI call \"%s\" in line %d of file %s failed " \ + "with %s " \ + "(%d).\n", \ + #call, __LINE__, __FILE__, mpi_error_string, mpi_status); \ + else \ + fprintf(stderr, \ + "ERROR: MPI call \"%s\" in line %d of file %s failed " \ + "with %d.\n", \ + #call, __LINE__, __FILE__, mpi_status); \ + } \ + } + +#include + +#ifdef USE_NVTX +#include + +const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, + 0x0000ffff, 0x00ff0000, 0x00ffffff}; +const int num_colors = sizeof(colors) / sizeof(uint32_t); + +#define PUSH_RANGE(name, cid) \ + { \ + int color_id = cid; \ + color_id = color_id % num_colors; \ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxRangePushEx(&eventAttrib); \ + } +#define POP_RANGE nvtxRangePop(); +#else +#define PUSH_RANGE(name, cid) +#define POP_RANGE +#endif + +#define CUDA_RT_CALL(call) \ + { \ + cudaError_t cudaStatus = call; \ + if (cudaSuccess != cudaStatus) \ + fprintf(stderr, \ + "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ + "with " \ + "%s (%d).\n", \ + #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ + } + +#include + +#define NCCL_CALL(call) \ + { \ + ncclResult_t ncclStatus = call; \ + if (ncclSuccess != ncclStatus) \ + fprintf(stderr, \ + "ERROR: NCCL call \"%s\" in line %d of file %s failed " \ + "with " \ + "%s (%d).\n", \ + #call, __LINE__, __FILE__, ncclGetErrorString(ncclStatus), ncclStatus); \ + } + +#ifdef USE_DOUBLE +typedef double real; +#define MPI_REAL_TYPE MPI_DOUBLE +#define NCCL_REAL_TYPE ncclDouble +#else +typedef float real; +#define MPI_REAL_TYPE MPI_FLOAT +#define NCCL_REAL_TYPE ncclFloat +#endif + +constexpr real tol = 1.0e-8; + +const real PI = 2.0 * std::asin(1.0); + +void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, + const real pi, const int offset, const int nx, const int my_ny, + const int ny); + +void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, + real* __restrict__ const l2_norm, const int iy_start, const int iy_end, + const int nx, const bool calculate_norm, cudaStream_t stream); + +double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h, + const int nccheck, const bool print); + +template +T get_argval(char** begin, char** end, const std::string& arg, const T default_val) { + T argval = default_val; + char** itr = std::find(begin, end, arg); + if (itr != end && ++itr != end) { + std::istringstream inbuf(*itr); + inbuf >> argval; + } + return argval; +} + +bool get_arg(char** begin, char** end, const std::string& arg) { + char** itr = std::find(begin, end, arg); + if (itr != end) { + return true; + } + return false; +} + +int main(int argc, char* argv[]) { + MPI_CALL(MPI_Init(&argc, &argv)); + int rank; + MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank)); + int size; + MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size)); + + ncclUniqueId nccl_uid; + if (rank == 0) NCCL_CALL(ncclGetUniqueId(&nccl_uid)); + MPI_CALL(MPI_Bcast(&nccl_uid, sizeof(ncclUniqueId), MPI_BYTE, 0, MPI_COMM_WORLD)); + + const int iter_max = get_argval(argv, argv + argc, "-niter", 1000); + const int nccheck = get_argval(argv, argv + argc, "-nccheck", 1); + const int nx = get_argval(argv, argv + argc, "-nx", 16384); + const int ny = get_argval(argv, argv + argc, "-ny", 16384); + const bool csv = get_arg(argv, argv + argc, "-csv"); + + int local_rank = -1; + { + MPI_Comm local_comm; + MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, + &local_comm)); + + MPI_CALL(MPI_Comm_rank(local_comm, &local_rank)); + + MPI_CALL(MPI_Comm_free(&local_comm)); + } + + int num_devices = 0; + CUDA_RT_CALL(cudaGetDeviceCount(&num_devices)); + CUDA_RT_CALL(cudaSetDevice(local_rank%num_devices)); + CUDA_RT_CALL(cudaFree(0)); + + ncclComm_t nccl_comm; + NCCL_CALL(ncclCommInitRank(&nccl_comm, size, nccl_uid, rank)); + int nccl_version = 0; + NCCL_CALL(ncclGetVersion(&nccl_version)); + if ( nccl_version < 2800 ) { + fprintf(stderr,"ERROR NCCL 2.8 or newer is required.\n"); + NCCL_CALL(ncclCommDestroy(nccl_comm)); + MPI_CALL(MPI_Finalize()); + return 1; + } + + real* a_ref_h; + CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real))); + real* a_h; + CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real))); + double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv && (0 == rank)); + + // ny - 2 rows are distributed amongst `size` ranks in such a way + // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows. + // This optimizes load balancing when (ny - 2) % size != 0 + int chunk_size; + int chunk_size_low = (ny - 2) / size; + int chunk_size_high = chunk_size_low + 1; + // To calculate the number of ranks that need to compute an extra row, + // the following formula is derived from this equation: + // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2 + int num_ranks_low = size * chunk_size_low + size - + (ny - 2); // Number of ranks with chunk_size = chunk_size_low + if (rank < num_ranks_low) + chunk_size = chunk_size_low; + else + chunk_size = chunk_size_high; + + real* a; + CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real))); + real* a_new; + CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real))); + + CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real))); + CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(real))); + + // Calculate local domain boundaries + int iy_start_global; // My start index in the global array + if (rank < num_ranks_low) { + iy_start_global = rank * chunk_size_low + 1; + } else { + iy_start_global = + num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1; + } + int iy_end_global = iy_start_global + chunk_size - 1; // My last index in the global array + + int iy_start = 1; + int iy_end = iy_start + chunk_size; + + // Set diriclet boundary conditions on left and right boarder + launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny); + CUDA_RT_CALL(cudaDeviceSynchronize()); + + int leastPriority = 0; + int greatestPriority = leastPriority; + CUDA_RT_CALL(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority)); + cudaStream_t compute_stream; + CUDA_RT_CALL(cudaStreamCreateWithPriority(&compute_stream, cudaStreamDefault, leastPriority)); + cudaStream_t push_stream; + CUDA_RT_CALL( + cudaStreamCreateWithPriority(&push_stream, cudaStreamDefault, greatestPriority)); + + cudaEvent_t push_prep_done; + CUDA_RT_CALL(cudaEventCreateWithFlags(&push_prep_done, cudaEventDisableTiming)); + cudaEvent_t push_done; + CUDA_RT_CALL(cudaEventCreateWithFlags(&push_done, cudaEventDisableTiming)); + cudaEvent_t reset_l2norm_done; + CUDA_RT_CALL(cudaEventCreateWithFlags(&reset_l2norm_done, cudaEventDisableTiming)); + + real* l2_norm_d; + CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); + real* l2_norm_h; + CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); + + PUSH_RANGE("NCCL_Warmup", 5) + for (int i = 0; i < 10; ++i) { + const int top = rank > 0 ? rank - 1 : (size - 1); + const int bottom = (rank + 1) % size; + NCCL_CALL(ncclGroupStart()); + NCCL_CALL(ncclRecv(a_new, nx, NCCL_REAL_TYPE, top, nccl_comm, compute_stream)); + NCCL_CALL(ncclSend(a_new + (iy_end - 1) * nx, nx, NCCL_REAL_TYPE, bottom, nccl_comm, compute_stream)); + NCCL_CALL(ncclRecv(a_new + (iy_end * nx), nx, NCCL_REAL_TYPE, bottom, nccl_comm, compute_stream)); + NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, compute_stream)); + NCCL_CALL(ncclGroupEnd()); + CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); + std::swap(a_new, a); + } + POP_RANGE + + cudaGraphExec_t graph_exec[2]; + cudaGraphExec_t graph_calc_norm_exec[2]; + + PUSH_RANGE("Build graphs", 0) + { + // Need to capture 4 distinct graphs for the 4 possible execution flows, which are the permutations of + // "calculate norm, yes or no" and "update buffer `a` or `a_new`" + // Note that we use `std::swap` to swap the pointers at the end of the iteration, and graph capture records the raw + // address of the pointer at the time of the kernel launch. + cudaGraph_t graphs[2][2]; + for (int iter = 0; iter < 4; ++iter) + { + const bool calculate_norm = (iter < 2); + //TODO: Begin capturing a graph in compute_stream +#ifdef SOLUTION + CUDA_RT_CALL(cudaStreamBeginCapture(compute_stream, cudaStreamCaptureModeGlobal)); +#endif + CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream)); + CUDA_RT_CALL(cudaEventRecord(reset_l2norm_done, compute_stream)); + + CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0)); + launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, + push_stream); + + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, + push_stream); + CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream)); + + const int top = rank > 0 ? rank - 1 : (size - 1); + const int bottom = (rank + 1) % size; + + // Apply periodic boundary conditions + NCCL_CALL(ncclGroupStart()); + NCCL_CALL(ncclRecv(a_new, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream)); + NCCL_CALL(ncclSend(a_new + (iy_end - 1) * nx, nx, NCCL_REAL_TYPE, bottom, nccl_comm, push_stream)); + NCCL_CALL(ncclRecv(a_new + (iy_end * nx), nx, NCCL_REAL_TYPE, bottom, nccl_comm, push_stream)); + NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream)); + NCCL_CALL(ncclGroupEnd()); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, + compute_stream); + + if (calculate_norm) { + CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0)); + CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, + compute_stream)); + } + + CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0)); + + const int is_even = iter % 2; + //TODO: End capturing `graphs[calculate_norm]+is_even` in compute_stream +#ifdef SOLUTION + CUDA_RT_CALL(cudaStreamEndCapture(compute_stream, graphs[calculate_norm]+is_even)); +#endif + + std::swap(a_new, a); + } + { + const bool calculate_norm = false; + //TODO: Instantiate graphs without norm calculation: What happens if cudaGraphInstantiateFlagUseNodePriority is **not** used? + // - Instantiate `graphs[calculate_norm][0]` to `graph_exec+0`. +#ifdef SOLUTION + CUDA_RT_CALL(cudaGraphInstantiateWithFlags(graph_exec+0, graphs[calculate_norm][0], cudaGraphInstantiateFlagUseNodePriority)); +#endif + // - Instantiate `graphs[calculate_norm][1]` to `graph_exec+1`. +#ifdef SOLUTION + CUDA_RT_CALL(cudaGraphInstantiateWithFlags(graph_exec+1, graphs[calculate_norm][1], cudaGraphInstantiateFlagUseNodePriority)); +#endif + } + { + const bool calculate_norm = true; + //TODO: Instantiate graphs with norm calculation: + // - Instantiate `graphs[calculate_norm][0]` to `graph_calc_norm_exec+0`. +#ifdef SOLUTION + CUDA_RT_CALL(cudaGraphInstantiateWithFlags(graph_calc_norm_exec+0, graphs[calculate_norm][0], cudaGraphInstantiateFlagUseNodePriority)); +#endif + // - Instantiate `graphs[calculate_norm][1]` to `graph_calc_norm_exec+1`. +#ifdef SOLUTION + CUDA_RT_CALL(cudaGraphInstantiateWithFlags(graph_calc_norm_exec+1, graphs[calculate_norm][1], cudaGraphInstantiateFlagUseNodePriority)); +#endif + } + // TODO: Destroy cudaGraph_t objects they are no longer required after instantiation +#ifdef SOLUTION + CUDA_RT_CALL(cudaGraphDestroy(graphs[0][0])); + CUDA_RT_CALL(cudaGraphDestroy(graphs[0][1])); + CUDA_RT_CALL(cudaGraphDestroy(graphs[1][0])); + CUDA_RT_CALL(cudaGraphDestroy(graphs[1][1])); +#endif + } + POP_RANGE + PUSH_RANGE("Graph upload", 0) + // TODO (Optional): Initiate upload instantiated graphs to overhead of lazy upload on first launch. +#ifdef SOLUTION + CUDA_RT_CALL(cudaGraphUpload(graph_exec[0],compute_stream)); + CUDA_RT_CALL(cudaGraphUpload(graph_exec[1],compute_stream)); + CUDA_RT_CALL(cudaGraphUpload(graph_calc_norm_exec[0],compute_stream)); + CUDA_RT_CALL(cudaGraphUpload(graph_calc_norm_exec[1],compute_stream)); +#endif + POP_RANGE + + CUDA_RT_CALL(cudaDeviceSynchronize()); + + if (!csv && 0 == rank) { + printf( + "Jacobi relaxation: %d iterations on %d x %d mesh with norm check " + "every %d iterations\n", + iter_max, ny, nx, nccheck); + } + + int iter = 0; + bool calculate_norm = true; + real l2_norm = 1.0; + + MPI_CALL(MPI_Barrier(MPI_COMM_WORLD)); + double start = MPI_Wtime(); + PUSH_RANGE("Jacobi solve", 0) + while (l2_norm > tol && iter < iter_max) { + const bool calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); + if (calculate_norm) { + // TODO: Launch `graph_calc_norm_exec[iter%2]` in `compute_stream` +#ifdef SOLUTION + CUDA_RT_CALL(cudaGraphLaunch(graph_calc_norm_exec[iter%2], compute_stream)); +#endif + CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); + MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_REAL_TYPE, MPI_SUM, MPI_COMM_WORLD)); + l2_norm = std::sqrt(l2_norm); + + if (!csv && 0 == rank && (iter % 100) == 0) { + printf("%5d, %0.6f\n", iter, l2_norm); + } + } else { + // TODO: Launch `graph_exec[iter%2]` in `compute_stream` +#ifdef SOLUTION + CUDA_RT_CALL(cudaGraphLaunch(graph_exec[iter%2], compute_stream)); +#endif + } + iter++; + } + CUDA_RT_CALL(cudaDeviceSynchronize()); + double stop = MPI_Wtime(); + POP_RANGE + + CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx, + std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(real), + cudaMemcpyDeviceToHost)); + + int result_correct = 1; + for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) { + for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) { + if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) { + fprintf(stderr, + "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f " + "(reference)\n", + rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]); + result_correct = 0; + } + } + } + + int global_result_correct = 1; + MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN, + MPI_COMM_WORLD)); + result_correct = global_result_correct; + + if (rank == 0 && result_correct) { + if (csv) { + //TODO: Don't forget to change your output lable from nccl_overlap to nccl_graph +#ifdef SOLUTION + printf("nccl_graph, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, +#else + printf("nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, +#endif // SOLUTION + (stop - start), runtime_serial); + } else { + printf("Num GPUs: %d.\n", size); + printf( + "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, " + "efficiency: %8.2f \n", + ny, nx, runtime_serial, size, (stop - start), runtime_serial / (stop - start), + runtime_serial / (size * (stop - start)) * 100); + } + } + + // TODO: Destroy instantiated graphs +#ifdef SOLUTION + CUDA_RT_CALL(cudaGraphExecDestroy(graph_exec[1])); + CUDA_RT_CALL(cudaGraphExecDestroy(graph_exec[0])); + CUDA_RT_CALL(cudaGraphExecDestroy(graph_calc_norm_exec[1])); + CUDA_RT_CALL(cudaGraphExecDestroy(graph_calc_norm_exec[0])); +#endif + + CUDA_RT_CALL(cudaEventDestroy(reset_l2norm_done)); + CUDA_RT_CALL(cudaEventDestroy(push_done)); + CUDA_RT_CALL(cudaEventDestroy(push_prep_done)); + CUDA_RT_CALL(cudaStreamDestroy(push_stream)); + CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); + + CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); + CUDA_RT_CALL(cudaFree(l2_norm_d)); + + CUDA_RT_CALL(cudaFree(a_new)); + CUDA_RT_CALL(cudaFree(a)); + + CUDA_RT_CALL(cudaFreeHost(a_h)); + CUDA_RT_CALL(cudaFreeHost(a_ref_h)); + + NCCL_CALL(ncclCommDestroy(nccl_comm)); + + MPI_CALL(MPI_Finalize()); + return (result_correct == 1) ? 0 : 1; +} + +double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h, + const int nccheck, const bool print) { + real* a; + real* a_new; + + cudaStream_t compute_stream; + cudaStream_t push_top_stream; + cudaStream_t push_bottom_stream; + cudaEvent_t compute_done; + cudaEvent_t push_top_done; + cudaEvent_t push_bottom_done; + + real* l2_norm_d; + real* l2_norm_h; + + int iy_start = 1; + int iy_end = (ny - 1); + + CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real))); + CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real))); + + CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real))); + CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real))); + + // Set diriclet boundary conditions on left and right boarder + launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny); + CUDA_RT_CALL(cudaDeviceSynchronize()); + + CUDA_RT_CALL(cudaStreamCreate(&compute_stream)); + CUDA_RT_CALL(cudaStreamCreate(&push_top_stream)); + CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream)); + CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming)); + CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming)); + CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming)); + + CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); + CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); + + CUDA_RT_CALL(cudaDeviceSynchronize()); + + if (print) + printf( + "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with " + "norm " + "check every %d iterations\n", + iter_max, ny, nx, nccheck); + + int iter = 0; + bool calculate_norm = true; + real l2_norm = 1.0; + + double start = MPI_Wtime(); + PUSH_RANGE("Jacobi solve", 0) + while (l2_norm > tol && iter < iter_max) { + CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream)); + + CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0)); + CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0)); + + calculate_norm = (iter % nccheck) == 0 || (iter % 100) == 0; + launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm, + compute_stream); + CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream)); + + if (calculate_norm) { + CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, + compute_stream)); + } + + // Apply periodic boundary conditions + + CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0)); + CUDA_RT_CALL(cudaMemcpyAsync(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real), + cudaMemcpyDeviceToDevice, push_top_stream)); + CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream)); + + CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0)); + CUDA_RT_CALL(cudaMemcpyAsync(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real), + cudaMemcpyDeviceToDevice, compute_stream)); + CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream)); + + if (calculate_norm) { + CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); + l2_norm = *l2_norm_h; + l2_norm = std::sqrt(l2_norm); + if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm); + } + + std::swap(a_new, a); + iter++; + } + POP_RANGE + double stop = MPI_Wtime(); + + CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost)); + + CUDA_RT_CALL(cudaEventDestroy(push_bottom_done)); + CUDA_RT_CALL(cudaEventDestroy(push_top_done)); + CUDA_RT_CALL(cudaEventDestroy(compute_done)); + CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream)); + CUDA_RT_CALL(cudaStreamDestroy(push_top_stream)); + CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); + + CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); + CUDA_RT_CALL(cudaFree(l2_norm_d)); + + CUDA_RT_CALL(cudaFree(a_new)); + CUDA_RT_CALL(cudaFree(a)); + return (stop - start); +} diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/jacobi_kernels.cu b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/jacobi_kernels.cu new file mode 100644 index 0000000..dbc7dc9 --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/Using_CUDA_Graphs/jacobi_kernels.cu @@ -0,0 +1,109 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2017,2021,2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +#ifdef HAVE_CUB +#include +#endif // HAVE_CUB + +#define CUDA_RT_CALL(call) \ + { \ + cudaError_t cudaStatus = call; \ + if (cudaSuccess != cudaStatus) \ + fprintf(stderr, \ + "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ + "with " \ + "%s (%d).\n", \ + #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ + } + +#ifdef USE_DOUBLE +typedef double real; +#define MPI_REAL_TYPE MPI_DOUBLE +#else +typedef float real; +#define MPI_REAL_TYPE MPI_FLOAT +#endif + +__global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, + const real pi, const int offset, const int nx, + const int my_ny, const int ny) { + for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) { + const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1)); + a[iy * nx + 0] = y0; + a[iy * nx + (nx - 1)] = y0; + a_new[iy * nx + 0] = y0; + a_new[iy * nx + (nx - 1)] = y0; + } +} + +void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, + const real pi, const int offset, const int nx, const int my_ny, + const int ny) { + initialize_boundaries<<>>(a_new, a, pi, offset, nx, my_ny, ny); + CUDA_RT_CALL(cudaGetLastError()); +} + +template +__global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, + real* __restrict__ const l2_norm, const int iy_start, + const int iy_end, const int nx, const bool calculate_norm) { +#ifdef HAVE_CUB + typedef cub::BlockReduce + BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; +#endif // HAVE_CUB + int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; + int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; + real local_l2_norm = 0.0; + + if (iy < iy_end && ix < (nx - 1)) { + const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + + a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); + a_new[iy * nx + ix] = new_val; + if (calculate_norm) { + real residue = new_val - a[iy * nx + ix]; + local_l2_norm += residue * residue; + } + } + if (calculate_norm) { +#ifdef HAVE_CUB + real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm); + if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm); +#else + atomicAdd(l2_norm, local_l2_norm); +#endif // HAVE_CUB + } +} + +void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, + real* __restrict__ const l2_norm, const int iy_start, const int iy_end, + const int nx, const bool calculate_norm, cudaStream_t stream) { + constexpr int dim_block_x = 32; + constexpr int dim_block_y = 32; + dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, + ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1); + jacobi_kernel<<>>( + a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm); + CUDA_RT_CALL(cudaGetLastError()); +} diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/copy.mk b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/copy.mk new file mode 100755 index 0000000..e693b2c --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/.master/copy.mk @@ -0,0 +1,6 @@ +#!/usr/bin/make -f +.PHONY: tasks Device-initiated_Communication_with_NVSHMEM Using_CUDA_Graphs +tasks: Device-initiated_Communication_with_NVSHMEM Using_CUDA_Graphs +Device-initiated_Communication_with_NVSHMEM Using_CUDA_Graphs: + @cd $@ && \ + ./copy.mk \ No newline at end of file diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Instructions.ipynb b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb similarity index 76% rename from 10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Instructions.ipynb rename to 10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb index 00b04f9..56f5b8a 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Instructions.ipynb +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb @@ -4,14 +4,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", - "## Hands-On 10: Device-initiated Communication with NVSHMEM\n", + "## Hands-On 10A: Device-initiated Communication with NVSHMEM\n", "\n", "### Task: Using NVSHMEM device API\n", "\n", @@ -53,15 +53,15 @@ "\n", "#### Note\n", "\n", - "The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES`\n", - "automatically so that each spawned process only sees the GPU it should\n", - "use (see [GPU\n", - "Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", - "in the JUWELS Booster Overview documentation). This is not supported for\n", - "NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled\n", - "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes\n", - "srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all\n", - "GPUs listed. This is automatically done for the `sanitize`, `run` and\n", + "The Slurm installation on JEDI sets `CUDA_VISIBLE_DEVICES` automatically\n", + "so that each spawned process only sees the GPU it should use (see\n", + "[Hardware\n", + "Configuration](https://apps.fz-juelich.de/jsc/hps/jedi/configuration.html#hardware-configuration)\n", + "in the JEDI user documentation). This is not supported for NVSHMEM. The\n", + "automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting\n", + "`CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With\n", + "`CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs\n", + "listed. This is automatically done for the `sanitize`, `run` and\n", "`profile` make targets.\n", "\n", "### Advanced Task: Use `nvshmemx_float_put_nbi_block`\n", @@ -86,7 +86,7 @@ " repository](https://github.com/NVIDIA/multi-gpu-programming-models)\n", " implements the same strategy." ], - "id": "f7525123-132c-4d36-890e-9efe369db7be" + "id": "ab1bfa7b-226f-4465-9f2d-5376e65931e2" } ], "nbformat": 4, diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Instructions.md b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Instructions.md similarity index 69% rename from 10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Instructions.md rename to 10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Instructions.md index bd8df5f..7432522 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Instructions.md +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Instructions.md @@ -1,11 +1,11 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 29 May 2022 9AM - 6PM CEST -- Location: Hall Y6, Congress Center Hamburg (CCH) +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 -## Hands-On 10: Device-initiated Communication with NVSHMEM +## Hands-On 10A: Device-initiated Communication with NVSHMEM ### Task: Using NVSHMEM device API @@ -41,7 +41,7 @@ Study the performance by glimpsing at the profile generated with #### Note -The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES` automatically so that each spawned process only sees the GPU it should use (see [GPU Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices) in the JUWELS Booster Overview documentation). This is not supported for NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs listed. This is automatically done for the `sanitize`, `run` and `profile` make targets. +The Slurm installation on JEDI sets `CUDA_VISIBLE_DEVICES` automatically so that each spawned process only sees the GPU it should use (see [Hardware Configuration](https://apps.fz-juelich.de/jsc/hps/jedi/configuration.html#hardware-configuration) in the JEDI user documentation). This is not supported for NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs listed. This is automatically done for the `sanitize`, `run` and `profile` make targets. ### Advanced Task: Use `nvshmemx_float_put_nbi_block` diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Makefile b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Makefile similarity index 75% rename from 10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Makefile rename to 10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Makefile index 1d83127..374e98f 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Makefile +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/Makefile @@ -1,4 +1,6 @@ -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 10H-NVSHMEM-sol +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 4 NVCC=nvcc JSC_SUBMIT_CMD ?= srun --cpu-bind=socket --gres=gpu:4 --ntasks-per-node 4 @@ -10,6 +12,7 @@ endif ifndef MPI_HOME $(error MPI_HOME is not set) endif +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -18,7 +21,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -35,10 +39,10 @@ clean: rm -f jacobi jacobi.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi profile: jacobi - CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/jacobi.cu b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/jacobi.cu similarity index 100% rename from 10-H_Device-initiated_Communication_with_NVSHMEM/solutions/jacobi.cu rename to 10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Device-initiated_Communication_with_NVSHMEM/jacobi.cu diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/.gitignore b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/.gitignore new file mode 100644 index 0000000..db6cccf --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/.gitignore @@ -0,0 +1,3 @@ +jacobi +*.o +*.nsys-rep diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Instructions.ipynb b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Instructions.ipynb new file mode 100644 index 0000000..a09a97d --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Instructions.ipynb @@ -0,0 +1,92 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", + "- Program Link:\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", + "\n", + "## Hands-On 10B: Using CUDA Graphs\n", + "\n", + "### Task: Combining CUDA Graphs with NCCL for Inter-GPU Communication\n", + "\n", + "#### Description\n", + "\n", + "The purpose of this task is to introduce [CUDA\n", + "Graphs](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-graphs).\n", + "For that, the NCCL version of the Jacobi solver developed in hands-on 8\n", + "is modified to use the [CUDA Graph Management\n", + "API](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH)\n", + "so that the CUDA API calls required in the main solver loop are\n", + "minimized. You need to work on `TODOs` in `jacobi.cpp`:\n", + "\n", + "- Use\n", + " [`cudaStreamBeginCapture`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g793d7d4e474388ddfda531603dc34aa3)\n", + " and\n", + " [`cudaStreamEndCapture`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1gf5a0efebc818054ceecd1e3e5e76d93e)\n", + " to create the necessary graphs.\n", + " - Read the comment at the top of the\n", + " `PUSH_RANGE(\"Build graphs\", 0)` structured block.\n", + "- Instantiate captured graphs with\n", + " [`cudaGraphInstantiateWithFlags`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233).\n", + " - Extra: Experiment (compare Nsight Systems timelines) with\n", + " [`cudaGraphInstantiate`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1gb25beab33abe4b2d13edbb6e35cb72ff)\n", + " not using `cudaGraphInstantiateFlagUseNodePriority`.\n", + "- Optional: Manually upload instantiated graphs with\n", + " [`cudaGraphUpload`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ge546432e411b4495b93bdcbf2fc0b2bd).\n", + "- Use\n", + " [`cudaGraphLaunch`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597)\n", + " to launch a single graph per iteration instead of launching multiple\n", + " kernels in different streams and managing their dependencies with\n", + " `cudaEventRecord` and `cudaStreamWaitEvent`.\n", + "- Free resources with\n", + " [`cudaGraphDestroy`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga351557d4d9ecab23d56395599b0e069)\n", + " and\n", + " [`cudaGraphExecDestroy`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g6d101c2cbc6dea2b4fba0fbe407eb91f).\n", + "\n", + "Compile with\n", + "\n", + "``` bash\n", + "make\n", + "```\n", + "\n", + "Submit your compiled application to the batch system with\n", + "\n", + "``` bash\n", + "make run\n", + "```\n", + "\n", + "Study the performance by glimpsing at the profile generated with\n", + "`make profile`. For `make run` and `make profile` the environment\n", + "variable `NP` can be set to change the number of processes." + ], + "id": "b5b4c6d4-656b-44dc-97c2-b51e226d522f" + } + ], + "nbformat": 4, + "nbformat_minor": 5, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + } +} diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Instructions.md b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Instructions.md new file mode 100644 index 0000000..f0fa5be --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Instructions.md @@ -0,0 +1,40 @@ +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale + +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA +- Program Link: + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 + +## Hands-On 10B: Using CUDA Graphs + +### Task: Combining CUDA Graphs with NCCL for Inter-GPU Communication + +#### Description + +The purpose of this task is to introduce [CUDA Graphs](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-graphs). +For that, the NCCL version of the Jacobi solver developed in hands-on 8 is modified to use the +[CUDA Graph Management API](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH) +so that the CUDA API calls required in the main solver loop are minimized. You need to work on `TODOs` in `jacobi.cpp`: + +- Use [`cudaStreamBeginCapture`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g793d7d4e474388ddfda531603dc34aa3) and [`cudaStreamEndCapture`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1gf5a0efebc818054ceecd1e3e5e76d93e) to create the necessary graphs. + - Read the comment at the top of the `PUSH_RANGE("Build graphs", 0)` structured block. +- Instantiate captured graphs with [`cudaGraphInstantiateWithFlags`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233). + - Extra: Experiment (compare Nsight Systems timelines) with [`cudaGraphInstantiate`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1gb25beab33abe4b2d13edbb6e35cb72ff) not using `cudaGraphInstantiateFlagUseNodePriority`. +- Optional: Manually upload instantiated graphs with [`cudaGraphUpload`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ge546432e411b4495b93bdcbf2fc0b2bd). +- Use [`cudaGraphLaunch`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597) to launch a single graph per iteration instead of launching multiple kernels in different streams and managing their dependencies with `cudaEventRecord` and `cudaStreamWaitEvent`. +- Free resources with [`cudaGraphDestroy`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga351557d4d9ecab23d56395599b0e069) and [`cudaGraphExecDestroy`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g6d101c2cbc6dea2b4fba0fbe407eb91f). + +Compile with + +``` {.bash} +make +``` + +Submit your compiled application to the batch system with + +``` {.bash} +make run +``` + +Study the performance by glimpsing at the profile generated with +`make profile`. For `make run` and `make profile` the environment variable `NP` can be set to change the number of processes. diff --git a/08-H_NCCL_NVSHMEM/.master/NCCL/Makefile b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Makefile similarity index 71% rename from 08-H_NCCL_NVSHMEM/.master/NCCL/Makefile rename to 10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Makefile index 2ba58cf..06003e2 100644 --- a/08-H_NCCL_NVSHMEM/.master/NCCL/Makefile +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/Makefile @@ -1,10 +1,13 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 10H-Graphs-sol +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 1 NVCC=nvcc JSC_SUBMIT_CMD ?= srun --cpu-bind=socket --gres=gpu:4 --ntasks-per-node 4 MPICXX=mpicxx CUDA_HOME ?= /usr/local/cuda NCCL_HOME ?= /usr +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -13,7 +16,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -33,10 +37,10 @@ clean: rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi profile: jacobi - $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx --cuda-graph-trace=node -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/jacobi.cpp b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/jacobi.cpp new file mode 100644 index 0000000..f692d4d --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/jacobi.cpp @@ -0,0 +1,571 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2017,2021,2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include + +#include +#define MPI_CALL(call) \ + { \ + int mpi_status = call; \ + if (0 != mpi_status) { \ + char mpi_error_string[MPI_MAX_ERROR_STRING]; \ + int mpi_error_string_length = 0; \ + MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \ + if (NULL != mpi_error_string) \ + fprintf(stderr, \ + "ERROR: MPI call \"%s\" in line %d of file %s failed " \ + "with %s " \ + "(%d).\n", \ + #call, __LINE__, __FILE__, mpi_error_string, mpi_status); \ + else \ + fprintf(stderr, \ + "ERROR: MPI call \"%s\" in line %d of file %s failed " \ + "with %d.\n", \ + #call, __LINE__, __FILE__, mpi_status); \ + } \ + } + +#include + +#ifdef USE_NVTX +#include + +const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, + 0x0000ffff, 0x00ff0000, 0x00ffffff}; +const int num_colors = sizeof(colors) / sizeof(uint32_t); + +#define PUSH_RANGE(name, cid) \ + { \ + int color_id = cid; \ + color_id = color_id % num_colors; \ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxRangePushEx(&eventAttrib); \ + } +#define POP_RANGE nvtxRangePop(); +#else +#define PUSH_RANGE(name, cid) +#define POP_RANGE +#endif + +#define CUDA_RT_CALL(call) \ + { \ + cudaError_t cudaStatus = call; \ + if (cudaSuccess != cudaStatus) \ + fprintf(stderr, \ + "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ + "with " \ + "%s (%d).\n", \ + #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ + } + +#include + +#define NCCL_CALL(call) \ + { \ + ncclResult_t ncclStatus = call; \ + if (ncclSuccess != ncclStatus) \ + fprintf(stderr, \ + "ERROR: NCCL call \"%s\" in line %d of file %s failed " \ + "with " \ + "%s (%d).\n", \ + #call, __LINE__, __FILE__, ncclGetErrorString(ncclStatus), ncclStatus); \ + } + +#ifdef USE_DOUBLE +typedef double real; +#define MPI_REAL_TYPE MPI_DOUBLE +#define NCCL_REAL_TYPE ncclDouble +#else +typedef float real; +#define MPI_REAL_TYPE MPI_FLOAT +#define NCCL_REAL_TYPE ncclFloat +#endif + +constexpr real tol = 1.0e-8; + +const real PI = 2.0 * std::asin(1.0); + +void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, + const real pi, const int offset, const int nx, const int my_ny, + const int ny); + +void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, + real* __restrict__ const l2_norm, const int iy_start, const int iy_end, + const int nx, const bool calculate_norm, cudaStream_t stream); + +double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h, + const int nccheck, const bool print); + +template +T get_argval(char** begin, char** end, const std::string& arg, const T default_val) { + T argval = default_val; + char** itr = std::find(begin, end, arg); + if (itr != end && ++itr != end) { + std::istringstream inbuf(*itr); + inbuf >> argval; + } + return argval; +} + +bool get_arg(char** begin, char** end, const std::string& arg) { + char** itr = std::find(begin, end, arg); + if (itr != end) { + return true; + } + return false; +} + +int main(int argc, char* argv[]) { + MPI_CALL(MPI_Init(&argc, &argv)); + int rank; + MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank)); + int size; + MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size)); + + ncclUniqueId nccl_uid; + if (rank == 0) NCCL_CALL(ncclGetUniqueId(&nccl_uid)); + MPI_CALL(MPI_Bcast(&nccl_uid, sizeof(ncclUniqueId), MPI_BYTE, 0, MPI_COMM_WORLD)); + + const int iter_max = get_argval(argv, argv + argc, "-niter", 1000); + const int nccheck = get_argval(argv, argv + argc, "-nccheck", 1); + const int nx = get_argval(argv, argv + argc, "-nx", 16384); + const int ny = get_argval(argv, argv + argc, "-ny", 16384); + const bool csv = get_arg(argv, argv + argc, "-csv"); + + int local_rank = -1; + { + MPI_Comm local_comm; + MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, + &local_comm)); + + MPI_CALL(MPI_Comm_rank(local_comm, &local_rank)); + + MPI_CALL(MPI_Comm_free(&local_comm)); + } + + int num_devices = 0; + CUDA_RT_CALL(cudaGetDeviceCount(&num_devices)); + CUDA_RT_CALL(cudaSetDevice(local_rank%num_devices)); + CUDA_RT_CALL(cudaFree(0)); + + ncclComm_t nccl_comm; + NCCL_CALL(ncclCommInitRank(&nccl_comm, size, nccl_uid, rank)); + int nccl_version = 0; + NCCL_CALL(ncclGetVersion(&nccl_version)); + if ( nccl_version < 2800 ) { + fprintf(stderr,"ERROR NCCL 2.8 or newer is required.\n"); + NCCL_CALL(ncclCommDestroy(nccl_comm)); + MPI_CALL(MPI_Finalize()); + return 1; + } + + real* a_ref_h; + CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real))); + real* a_h; + CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real))); + double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv && (0 == rank)); + + // ny - 2 rows are distributed amongst `size` ranks in such a way + // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows. + // This optimizes load balancing when (ny - 2) % size != 0 + int chunk_size; + int chunk_size_low = (ny - 2) / size; + int chunk_size_high = chunk_size_low + 1; + // To calculate the number of ranks that need to compute an extra row, + // the following formula is derived from this equation: + // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2 + int num_ranks_low = size * chunk_size_low + size - + (ny - 2); // Number of ranks with chunk_size = chunk_size_low + if (rank < num_ranks_low) + chunk_size = chunk_size_low; + else + chunk_size = chunk_size_high; + + real* a; + CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real))); + real* a_new; + CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real))); + + CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real))); + CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(real))); + + // Calculate local domain boundaries + int iy_start_global; // My start index in the global array + if (rank < num_ranks_low) { + iy_start_global = rank * chunk_size_low + 1; + } else { + iy_start_global = + num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1; + } + int iy_end_global = iy_start_global + chunk_size - 1; // My last index in the global array + + int iy_start = 1; + int iy_end = iy_start + chunk_size; + + // Set diriclet boundary conditions on left and right boarder + launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny); + CUDA_RT_CALL(cudaDeviceSynchronize()); + + int leastPriority = 0; + int greatestPriority = leastPriority; + CUDA_RT_CALL(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority)); + cudaStream_t compute_stream; + CUDA_RT_CALL(cudaStreamCreateWithPriority(&compute_stream, cudaStreamDefault, leastPriority)); + cudaStream_t push_stream; + CUDA_RT_CALL( + cudaStreamCreateWithPriority(&push_stream, cudaStreamDefault, greatestPriority)); + + cudaEvent_t push_prep_done; + CUDA_RT_CALL(cudaEventCreateWithFlags(&push_prep_done, cudaEventDisableTiming)); + cudaEvent_t push_done; + CUDA_RT_CALL(cudaEventCreateWithFlags(&push_done, cudaEventDisableTiming)); + cudaEvent_t reset_l2norm_done; + CUDA_RT_CALL(cudaEventCreateWithFlags(&reset_l2norm_done, cudaEventDisableTiming)); + + real* l2_norm_d; + CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); + real* l2_norm_h; + CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); + + PUSH_RANGE("NCCL_Warmup", 5) + for (int i = 0; i < 10; ++i) { + const int top = rank > 0 ? rank - 1 : (size - 1); + const int bottom = (rank + 1) % size; + NCCL_CALL(ncclGroupStart()); + NCCL_CALL(ncclRecv(a_new, nx, NCCL_REAL_TYPE, top, nccl_comm, compute_stream)); + NCCL_CALL(ncclSend(a_new + (iy_end - 1) * nx, nx, NCCL_REAL_TYPE, bottom, nccl_comm, compute_stream)); + NCCL_CALL(ncclRecv(a_new + (iy_end * nx), nx, NCCL_REAL_TYPE, bottom, nccl_comm, compute_stream)); + NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, compute_stream)); + NCCL_CALL(ncclGroupEnd()); + CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); + std::swap(a_new, a); + } + POP_RANGE + + cudaGraphExec_t graph_exec[2]; + cudaGraphExec_t graph_calc_norm_exec[2]; + + PUSH_RANGE("Build graphs", 0) + { + // Need to capture 4 distinct graphs for the 4 possible execution flows, which are the permutations of + // "calculate norm, yes or no" and "update buffer `a` or `a_new`" + // Note that we use `std::swap` to swap the pointers at the end of the iteration, and graph capture records the raw + // address of the pointer at the time of the kernel launch. + cudaGraph_t graphs[2][2]; + for (int iter = 0; iter < 4; ++iter) + { + const bool calculate_norm = (iter < 2); + //TODO: Begin capturing a graph in compute_stream + CUDA_RT_CALL(cudaStreamBeginCapture(compute_stream, cudaStreamCaptureModeGlobal)); + CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream)); + CUDA_RT_CALL(cudaEventRecord(reset_l2norm_done, compute_stream)); + + CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0)); + launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, + push_stream); + + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, + push_stream); + CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream)); + + const int top = rank > 0 ? rank - 1 : (size - 1); + const int bottom = (rank + 1) % size; + + // Apply periodic boundary conditions + NCCL_CALL(ncclGroupStart()); + NCCL_CALL(ncclRecv(a_new, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream)); + NCCL_CALL(ncclSend(a_new + (iy_end - 1) * nx, nx, NCCL_REAL_TYPE, bottom, nccl_comm, push_stream)); + NCCL_CALL(ncclRecv(a_new + (iy_end * nx), nx, NCCL_REAL_TYPE, bottom, nccl_comm, push_stream)); + NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream)); + NCCL_CALL(ncclGroupEnd()); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, + compute_stream); + + if (calculate_norm) { + CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0)); + CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, + compute_stream)); + } + + CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0)); + + const int is_even = iter % 2; + //TODO: End capturing `graphs[calculate_norm]+is_even` in compute_stream + CUDA_RT_CALL(cudaStreamEndCapture(compute_stream, graphs[calculate_norm]+is_even)); + + std::swap(a_new, a); + } + { + const bool calculate_norm = false; + //TODO: Instantiate graphs without norm calculation: What happens if cudaGraphInstantiateFlagUseNodePriority is **not** used? + // - Instantiate `graphs[calculate_norm][0]` to `graph_exec+0`. + CUDA_RT_CALL(cudaGraphInstantiateWithFlags(graph_exec+0, graphs[calculate_norm][0], cudaGraphInstantiateFlagUseNodePriority)); + // - Instantiate `graphs[calculate_norm][1]` to `graph_exec+1`. + CUDA_RT_CALL(cudaGraphInstantiateWithFlags(graph_exec+1, graphs[calculate_norm][1], cudaGraphInstantiateFlagUseNodePriority)); + } + { + const bool calculate_norm = true; + //TODO: Instantiate graphs with norm calculation: + // - Instantiate `graphs[calculate_norm][0]` to `graph_calc_norm_exec+0`. + CUDA_RT_CALL(cudaGraphInstantiateWithFlags(graph_calc_norm_exec+0, graphs[calculate_norm][0], cudaGraphInstantiateFlagUseNodePriority)); + // - Instantiate `graphs[calculate_norm][1]` to `graph_calc_norm_exec+1`. + CUDA_RT_CALL(cudaGraphInstantiateWithFlags(graph_calc_norm_exec+1, graphs[calculate_norm][1], cudaGraphInstantiateFlagUseNodePriority)); + } + // TODO: Destroy cudaGraph_t objects they are no longer required after instantiation + CUDA_RT_CALL(cudaGraphDestroy(graphs[0][0])); + CUDA_RT_CALL(cudaGraphDestroy(graphs[0][1])); + CUDA_RT_CALL(cudaGraphDestroy(graphs[1][0])); + CUDA_RT_CALL(cudaGraphDestroy(graphs[1][1])); + } + POP_RANGE + PUSH_RANGE("Graph upload", 0) + // TODO (Optional): Initiate upload instantiated graphs to overhead of lazy upload on first launch. + CUDA_RT_CALL(cudaGraphUpload(graph_exec[0],compute_stream)); + CUDA_RT_CALL(cudaGraphUpload(graph_exec[1],compute_stream)); + CUDA_RT_CALL(cudaGraphUpload(graph_calc_norm_exec[0],compute_stream)); + CUDA_RT_CALL(cudaGraphUpload(graph_calc_norm_exec[1],compute_stream)); + POP_RANGE + + CUDA_RT_CALL(cudaDeviceSynchronize()); + + if (!csv && 0 == rank) { + printf( + "Jacobi relaxation: %d iterations on %d x %d mesh with norm check " + "every %d iterations\n", + iter_max, ny, nx, nccheck); + } + + int iter = 0; + bool calculate_norm = true; + real l2_norm = 1.0; + + MPI_CALL(MPI_Barrier(MPI_COMM_WORLD)); + double start = MPI_Wtime(); + PUSH_RANGE("Jacobi solve", 0) + while (l2_norm > tol && iter < iter_max) { + const bool calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); + if (calculate_norm) { + // TODO: Launch `graph_calc_norm_exec[iter%2]` in `compute_stream` + CUDA_RT_CALL(cudaGraphLaunch(graph_calc_norm_exec[iter%2], compute_stream)); + CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); + MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_REAL_TYPE, MPI_SUM, MPI_COMM_WORLD)); + l2_norm = std::sqrt(l2_norm); + + if (!csv && 0 == rank && (iter % 100) == 0) { + printf("%5d, %0.6f\n", iter, l2_norm); + } + } else { + // TODO: Launch `graph_exec[iter%2]` in `compute_stream` + CUDA_RT_CALL(cudaGraphLaunch(graph_exec[iter%2], compute_stream)); + } + iter++; + } + CUDA_RT_CALL(cudaDeviceSynchronize()); + double stop = MPI_Wtime(); + POP_RANGE + + CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx, + std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(real), + cudaMemcpyDeviceToHost)); + + int result_correct = 1; + for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) { + for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) { + if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) { + fprintf(stderr, + "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f " + "(reference)\n", + rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]); + result_correct = 0; + } + } + } + + int global_result_correct = 1; + MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN, + MPI_COMM_WORLD)); + result_correct = global_result_correct; + + if (rank == 0 && result_correct) { + if (csv) { + //TODO: Don't forget to change your output lable from nccl_overlap to nccl_graph + printf("nccl_graph, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, + (stop - start), runtime_serial); + } else { + printf("Num GPUs: %d.\n", size); + printf( + "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, " + "efficiency: %8.2f \n", + ny, nx, runtime_serial, size, (stop - start), runtime_serial / (stop - start), + runtime_serial / (size * (stop - start)) * 100); + } + } + + // TODO: Destroy instantiated graphs + CUDA_RT_CALL(cudaGraphExecDestroy(graph_exec[1])); + CUDA_RT_CALL(cudaGraphExecDestroy(graph_exec[0])); + CUDA_RT_CALL(cudaGraphExecDestroy(graph_calc_norm_exec[1])); + CUDA_RT_CALL(cudaGraphExecDestroy(graph_calc_norm_exec[0])); + + CUDA_RT_CALL(cudaEventDestroy(reset_l2norm_done)); + CUDA_RT_CALL(cudaEventDestroy(push_done)); + CUDA_RT_CALL(cudaEventDestroy(push_prep_done)); + CUDA_RT_CALL(cudaStreamDestroy(push_stream)); + CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); + + CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); + CUDA_RT_CALL(cudaFree(l2_norm_d)); + + CUDA_RT_CALL(cudaFree(a_new)); + CUDA_RT_CALL(cudaFree(a)); + + CUDA_RT_CALL(cudaFreeHost(a_h)); + CUDA_RT_CALL(cudaFreeHost(a_ref_h)); + + NCCL_CALL(ncclCommDestroy(nccl_comm)); + + MPI_CALL(MPI_Finalize()); + return (result_correct == 1) ? 0 : 1; +} + +double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h, + const int nccheck, const bool print) { + real* a; + real* a_new; + + cudaStream_t compute_stream; + cudaStream_t push_top_stream; + cudaStream_t push_bottom_stream; + cudaEvent_t compute_done; + cudaEvent_t push_top_done; + cudaEvent_t push_bottom_done; + + real* l2_norm_d; + real* l2_norm_h; + + int iy_start = 1; + int iy_end = (ny - 1); + + CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real))); + CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real))); + + CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real))); + CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real))); + + // Set diriclet boundary conditions on left and right boarder + launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny); + CUDA_RT_CALL(cudaDeviceSynchronize()); + + CUDA_RT_CALL(cudaStreamCreate(&compute_stream)); + CUDA_RT_CALL(cudaStreamCreate(&push_top_stream)); + CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream)); + CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming)); + CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming)); + CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming)); + + CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); + CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); + + CUDA_RT_CALL(cudaDeviceSynchronize()); + + if (print) + printf( + "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with " + "norm " + "check every %d iterations\n", + iter_max, ny, nx, nccheck); + + int iter = 0; + bool calculate_norm = true; + real l2_norm = 1.0; + + double start = MPI_Wtime(); + PUSH_RANGE("Jacobi solve", 0) + while (l2_norm > tol && iter < iter_max) { + CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream)); + + CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0)); + CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0)); + + calculate_norm = (iter % nccheck) == 0 || (iter % 100) == 0; + launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm, + compute_stream); + CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream)); + + if (calculate_norm) { + CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, + compute_stream)); + } + + // Apply periodic boundary conditions + + CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0)); + CUDA_RT_CALL(cudaMemcpyAsync(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real), + cudaMemcpyDeviceToDevice, push_top_stream)); + CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream)); + + CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0)); + CUDA_RT_CALL(cudaMemcpyAsync(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real), + cudaMemcpyDeviceToDevice, compute_stream)); + CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream)); + + if (calculate_norm) { + CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); + l2_norm = *l2_norm_h; + l2_norm = std::sqrt(l2_norm); + if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm); + } + + std::swap(a_new, a); + iter++; + } + POP_RANGE + double stop = MPI_Wtime(); + + CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost)); + + CUDA_RT_CALL(cudaEventDestroy(push_bottom_done)); + CUDA_RT_CALL(cudaEventDestroy(push_top_done)); + CUDA_RT_CALL(cudaEventDestroy(compute_done)); + CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream)); + CUDA_RT_CALL(cudaStreamDestroy(push_top_stream)); + CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); + + CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); + CUDA_RT_CALL(cudaFree(l2_norm_d)); + + CUDA_RT_CALL(cudaFree(a_new)); + CUDA_RT_CALL(cudaFree(a)); + return (stop - start); +} diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/jacobi_kernels.cu b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/jacobi_kernels.cu new file mode 100644 index 0000000..dbc7dc9 --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/solutions/Using_CUDA_Graphs/jacobi_kernels.cu @@ -0,0 +1,109 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2017,2021,2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +#ifdef HAVE_CUB +#include +#endif // HAVE_CUB + +#define CUDA_RT_CALL(call) \ + { \ + cudaError_t cudaStatus = call; \ + if (cudaSuccess != cudaStatus) \ + fprintf(stderr, \ + "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ + "with " \ + "%s (%d).\n", \ + #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ + } + +#ifdef USE_DOUBLE +typedef double real; +#define MPI_REAL_TYPE MPI_DOUBLE +#else +typedef float real; +#define MPI_REAL_TYPE MPI_FLOAT +#endif + +__global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, + const real pi, const int offset, const int nx, + const int my_ny, const int ny) { + for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) { + const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1)); + a[iy * nx + 0] = y0; + a[iy * nx + (nx - 1)] = y0; + a_new[iy * nx + 0] = y0; + a_new[iy * nx + (nx - 1)] = y0; + } +} + +void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, + const real pi, const int offset, const int nx, const int my_ny, + const int ny) { + initialize_boundaries<<>>(a_new, a, pi, offset, nx, my_ny, ny); + CUDA_RT_CALL(cudaGetLastError()); +} + +template +__global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, + real* __restrict__ const l2_norm, const int iy_start, + const int iy_end, const int nx, const bool calculate_norm) { +#ifdef HAVE_CUB + typedef cub::BlockReduce + BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; +#endif // HAVE_CUB + int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; + int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; + real local_l2_norm = 0.0; + + if (iy < iy_end && ix < (nx - 1)) { + const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + + a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); + a_new[iy * nx + ix] = new_val; + if (calculate_norm) { + real residue = new_val - a[iy * nx + ix]; + local_l2_norm += residue * residue; + } + } + if (calculate_norm) { +#ifdef HAVE_CUB + real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm); + if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm); +#else + atomicAdd(l2_norm, local_l2_norm); +#endif // HAVE_CUB + } +} + +void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, + real* __restrict__ const l2_norm, const int iy_start, const int iy_end, + const int nx, const bool calculate_norm, cudaStream_t stream) { + constexpr int dim_block_x = 32; + constexpr int dim_block_y = 32; + dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, + ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1); + jacobi_kernel<<>>( + a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm); + CUDA_RT_CALL(cudaGetLastError()); +} diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Instructions.ipynb b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb similarity index 76% rename from 10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Instructions.ipynb rename to 10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb index 00b04f9..56f5b8a 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/solutions/Instructions.ipynb +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Instructions.ipynb @@ -4,14 +4,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", "\n", - "- Time: Sunday, 29 May 2022 9AM - 6PM CEST\n", - "- Location: Hall Y6, Congress Center Hamburg (CCH)\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", "- Program Link:\n", - " https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", "\n", - "## Hands-On 10: Device-initiated Communication with NVSHMEM\n", + "## Hands-On 10A: Device-initiated Communication with NVSHMEM\n", "\n", "### Task: Using NVSHMEM device API\n", "\n", @@ -53,15 +53,15 @@ "\n", "#### Note\n", "\n", - "The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES`\n", - "automatically so that each spawned process only sees the GPU it should\n", - "use (see [GPU\n", - "Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices)\n", - "in the JUWELS Booster Overview documentation). This is not supported for\n", - "NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled\n", - "by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes\n", - "srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all\n", - "GPUs listed. This is automatically done for the `sanitize`, `run` and\n", + "The Slurm installation on JEDI sets `CUDA_VISIBLE_DEVICES` automatically\n", + "so that each spawned process only sees the GPU it should use (see\n", + "[Hardware\n", + "Configuration](https://apps.fz-juelich.de/jsc/hps/jedi/configuration.html#hardware-configuration)\n", + "in the JEDI user documentation). This is not supported for NVSHMEM. The\n", + "automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting\n", + "`CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With\n", + "`CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs\n", + "listed. This is automatically done for the `sanitize`, `run` and\n", "`profile` make targets.\n", "\n", "### Advanced Task: Use `nvshmemx_float_put_nbi_block`\n", @@ -86,7 +86,7 @@ " repository](https://github.com/NVIDIA/multi-gpu-programming-models)\n", " implements the same strategy." ], - "id": "f7525123-132c-4d36-890e-9efe369db7be" + "id": "ab1bfa7b-226f-4465-9f2d-5376e65931e2" } ], "nbformat": 4, diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Instructions.md b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Instructions.md similarity index 69% rename from 10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Instructions.md rename to 10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Instructions.md index bd8df5f..7432522 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/Instructions.md +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Instructions.md @@ -1,11 +1,11 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale -- Time: Sunday, 29 May 2022 9AM - 6PM CEST -- Location: Hall Y6, Congress Center Hamburg (CCH) +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA - Program Link: - https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2 + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 -## Hands-On 10: Device-initiated Communication with NVSHMEM +## Hands-On 10A: Device-initiated Communication with NVSHMEM ### Task: Using NVSHMEM device API @@ -41,7 +41,7 @@ Study the performance by glimpsing at the profile generated with #### Note -The Slurm installation on JUWELS-Booster sets `CUDA_VISIBLE_DEVICES` automatically so that each spawned process only sees the GPU it should use (see [GPU Devices](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html#gpu-devices) in the JUWELS Booster Overview documentation). This is not supported for NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs listed. This is automatically done for the `sanitize`, `run` and `profile` make targets. +The Slurm installation on JEDI sets `CUDA_VISIBLE_DEVICES` automatically so that each spawned process only sees the GPU it should use (see [Hardware Configuration](https://apps.fz-juelich.de/jsc/hps/jedi/configuration.html#hardware-configuration) in the JEDI user documentation). This is not supported for NVSHMEM. The automatic setting of `CUDA_VISIBLE_DEVICES` can be disabled by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` in the shell that executes srun. With `CUDA_VISIBLE_DEVICES` set all spawned processes can see all GPUs listed. This is automatically done for the `sanitize`, `run` and `profile` make targets. ### Advanced Task: Use `nvshmemx_float_put_nbi_block` diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Makefile b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Makefile similarity index 75% rename from 10-H_Device-initiated_Communication_with_NVSHMEM/.master/Makefile rename to 10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Makefile index 1d83127..687a121 100644 --- a/10-H_Device-initiated_Communication_with_NVSHMEM/.master/Makefile +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/Makefile @@ -1,4 +1,6 @@ -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 10H-NVSHMEM-task +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') NP ?= 4 NVCC=nvcc JSC_SUBMIT_CMD ?= srun --cpu-bind=socket --gres=gpu:4 --ntasks-per-node 4 @@ -10,6 +12,7 @@ endif ifndef MPI_HOME $(error MPI_HOME is not set) endif +_JSCCOURSE_GPU_ARCH?=80 GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 @@ -18,7 +21,8 @@ GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 -GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) ifdef DISABLE_CUB NVCC_FLAGS = -Xptxas --optimize-float-atomics else @@ -35,10 +39,10 @@ clean: rm -f jacobi jacobi.o *.nsys-rep jacobi.*.compute-sanitizer.log sanitize: jacobi - CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file jacobi.%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 run: jacobi CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi profile: jacobi - CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o jacobi.%q{SLURM_PROCID} ./jacobi -niter 10 + CUDA_VISIBLE_DEVICES=$(C_V_D) $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/10-H_Device-initiated_Communication_with_NVSHMEM/tasks/jacobi.cu b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/jacobi.cu similarity index 100% rename from 10-H_Device-initiated_Communication_with_NVSHMEM/tasks/jacobi.cu rename to 10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Device-initiated_Communication_with_NVSHMEM/jacobi.cu diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/.gitignore b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/.gitignore new file mode 100644 index 0000000..db6cccf --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/.gitignore @@ -0,0 +1,3 @@ +jacobi +*.o +*.nsys-rep diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Instructions.ipynb b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Instructions.ipynb new file mode 100644 index 0000000..a09a97d --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Instructions.ipynb @@ -0,0 +1,92 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale\n", + "\n", + "- Sunday, November 17, 2024 8:30 AM to 5:30 PM\n", + "- Location: B211, Atlanta Convention Center, Georgia, USA\n", + "- Program Link:\n", + " https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412\n", + "\n", + "## Hands-On 10B: Using CUDA Graphs\n", + "\n", + "### Task: Combining CUDA Graphs with NCCL for Inter-GPU Communication\n", + "\n", + "#### Description\n", + "\n", + "The purpose of this task is to introduce [CUDA\n", + "Graphs](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-graphs).\n", + "For that, the NCCL version of the Jacobi solver developed in hands-on 8\n", + "is modified to use the [CUDA Graph Management\n", + "API](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH)\n", + "so that the CUDA API calls required in the main solver loop are\n", + "minimized. You need to work on `TODOs` in `jacobi.cpp`:\n", + "\n", + "- Use\n", + " [`cudaStreamBeginCapture`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g793d7d4e474388ddfda531603dc34aa3)\n", + " and\n", + " [`cudaStreamEndCapture`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1gf5a0efebc818054ceecd1e3e5e76d93e)\n", + " to create the necessary graphs.\n", + " - Read the comment at the top of the\n", + " `PUSH_RANGE(\"Build graphs\", 0)` structured block.\n", + "- Instantiate captured graphs with\n", + " [`cudaGraphInstantiateWithFlags`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233).\n", + " - Extra: Experiment (compare Nsight Systems timelines) with\n", + " [`cudaGraphInstantiate`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1gb25beab33abe4b2d13edbb6e35cb72ff)\n", + " not using `cudaGraphInstantiateFlagUseNodePriority`.\n", + "- Optional: Manually upload instantiated graphs with\n", + " [`cudaGraphUpload`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ge546432e411b4495b93bdcbf2fc0b2bd).\n", + "- Use\n", + " [`cudaGraphLaunch`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597)\n", + " to launch a single graph per iteration instead of launching multiple\n", + " kernels in different streams and managing their dependencies with\n", + " `cudaEventRecord` and `cudaStreamWaitEvent`.\n", + "- Free resources with\n", + " [`cudaGraphDestroy`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga351557d4d9ecab23d56395599b0e069)\n", + " and\n", + " [`cudaGraphExecDestroy`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g6d101c2cbc6dea2b4fba0fbe407eb91f).\n", + "\n", + "Compile with\n", + "\n", + "``` bash\n", + "make\n", + "```\n", + "\n", + "Submit your compiled application to the batch system with\n", + "\n", + "``` bash\n", + "make run\n", + "```\n", + "\n", + "Study the performance by glimpsing at the profile generated with\n", + "`make profile`. For `make run` and `make profile` the environment\n", + "variable `NP` can be set to change the number of processes." + ], + "id": "b5b4c6d4-656b-44dc-97c2-b51e226d522f" + } + ], + "nbformat": 4, + "nbformat_minor": 5, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + } +} diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Instructions.md b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Instructions.md new file mode 100644 index 0000000..f0fa5be --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Instructions.md @@ -0,0 +1,40 @@ +# SC24 Tutorial: Efficient Distributed GPU Programming for Exascale + +- Sunday, November 17, 2024 8:30 AM to 5:30 PM +- Location: B211, Atlanta Convention Center, Georgia, USA +- Program Link: + https://sc24.conference-program.com/presentation/?id=tut123&sess=sess412 + +## Hands-On 10B: Using CUDA Graphs + +### Task: Combining CUDA Graphs with NCCL for Inter-GPU Communication + +#### Description + +The purpose of this task is to introduce [CUDA Graphs](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-graphs). +For that, the NCCL version of the Jacobi solver developed in hands-on 8 is modified to use the +[CUDA Graph Management API](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH) +so that the CUDA API calls required in the main solver loop are minimized. You need to work on `TODOs` in `jacobi.cpp`: + +- Use [`cudaStreamBeginCapture`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g793d7d4e474388ddfda531603dc34aa3) and [`cudaStreamEndCapture`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1gf5a0efebc818054ceecd1e3e5e76d93e) to create the necessary graphs. + - Read the comment at the top of the `PUSH_RANGE("Build graphs", 0)` structured block. +- Instantiate captured graphs with [`cudaGraphInstantiateWithFlags`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233). + - Extra: Experiment (compare Nsight Systems timelines) with [`cudaGraphInstantiate`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1gb25beab33abe4b2d13edbb6e35cb72ff) not using `cudaGraphInstantiateFlagUseNodePriority`. +- Optional: Manually upload instantiated graphs with [`cudaGraphUpload`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ge546432e411b4495b93bdcbf2fc0b2bd). +- Use [`cudaGraphLaunch`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597) to launch a single graph per iteration instead of launching multiple kernels in different streams and managing their dependencies with `cudaEventRecord` and `cudaStreamWaitEvent`. +- Free resources with [`cudaGraphDestroy`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga351557d4d9ecab23d56395599b0e069) and [`cudaGraphExecDestroy`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g6d101c2cbc6dea2b4fba0fbe407eb91f). + +Compile with + +``` {.bash} +make +``` + +Submit your compiled application to the batch system with + +``` {.bash} +make run +``` + +Study the performance by glimpsing at the profile generated with +`make profile`. For `make run` and `make profile` the environment variable `NP` can be set to change the number of processes. diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Makefile b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Makefile new file mode 100644 index 0000000..ca0ae2f --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/Makefile @@ -0,0 +1,46 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. +THIS_TASK := 10H-Graphs-task +OUTPUT_NAME := jacobi.$(THIS_TASK)__$(shell date '+%Y%m%d-%H%M') +NP ?= 1 +NVCC=nvcc +JSC_SUBMIT_CMD ?= srun --cpu-bind=socket --gres=gpu:4 --ntasks-per-node 4 +MPICXX=mpicxx +CUDA_HOME ?= /usr/local/cuda +NCCL_HOME ?= /usr +_JSCCOURSE_GPU_ARCH?=80 +GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 +GENCODE_SM35 := -gencode arch=compute_35,code=sm_35 +GENCODE_SM37 := -gencode arch=compute_37,code=sm_37 +GENCODE_SM50 := -gencode arch=compute_50,code=sm_50 +GENCODE_SM52 := -gencode arch=compute_52,code=sm_52 +GENCODE_SM60 := -gencode arch=compute_60,code=sm_60 +GENCODE_SM70 := -gencode arch=compute_70,code=sm_70 +GENCODE_SM80 := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80 +GENCODE_SM90 := -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 +GENCODE_FLAGS := $(GENCODE_SM$(_JSCCOURSE_GPU_ARCH)) +ifdef DISABLE_CUB + NVCC_FLAGS = -Xptxas --optimize-float-atomics +else + NVCC_FLAGS = -DHAVE_CUB +endif +NVCC_FLAGS += -lineinfo $(GENCODE_FLAGS) -std=c++14 +MPICXX_FLAGS = -DUSE_NVTX -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include -std=c++14 +LD_FLAGS = -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt -lnccl +jacobi: Makefile jacobi.cpp jacobi_kernels.o + $(MPICXX) $(MPICXX_FLAGS) jacobi.cpp jacobi_kernels.o $(LD_FLAGS) -o jacobi + +jacobi_kernels.o: Makefile jacobi_kernels.cu + $(NVCC) $(NVCC_FLAGS) jacobi_kernels.cu -c + +.PHONY.: clean +clean: + rm -f jacobi jacobi_kernels.o *.nsys-rep jacobi.*.compute-sanitizer.log + +sanitize: jacobi + $(JSC_SUBMIT_CMD) -n $(NP) compute-sanitizer --log-file $(OUTPUT_NAME).%q{SLURM_PROCID}.compute-sanitizer.log ./jacobi -niter 10 + +run: jacobi + $(JSC_SUBMIT_CMD) -n $(NP) ./jacobi + +profile: jacobi + $(JSC_SUBMIT_CMD) -n $(NP) nsys profile --trace=mpi,cuda,nvtx --cuda-graph-trace=node -o $(OUTPUT_NAME).%q{SLURM_PROCID} ./jacobi -niter 10 diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/jacobi.cpp b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/jacobi.cpp new file mode 100644 index 0000000..b9197e7 --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/jacobi.cpp @@ -0,0 +1,551 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2017,2021,2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include + +#include +#define MPI_CALL(call) \ + { \ + int mpi_status = call; \ + if (0 != mpi_status) { \ + char mpi_error_string[MPI_MAX_ERROR_STRING]; \ + int mpi_error_string_length = 0; \ + MPI_Error_string(mpi_status, mpi_error_string, &mpi_error_string_length); \ + if (NULL != mpi_error_string) \ + fprintf(stderr, \ + "ERROR: MPI call \"%s\" in line %d of file %s failed " \ + "with %s " \ + "(%d).\n", \ + #call, __LINE__, __FILE__, mpi_error_string, mpi_status); \ + else \ + fprintf(stderr, \ + "ERROR: MPI call \"%s\" in line %d of file %s failed " \ + "with %d.\n", \ + #call, __LINE__, __FILE__, mpi_status); \ + } \ + } + +#include + +#ifdef USE_NVTX +#include + +const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, + 0x0000ffff, 0x00ff0000, 0x00ffffff}; +const int num_colors = sizeof(colors) / sizeof(uint32_t); + +#define PUSH_RANGE(name, cid) \ + { \ + int color_id = cid; \ + color_id = color_id % num_colors; \ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxRangePushEx(&eventAttrib); \ + } +#define POP_RANGE nvtxRangePop(); +#else +#define PUSH_RANGE(name, cid) +#define POP_RANGE +#endif + +#define CUDA_RT_CALL(call) \ + { \ + cudaError_t cudaStatus = call; \ + if (cudaSuccess != cudaStatus) \ + fprintf(stderr, \ + "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ + "with " \ + "%s (%d).\n", \ + #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ + } + +#include + +#define NCCL_CALL(call) \ + { \ + ncclResult_t ncclStatus = call; \ + if (ncclSuccess != ncclStatus) \ + fprintf(stderr, \ + "ERROR: NCCL call \"%s\" in line %d of file %s failed " \ + "with " \ + "%s (%d).\n", \ + #call, __LINE__, __FILE__, ncclGetErrorString(ncclStatus), ncclStatus); \ + } + +#ifdef USE_DOUBLE +typedef double real; +#define MPI_REAL_TYPE MPI_DOUBLE +#define NCCL_REAL_TYPE ncclDouble +#else +typedef float real; +#define MPI_REAL_TYPE MPI_FLOAT +#define NCCL_REAL_TYPE ncclFloat +#endif + +constexpr real tol = 1.0e-8; + +const real PI = 2.0 * std::asin(1.0); + +void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, + const real pi, const int offset, const int nx, const int my_ny, + const int ny); + +void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, + real* __restrict__ const l2_norm, const int iy_start, const int iy_end, + const int nx, const bool calculate_norm, cudaStream_t stream); + +double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h, + const int nccheck, const bool print); + +template +T get_argval(char** begin, char** end, const std::string& arg, const T default_val) { + T argval = default_val; + char** itr = std::find(begin, end, arg); + if (itr != end && ++itr != end) { + std::istringstream inbuf(*itr); + inbuf >> argval; + } + return argval; +} + +bool get_arg(char** begin, char** end, const std::string& arg) { + char** itr = std::find(begin, end, arg); + if (itr != end) { + return true; + } + return false; +} + +int main(int argc, char* argv[]) { + MPI_CALL(MPI_Init(&argc, &argv)); + int rank; + MPI_CALL(MPI_Comm_rank(MPI_COMM_WORLD, &rank)); + int size; + MPI_CALL(MPI_Comm_size(MPI_COMM_WORLD, &size)); + + ncclUniqueId nccl_uid; + if (rank == 0) NCCL_CALL(ncclGetUniqueId(&nccl_uid)); + MPI_CALL(MPI_Bcast(&nccl_uid, sizeof(ncclUniqueId), MPI_BYTE, 0, MPI_COMM_WORLD)); + + const int iter_max = get_argval(argv, argv + argc, "-niter", 1000); + const int nccheck = get_argval(argv, argv + argc, "-nccheck", 1); + const int nx = get_argval(argv, argv + argc, "-nx", 16384); + const int ny = get_argval(argv, argv + argc, "-ny", 16384); + const bool csv = get_arg(argv, argv + argc, "-csv"); + + int local_rank = -1; + { + MPI_Comm local_comm; + MPI_CALL(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, + &local_comm)); + + MPI_CALL(MPI_Comm_rank(local_comm, &local_rank)); + + MPI_CALL(MPI_Comm_free(&local_comm)); + } + + int num_devices = 0; + CUDA_RT_CALL(cudaGetDeviceCount(&num_devices)); + CUDA_RT_CALL(cudaSetDevice(local_rank%num_devices)); + CUDA_RT_CALL(cudaFree(0)); + + ncclComm_t nccl_comm; + NCCL_CALL(ncclCommInitRank(&nccl_comm, size, nccl_uid, rank)); + int nccl_version = 0; + NCCL_CALL(ncclGetVersion(&nccl_version)); + if ( nccl_version < 2800 ) { + fprintf(stderr,"ERROR NCCL 2.8 or newer is required.\n"); + NCCL_CALL(ncclCommDestroy(nccl_comm)); + MPI_CALL(MPI_Finalize()); + return 1; + } + + real* a_ref_h; + CUDA_RT_CALL(cudaMallocHost(&a_ref_h, nx * ny * sizeof(real))); + real* a_h; + CUDA_RT_CALL(cudaMallocHost(&a_h, nx * ny * sizeof(real))); + double runtime_serial = single_gpu(nx, ny, iter_max, a_ref_h, nccheck, !csv && (0 == rank)); + + // ny - 2 rows are distributed amongst `size` ranks in such a way + // that each rank gets either (ny - 2) / size or (ny - 2) / size + 1 rows. + // This optimizes load balancing when (ny - 2) % size != 0 + int chunk_size; + int chunk_size_low = (ny - 2) / size; + int chunk_size_high = chunk_size_low + 1; + // To calculate the number of ranks that need to compute an extra row, + // the following formula is derived from this equation: + // num_ranks_low * chunk_size_low + (size - num_ranks_low) * (chunk_size_low + 1) = ny - 2 + int num_ranks_low = size * chunk_size_low + size - + (ny - 2); // Number of ranks with chunk_size = chunk_size_low + if (rank < num_ranks_low) + chunk_size = chunk_size_low; + else + chunk_size = chunk_size_high; + + real* a; + CUDA_RT_CALL(cudaMalloc(&a, nx * (chunk_size + 2) * sizeof(real))); + real* a_new; + CUDA_RT_CALL(cudaMalloc(&a_new, nx * (chunk_size + 2) * sizeof(real))); + + CUDA_RT_CALL(cudaMemset(a, 0, nx * (chunk_size + 2) * sizeof(real))); + CUDA_RT_CALL(cudaMemset(a_new, 0, nx * (chunk_size + 2) * sizeof(real))); + + // Calculate local domain boundaries + int iy_start_global; // My start index in the global array + if (rank < num_ranks_low) { + iy_start_global = rank * chunk_size_low + 1; + } else { + iy_start_global = + num_ranks_low * chunk_size_low + (rank - num_ranks_low) * chunk_size_high + 1; + } + int iy_end_global = iy_start_global + chunk_size - 1; // My last index in the global array + + int iy_start = 1; + int iy_end = iy_start + chunk_size; + + // Set diriclet boundary conditions on left and right boarder + launch_initialize_boundaries(a, a_new, PI, iy_start_global - 1, nx, (chunk_size + 2), ny); + CUDA_RT_CALL(cudaDeviceSynchronize()); + + int leastPriority = 0; + int greatestPriority = leastPriority; + CUDA_RT_CALL(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority)); + cudaStream_t compute_stream; + CUDA_RT_CALL(cudaStreamCreateWithPriority(&compute_stream, cudaStreamDefault, leastPriority)); + cudaStream_t push_stream; + CUDA_RT_CALL( + cudaStreamCreateWithPriority(&push_stream, cudaStreamDefault, greatestPriority)); + + cudaEvent_t push_prep_done; + CUDA_RT_CALL(cudaEventCreateWithFlags(&push_prep_done, cudaEventDisableTiming)); + cudaEvent_t push_done; + CUDA_RT_CALL(cudaEventCreateWithFlags(&push_done, cudaEventDisableTiming)); + cudaEvent_t reset_l2norm_done; + CUDA_RT_CALL(cudaEventCreateWithFlags(&reset_l2norm_done, cudaEventDisableTiming)); + + real* l2_norm_d; + CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); + real* l2_norm_h; + CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); + + PUSH_RANGE("NCCL_Warmup", 5) + for (int i = 0; i < 10; ++i) { + const int top = rank > 0 ? rank - 1 : (size - 1); + const int bottom = (rank + 1) % size; + NCCL_CALL(ncclGroupStart()); + NCCL_CALL(ncclRecv(a_new, nx, NCCL_REAL_TYPE, top, nccl_comm, compute_stream)); + NCCL_CALL(ncclSend(a_new + (iy_end - 1) * nx, nx, NCCL_REAL_TYPE, bottom, nccl_comm, compute_stream)); + NCCL_CALL(ncclRecv(a_new + (iy_end * nx), nx, NCCL_REAL_TYPE, bottom, nccl_comm, compute_stream)); + NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, compute_stream)); + NCCL_CALL(ncclGroupEnd()); + CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); + std::swap(a_new, a); + } + POP_RANGE + + cudaGraphExec_t graph_exec[2]; + cudaGraphExec_t graph_calc_norm_exec[2]; + + PUSH_RANGE("Build graphs", 0) + { + // Need to capture 4 distinct graphs for the 4 possible execution flows, which are the permutations of + // "calculate norm, yes or no" and "update buffer `a` or `a_new`" + // Note that we use `std::swap` to swap the pointers at the end of the iteration, and graph capture records the raw + // address of the pointer at the time of the kernel launch. + cudaGraph_t graphs[2][2]; + for (int iter = 0; iter < 4; ++iter) + { + const bool calculate_norm = (iter < 2); + //TODO: Begin capturing a graph in compute_stream + CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream)); + CUDA_RT_CALL(cudaEventRecord(reset_l2norm_done, compute_stream)); + + CUDA_RT_CALL(cudaStreamWaitEvent(push_stream, reset_l2norm_done, 0)); + launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, (iy_start + 1), nx, calculate_norm, + push_stream); + + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_end - 1), iy_end, nx, calculate_norm, + push_stream); + CUDA_RT_CALL(cudaEventRecord(push_prep_done, push_stream)); + + const int top = rank > 0 ? rank - 1 : (size - 1); + const int bottom = (rank + 1) % size; + + // Apply periodic boundary conditions + NCCL_CALL(ncclGroupStart()); + NCCL_CALL(ncclRecv(a_new, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream)); + NCCL_CALL(ncclSend(a_new + (iy_end - 1) * nx, nx, NCCL_REAL_TYPE, bottom, nccl_comm, push_stream)); + NCCL_CALL(ncclRecv(a_new + (iy_end * nx), nx, NCCL_REAL_TYPE, bottom, nccl_comm, push_stream)); + NCCL_CALL(ncclSend(a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream)); + NCCL_CALL(ncclGroupEnd()); + CUDA_RT_CALL(cudaEventRecord(push_done, push_stream)); + + launch_jacobi_kernel(a_new, a, l2_norm_d, (iy_start + 1), (iy_end - 1), nx, calculate_norm, + compute_stream); + + if (calculate_norm) { + CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_prep_done, 0)); + CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, + compute_stream)); + } + + CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_done, 0)); + + const int is_even = iter % 2; + //TODO: End capturing `graphs[calculate_norm]+is_even` in compute_stream + + std::swap(a_new, a); + } + { + const bool calculate_norm = false; + //TODO: Instantiate graphs without norm calculation: What happens if cudaGraphInstantiateFlagUseNodePriority is **not** used? + // - Instantiate `graphs[calculate_norm][0]` to `graph_exec+0`. + // - Instantiate `graphs[calculate_norm][1]` to `graph_exec+1`. + } + { + const bool calculate_norm = true; + //TODO: Instantiate graphs with norm calculation: + // - Instantiate `graphs[calculate_norm][0]` to `graph_calc_norm_exec+0`. + // - Instantiate `graphs[calculate_norm][1]` to `graph_calc_norm_exec+1`. + } + // TODO: Destroy cudaGraph_t objects they are no longer required after instantiation + } + POP_RANGE + PUSH_RANGE("Graph upload", 0) + // TODO (Optional): Initiate upload instantiated graphs to overhead of lazy upload on first launch. + POP_RANGE + + CUDA_RT_CALL(cudaDeviceSynchronize()); + + if (!csv && 0 == rank) { + printf( + "Jacobi relaxation: %d iterations on %d x %d mesh with norm check " + "every %d iterations\n", + iter_max, ny, nx, nccheck); + } + + int iter = 0; + bool calculate_norm = true; + real l2_norm = 1.0; + + MPI_CALL(MPI_Barrier(MPI_COMM_WORLD)); + double start = MPI_Wtime(); + PUSH_RANGE("Jacobi solve", 0) + while (l2_norm > tol && iter < iter_max) { + const bool calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100) == 0); + if (calculate_norm) { + // TODO: Launch `graph_calc_norm_exec[iter%2]` in `compute_stream` + CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); + MPI_CALL(MPI_Allreduce(l2_norm_h, &l2_norm, 1, MPI_REAL_TYPE, MPI_SUM, MPI_COMM_WORLD)); + l2_norm = std::sqrt(l2_norm); + + if (!csv && 0 == rank && (iter % 100) == 0) { + printf("%5d, %0.6f\n", iter, l2_norm); + } + } else { + // TODO: Launch `graph_exec[iter%2]` in `compute_stream` + } + iter++; + } + CUDA_RT_CALL(cudaDeviceSynchronize()); + double stop = MPI_Wtime(); + POP_RANGE + + CUDA_RT_CALL(cudaMemcpy(a_h + iy_start_global * nx, a + nx, + std::min((ny - iy_start_global) * nx, chunk_size * nx) * sizeof(real), + cudaMemcpyDeviceToHost)); + + int result_correct = 1; + for (int iy = iy_start_global; result_correct && (iy < iy_end_global); ++iy) { + for (int ix = 1; result_correct && (ix < (nx - 1)); ++ix) { + if (std::fabs(a_ref_h[iy * nx + ix] - a_h[iy * nx + ix]) > tol) { + fprintf(stderr, + "ERROR on rank %d: a[%d * %d + %d] = %f does not match %f " + "(reference)\n", + rank, iy, nx, ix, a_h[iy * nx + ix], a_ref_h[iy * nx + ix]); + result_correct = 0; + } + } + } + + int global_result_correct = 1; + MPI_CALL(MPI_Allreduce(&result_correct, &global_result_correct, 1, MPI_INT, MPI_MIN, + MPI_COMM_WORLD)); + result_correct = global_result_correct; + + if (rank == 0 && result_correct) { + if (csv) { + //TODO: Don't forget to change your output lable from nccl_overlap to nccl_graph + printf("nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n", nx, ny, iter_max, nccheck, size, + (stop - start), runtime_serial); + } else { + printf("Num GPUs: %d.\n", size); + printf( + "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, " + "efficiency: %8.2f \n", + ny, nx, runtime_serial, size, (stop - start), runtime_serial / (stop - start), + runtime_serial / (size * (stop - start)) * 100); + } + } + + // TODO: Destroy instantiated graphs + + CUDA_RT_CALL(cudaEventDestroy(reset_l2norm_done)); + CUDA_RT_CALL(cudaEventDestroy(push_done)); + CUDA_RT_CALL(cudaEventDestroy(push_prep_done)); + CUDA_RT_CALL(cudaStreamDestroy(push_stream)); + CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); + + CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); + CUDA_RT_CALL(cudaFree(l2_norm_d)); + + CUDA_RT_CALL(cudaFree(a_new)); + CUDA_RT_CALL(cudaFree(a)); + + CUDA_RT_CALL(cudaFreeHost(a_h)); + CUDA_RT_CALL(cudaFreeHost(a_ref_h)); + + NCCL_CALL(ncclCommDestroy(nccl_comm)); + + MPI_CALL(MPI_Finalize()); + return (result_correct == 1) ? 0 : 1; +} + +double single_gpu(const int nx, const int ny, const int iter_max, real* const a_ref_h, + const int nccheck, const bool print) { + real* a; + real* a_new; + + cudaStream_t compute_stream; + cudaStream_t push_top_stream; + cudaStream_t push_bottom_stream; + cudaEvent_t compute_done; + cudaEvent_t push_top_done; + cudaEvent_t push_bottom_done; + + real* l2_norm_d; + real* l2_norm_h; + + int iy_start = 1; + int iy_end = (ny - 1); + + CUDA_RT_CALL(cudaMalloc(&a, nx * ny * sizeof(real))); + CUDA_RT_CALL(cudaMalloc(&a_new, nx * ny * sizeof(real))); + + CUDA_RT_CALL(cudaMemset(a, 0, nx * ny * sizeof(real))); + CUDA_RT_CALL(cudaMemset(a_new, 0, nx * ny * sizeof(real))); + + // Set diriclet boundary conditions on left and right boarder + launch_initialize_boundaries(a, a_new, PI, 0, nx, ny, ny); + CUDA_RT_CALL(cudaDeviceSynchronize()); + + CUDA_RT_CALL(cudaStreamCreate(&compute_stream)); + CUDA_RT_CALL(cudaStreamCreate(&push_top_stream)); + CUDA_RT_CALL(cudaStreamCreate(&push_bottom_stream)); + CUDA_RT_CALL(cudaEventCreateWithFlags(&compute_done, cudaEventDisableTiming)); + CUDA_RT_CALL(cudaEventCreateWithFlags(&push_top_done, cudaEventDisableTiming)); + CUDA_RT_CALL(cudaEventCreateWithFlags(&push_bottom_done, cudaEventDisableTiming)); + + CUDA_RT_CALL(cudaMalloc(&l2_norm_d, sizeof(real))); + CUDA_RT_CALL(cudaMallocHost(&l2_norm_h, sizeof(real))); + + CUDA_RT_CALL(cudaDeviceSynchronize()); + + if (print) + printf( + "Single GPU jacobi relaxation: %d iterations on %d x %d mesh with " + "norm " + "check every %d iterations\n", + iter_max, ny, nx, nccheck); + + int iter = 0; + bool calculate_norm = true; + real l2_norm = 1.0; + + double start = MPI_Wtime(); + PUSH_RANGE("Jacobi solve", 0) + while (l2_norm > tol && iter < iter_max) { + CUDA_RT_CALL(cudaMemsetAsync(l2_norm_d, 0, sizeof(real), compute_stream)); + + CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_top_done, 0)); + CUDA_RT_CALL(cudaStreamWaitEvent(compute_stream, push_bottom_done, 0)); + + calculate_norm = (iter % nccheck) == 0 || (iter % 100) == 0; + launch_jacobi_kernel(a_new, a, l2_norm_d, iy_start, iy_end, nx, calculate_norm, + compute_stream); + CUDA_RT_CALL(cudaEventRecord(compute_done, compute_stream)); + + if (calculate_norm) { + CUDA_RT_CALL(cudaMemcpyAsync(l2_norm_h, l2_norm_d, sizeof(real), cudaMemcpyDeviceToHost, + compute_stream)); + } + + // Apply periodic boundary conditions + + CUDA_RT_CALL(cudaStreamWaitEvent(push_top_stream, compute_done, 0)); + CUDA_RT_CALL(cudaMemcpyAsync(a_new, a_new + (iy_end - 1) * nx, nx * sizeof(real), + cudaMemcpyDeviceToDevice, push_top_stream)); + CUDA_RT_CALL(cudaEventRecord(push_top_done, push_top_stream)); + + CUDA_RT_CALL(cudaStreamWaitEvent(push_bottom_stream, compute_done, 0)); + CUDA_RT_CALL(cudaMemcpyAsync(a_new + iy_end * nx, a_new + iy_start * nx, nx * sizeof(real), + cudaMemcpyDeviceToDevice, compute_stream)); + CUDA_RT_CALL(cudaEventRecord(push_bottom_done, push_bottom_stream)); + + if (calculate_norm) { + CUDA_RT_CALL(cudaStreamSynchronize(compute_stream)); + l2_norm = *l2_norm_h; + l2_norm = std::sqrt(l2_norm); + if (print && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, l2_norm); + } + + std::swap(a_new, a); + iter++; + } + POP_RANGE + double stop = MPI_Wtime(); + + CUDA_RT_CALL(cudaMemcpy(a_ref_h, a, nx * ny * sizeof(real), cudaMemcpyDeviceToHost)); + + CUDA_RT_CALL(cudaEventDestroy(push_bottom_done)); + CUDA_RT_CALL(cudaEventDestroy(push_top_done)); + CUDA_RT_CALL(cudaEventDestroy(compute_done)); + CUDA_RT_CALL(cudaStreamDestroy(push_bottom_stream)); + CUDA_RT_CALL(cudaStreamDestroy(push_top_stream)); + CUDA_RT_CALL(cudaStreamDestroy(compute_stream)); + + CUDA_RT_CALL(cudaFreeHost(l2_norm_h)); + CUDA_RT_CALL(cudaFree(l2_norm_d)); + + CUDA_RT_CALL(cudaFree(a_new)); + CUDA_RT_CALL(cudaFree(a)); + return (stop - start); +} diff --git a/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/jacobi_kernels.cu b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/jacobi_kernels.cu new file mode 100644 index 0000000..dbc7dc9 --- /dev/null +++ b/10-H_CUDA_Graphs_and_Device-initiated_Communication_with_NVSHMEM/tasks/Using_CUDA_Graphs/jacobi_kernels.cu @@ -0,0 +1,109 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2017,2021,2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +#ifdef HAVE_CUB +#include +#endif // HAVE_CUB + +#define CUDA_RT_CALL(call) \ + { \ + cudaError_t cudaStatus = call; \ + if (cudaSuccess != cudaStatus) \ + fprintf(stderr, \ + "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ + "with " \ + "%s (%d).\n", \ + #call, __LINE__, __FILE__, cudaGetErrorString(cudaStatus), cudaStatus); \ + } + +#ifdef USE_DOUBLE +typedef double real; +#define MPI_REAL_TYPE MPI_DOUBLE +#else +typedef float real; +#define MPI_REAL_TYPE MPI_FLOAT +#endif + +__global__ void initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, + const real pi, const int offset, const int nx, + const int my_ny, const int ny) { + for (int iy = blockIdx.x * blockDim.x + threadIdx.x; iy < my_ny; iy += blockDim.x * gridDim.x) { + const real y0 = sin(2.0 * pi * (offset + iy) / (ny - 1)); + a[iy * nx + 0] = y0; + a[iy * nx + (nx - 1)] = y0; + a_new[iy * nx + 0] = y0; + a_new[iy * nx + (nx - 1)] = y0; + } +} + +void launch_initialize_boundaries(real* __restrict__ const a_new, real* __restrict__ const a, + const real pi, const int offset, const int nx, const int my_ny, + const int ny) { + initialize_boundaries<<>>(a_new, a, pi, offset, nx, my_ny, ny); + CUDA_RT_CALL(cudaGetLastError()); +} + +template +__global__ void jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, + real* __restrict__ const l2_norm, const int iy_start, + const int iy_end, const int nx, const bool calculate_norm) { +#ifdef HAVE_CUB + typedef cub::BlockReduce + BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; +#endif // HAVE_CUB + int iy = blockIdx.y * blockDim.y + threadIdx.y + iy_start; + int ix = blockIdx.x * blockDim.x + threadIdx.x + 1; + real local_l2_norm = 0.0; + + if (iy < iy_end && ix < (nx - 1)) { + const real new_val = 0.25 * (a[iy * nx + ix + 1] + a[iy * nx + ix - 1] + + a[(iy + 1) * nx + ix] + a[(iy - 1) * nx + ix]); + a_new[iy * nx + ix] = new_val; + if (calculate_norm) { + real residue = new_val - a[iy * nx + ix]; + local_l2_norm += residue * residue; + } + } + if (calculate_norm) { +#ifdef HAVE_CUB + real block_l2_norm = BlockReduce(temp_storage).Sum(local_l2_norm); + if (0 == threadIdx.y && 0 == threadIdx.x) atomicAdd(l2_norm, block_l2_norm); +#else + atomicAdd(l2_norm, local_l2_norm); +#endif // HAVE_CUB + } +} + +void launch_jacobi_kernel(real* __restrict__ const a_new, const real* __restrict__ const a, + real* __restrict__ const l2_norm, const int iy_start, const int iy_end, + const int nx, const bool calculate_norm, cudaStream_t stream) { + constexpr int dim_block_x = 32; + constexpr int dim_block_y = 32; + dim3 dim_grid((nx + dim_block_x - 1) / dim_block_x, + ((iy_end - iy_start) + dim_block_y - 1) / dim_block_y, 1); + jacobi_kernel<<>>( + a_new, a, l2_norm, iy_start, iy_end, nx, calculate_norm); + CUDA_RT_CALL(cudaGetLastError()); +} diff --git a/11-L_Summary_Advanced/slides.pdf b/11-L_Summary_Advanced/slides.pdf index f3e27c1..2fa9a67 100644 Binary files a/11-L_Summary_Advanced/slides.pdf and b/11-L_Summary_Advanced/slides.pdf differ diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..0d06618 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,52 @@ +cff-version: 1.2.0 +title: Efficient Distributed GPU Programming for Exascale +message: >- + If you use this software, please cite it using the + metadata from this file. +authors: + - given-names: Andreas + family-names: Herten + email: a.herten@fz-juelich.de + affiliation: Jülich Supercomputing Centre + orcid: 'https://orcid.org/0000-0002-7150-2505' + - given-names: Lena + family-names: Oden + email: lena.oden@fernuni-hagen.de + affiliation: FernUni Hagen + orcid: 'https://orcid.org/0000-0002-9670-5296' + - given-names: Simon + family-names: Garcia de Gonzalo + email: simgarc@sandia.gov + affiliation: Sandia National Laboratories + orcid: 'https://orcid.org/0000-0002-5699-1793' + - given-names: Jiri + family-names: Kraus + email: jkraus@nvidia.com + affiliation: NVIDIA + orcid: 'https://orcid.org/0000-0002-5240-3317' + - given-names: Markus + family-names: Hrywniak + email: mhrywniak@nvidia.com + affiliation: NVIDIA + orcid: 'https://orcid.org/0000-0002-6015-8788' +identifiers: + - type: doi + value: 10.5281/zenodo.5745504 + description: Year-agnostic Zenodo Identifier +repository-code: 'https://github.com/FZJ-JSC/tutorial-multi-gpu/' +abstract: >- + Over the past decade, GPUs became ubiquitous in HPC installations around the world, delivering the majority of performance of some of the largest supercomputers (e.g. Summit, Sierra, JUWELS Booster). This trend continues in the recently deployed and upcoming Pre-Exascale and Exascale systems (JUPITER, LUMI, Leonardo; El Capitan, Frontier, Aurora): GPUs are chosen as the core computing devices to enter this next era of HPC. + To take advantage of future GPU-accelerated systems with tens of thousands of devices, application developers need to have the proper skills and tools to understand, manage, and optimize distributed GPU applications. + In this tutorial, participants will learn techniques to efficiently program large-scale multi-GPU systems. While programming multiple GPUs with MPI is explained in detail, also advanced tuning techniques and complementing programming models like NCCL and NVSHMEM are presented. Tools for analysis are shown and used to motivate and implement performance optimizations. The tutorial teaches fundamental concepts that apply to GPU-accelerated systems in general, taking the NVIDIA platform as an example. It is a combination of lectures and hands-on exercises, using a development system for JUPITER (JEDI), for interactive learning and discovery. +keywords: + - NVIDIA + - GPU + - CUDA + - Exascale + - MPI + - NCCL + - NVSHMEM + - Distributed Programming +license: MIT +version: '8.0-isc25' +date-released: '2025-06-13' diff --git a/README.md b/README.md index f64dc6a..373e9e0 100644 --- a/README.md +++ b/README.md @@ -1,50 +1,40 @@ -# ISC22 Tutorial: Efficient Distributed GPU Programming for Exascale +# ISC25 Tutorial: Efficient Distributed GPU Programming for Exascale -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5745505.svg)](https://doi.org/10.5281/zenodo.5745505) (*old*) +[![DOI](https://zenodo.org/badge/409504932.svg)](https://zenodo.org/badge/latestdoi/409504932) -Repository with talks and exercises of our [Efficient GPU Programming for Exascale](https://app.swapcard.com/widget/event/isc-high-performance-2022/planning/UGxhbm5pbmdfODYxMTQ2) tutorial. + +Repository with talks and exercises of our Efficient GPU Programming for Exascale tutorial, to be held at [ISC25](https://isc.app.swapcard.com/widget/event/isc-high-performance-2025/planning/UGxhbm5pbmdfMjU4MTc5Ng==). ## Coordinates -* Date: 29 May 2022 -* Occasion: ISC22 Tutorial -* Tutors: Andreas Herten (JSC), Markus Hrywniak (NVIDIA), Jiri Kraus (NVIDIA), Lena Oden (Uni Hagen) (and Simon Garcia (BSC), helping from afar) +* Date: 13 June 2025 +* Occasion: ISC25 Tutorial +* Tutors: Simon Garcia de Gonzalo (SNL), Andreas Herten (JSC), Lena Oden (Uni Hagen), with support by Markus Hrywniak (NVIDIA) and Jiri Kraus (NVIDIA) + ## Setup The tutorial is an interactive tutorial with introducing lectures and practical exercises to apply knowledge. The exercises have been derived from the Jacobi solver implementations available in [NVIDIA/multi-gpu-programming-models](https://github.com/NVIDIA/multi-gpu-programming-models). -Curriculum: +Walk-through: + +* Sign up at JuDoor +* Open Jupyter JSC: https://jupyter-jsc.fz-juelich.de +* Create new Jupyter instance on JEDI, using training25XX account, on **LoginNode** +* Source course environment: `source $PROJECT_training25XX/env.sh` +* Sync material: `jsc-material-sync` +* Locally install NVIDIA Nsight Systems: https://developer.nvidia.com/nsight-systems + +Curriculum (Note: square-bracketed sessions are skipped at ISC25 because only ½ day was allocated to the tutorial): 1. Lecture: Tutorial Overview, Introduction to System + Onboarding *Andreas* -2. Lecture: MPI-Distributed Computing with GPUs *Lena* +2. Lecture: MPI-Distributed Computing with GPUs *Simon* 3. Hands-on: Multi-GPU Parallelization -4. Lecture: Performance / Debugging Tools *Markus* -5. Lecture: Optimization Techniques for Multi-GPU Applications *Jiri* +4. [Lecture: Performance / Debugging Tools] +5. Lecture: Optimization Techniques for Multi-GPU Applications *Lena* 6. Hands-on: Overlap Communication and Computation with MPI -7. Lecture: Overview of NCCL and NVSHMEN in MPI *Lena* -8. Hands-on: Using NCCL and NVSHMEM -9. Lecture: Device-initiated Communication with NVSHMEM *Jiri* -10. Hands-on: Using Device-Initiated Communication with NVSHMEM +7. [Lecture: Overview of NCCL and NVSHMEN in MPI] +8. [Hands-on: Using NCCL and NVSHMEM] +9. [Lecture: Device-initiated Communication with NVSHMEM] +10. [Hands-on: Using Device-Initiated Communication with NVSHMEM] 11. Lecture: Conclusion and Outline of Advanced Topics *Andreas* - -## Onboarding - -The supercomputer used for the exercises is [JUWELS Booster](https://apps.fz-juelich.de/jsc/hps/juwels/booster-overview.html), a system located a Jülich Supercomputing Centre (Germany) with about 3700 NVIDIA A100 GPUs. - -Visual onboarding instructions can be found in the subfolder of the according lecture, `01b-H-Onboarding/`. Here follows the textual description: - -* Register for an account at [JuDoor](https://judoor.fz-juelich.de/login) -* Sign-up for the [`training2216` project](https://judoor.fz-juelich.de/projects/join/training2216) -* Accept the Usage Agreement of JUWELS -* Wait for wheels to turn as your information is pushed through the systems (about 15 minutes) -* Access JUWELS Booster via [JSC's Jupyter portal](https://jupyter-jsc.fz-juelich.de/) -* Create a Jupyter v2 instance using `LoginNodeBooster` and the `training2216` allocation on JUWELS -* When started, launch a browser-based Shell in Jupyter -* Source the course environment to introduce commands and helper script to environment - ``` - source $PROJECT_training2216/env.sh - ``` -* Sync course material to your home directory with `jsc-material-sync`. - -You can also access JSC's facilities via SSH. In that case you need to add your SSH key through JuDoor. You need to restrict access from certain IPs/IP ranges via the `from` clause, as explained [in the documentation](https://apps.fz-juelich.de/jsc/hps/juwels/access.html#ssh-login). We recommend using Jupyter JSC for its simplicity, especially during such a short day that is the tutorial day. \ No newline at end of file