diff --git a/.github/workflows/lfric_test.yml b/.github/workflows/lfric_test.yml index 8f6c796f31..23d668452e 100644 --- a/.github/workflows/lfric_test.yml +++ b/.github/workflows/lfric_test.yml @@ -75,6 +75,58 @@ jobs: pip install .[test] pip install jinja2 + # PSyclone, compile and run MetOffice gungho_model on GPU + - name: LFRic GungHo with OpenMP offload + run: | + # Set up environment + source /apps/spack/psyclone-spack/spack-repo/share/spack/setup-env.sh + spack load lfric-build-environment%nvhpc + source .runner_venv/bin/activate + export PSYCLONE_LFRIC_DIR=${GITHUB_WORKSPACE}/examples/lfric/scripts + export PSYCLONE_CONFIG_FILE=${PSYCLONE_LFRIC_DIR}/KGOs/lfric_psyclone.cfg + # The LFRic source must be patched to workaround bugs in the NVIDIA + # compiler's namelist handling. + rm -rf ${HOME}/LFRic/gpu_build + mkdir -p ${HOME}/LFRic/gpu_build + cp -r ${HOME}/LFRic/lfric_apps_${LFRIC_APPS_REV} ${HOME}/LFRic/gpu_build/lfric_apps + cp -r ${HOME}/LFRic/lfric_core_50869 ${HOME}/LFRic/gpu_build/lfric + cd ${HOME}/LFRic/gpu_build + patch -p1 < ${PSYCLONE_LFRIC_DIR}/KGOs/lfric_${LFRIC_APPS_REV}_nvidia.patch + # Update the compiler definitions to build for GPU + cp ${PSYCLONE_LFRIC_DIR}/KGOs/nvfortran_acc.mk lfric/infrastructure/build/fortran/nvfortran.mk + cp ${PSYCLONE_LFRIC_DIR}/KGOs/nvc++.mk lfric/infrastructure/build/cxx/. + # Update the PSyclone commands to ensure transformed kernels are written + # to working directory. + cp ${PSYCLONE_LFRIC_DIR}/KGOs/psyclone.mk lfric/infrastructure/build/psyclone/. + # Update dependencies.sh to point to our patched lfric core. + sed -i -e 's/export lfric_core_sources=.*$/export lfric_core_sources\=\/home\/gh_runner\/LFRic\/gpu_build\/lfric/' lfric_apps/dependencies.sh + export LFRIC_DIR=${HOME}/LFRic/gpu_build/lfric_apps + export OPT_DIR=${LFRIC_DIR}/applications/gungho_model/optimisation/psyclone-test + cd ${LFRIC_DIR} + # PSyclone scripts must now be under 'optimisation' and be called 'global.py' + mkdir -p ${OPT_DIR} + cp ${PSYCLONE_LFRIC_DIR}/gpu_offloading.py ${OPT_DIR}/global.py + # Clean previous version and compile again + rm -rf applications/gungho_model/working + LFRIC_OFFLOAD_DIRECTIVES=omp ./build/local_build.py -a gungho_model -p psyclone-test + cd applications/gungho_model/example + cp ${PSYCLONE_LFRIC_DIR}/KGOs/lfric_gungho_configuration_4its.nml configuration.nml + mpirun -n 1 ../bin/gungho_model configuration.nml |& tee output.txt + python ${PSYCLONE_LFRIC_DIR}/compare_ouput.py ${PSYCLONE_LFRIC_DIR}/KGOs/lfric_gungho_configuration_4its_checksums.txt gungho_model-checksums.txt + cat timer.txt + export VAR_TIME=$(grep "gungho_model" timer.txt | cut -d'|' -f5) + export VAR_HALOS=$(grep "gungho_model" halo_calls_counter.txt | cut -d'|' -f5) + echo $GITHUB_REF_NAME $GITHUB_SHA $VAR_TIME $VAR_HALOS >> ${HOME}/store_results/lfric_omp_performance_history + ${HOME}/mongosh-2.1.1-linux-x64/bin/mongosh \ + "mongodb+srv://cluster0.x8ncpxi.mongodb.net/PerformanceMonitoring" \ + --quiet --apiVersion 1 --username ${{ secrets.MONGODB_USERNAME }} \ + --password ${{ secrets.MONGODB_PASSWORD }} \ + --eval 'db.GitHub_CI.insertOne({branch_name: "'"$GITHUB_REF_NAME"'", commit: "'"$GITHUB_SHA"'", + github_job: "'"$GITHUB_RUN_ID"'"-"'"$GITHUB_RUN_ATTEMPT"'", + ci_test: "LFRic OpenMP offloading", lfric_apps_version: '"$LFRIC_APPS_REV"', system: "GlaDos", + compiler:"spack-nvhpc-24.5", date: new Date(), elapsed_time: '"$VAR_TIME"', + num_of_halo_exchanges: '"$VAR_HALOS"'})' + # PSyclone, compile and run MetOffice gungho_model on GPU - name: LFRic GungHo with OpenACC offload run: | @@ -105,10 +157,10 @@ jobs: cd ${LFRIC_DIR} # PSyclone scripts must now be under 'optimisation' and be called 'global.py' mkdir -p ${OPT_DIR} - cp ${PSYCLONE_LFRIC_DIR}/acc_parallel.py ${OPT_DIR}/global.py + cp ${PSYCLONE_LFRIC_DIR}/gpu_offloading.py ${OPT_DIR}/global.py # Clean previous version and compile again rm -rf applications/gungho_model/working - ./build/local_build.py -a gungho_model -p psyclone-test + LFRIC_OFFLOAD_DIRECTIVES=acc ./build/local_build.py -a gungho_model -p psyclone-test cd applications/gungho_model/example cp ${PSYCLONE_LFRIC_DIR}/KGOs/lfric_gungho_configuration_4its.nml configuration.nml mpirun -n 1 ../bin/gungho_model configuration.nml |& tee output.txt diff --git a/changelog b/changelog index 658a3acbb5..e1090c9cb2 100644 --- a/changelog +++ b/changelog @@ -235,7 +235,10 @@ symbols from their parent scope. 81) PR #2725 to close #717. Removes some TODOs and associated utility code - from the fparser2 frontend that is now unused. + from the fparser2 frontend that is now unused. + + 82) PR #2733 for #2730. Adds OpenMP offloading support for LFRic plus + associated integration test. release 2.5.0 14th of February 2024 diff --git a/examples/lfric/scripts/KGOs/lfric_3269_nvidia.patch b/examples/lfric/scripts/KGOs/lfric_3269_nvidia.patch index d3e178318a..db37ced2e3 100644 --- a/examples/lfric/scripts/KGOs/lfric_3269_nvidia.patch +++ b/examples/lfric/scripts/KGOs/lfric_3269_nvidia.patch @@ -57,65 +57,6 @@ index 19c9cff9..b5cd3014 100644 $(call MESSAGE,Compiled,$<) -diff --git a/lfric/infrastructure/build/cxx/nvc++.mk b/lfric/infrastructure/build/cxx/nvc++.mk -new file mode 100644 -index 00000000..13b17a10 ---- /dev/null -+++ b/lfric/infrastructure/build/cxx/nvc++.mk -@@ -0,0 +1,9 @@ -+############################################################################## -+# (c) Crown copyright 2017 Met Office. All rights reserved. -+# The file LICENCE, distributed with this code, contains details of the terms -+# under which the code may be used. -+############################################################################## -+ -+$(info ** Chosen NVC++ compiler) -+ -+CXX_RUNTIME_LIBRARY=stdc++ -diff --git a/lfric/infrastructure/build/fortran/nvfortran.mk b/lfric/infrastructure/build/fortran/nvfortran.mk -new file mode 100644 -index 00000000..cfed52c1 ---- /dev/null -+++ b/lfric/infrastructure/build/fortran/nvfortran.mk -@@ -0,0 +1,38 @@ -+############################################################################## -+# Copyright (c) 2017, Met Office, on behalf of HMSO and Queen's Printer -+# For further details please refer to the file LICENCE.original which you -+# should have received as part of this distribution. -+############################################################################## -+# Various things specific to the Portland Fortran compiler. -+############################################################################## -+# -+# This macro is evaluated now (:= syntax) so it may be used as many times as -+# desired without wasting time rerunning it. -+# -+F_MOD_DESTINATION_ARG = -module$(SPACE) -+OPENMP_ARG = -mp -+ -+FFLAGS_COMPILER = -+FFLAGS_NO_OPTIMISATION = -O0 -+FFLAGS_SAFE_OPTIMISATION = -O2 -+FFLAGS_RISKY_OPTIMISATION = -O4 -+FFLAGS_DEBUG = -g -traceback -+FFLAGS_RUNTIME = -Mchkptr -Mchkstk -+# Option for checking code meets Fortran standard (not available for PGI) -+FFLAGS_FORTRAN_STANDARD = -+ -+LDFLAGS_COMPILER = -g -+ -+FPP = cpp -traditional-cpp -+FPPFLAGS = -P -+FC = mpif90 -+ -+# FS#34981 (nvbug 4648082) -+science/src/um/src/atmosphere/large_scale_precipitation/ls_ppnc.o: private FFLAGS_RUNTIME = -Mchkstk -+ -+# FS#35751 -+mesh/create_mesh_mod.o: private FFLAGS_RUNTIME = -Mchkstk -+ -+# 24.3 -+science/src/socrates/src/cosp_github/subsample_and_optics_example/optics/quickbeam_optics/optics_lib.o: private FFLAGS_SAFE_OPTIMISATION = -O1 -+science/src/socrates/src/cosp_github/subsample_and_optics_example/optics/quickbeam_optics/optics_lib.o: private FFLAGS_RISKY_OPTIMISATION = -O1 diff --git a/lfric/infrastructure/build/tools/DependencyRules b/lfric/infrastructure/build/tools/DependencyRules index 9d4db390..e37384fc 100755 --- a/lfric/infrastructure/build/tools/DependencyRules diff --git a/examples/lfric/scripts/KGOs/nvfortran_acc.mk b/examples/lfric/scripts/KGOs/nvfortran_acc.mk index 79df074ed8..34ead62500 100644 --- a/examples/lfric/scripts/KGOs/nvfortran_acc.mk +++ b/examples/lfric/scripts/KGOs/nvfortran_acc.mk @@ -20,9 +20,20 @@ FFLAGS_DEBUG = -g -traceback FFLAGS_RUNTIME = -Mchkptr -Mchkstk # Option for checking code meets Fortran standard (not available for PGI) FFLAGS_FORTRAN_STANDARD = -OPENMP_ARG = -acc=gpu -gpu=managed -mp=multicore -LDFLAGS_COMPILER = -g -acc=gpu -gpu=managed -mp=multicore -cuda +# Flags for OpenMP threading / OpenMP offloading / OpenACC Offloading +# The LFRIC_OFFLOAD_DIRECTIVES env_variable is also queried in the PSyclone +# script to generate matching directives +ifeq ("$(LFRIC_OFFLOAD_DIRECTIVES)", "omp") + OPENMP_ARG = -mp=gpu -gpu=managed + LDFLAGS_COMPILER = -mp=gpu -gpu=managed -cuda +else ifeq ("$(LFRIC_OFFLOAD_DIRECTIVES)", "acc") + OPENMP_ARG = -acc=gpu -gpu=managed -mp=multicore + LDFLAGS_COMPILER = -acc=gpu -gpu=managed -mp=multicore -cuda +else + OPENMP_ARG = -mp + LDFLAGS_COMPILER = -mp +endif FPP = cpp -traditional-cpp FPPFLAGS = -P diff --git a/examples/lfric/scripts/Makefile b/examples/lfric/scripts/Makefile index b2703e6172..e39c1f03c0 100644 --- a/examples/lfric/scripts/Makefile +++ b/examples/lfric/scripts/Makefile @@ -45,7 +45,7 @@ transform: ${SCRIPTS} .PHONY: ${SCRIPTS} ${SCRIPTS}: - ${PSYCLONE} -api lfric -s ./$@ ../eg3/solver_mod.x90 -oalg /dev/null -opsy /dev/null + LFRIC_OFFLOAD_DIRECTIVES=acc ${PSYCLONE} -api lfric -s ./$@ ../eg3/solver_mod.x90 -oalg /dev/null -opsy /dev/null compile: transform @echo "No compilation supported for lfric/scripts examples" diff --git a/examples/lfric/scripts/acc_parallel.py b/examples/lfric/scripts/acc_parallel.py deleted file mode 100644 index 198450e91c..0000000000 --- a/examples/lfric/scripts/acc_parallel.py +++ /dev/null @@ -1,150 +0,0 @@ -# ----------------------------------------------------------------------------- -# BSD 3-Clause License -# -# Copyright (c) 2018-2024, Science and Technology Facilities Council. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# ----------------------------------------------------------------------------- -# Authors: A. R. Porter, STFC Daresbury Lab -# R. W. Ford, STFC Daresbury Lab -# L. Mosimann, NVIDIA. - -'''PSyclone transformation script for the lfric API to apply -colouring, OpenACC, OpenMP. Also adds redundant computation to the level-1 -halo for setval_* generically. - -''' -from psyclone.domain.lfric import LFRicConstants -from psyclone.psyir.nodes import ACCDirective, Loop -from psyclone.psyir.transformations import ( - ACCKernelsTrans, TransformationError) -from psyclone.transformations import ( - Dynamo0p3ColourTrans, Dynamo0p3OMPLoopTrans, - Dynamo0p3RedundantComputationTrans, OMPParallelTrans, - ACCParallelTrans, ACCLoopTrans, ACCRoutineTrans) - - -# Names of any routines that we won't add any OpenACC to. -ACC_EXCLUSIONS = [ -] - - -def trans(psy): - '''Applies PSyclone colouring and OpenACC transformations. Any kernels that - cannot be offloaded to GPU are parallelised using OpenMP on the CPU. Any - setval_* kernels are transformed so as to compute into the L1 halos. - - ''' - rtrans = Dynamo0p3RedundantComputationTrans() - ctrans = Dynamo0p3ColourTrans() - otrans = Dynamo0p3OMPLoopTrans() - const = LFRicConstants() - loop_trans = ACCLoopTrans() - ktrans = ACCKernelsTrans() - parallel_trans = ACCParallelTrans(default_present=False) - artrans = ACCRoutineTrans() - oregtrans = OMPParallelTrans() - - print(f"PSy name = '{psy.name}'") - - # Loop over all of the Invokes in the PSy object - for invoke in psy.invokes.invoke_list: - - print("Transforming invoke '{0}' ...".format(invoke.name)) - schedule = invoke.schedule - - # Make setval_* compute redundantly to the level 1 halo if it - # is in its own loop - for loop in schedule.loops(): - if loop.iteration_space == "dof": - if len(loop.kernels()) == 1: - if loop.kernels()[0].name in ["setval_c", "setval_x"]: - rtrans.apply(loop, options={"depth": 1}) - - if psy.name.lower() in ACC_EXCLUSIONS: - print(f"Not adding ACC to invoke in '{psy.name}'") - apply_acc = False - else: - apply_acc = True - - # Keep a record of any kernels we fail to module inline as we can't - # then add ACC ROUTINE to them. - failed_inline = set() - - # Colour loops over cells unless they are on discontinuous - # spaces or over dofs - for loop in schedule.loops(): - if loop.iteration_space == "cell_column": - if apply_acc: - for kern in loop.kernels(): - try: - artrans.apply(kern) - except TransformationError as err: - failed_inline.add(kern.name.lower()) - print(f"Adding ACC Routine to kernel '{kern.name}'" - f" failed:\n{err.value}") - if (loop.field_space.orig_name not in - const.VALID_DISCONTINUOUS_NAMES): - ctrans.apply(loop) - - # Add OpenACC to loops unless they are over colours or are null. - schedule = invoke.schedule - for loop in schedule.walk(Loop): - if not apply_acc or any(kern.name.lower() in failed_inline for - kern in loop.kernels()): - print(f"Not adding OpenACC for kernels: " - f"{[kern.name for kern in loop.kernels()]}") - continue - try: - if loop.loop_type == "colours": - pass - if loop.loop_type == "colour": - loop_trans.apply(loop, options={"independent": True}) - parallel_trans.apply(loop.ancestor(ACCDirective)) - if loop.loop_type == "": - loop_trans.apply(loop, options={"independent": True}) - parallel_trans.apply(loop.ancestor(ACCDirective)) - if loop.loop_type == "dof": - # We use ACC KERNELS for dof loops since they can contain - # reductions. - ktrans.apply(loop) - except TransformationError as err: - print(str(err)) - pass - - # Apply OpenMP thread parallelism for any kernels we've not been able - # to offload to GPU. - for loop in schedule.walk(Loop): - if not apply_acc or any(kern.name.lower() in failed_inline for - kern in loop.kernels()): - if loop.loop_type not in ["colours", "null"]: - oregtrans.apply(loop) - otrans.apply(loop, options={"reprod": True}) - - return psy diff --git a/examples/lfric/scripts/gpu_offloading.py b/examples/lfric/scripts/gpu_offloading.py new file mode 100644 index 0000000000..2167119fa1 --- /dev/null +++ b/examples/lfric/scripts/gpu_offloading.py @@ -0,0 +1,196 @@ +# ----------------------------------------------------------------------------- +# BSD 3-Clause License +# +# Copyright (c) 2018-2024, Science and Technology Facilities Council. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# ----------------------------------------------------------------------------- +# Authors: A. R. Porter, STFC Daresbury Lab +# R. W. Ford, STFC Daresbury Lab +# S. Siso, STFC Daresbury Lab +# L. Mosimann, NVIDIA. + +'''PSyclone transformation script for LFRic to apply colouring and GPU +offloading. Also adds redundant computation to the level-1 halo for +setval_* generically. + +''' +import os +import sys +from psyclone.domain.lfric import LFRicConstants +from psyclone.psyir.nodes import Directive, Loop +from psyclone.psyir.transformations import ( + ACCKernelsTrans, TransformationError, OMPTargetTrans) +from psyclone.transformations import ( + Dynamo0p3ColourTrans, Dynamo0p3OMPLoopTrans, + Dynamo0p3RedundantComputationTrans, OMPParallelTrans, + ACCParallelTrans, ACCLoopTrans, ACCRoutineTrans, + OMPDeclareTargetTrans, OMPLoopTrans) + + +# Names of any invoke that we won't add any GPU offloading +INVOKE_EXCLUSIONS = [ +] + +OFFLOAD_DIRECTIVES = os.getenv('LFRIC_OFFLOAD_DIRECTIVES', "none") + + +def trans(psy): + '''Applies PSyclone colouring and GPU offloading transformations. Any + kernels that cannot be offloaded to GPU are parallelised using OpenMP + on the CPU. Any setval_* kernels are transformed so as to compute + into the L1 halos. + + ''' + rtrans = Dynamo0p3RedundantComputationTrans() + ctrans = Dynamo0p3ColourTrans() + otrans = Dynamo0p3OMPLoopTrans() + const = LFRicConstants() + cpu_parallel = OMPParallelTrans() + + if OFFLOAD_DIRECTIVES == "omp": + # Use OpenMP offloading + loop_offloading_trans = OMPLoopTrans( + omp_directive="teamsdistributeparalleldo", + omp_schedule="none" + ) + # OpenMP does not have a kernels parallelism directive equivalent + # to OpenACC 'kernels' + kernels_trans = None + gpu_region_trans = OMPTargetTrans() + gpu_annotation_trans = OMPDeclareTargetTrans() + elif OFFLOAD_DIRECTIVES == "acc": + # Use OpenACC offloading + loop_offloading_trans = ACCLoopTrans() + kernels_trans = ACCKernelsTrans() + gpu_region_trans = ACCParallelTrans(default_present=False) + gpu_annotation_trans = ACCRoutineTrans() + else: + print(f"The PSyclone transformation script expects the " + f"LFRIC_OFFLOAD_DIRECTIVES to be set to 'omp' or 'acc' " + f"but found '{OFFLOAD_DIRECTIVES}'.") + sys.exit(-1) + + print(f"PSy name = '{psy.name}'") + + # Loop over all of the Invokes in the PSy object + for invoke in psy.invokes.invoke_list: + + print("Transforming invoke '{0}' ...".format(invoke.name)) + schedule = invoke.schedule + + # Make setval_* compute redundantly to the level 1 halo if it + # is in its own loop + for loop in schedule.loops(): + if loop.iteration_space == "dof": + if len(loop.kernels()) == 1: + if loop.kernels()[0].name in ["setval_c", "setval_x"]: + rtrans.apply(loop, options={"depth": 1}) + + if psy.name.lower() in INVOKE_EXCLUSIONS: + print(f"Not adding GPU offloading to invoke '{psy.name}'") + offload = False + else: + offload = True + + # Keep a record of any kernels we fail to offload + failed_to_offload = set() + + # Colour loops over cells unless they are on discontinuous spaces + # (alternatively we could annotate the kernels with atomics) + for loop in schedule.loops(): + if loop.iteration_space == "cell_column": + if (loop.field_space.orig_name not in + const.VALID_DISCONTINUOUS_NAMES): + ctrans.apply(loop) + + # Mark Kernels inside the loops over cells as GPU-enabled + # (alternatively we could inline them) + for loop in schedule.loops(): + if loop.iteration_space == "cell_column": + if offload: + for kern in loop.kernels(): + try: + gpu_annotation_trans.apply(kern) + except TransformationError as err: + failed_to_offload.add(kern.name.lower()) + print(f"Failed to annotate '{kern.name}' with " + f"GPU-enabled directive due to:\n" + f"{err.value}") + # For annotated or inlined kernels we could attempt to + # provide compile-time dimensions for the temporary + # arrays and convert to code unsupported intrinsics. + + # Add GPU offloading to loops unless they are over colours or are null. + schedule = invoke.schedule + for loop in schedule.walk(Loop): + kernel_names = [k.name.lower() for k in loop.kernels()] + if offload and all(name not in failed_to_offload for name in + kernel_names): + try: + if loop.loop_type == "colours": + pass + if loop.loop_type == "colour": + loop_offloading_trans.apply( + loop, options={"independent": True}) + gpu_region_trans.apply(loop.ancestor(Directive)) + if loop.loop_type == "": + loop_offloading_trans.apply( + loop, options={"independent": True}) + gpu_region_trans.apply(loop.ancestor(Directive)) + if loop.loop_type == "dof": + # Loops over dofs can contains reductions + if kernels_trans: + # If kernel offloading is available it should + # manage them + kernels_trans.apply(loop) + else: + # Otherwise, if the reductions exists, they will + # be detected by the dependencyAnalysis and raise + # a TransformationError captured below + loop_offloading_trans.apply( + loop, options={"independent": True}) + gpu_region_trans.apply(loop.ancestor(Directive)) + # Alternatively we could use loop parallelism with + # reduction clauses + print(f"Successfully offloaded loop with {kernel_names}") + except TransformationError as err: + print(f"Failed to offload loop with {kernel_names} " + f"because: {err}") + + # Apply OpenMP thread parallelism for any kernels we've not been able + # to offload to GPU. + for loop in schedule.walk(Loop): + if not offload or any(kern.name.lower() in failed_to_offload for + kern in loop.kernels()): + if loop.loop_type not in ["colours", "null"]: + cpu_parallel.apply(loop) + otrans.apply(loop, options={"reprod": True}) + + return psy diff --git a/psyclone.pdf b/psyclone.pdf index 3736751f8f..77b3cdabb3 100644 Binary files a/psyclone.pdf and b/psyclone.pdf differ diff --git a/src/psyclone/f2pygen.py b/src/psyclone/f2pygen.py index f983e8f715..074fc3e76f 100644 --- a/src/psyclone/f2pygen.py +++ b/src/psyclone/f2pygen.py @@ -138,7 +138,8 @@ class OMPDirective(Directive): ''' def __init__(self, root, line, position, dir_type): self._types = ["parallel do", "parallel", "do", "master", "single", - "taskloop", "taskwait", "declare"] + "taskloop", "taskwait", "declare", "target", "teams", + "teams distribute parallel do"] self._positions = ["begin", "end"] super(OMPDirective, self).__init__(root, line, position, dir_type) diff --git a/src/psyclone/psyir/nodes/omp_directives.py b/src/psyclone/psyir/nodes/omp_directives.py index 00318273ca..8001ae1775 100644 --- a/src/psyclone/psyir/nodes/omp_directives.py +++ b/src/psyclone/psyir/nodes/omp_directives.py @@ -1442,7 +1442,10 @@ def gen_code(self, parent): for call in reprod_red_call_list: call.reduction_sum_loop(parent) - self.gen_post_region_code(parent) + # If there are nested OMPRegions, the post region code should be after + # the top-level one + if not self.ancestor(OMPRegionDirective): + self.gen_post_region_code(parent) def lower_to_language_level(self): ''' @@ -2302,7 +2305,7 @@ def gen_code(self, parent): # Add directive to the f2pygen tree parent.add( DirectiveGen( - parent, "omp", "begin", "parallel do", ", ".join( + parent, "omp", "begin", self._directive_string, ", ".join( text for text in [default_str, private_str, fprivate_str, schedule_str, self._reduction_string()] if text))) @@ -2312,10 +2315,23 @@ def gen_code(self, parent): # make sure the directive occurs straight after the loop body position = parent.previous_loop() - parent.add(DirectiveGen(parent, *self.end_string().split()), + + # DirectiveGen only accepts 3 terms, e.g. "omp end loop", so for longer + # directive e.g. "omp end teams distribute parallel do", we split them + # between arguments and content (which is an additional string appended + # at the end) + terms = self.end_string().split() + # If its < 3 the array slices still work as expected + arguments = terms[:3] + content = " ".join(terms[3:]) + + parent.add(DirectiveGen(parent, *arguments, content=content), position=["after", position]) - self.gen_post_region_code(parent) + # If there are nested OMPRegions, the post region code should be after + # the top-level one + if not self.ancestor(OMPRegionDirective): + self.gen_post_region_code(parent) def lower_to_language_level(self): ''' @@ -2402,6 +2418,31 @@ def end_string(self): ''' return "omp end target" + def gen_code(self, parent): + '''Generate the OpenMP Target Directive and any associated code. + + :param parent: the parent Node in the Schedule to which to add our + content. + :type parent: sub-class of :py:class:`psyclone.f2pygen.BaseGen` + ''' + # Check the constraints are correct + self.validate_global_constraints() + + # Generate the code for this Directive + parent.add(DirectiveGen(parent, "omp", "begin", "target")) + + # Generate the code for all of this node's children + for child in self.dir_body: + child.gen_code(parent) + + # Generate the end code for this node + parent.add(DirectiveGen(parent, "omp", "end", "target", "")) + + # If there are nested OMPRegions, the post region code should be after + # the top-level one + if not self.ancestor(OMPRegionDirective): + self.gen_post_region_code(parent) + class OMPLoopDirective(OMPRegionDirective): ''' Class for the !$OMP LOOP directive that specifies that the iterations diff --git a/src/psyclone/tests/psyir/nodes/omp_directives_test.py b/src/psyclone/tests/psyir/nodes/omp_directives_test.py index 2c8d2dbfd4..068c71896e 100644 --- a/src/psyclone/tests/psyir/nodes/omp_directives_test.py +++ b/src/psyclone/tests/psyir/nodes/omp_directives_test.py @@ -4669,3 +4669,34 @@ def test_omp_serial_check_dependency_valid_pairing(): assert test_dir._check_dependency_pairing_valid( array_reference1, array_reference2, None, None ) + + +def test_omptarget_gen_code(): + ''' Check that the OMPTarget gen_code produces the right code ''' + _, invoke_info = parse(os.path.join(BASE_PATH, "1_single_invoke.f90"), + api="lfric") + psy = PSyFactory("lfric", distributed_memory=True).create(invoke_info) + schedule = psy.invokes.invoke_list[0].schedule + kern = schedule.children[-1] + + # Add an OMPTarget and move the kernel inside it + target = OMPTargetDirective() + schedule.addchild(target) + target.dir_body.addchild(kern.detach()) + + # Check that the "omp target" is produced, and that the set_dirty is + # generated after it + code = str(psy.gen) + assert """ + !$omp target + DO cell = loop0_start, loop0_stop, 1 + CALL testkern_code(nlayers_f1, a, f1_data, f2_data, m1_data, \ +m2_data, ndf_w1, undf_w1, map_w1(:,cell), ndf_w2, undf_w2, map_w2(:,cell), \ +ndf_w3, undf_w3, map_w3(:,cell)) + END DO + !$omp end target + ! + ! Set halos dirty/clean for fields modified in the above loop(s) + ! + CALL f1_proxy%set_dirty() + """ in code diff --git a/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py b/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py index b5fd9e08b7..14a7b19a01 100644 --- a/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py +++ b/src/psyclone/tests/psyir/transformations/kernel_transformation_test.py @@ -50,7 +50,8 @@ from psyclone.psyir.nodes import Routine, FileContainer, IntrinsicCall, Call from psyclone.psyir.symbols import DataSymbol, INTEGER_TYPE from psyclone.psyir.transformations import TransformationError -from psyclone.transformations import ACCRoutineTrans, Dynamo0p3KernelConstTrans +from psyclone.transformations import ( + ACCRoutineTrans, OMPDeclareTargetTrans, Dynamo0p3KernelConstTrans) from psyclone.tests.gocean_build import GOceanBuild from psyclone.tests.lfric_build import LFRicBuild @@ -449,6 +450,24 @@ def test_gpumixin_validate_no_call(): "(TODO #342)." in str(err.value)) +@pytest.mark.parametrize( + "rtrans, expected_directive", + [(ACCRoutineTrans(), "!$acc routine"), + (OMPDeclareTargetTrans(), "!$omp declare target")]) +def test_kernel_gpu_annotation_trans(rtrans, expected_directive, + fortran_writer): + ''' Check that the GPU annotation transformations insert the + proper directive inside PSyKAl kernel code ''' + _, invoke = get_invoke("1_single_invoke.f90", api="lfric", idx=0) + sched = invoke.schedule + kern = sched.coded_kernels()[0] + rtrans.apply(kern) + + # Check that the directive has been added to the kernel code + code = fortran_writer(kern.get_kernel_schedule()) + assert expected_directive in code + + def test_1kern_trans(kernel_outputdir): ''' Check that we generate the correct code when an invoke contains the same kernel more than once but only one of them is transformed. ''' diff --git a/src/psyclone/transformations.py b/src/psyclone/transformations.py index d2c3678fc5..9904a36fad 100644 --- a/src/psyclone/transformations.py +++ b/src/psyclone/transformations.py @@ -526,19 +526,35 @@ class OMPDeclareTargetTrans(Transformation, MarkRoutineForGPUMixin): ''' def apply(self, node, options=None): - ''' Insert an OMPDeclareTargetDirective inside the provided routine. + ''' Insert an OMPDeclareTargetDirective inside the provided routine or + associated PSyKAl kernel. - :param node: the PSyIR routine to insert the directive into. - :type node: :py:class:`psyclone.psyir.nodes.Routine` + :param node: the kernel or routine which is the target of this + transformation. + :type node: :py:class:`psyclone.psyir.nodes.Routine` | + :py:class:`psyclone.psyGen.Kern` :param options: a dictionary with options for transformations. :type options: Optional[Dict[str, Any]] + :param bool options["force"]: whether to allow routines with + CodeBlocks to run on the GPU. ''' self.validate(node, options) - for child in node.children: + + if isinstance(node, Kern): + # Flag that the kernel has been modified + node.modified = True + + # Get the schedule representing the kernel subroutine + routine = node.get_kernel_schedule() + else: + routine = node + + for child in routine.children: if isinstance(child, OMPDeclareTargetDirective): return # The routine is already marked with OMPDeclareTarget - node.children.insert(0, OMPDeclareTargetDirective()) + + routine.children.insert(0, OMPDeclareTargetDirective()) def validate(self, node, options=None): ''' Check that an OMPDeclareTargetDirective can be inserted.