madgraph5 · valassi · Jan 26, 2022 · Jan 26, 2022 · Jan 26, 2022 · Jan 26, 2022
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
@@ -174,7 +174,7 @@ template <typename T>
 void Bridge<T>::gpu_sequence( const T *momenta, T *mes, const bool goodHelOnly )
 {
   constexpr int neppM = MemoryAccessMomenta::neppM;
-  if ( neppM == 1 ) // eventually move to "if constexpr" (need c++17, not available in cuda 11.1)
+  if constexpr ( neppM == 1 ) // needs c++17 and cuda >=11.2 (#333)
   {
     checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
   }
@@ -297,7 +297,7 @@ void hst_transposeMomentaF2C( const T *in, T *out, const int evt )
   constexpr int npar = mgOnGpu::npar;
   constexpr int np4 = mgOnGpu::np4;
   constexpr int neppM = MemoryAccessMomenta::neppM;
-  if ( neppM == 1 ) // eventually move to "if constexpr" (need c++17, not available in cuda 11.1)
+  if constexpr ( neppM == 1 ) // needs c++17 and cuda >=11.2 (#333)
   {
     memcpy( out, in, evt * npar * np4 * sizeof(T) );
   }

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Makefile b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Makefile
@@ -27,13 +27,12 @@ TESTDIR  = ../../../../../test
 GTESTLIBDIR = $(TESTDIR)/googletest/build/lib/
 GTESTLIBS   = $(GTESTLIBDIR)/libgtest.a $(GTESTLIBDIR)/libgtest_main.a
 
-# OpenMP flags
+# OpenMP flags (on gcc this requires gcc>=9.3, issue #269)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
 override OMPFLAGS =
-else ifeq ($(shell $(CXX) --version | grep GCC | cut -d' ' -f3 | cut -d'.' -f1),9) # disable OMP on gcc9 (issue #269)
-override OMPFLAGS =
-OMPFLAGS ?= -fopenmp
 endif
+###OMPFLAGS ?= -fopenmp # TEMPORARELY DISABLE OMP (need to reassess MT)
+override OMPFLAGS = # TEMPORARELY DISABLE OMP (need to reassess MT)
 $(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
@@ -99,17 +98,8 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
   CUOPTFLAGS  = -lineinfo
   CUFLAGS     = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
   ###CUFLAGS    += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  # TEMPORARY! For as long as cuda 11.0/11.1 are supported (issues #282 and #292)
-  # (Eventually, use only c++17; previously, c++14 in cuda110/111 and c++17 above; now c++14 everywhere)
-  ###ifeq ($(NVCC_VERSION),11.0)
-  ###  CUFLAGS += -std=c++14
-  ###else ifeq ($(NVCC_VERSION),11.1)
-  ###  CUFLAGS += -std=c++14
-  ###else
-  ###  CUFLAGS += -std=c++17
-  ###endif
-  CUFLAGS += -std=c++14
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
   ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
   ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
@@ -323,7 +313,7 @@ $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MODELLIB).a $(cu_objects) $(c
 	$(NVCC) $< -o $@ $(cu_objects) $(cucxx_objects) $(CUARCHFLAGS) $(LIBFLAGS) $(CULIBFLAGS)
 
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MODELLIB).a $(cxx_objects) $(cucxx_objects)
-	$(CXX) $< -o $@ $(cxx_objects) $(cucxx_objects) $(CPPFLAGS) $(CXXFLAGS) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS)
+	$(CXX) $< -o $@ $(cxx_objects) $(cucxx_objects) $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS)
 
 $(BUILDDIR)/testxxx.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include
@@ -373,11 +363,11 @@ endif
 ifeq ($(NVCC),)
 # Link only runTest.o
 $(testmain): $(LIBDIR)/lib$(MODELLIB).a $(cxx_objects) $(GTESTLIBS)
-	$(CXX) -o $@ $(cxx_objects) $(CPPFLAGS) $(CXXFLAGS) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS)
+	$(CXX) -o $@ $(cxx_objects) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS)
 else
 # Link both runTest.o and runTest_cu.o
 $(testmain): $(LIBDIR)/lib$(MODELLIB).a $(cxx_objects) $(cu_objects) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects) $(cu_objects) $(CPPFLAGS) $(CUFLAGS) -ldl $(LIBFLAGS) $(CULIBFLAGS) -lcuda -lgomp
+	$(NVCC) -o $@ $(cxx_objects) $(cu_objects) -ldl $(LIBFLAGS) $(CULIBFLAGS) -lcuda -lgomp
 endif
 
 $(GTESTLIBS):

diff --git a/...CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h b/...CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h
@@ -97,8 +97,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
   __host__ __device__ inline
   fptype* kernelAccessRecord( fptype* buffer )
   {
-    //if constexpr ( !onDevice ) // FIXME! enable this when we move to nvcc supporting c++17
-    if ( !onDevice )
+    if constexpr ( !onDevice ) // requires c++17 also in CUDA (#333)
     {
       return T::ieventAccessRecord( buffer, 0 );
     }

diff --git a/...dacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/...dacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -109,7 +109,7 @@ namespace mgOnGpu
 
   // Alignment requirement for using reinterpret_cast with SIMD vectorized code
   // (using reinterpret_cast with non aligned memory may lead to segmentation faults!)
-  // [NB eventually define this also for code built with nvcc (#318), however this requires c++17 and nvcc >= 11.2]
+  // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333)
 #ifndef __CUDACC__
   constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit)
 #endif
@@ -190,4 +190,24 @@ using mgOnGpu::fptype;
 // For SANITY CHECKS: check that neppR, neppM, neppV... are powers of two (https://stackoverflow.com/a/108360)
 inline constexpr bool ispoweroftwo( int n ){ return ( n > 0 ) && !( n & ( n - 1 ) ); }
 
+// Compiler version support (#96): require nvcc from CUDA >= 11.2, e.g. to use C++17 (see #333)
+#ifdef __NVCC__
+#if ( __CUDACC_VER_MAJOR__ < 11 ) || ( __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ < 2 )
+#error Unsupported CUDA version: please use CUDA >= 11.2
+#endif
+#endif
+
+// Compiler version support (#96): require clang >= 11
+#if defined __clang__
+#if ( __clang_major__ < 11 )
+#error Unsupported clang version: please use clang >= 11
+#endif
+// Compiler version support (#96): require gcc >= 9.3, e.g. for some OMP issues (see #269)
+// [NB skip this check for the gcc toolchain below clang or icx (TEMPORARY? #355)]
+#elif defined __GNUC__
+#if ( __GNUC__ < 9 ) || ( __GNUC__ == 9 && __GNUC_MINOR__ < 3 )
+#error Unsupported gcc version: please gcc >= 9.3
+#endif
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/...GIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/...GIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
@@ -135,7 +135,10 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-#ifdef __CUDACC__
+    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
+    // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
+#ifdef __NVCC__
 #if defined __CUDACC_VER_MAJOR__ && defined __CUDACC_VER_MINOR__ && defined __CUDACC_VER_BUILD__
     out << "nvcc " << __CUDACC_VER_MAJOR__ << "." << __CUDACC_VER_MINOR__ << "." << __CUDACC_VER_BUILD__;
 #else
@@ -148,7 +151,7 @@ namespace mg5amcCpu
 #error "icc is no longer supported: please use icx"
 #elif defined __INTEL_LLVM_COMPILER // alternative: __INTEL_CLANG_COMPILER
     out << "icx " << __INTEL_LLVM_COMPILER;
-#ifdef __CUDACC__
+#ifdef __NVCC__
     out << ", ";
 #else
     out << " (";
@@ -166,7 +169,7 @@ namespace mg5amcCpu
     std::array<char, 128> tchainbuf;
     while ( fgets( tchainbuf.data(), tchainbuf.size(), tchainpipe.get() ) != nullptr ) tchainout += tchainbuf.data();
     tchainout.pop_back(); // remove trailing newline
-#if defined __CUDACC__ or defined __INTEL_LLVM_COMPILER
+#if defined __NVCC__ or defined __INTEL_LLVM_COMPILER
     out << ", gcc " << tchainout;
 #else
     out << " (gcc " << tchainout << ")";
@@ -182,7 +185,7 @@ namespace mg5amcCpu
     out << "gcc UNKNOWKN";
 #endif
 #endif
-#if defined __CUDACC__ or defined __INTEL_LLVM_COMPILER
+#if defined __NVCC__ or defined __INTEL_LLVM_COMPILER
     out << ")";
 #endif
     return out.str();

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
@@ -887,9 +887,9 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
     // - private: give each thread its own copy, without initialising
     // - firstprivate: give each thread its own copy, and initialise with value from outside
 #ifdef MGONGPU_CPPSIMD
-#pragma omp parallel for default(none) shared(allmomenta,allMEs,cHel,cIPC,cIPD,ihel,npagV,isAligned_allMEs) private (amp_sv,w_sv,jamp_sv)
+#pragma omp parallel for default(none) shared(allmomenta,allMEs,cHel,cIPC,cIPD,ihel,npagV,amp_fp,w_fp,isAligned_allMEs) private (amp_sv,w_sv,jamp_sv)
 #else
-#pragma omp parallel for default(none) shared(allmomenta,allMEs,cHel,cIPC,cIPD,ihel,npagV) private (amp_sv,w_sv,jamp_sv)
+#pragma omp parallel for default(none) shared(allmomenta,allMEs,cHel,cIPC,cIPD,ihel,npagV,amp_fp,w_fp) private (amp_sv,w_sv,jamp_sv)
 #endif
 #endif
     for ( int ipagV = 0; ipagV < npagV; ++ipagV )

diff --git a/epochX/cudacpp/CODEGEN/syncManu.sh b/epochX/cudacpp/CODEGEN/syncManu.sh
@@ -2,7 +2,9 @@
 
 eemumu=0
 ggtt=0
+ggttg=0
 ggttgg=0
+ggttggg=0
 
 function usage()
 {
@@ -17,18 +19,26 @@ while [ "$1" != "" ]; do
   elif [ "$1" == "-ggtt" ]; then
     ggtt=1
     shift
+  elif [ "$1" == "-ggttg" ]; then
+    ggttg=1
+    shift
   elif [ "$1" == "-ggttgg" ]; then
     ggttgg=1
     shift
+  elif [ "$1" == "-ggttggg" ]; then
+    ggttggg=1
+    shift
   else
     usage
   fi
 done
 
 # Check that at least one process has been selected
 processes=
-if [ "${ggttgg}" == "1" ]; then processes="gg_ttgg $processes"; fi
 if [ "${ggtt}" == "1" ]; then processes="gg_tt $processes"; fi
+if [ "${ggttg}" == "1" ]; then processes="gg_ttg $processes"; fi
+if [ "${ggttgg}" == "1" ]; then processes="gg_ttgg $processes"; fi
+if [ "${ggttggg}" == "1" ]; then processes="gg_ttggg $processes"; fi
 if [ "${eemumu}" == "1" ]; then processes="ee_mumu $processes"; fi
 if [ "${processes}" == "" ]; then usage; fi
 

diff --git a/epochX/cudacpp/ee_mumu.auto/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.auto/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -50,7 +50,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006892681121826172 [0m
+[1;32mDEBUG: model prefixing  takes 0.006869077682495117 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 INFO: Change particles name to pass to MG5 convention 
 [1;32mDEBUG:  True [1;30m[misc.py at line 2192][0m [0m
@@ -120,6 +120,6 @@ INFO: /data/avalassi/GPU2020/MG5aMC/2.7.0_gpu/CODEGEN_cudacpp_ee_mumu/src/. and
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 178][0m [0m
 quit
 
-real	0m3.732s
-user	0m0.933s
-sys	0m0.140s
+real	0m3.952s
+user	0m0.946s
+sys	0m0.125s
diff --git a/epochX/cudacpp/ee_mumu.auto/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.auto/SubProcesses/Bridge.h
@@ -174,7 +174,7 @@ template <typename T>
 void Bridge<T>::gpu_sequence( const T *momenta, T *mes, const bool goodHelOnly )
 {
   constexpr int neppM = MemoryAccessMomenta::neppM;
-  if ( neppM == 1 ) // eventually move to "if constexpr" (need c++17, not available in cuda 11.1)
+  if constexpr ( neppM == 1 ) // needs c++17 and cuda >=11.2 (#333)
   {
     checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) );
   }
@@ -297,7 +297,7 @@ void hst_transposeMomentaF2C( const T *in, T *out, const int evt )
   constexpr int npar = mgOnGpu::npar;
   constexpr int np4 = mgOnGpu::np4;
   constexpr int neppM = MemoryAccessMomenta::neppM;
-  if ( neppM == 1 ) // eventually move to "if constexpr" (need c++17, not available in cuda 11.1)
+  if constexpr ( neppM == 1 ) // needs c++17 and cuda >=11.2 (#333)
   {
     memcpy( out, in, evt * npar * np4 * sizeof(T) );
   }

diff --git a/epochX/cudacpp/ee_mumu.auto/SubProcesses/Makefile b/epochX/cudacpp/ee_mumu.auto/SubProcesses/Makefile
@@ -27,13 +27,12 @@ TESTDIR  = ../../../../../test
 GTESTLIBDIR = $(TESTDIR)/googletest/build/lib/
 GTESTLIBS   = $(GTESTLIBDIR)/libgtest.a $(GTESTLIBDIR)/libgtest_main.a
 
-# OpenMP flags
+# OpenMP flags (on gcc this requires gcc>=9.3, issue #269)
 ifneq ($(shell $(CXX) --version | grep ^Intel),)
 override OMPFLAGS =
-else ifeq ($(shell $(CXX) --version | grep GCC | cut -d' ' -f3 | cut -d'.' -f1),9) # disable OMP on gcc9 (issue #269)
-override OMPFLAGS =
-OMPFLAGS ?= -fopenmp
 endif
+###OMPFLAGS ?= -fopenmp # TEMPORARELY DISABLE OMP (need to reassess MT)
+override OMPFLAGS = # TEMPORARELY DISABLE OMP (need to reassess MT)
 $(info OMPFLAGS=$(OMPFLAGS))
 CXXFLAGS += $(OMPFLAGS)
 
@@ -99,17 +98,8 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),)
   CUOPTFLAGS  = -lineinfo
   CUFLAGS     = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math
   ###CUFLAGS    += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow
-  NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
-  # TEMPORARY! For as long as cuda 11.0/11.1 are supported (issues #282 and #292)
-  # (Eventually, use only c++17; previously, c++14 in cuda110/111 and c++17 above; now c++14 everywhere)
-  ###ifeq ($(NVCC_VERSION),11.0)
-  ###  CUFLAGS += -std=c++14
-  ###else ifeq ($(NVCC_VERSION),11.1)
-  ###  CUFLAGS += -std=c++14
-  ###else
-  ###  CUFLAGS += -std=c++17
-  ###endif
-  CUFLAGS += -std=c++14
+  ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1)
+  CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h
   # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12)
   ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12)
   ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12)
@@ -323,7 +313,7 @@ $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MODELLIB).a $(cu_objects) $(c
 	$(NVCC) $< -o $@ $(cu_objects) $(cucxx_objects) $(CUARCHFLAGS) $(LIBFLAGS) $(CULIBFLAGS)
 
 $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MODELLIB).a $(cxx_objects) $(cucxx_objects)
-	$(CXX) $< -o $@ $(cxx_objects) $(cucxx_objects) $(CPPFLAGS) $(CXXFLAGS) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS)
+	$(CXX) $< -o $@ $(cxx_objects) $(cucxx_objects) $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS)
 
 $(BUILDDIR)/testxxx.o: $(GTESTLIBS)
 $(BUILDDIR)/testxxx.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include
@@ -373,11 +363,11 @@ endif
 ifeq ($(NVCC),)
 # Link only runTest.o
 $(testmain): $(LIBDIR)/lib$(MODELLIB).a $(cxx_objects) $(GTESTLIBS)
-	$(CXX) -o $@ $(cxx_objects) $(CPPFLAGS) $(CXXFLAGS) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS)
+	$(CXX) -o $@ $(cxx_objects) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS)
 else
 # Link both runTest.o and runTest_cu.o
 $(testmain): $(LIBDIR)/lib$(MODELLIB).a $(cxx_objects) $(cu_objects) $(GTESTLIBS)
-	$(NVCC) -o $@ $(cxx_objects) $(cu_objects) $(CPPFLAGS) $(CUFLAGS) -ldl $(LIBFLAGS) $(CULIBFLAGS) -lcuda -lgomp
+	$(NVCC) -o $@ $(cxx_objects) $(cu_objects) -ldl $(LIBFLAGS) $(CULIBFLAGS) -lcuda -lgomp
 endif
 
 $(GTESTLIBS):

diff --git a/epochX/cudacpp/ee_mumu.auto/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.auto/SubProcesses/MemoryAccessHelpers.h
@@ -97,8 +97,7 @@ class KernelAccessHelper : public MemoryAccessHelper<T>
   __host__ __device__ inline
   fptype* kernelAccessRecord( fptype* buffer )
   {
-    //if constexpr ( !onDevice ) // FIXME! enable this when we move to nvcc supporting c++17
-    if ( !onDevice )
+    if constexpr ( !onDevice ) // requires c++17 also in CUDA (#333)
     {
       return T::ieventAccessRecord( buffer, 0 );
     }

diff --git a/epochX/cudacpp/ee_mumu.auto/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.auto/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
@@ -136,9 +136,9 @@ namespace mg5amcCpu
     // - private: give each thread its own copy, without initialising
     // - firstprivate: give each thread its own copy, and initialise with value from outside
 #ifdef MGONGPU_CPPSIMD
-#pragma omp parallel for default(none) shared(allmomenta,allMEs,cHel,cIPC,cIPD,ihel,npagV,isAligned_allMEs) private (amp_sv,w_sv,jamp_sv)
+#pragma omp parallel for default(none) shared(allmomenta,allMEs,cHel,cIPC,cIPD,ihel,npagV,amp_fp,w_fp,isAligned_allMEs) private (amp_sv,w_sv,jamp_sv)
 #else
-#pragma omp parallel for default(none) shared(allmomenta,allMEs,cHel,cIPC,cIPD,ihel,npagV) private (amp_sv,w_sv,jamp_sv)
+#pragma omp parallel for default(none) shared(allmomenta,allMEs,cHel,cIPC,cIPD,ihel,npagV,amp_fp,w_fp) private (amp_sv,w_sv,jamp_sv)
 #endif
 #endif
     for ( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -356,7 +356,10 @@ namespace mg5amcCpu
   {
     std::stringstream out;
     // CUDA version (NVCC)
-#ifdef __CUDACC__
+    // [Use __NVCC__ instead of __CUDACC__ here!]
+    // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file]
+    // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712]
+#ifdef __NVCC__
 #if defined __CUDACC_VER_MAJOR__ && defined __CUDACC_VER_MINOR__ && defined __CUDACC_VER_BUILD__
     out << "nvcc " << __CUDACC_VER_MAJOR__ << "." << __CUDACC_VER_MINOR__ << "." << __CUDACC_VER_BUILD__;
 #else
@@ -369,7 +372,7 @@ namespace mg5amcCpu
 #error "icc is no longer supported: please use icx"
 #elif defined __INTEL_LLVM_COMPILER // alternative: __INTEL_CLANG_COMPILER
     out << "icx " << __INTEL_LLVM_COMPILER;
-#ifdef __CUDACC__
+#ifdef __NVCC__
     out << ", ";
 #else
     out << " (";
@@ -387,7 +390,7 @@ namespace mg5amcCpu
     std::array<char, 128> tchainbuf;
     while ( fgets( tchainbuf.data(), tchainbuf.size(), tchainpipe.get() ) != nullptr ) tchainout += tchainbuf.data();
     tchainout.pop_back(); // remove trailing newline
-#if defined __CUDACC__ or defined __INTEL_LLVM_COMPILER
+#if defined __NVCC__ or defined __INTEL_LLVM_COMPILER
     out << ", gcc " << tchainout;
 #else
     out << " (gcc " << tchainout << ")";
@@ -403,7 +406,7 @@ namespace mg5amcCpu
     out << "gcc UNKNOWKN";
 #endif
 #endif
-#if defined __CUDACC__ or defined __INTEL_LLVM_COMPILER
+#if defined __NVCC__ or defined __INTEL_LLVM_COMPILER
     out << ")";
 #endif
     return out.str();