madgraph5 · roiser · Jun 10, 2024 · Sep 14, 2023 · Sep 22, 2023 · Oct 5, 2023
@@ -2,9 +2,9 @@ name: C/C++ CI
 
 on:
   push:
-    branches: [ master ]
+    branches: [ master, master_june24 ]
   pull_request:
-    branches: [ master ]
+    branches: [ master, master_june24 ]
 
 jobs:
   debug_builds:

@@ -16,7 +16,7 @@ on:
 
   # Trigger the all-processes workflow for pull requests to master
   pull_request:
-    branches: [ master ]
+    branches: [ master, master_june24 ]
 
   # Trigger the all-processes workflow when new changes to the workflow are pushed
   # (NB: this is now disabled to avoid triggering two jobs when pushing to a branch for which a PR is opened)

@@ -35,7 +35,7 @@ on:
         ###type: string
         type: choice
         # FIXME? Can the list of supported processes be specified only once in oneprocess.yml or allprocesses.yml?
-        options: [gg_tt.mad, gg_ttg.mad, gg_ttgg.mad, gg_ttggg.mad, ee_mumu.mad, nobm_pp_ttW.mad]
+        options: [gg_tt.mad, gg_ttg.mad, gg_ttgg.mad, gg_ttggg.mad, ee_mumu.mad, nobm_pp_ttW.mad, susy_gg_tt.mad]
 
 #----------------------------------------------------------------------------------------------------------------------------------
 

diff --git a/.gitmodules b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "MG5aMC/mg5amcnlo"]
 	path = MG5aMC/mg5amcnlo
 	url = https://github.com/mg5amcnlo/mg5amcnlo
-	branch = gpucpp
+	branch = gpucpp_wrap
diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1 b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
@@ -1,161 +1,3 @@
-diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
-index 4fbb8e6ba..f9e2335de 100644
---- b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
-+++ a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
-@@ -484,23 +484,140 @@ C
-       INTEGER VECSIZE_USED
-
-       INTEGER IVEC
--
--
-+      INTEGER IEXT
-+
-+      INTEGER                    ISUM_HEL
-+      LOGICAL                    MULTI_CHANNEL
-+      COMMON/TO_MATRIX/ISUM_HEL, MULTI_CHANNEL
-+
-+      LOGICAL FIRST_CHID
-+      SAVE FIRST_CHID
-+      DATA FIRST_CHID/.TRUE./
-+      
-+#ifdef MG5AMC_MEEXPORTER_CUDACPP
-+      INCLUDE 'coupl.inc' ! for ALL_G
-+      INCLUDE 'fbridge.inc'
-+      INCLUDE 'fbridge_common.inc'
-+      INCLUDE 'genps.inc'
-+      INCLUDE 'run.inc'
-+      DOUBLE PRECISION OUT2(VECSIZE_MEMMAX)
-+      INTEGER SELECTED_HEL2(VECSIZE_MEMMAX)
-+      INTEGER SELECTED_COL2(VECSIZE_MEMMAX)
-+      DOUBLE PRECISION CBYF1
-+      INTEGER*4 NGOODHEL, NTOTHEL
-+
-+      INTEGER*4 NWARNINGS
-+      SAVE NWARNINGS
-+      DATA NWARNINGS/0/
-+      
-+      LOGICAL FIRST
-+      SAVE FIRST
-+      DATA FIRST/.TRUE./
-+      
-+      IF( FBRIDGE_MODE .LE. 0 ) THEN ! (FortranOnly=0 or BothQuiet=-1 or BothDebug=-2)
-+#endif
-+        call counters_smatrix1multi_start( -1, VECSIZE_USED ) ! fortran=-1
- !$OMP PARALLEL
- !$OMP DO
--      DO IVEC=1, VECSIZE_USED
--        CALL SMATRIX1(P_MULTI(0,1,IVEC),
--     &	                         hel_rand(IVEC),
--     &                           col_rand(IVEC),
--     &				 channel,
--     &                           IVEC,
--     &				 out(IVEC),
--     &				 selected_hel(IVEC),
--     &				 selected_col(IVEC)
--     &				 )
--      ENDDO
-+        DO IVEC=1, VECSIZE_USED
-+          CALL SMATRIX1(P_MULTI(0,1,IVEC),
-+     &      hel_rand(IVEC),
-+     &      col_rand(IVEC),
-+     &      channel,
-+     &      IVEC,
-+     &      out(IVEC),
-+     &      selected_hel(IVEC),
-+     &      selected_col(IVEC)
-+     &      )
-+        ENDDO
- !$OMP END DO
- !$OMP END PARALLEL
-+        call counters_smatrix1multi_stop( -1 ) ! fortran=-1
-+#ifdef MG5AMC_MEEXPORTER_CUDACPP
-+      ENDIF
-+
-+      IF( FBRIDGE_MODE .EQ. 1 .OR. FBRIDGE_MODE .LT. 0 ) THEN ! (CppOnly=1 or BothQuiet=-1 or BothDebug=-2)
-+        IF( LIMHEL.NE.0 ) THEN
-+          WRITE(6,*) 'ERROR! The cudacpp bridge only supports LIMHEL=0'
-+          STOP
-+        ENDIF
-+        IF ( FIRST ) THEN ! exclude first pass (helicity filtering) from timers (#461)
-+          CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled for helicity filtering
-+     &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-+     &      SELECTED_HEL2, SELECTED_COL2 )
-+          FIRST = .FALSE.
-+c         ! This is a workaround for https://github.com/oliviermattelaer/mg5amc_test/issues/22 (see PR #486)
-+          IF( FBRIDGE_MODE .EQ. 1 ) THEN ! (CppOnly=1 : SMATRIX1 is not called at all)
-+            CALL RESET_CUMULATIVE_VARIABLE() ! mimic 'avoid bias of the initialization' within SMATRIX1
-+          ENDIF
-+          CALL FBRIDGEGETNGOODHEL(FBRIDGE_PBRIDGE,NGOODHEL,NTOTHEL)
-+          IF( NTOTHEL .NE. NCOMB ) THEN
-+            WRITE(6,*) 'ERROR! Cudacpp/Fortran mismatch',
-+     &        ' in total number of helicities', NTOTHEL, NCOMB
-+            STOP
-+          ENDIF
-+          WRITE (6,*) 'NGOODHEL =', NGOODHEL
-+          WRITE (6,*) 'NCOMB =', NCOMB
-+        ENDIF
-+        call counters_smatrix1multi_start( 0, VECSIZE_USED ) ! cudacpp=0
-+        IF ( .NOT. MULTI_CHANNEL ) THEN
-+          CALL FBRIDGESEQUENCE_NOMULTICHANNEL( FBRIDGE_PBRIDGE, ! multi channel disabled
-+     &      P_MULTI, ALL_G, HEL_RAND, COL_RAND, OUT2,
-+     &      SELECTED_HEL2, SELECTED_COL2 )
-+        ELSE
-+          IF( SDE_STRAT.NE.1 ) THEN
-+            WRITE(6,*) 'ERROR! The cudacpp bridge requires SDE=1' ! multi channel single-diagram enhancement strategy
-+            STOP
-+          ENDIF
-+          CALL FBRIDGESEQUENCE(FBRIDGE_PBRIDGE, P_MULTI, ALL_G,
-+     &      HEL_RAND, COL_RAND, CHANNEL, OUT2,
-+     &      SELECTED_HEL2, SELECTED_COL2 ) ! 1-N: multi channel enabled
-+        ENDIF
-+        call counters_smatrix1multi_stop( 0 ) ! cudacpp=0
-+      ENDIF
-+
-+      IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
-+        DO IVEC=1, VECSIZE_USED
-+          CBYF1 = OUT2(IVEC)/OUT(IVEC) - 1
-+          FBRIDGE_NCBYF1 = FBRIDGE_NCBYF1 + 1
-+          FBRIDGE_CBYF1SUM = FBRIDGE_CBYF1SUM + CBYF1
-+          FBRIDGE_CBYF1SUM2 = FBRIDGE_CBYF1SUM2 + CBYF1 * CBYF1
-+          IF( CBYF1 .GT. FBRIDGE_CBYF1MAX ) FBRIDGE_CBYF1MAX = CBYF1
-+          IF( CBYF1 .LT. FBRIDGE_CBYF1MIN ) FBRIDGE_CBYF1MIN = CBYF1
-+          IF( FBRIDGE_MODE .EQ. -2 ) THEN ! (BothDebug=-2)
-+            WRITE (*,'(I4,2E16.8,F23.11,I3,I3,I4,I4)')
-+     &        IVEC, OUT(IVEC), OUT2(IVEC), 1+CBYF1,
-+     &        SELECTED_HEL(IVEC), SELECTED_HEL2(IVEC),
-+     &        SELECTED_COL(IVEC), SELECTED_COL2(IVEC)
-+          ENDIF
-+          IF( ABS(CBYF1).GT.5E-5 .AND. NWARNINGS.LT.20 ) THEN
-+            NWARNINGS = NWARNINGS + 1
-+            WRITE (*,'(A,I4,A,I4,2E16.8,F23.11)')
-+     &        'WARNING! (', NWARNINGS, '/20) Deviation more than 5E-5',
-+     &        IVEC, OUT(IVEC), OUT2(IVEC), 1+CBYF1
-+          ENDIF
-+        END DO
-+      ENDIF
-+
-+      IF( FBRIDGE_MODE .EQ. 1 .OR. FBRIDGE_MODE .LT. 0 ) THEN ! (CppOnly=1 or BothQuiet=-1 or BothDebug=-2)
-+        DO IVEC=1, VECSIZE_USED
-+          OUT(IVEC) = OUT2(IVEC) ! use the cudacpp ME instead of the fortran ME!
-+          SELECTED_HEL(IVEC) = SELECTED_HEL2(IVEC) ! use the cudacpp helicity instead of the fortran helicity!
-+          SELECTED_COL(IVEC) = SELECTED_COL2(IVEC) ! use the cudacpp color instead of the fortran color!
-+        END DO
-+      ENDIF
-+#endif
-+
-+      IF ( FIRST_CHID ) THEN
-+        IF ( MULTI_CHANNEL ) THEN
-+          WRITE (*,*) 'MULTI_CHANNEL = TRUE'
-+        ELSE
-+          WRITE (*,*) 'MULTI_CHANNEL = FALSE'
-+        ENDIF
-+        WRITE (*,*) 'CHANNEL_ID =', CHANNEL
-+        FIRST_CHID = .FALSE.
-+      ENDIF
-+
-       RETURN
-       END
-
 diff --git b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
 index 71fbf2b25..0f1d199fc 100644
 --- b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
@@ -67,6 +67,8 @@ def reset_simd(self, old_value, new_value, name):
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
+
+        # ok need to force recompilation of the cpp part
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
@@ -103,7 +105,7 @@ def default_setup(self):
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
         if incname == "vector.inc":
-            if 'vector_size' not in self.user_set: return
+            if 'vector_size' not in self.user_set and 'wrap_size' not in self.user_set: return
             if output_file is None: vectorinc=pjoin(output_dir,incname)
             else: vectorinc=output_file
             with open(vectorinc+'.new','w') as fileout:

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
@@ -107,7 +107,7 @@ namespace mg5amcCpu
      * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
      * @param rndhel the pointer to the input random numbers for helicity selection
      * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
+     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
      * @param mes the pointer to the output matrix elements
      * @param goodHelOnly quit after computing good helicities?
      * @param selhel the pointer to the output selected helicities
@@ -117,7 +117,7 @@ namespace mg5amcCpu
                        const FORTRANFPTYPE* gs,
                        const FORTRANFPTYPE* rndhel,
                        const FORTRANFPTYPE* rndcol,
-                       const unsigned int channelId,
+                       const unsigned int* channelIds,
                        FORTRANFPTYPE* mes,
                        int* selhel,
                        int* selcol,
@@ -130,7 +130,7 @@ namespace mg5amcCpu
      * @param gs the pointer to the input Gs (running QCD coupling constant alphas)
      * @param rndhel the pointer to the input random numbers for helicity selection
      * @param rndcol the pointer to the input random numbers for color selection
-     * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0)
+     * @param channelIds the Feynman diagram to enhance in multi-channel mode if 1 to n
      * @param mes the pointer to the output matrix elements
      * @param selhel the pointer to the output selected helicities
      * @param selcol the pointer to the output selected colors
@@ -140,7 +140,7 @@ namespace mg5amcCpu
                        const FORTRANFPTYPE* gs,
                        const FORTRANFPTYPE* rndhel,
                        const FORTRANFPTYPE* rndcol,
-                       const unsigned int channelId,
+                       const unsigned int* channelIds,
                        FORTRANFPTYPE* mes,
                        int* selhel,
                        int* selcol,
@@ -168,12 +168,14 @@ namespace mg5amcCpu
     DeviceBufferMatrixElements m_devMEs;
     DeviceBufferSelectedHelicity m_devSelHel;
     DeviceBufferSelectedColor m_devSelCol;
+    DeviceBufferChannelIds m_devChanIds;
     PinnedHostBufferGs m_hstGs;
     PinnedHostBufferRndNumHelicity m_hstRndHel;
     PinnedHostBufferRndNumColor m_hstRndCol;
     PinnedHostBufferMatrixElements m_hstMEs;
     PinnedHostBufferSelectedHelicity m_hstSelHel;
     PinnedHostBufferSelectedColor m_hstSelCol;
+    PinnedHostBufferChannelIds m_hstChanIds;
     std::unique_ptr<MatrixElementKernelDevice> m_pmek;
     //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT)
     static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT)
@@ -185,6 +187,7 @@ namespace mg5amcCpu
     HostBufferMatrixElements m_hstMEs;
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
+    HostBufferChannelIds m_hstChanIds;
     std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
@@ -227,6 +230,7 @@ namespace mg5amcCpu
     , m_devMEs( m_nevt )
     , m_devSelHel( m_nevt )
     , m_devSelCol( m_nevt )
+    , m_devChanIds( m_nevt )
 #else
     , m_hstMomentaC( m_nevt )
 #endif
@@ -236,11 +240,15 @@ namespace mg5amcCpu
     , m_hstMEs( m_nevt )
     , m_hstSelHel( m_nevt )
     , m_hstSelCol( m_nevt )
+    , m_hstChanIds( m_nevt )
     , m_pmek( nullptr )
   {
     if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" );
     if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" );
 #ifdef MGONGPUCPP_GPUIMPL
+    // this memory is allocated with cuda/hipMallocHost. The documentation does not guarantuee
+    // that its properly default initialized but we rely on this later on in sigmaKin
+    std::fill_n( m_hstChanIds.data(), m_nevt, 0 );
     if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) )
       throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) );
     while( m_nevt != m_gpublocks * m_gputhreads )
@@ -252,10 +260,10 @@ namespace mg5amcCpu
     }
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
+    m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChanIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
-    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+    m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChanIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only needed to read parameters?
@@ -297,7 +305,7 @@ namespace mg5amcCpu
                                             const FORTRANFPTYPE* gs,
                                             const FORTRANFPTYPE* rndhel,
                                             const FORTRANFPTYPE* rndcol,
-                                            const unsigned int channelId,
+                                            const unsigned int* channelIds,
                                             FORTRANFPTYPE* mes,
                                             int* selhel,
                                             int* selcol,
@@ -327,6 +335,7 @@ namespace mg5amcCpu
       std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() );
       std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
     }
+    if( channelIds ) memcpy( m_hstChanIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
     copyDeviceFromHost( m_devGs, m_hstGs );
     copyDeviceFromHost( m_devRndHel, m_hstRndHel );
     copyDeviceFromHost( m_devRndCol, m_hstRndCol );
@@ -336,7 +345,8 @@ namespace mg5amcCpu
       if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
-    m_pmek->computeMatrixElements( channelId );
+    copyDeviceFromHost( m_devChanIds, m_hstChanIds );
+    m_pmek->computeMatrixElements();
     copyHostFromDevice( m_hstMEs, m_devMEs );
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
@@ -362,7 +372,7 @@ namespace mg5amcCpu
                                             const FORTRANFPTYPE* gs,
                                             const FORTRANFPTYPE* rndhel,
                                             const FORTRANFPTYPE* rndcol,
-                                            const unsigned int channelId,
+                                            const unsigned int* channelIds,
                                             FORTRANFPTYPE* mes,
                                             int* selhel,
                                             int* selcol,
@@ -387,7 +397,8 @@ namespace mg5amcCpu
       if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" );
     }
     if( goodHelOnly ) return;
-    m_pmek->computeMatrixElements( channelId );
+    if( channelIds ) memcpy( m_hstChanIds.data(), channelIds, m_nevt * sizeof( unsigned int ) );
+    m_pmek->computeMatrixElements();
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {