madgraph5 · valassi · May 23, 2023 · May 23, 2023 · May 23, 2023 · May 23, 2023
diff --git a/...CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumbers.h b/...CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumbers.h
@@ -1,88 +1,95 @@
-/*
- * CommonRandomNumbers.h
- *
- *  Created on: 04.11.2020
- *      Author: Stephan Hageboeck
- */
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2023) for the MG5aMC CUDACPP plugin.
 
 #ifndef COMMONRANDOMNUMBERS_H_
-#define COMMONRANDOMNUMBERS_H_
+#define COMMONRANDOMNUMBERS_H_ 1
 
-#include <vector>
+#include <future>
 #include <random>
 #include <thread>
-#include <future>
-
-namespace CommonRandomNumbers {
-
-/// Create `n` random numbers using simple c++ engine.
-template<typename T>
-std::vector<T> generate(std::size_t n, std::minstd_rand::result_type seed = 1337) {
-  std::vector<T> result;
-  result.reserve(n);
-
-  std::minstd_rand generator(seed);
-  std::uniform_real_distribution<T> distribution(0.0, 1.0);
+#include <vector>
 
-  for (std::size_t i=0; i<n; ++i) {
-    result.push_back(distribution(generator));
-  }
+namespace CommonRandomNumbers
+{
 
-  return result;
-}
+  /// Create `n` random numbers using simple c++ engine.
+  template<typename T>
+  std::vector<T> generate( std::size_t n, std::minstd_rand::result_type seed = 1337 )
+  {
+    std::vector<T> result;
+    result.reserve( n );
 
+    std::minstd_rand generator( seed );
+    std::uniform_real_distribution<T> distribution( 0.0, 1.0 );
 
-/// Create `nBlock` blocks of random numbers.
-/// Each block uses a generator that's seeded with `seed + blockIndex`, and blocks are generated in parallel.
-template<typename T>
-std::vector<std::vector<T>> generateParallel(std::size_t nPerBlock, std::size_t nBlock, std::minstd_rand::result_type seed = 1337) {
-  std::vector<std::vector<T>> results(nBlock);
-  std::vector<std::thread> threads;
-  const auto partPerThread = nBlock/std::thread::hardware_concurrency() + (nBlock % std::thread::hardware_concurrency() != 0);
-
-  auto makeBlock = [nPerBlock,nBlock,seed,&results](std::size_t partitionBegin, std::size_t partitionEnd) {
-    for (std::size_t partition = partitionBegin; partition < partitionEnd && partition < nBlock; ++partition) {
-      results[partition] = generate<T>(nPerBlock, seed + partition);
+    for( std::size_t i = 0; i < n; ++i )
+    {
+      result.push_back( distribution( generator ) );
     }
-  };
 
-  for (unsigned int threadId = 0; threadId < std::thread::hardware_concurrency(); ++threadId) {
-    threads.emplace_back(makeBlock, threadId * partPerThread, (threadId+1) * partPerThread);
+    return result;
   }
 
-  for (auto& thread : threads) {
-    thread.join();
-  }
+  /// Create `nBlock` blocks of random numbers.
+  /// Each block uses a generator that's seeded with `seed + blockIndex`, and blocks are generated in parallel.
+  template<typename T>
+  std::vector<std::vector<T>> generateParallel( std::size_t nPerBlock, std::size_t nBlock, std::minstd_rand::result_type seed = 1337 )
+  {
+    std::vector<std::vector<T>> results( nBlock );
+    std::vector<std::thread> threads;
+    const auto partPerThread = nBlock / std::thread::hardware_concurrency() + ( nBlock % std::thread::hardware_concurrency() != 0 );
+
+    auto makeBlock = [nPerBlock, nBlock, seed, &results]( std::size_t partitionBegin, std::size_t partitionEnd )
+    {
+      for( std::size_t partition = partitionBegin; partition < partitionEnd && partition < nBlock; ++partition )
+      {
+        results[partition] = generate<T>( nPerBlock, seed + partition );
+      }
+    };
+
+    for( unsigned int threadId = 0; threadId < std::thread::hardware_concurrency(); ++threadId )
+    {
+      threads.emplace_back( makeBlock, threadId * partPerThread, ( threadId + 1 ) * partPerThread );
+    }
 
-  return results;
-}
+    for( auto& thread: threads )
+    {
+      thread.join();
+    }
 
+    return results;
+  }
 
-/// Starts asynchronous generation of random numbers. This uses as many threads as cores, and generates blocks of random numbers.
-/// These become available at unspecified times, but the blocks 0, 1, 2, ... are generated first.
-/// Each block is seeded with seed + blockIndex to generate stable sequences.
-/// \param[in/out] promises Vector of promise objects storing blocks of random numbers.
-/// \param[in] nPerBlock Configures number of entries generated per block.
-/// \param[in] nBlock Configures the number of blocks generated.
-/// \param[in] nThread Optional concurrency.
-/// \param[in] seed Optional seed.
-template<typename T>
-void startGenerateAsync(std::vector<std::promise<std::vector<T>>>& promises, std::size_t nPerBlock, std::size_t nBlock,
-    unsigned int nThread = std::thread::hardware_concurrency(), std::minstd_rand::result_type seed = 1337) {
-  promises.resize(nBlock);
-  std::vector<std::thread> threads;
-
-  auto makeBlocks = [=,&promises](std::size_t threadID) {
-    for (std::size_t partition = threadID; partition < nBlock; partition += nThread) {
-      auto values = generate<T>(nPerBlock, seed + partition);
-      promises[partition].set_value(std::move(values));
+  /// Starts asynchronous generation of random numbers. This uses as many threads as cores, and generates blocks of random numbers.
+  /// These become available at unspecified times, but the blocks 0, 1, 2, ... are generated first.
+  /// Each block is seeded with seed + blockIndex to generate stable sequences.
+  /// \param[in/out] promises Vector of promise objects storing blocks of random numbers.
+  /// \param[in] nPerBlock Configures number of entries generated per block.
+  /// \param[in] nBlock Configures the number of blocks generated.
+  /// \param[in] nThread Optional concurrency.
+  /// \param[in] seed Optional seed.
+  template<typename T>
+  void startGenerateAsync( std::vector<std::promise<std::vector<T>>>& promises, std::size_t nPerBlock, std::size_t nBlock, unsigned int nThread = std::thread::hardware_concurrency(), std::minstd_rand::result_type seed = 1337 )
+  {
+    promises.resize( nBlock );
+    std::vector<std::thread> threads;
+
+    auto makeBlocks = [=, &promises]( std::size_t threadID )
+    {
+      for( std::size_t partition = threadID; partition < nBlock; partition += nThread )
+      {
+        auto values = generate<T>( nPerBlock, seed + partition );
+        promises[partition].set_value( std::move( values ) );
+      }
+    };
+
+    for( unsigned int threadId = 0; threadId < nThread; ++threadId )
+    {
+      std::thread( makeBlocks, threadId ).detach();
     }
-  };
-
-  for (unsigned int threadId = 0; threadId < nThread; ++threadId) {
-    std::thread(makeBlocks, threadId).detach();
   }
-}
 
 }
 

diff --git a/...hX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/...hX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -39,10 +39,6 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
-# Dependency on tools directory
-TOOLSDIR = ../../../../../tools
-INCFLAGS += -I$(TOOLSDIR)
-
 # Dependency on test directory
 TESTDIR  = ../../../../../test
 GTESTLIBDIR = $(TESTDIR)/googletest/build/lib/

diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00481867790222168 [0m
+[1;32mDEBUG: model prefixing  takes 0.004959821701049805 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -176,7 +176,7 @@ INFO: Creating files in directory P1_epem_mupmum
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.__init__ [1;30m[model_handling.py at line 1027][0m [0m
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1028][0m [0m
 [1;32mDEBUG:  proc_id = [0m 1 [1;30m[model_handling.py at line 1033][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fb98cd4f430> [1;30m[export_v4.py at line 6163][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_SA_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe3be9aa430> [1;30m[export_v4.py at line 6163][0m [0m
 INFO: Creating files in directory . 
 [1;32mDEBUG:  Entering PLUGIN_OneProcessExporter.generate_process_files [1;30m[model_handling.py at line 1281][0m [0m
 [1;32mDEBUG:  self.include_multi_channel is already defined: this is madevent+second_exporter mode [1;30m[model_handling.py at line 1283][0m [0m
@@ -208,20 +208,20 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;34mWARNING: vector code for lepton pdf not implemented. We removed the option to run dressed lepton [0m
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.088 s
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
+Wrote files for 8 helas calls in 0.087 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.173 s
+ALOHA: aloha creates 3 routines in  0.174 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 187][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.216 s
+ALOHA: aloha creates 7 routines in  0.224 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -259,6 +259,6 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.847s
-user	0m1.529s
-sys	0m0.217s
+real	0m1.810s
+user	0m1.562s
+sys	0m0.223s
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumbers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumbers.h
@@ -0,0 +1,96 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2023) for the MG5aMC CUDACPP plugin.
+
+#ifndef COMMONRANDOMNUMBERS_H_
+#define COMMONRANDOMNUMBERS_H_ 1
+
+#include <future>
+#include <random>
+#include <thread>
+#include <vector>
+
+namespace CommonRandomNumbers
+{
+
+  /// Create `n` random numbers using simple c++ engine.
+  template<typename T>
+  std::vector<T> generate( std::size_t n, std::minstd_rand::result_type seed = 1337 )
+  {
+    std::vector<T> result;
+    result.reserve( n );
+
+    std::minstd_rand generator( seed );
+    std::uniform_real_distribution<T> distribution( 0.0, 1.0 );
+
+    for( std::size_t i = 0; i < n; ++i )
+    {
+      result.push_back( distribution( generator ) );
+    }
+
+    return result;
+  }
+
+  /// Create `nBlock` blocks of random numbers.
+  /// Each block uses a generator that's seeded with `seed + blockIndex`, and blocks are generated in parallel.
+  template<typename T>
+  std::vector<std::vector<T>> generateParallel( std::size_t nPerBlock, std::size_t nBlock, std::minstd_rand::result_type seed = 1337 )
+  {
+    std::vector<std::vector<T>> results( nBlock );
+    std::vector<std::thread> threads;
+    const auto partPerThread = nBlock / std::thread::hardware_concurrency() + ( nBlock % std::thread::hardware_concurrency() != 0 );
+
+    auto makeBlock = [nPerBlock, nBlock, seed, &results]( std::size_t partitionBegin, std::size_t partitionEnd )
+    {
+      for( std::size_t partition = partitionBegin; partition < partitionEnd && partition < nBlock; ++partition )
+      {
+        results[partition] = generate<T>( nPerBlock, seed + partition );
+      }
+    };
+
+    for( unsigned int threadId = 0; threadId < std::thread::hardware_concurrency(); ++threadId )
+    {
+      threads.emplace_back( makeBlock, threadId * partPerThread, ( threadId + 1 ) * partPerThread );
+    }
+
+    for( auto& thread: threads )
+    {
+      thread.join();
+    }
+
+    return results;
+  }
+
+  /// Starts asynchronous generation of random numbers. This uses as many threads as cores, and generates blocks of random numbers.
+  /// These become available at unspecified times, but the blocks 0, 1, 2, ... are generated first.
+  /// Each block is seeded with seed + blockIndex to generate stable sequences.
+  /// \param[in/out] promises Vector of promise objects storing blocks of random numbers.
+  /// \param[in] nPerBlock Configures number of entries generated per block.
+  /// \param[in] nBlock Configures the number of blocks generated.
+  /// \param[in] nThread Optional concurrency.
+  /// \param[in] seed Optional seed.
+  template<typename T>
+  void startGenerateAsync( std::vector<std::promise<std::vector<T>>>& promises, std::size_t nPerBlock, std::size_t nBlock, unsigned int nThread = std::thread::hardware_concurrency(), std::minstd_rand::result_type seed = 1337 )
+  {
+    promises.resize( nBlock );
+    std::vector<std::thread> threads;
+
+    auto makeBlocks = [=, &promises]( std::size_t threadID )
+    {
+      for( std::size_t partition = threadID; partition < nBlock; partition += nThread )
+      {
+        auto values = generate<T>( nPerBlock, seed + partition );
+        promises[partition].set_value( std::move( values ) );
+      }
+    };
+
+    for( unsigned int threadId = 0; threadId < nThread; ++threadId )
+    {
+      std::thread( makeBlocks, threadId ).detach();
+    }
+  }
+
+}
+
+#endif /* COMMONRANDOMNUMBERS_H_ */
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CommonRandomNumbers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CommonRandomNumbers.h
@@ -0,0 +1 @@
+../CommonRandomNumbers.h
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -39,10 +39,6 @@ MG5AMC_COMMONLIB = mg5amc_common
 LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
 INCFLAGS += -I../../src
 
-# Dependency on tools directory
-TOOLSDIR = ../../../../../tools
-INCFLAGS += -I$(TOOLSDIR)
-
 # Dependency on test directory
 TESTDIR  = ../../../../../test
 GTESTLIBDIR = $(TESTDIR)/googletest/build/lib/

diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0048007965087890625 [0m
+[1;32mDEBUG: model prefixing  takes 0.004700660705566406 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -201,7 +201,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.331 s
+ALOHA: aloha creates 4 routines in  0.232 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -230,6 +230,6 @@ INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_ee_mumu/src/.
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.finalize [1;30m[output.py at line 196][0m [0m
 quit
 
-real	0m0.795s
-user	0m0.591s
-sys	0m0.051s
+real	0m1.242s
+user	0m0.577s
+sys	0m0.061s