Merge branch 'UpdateBenchmarks' into 'master'

Update benchmarks See merge request walberla/walberla!685
lssfau · Oct 8, 2024 · fb98460 · fb98460
2 parents 0c0092f + a74df61
commit fb98460
Show file tree

Hide file tree

Showing 94 changed files with 5,231 additions and 5,245 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ qrc_*
 
 # CLion indexing
 *.uuid
+.fleet
 
 
 # Generated files
@@ -32,11 +33,13 @@ qrc_*
 # Visual Studio Code
 /.vscode
 
+# Zed
+/.cache*
+
 # CLion
 *.idea
 *.clion*
 
-
 # QtCreator
 CMakeLists.txt.user.*
 

diff --git a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
@@ -11,11 +11,6 @@ waLBerla_generate_target_from_python(NAME NonUniformGridCPUGenerated
         NonUniformGridCPUBoundaryCollection.h
         NonUniformGridCPUInfoHeader.h)
 
-waLBerla_add_executable( NAME NonUniformGridGenerator
-                         FILES NonUniformGridGenerator.cpp LdcSetup.h
-                         DEPENDS blockforest core field python_coupling )
-
-
 waLBerla_add_executable( NAME NonUniformGridCPU
-                         FILES NonUniformGridCPU.cpp LdcSetup.h
+                         FILES NonUniformGridCPU.cpp LdcSetup.h GridGeneration.h
                          DEPENDS blockforest boundary core domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridCPUGenerated )
diff --git a/apps/benchmarks/NonUniformGridCPU/GridGeneration.h b/apps/benchmarks/NonUniformGridCPU/GridGeneration.h
@@ -0,0 +1,144 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GridGeneration.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "blockforest/Initialization.h"
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/loadbalancing/StaticCurve.h"
+
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include <string>
+
+#include "LdcSetup.h"
+#include "NonUniformGridCPUInfoHeader.h"
+
+using StorageSpecification_T = lbm::NonUniformGridCPUStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+
+using namespace walberla;
+
+void createSetupBlockForest(SetupBlockForest& setupBfs,
+                            const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup,
+                            const bool useMPIManager=false)
+{
+   WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+
+   Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
+   Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+   Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
+   Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
+
+   const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
+   uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
+   const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest");
+   const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false);
+   const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false);
+
+   if(useMPIManager)
+      numProcesses = uint_c(mpi::MPIManager::instance()->numProcesses());
+
+   const LDC ldc(refinementDepth);
+
+   auto refSelection = ldc.refinementSelector();
+   setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
+   const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
+   setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
+   setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
+   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
+
+   if(mpi::MPIManager::instance()->numProcesses() > 1)
+      return;
+
+   {
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      setupBfs.saveToFile(oss.str().c_str());
+   }
+
+   if(writeVtk){
+      setupBfs.writeVTKOutput(blockForestFilestem);
+   }
+
+   if(outputStatistics){
+      WALBERLA_LOG_INFO_ON_ROOT("===========================  BLOCK FOREST STATISTICS ============================");
+      WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+      for (uint_t level = 0; level <= refinementDepth; level++)
+      {
+         const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
+         WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
+      }
+
+      const real_t avgBlocksPerProc = real_c(setupBfs.getNumberOfBlocks()) / real_c(setupBfs.getNumberOfProcesses());
+      WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);
+
+      const uint_t totalNumberCells = setupBfs.getNumberOfBlocks() * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2];
+      const real_t averageCellsPerGPU = avgBlocksPerProc * real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
+
+      const uint_t PDFsPerCell = StorageSpecification_T::inplace ? Stencil_T::Q : 2 * Stencil_T::Q;
+      const uint_t valuesPerCell = (PDFsPerCell + VelocityField_T::F_SIZE + ScalarField_T::F_SIZE);
+      const uint_t sizePerValue = sizeof(StorageSpecification_T::value_type);
+      const double expectedMemory = double_c(totalNumberCells * valuesPerCell * sizePerValue) * 1e-9;
+      const double expectedMemoryPerGPU = double_c(averageCellsPerGPU * valuesPerCell * sizePerValue) * 1e-9;
+
+      WALBERLA_LOG_INFO_ON_ROOT( "Total number of cells will be " << totalNumberCells << " fluid cells (in total on all levels)")
+      WALBERLA_LOG_INFO_ON_ROOT( "Expected total memory demand will be " << expectedMemory << " GB")
+      WALBERLA_LOG_INFO_ON_ROOT( "Average memory demand per GPU will be " << expectedMemoryPerGPU << " GB")
+
+      WALBERLA_LOG_INFO_ON_ROOT("=================================================================================");
+   }
+}
+
+void createBlockForest(shared_ptr< BlockForest >& bfs,
+                       const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup)
+{
+   if (mpi::MPIManager::instance()->numProcesses() > 1)
+   {
+      const std::string blockForestFilestem =
+         blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
+      // Load structured block forest from file
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      const std::string setupBlockForestFilepath = oss.str();
+      std::ifstream infile(setupBlockForestFilepath.c_str());
+      if(!infile.good())
+      {
+         WALBERLA_LOG_WARNING_ON_ROOT("Blockforest was not created beforehand and thus needs to be created on the fly. For large simulation runs this can be a severe problem!")
+         SetupBlockForest setupBfs;
+         createSetupBlockForest(setupBfs, domainSetup, blockForestSetup, true);
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+      }
+      else
+      {
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
+                                               setupBlockForestFilepath.c_str(), false);
+      }
+   }
+   else
+   {
+      SetupBlockForest setupBfs;
+      createSetupBlockForest(setupBfs, domainSetup, blockForestSetup);
+      bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+   }
+}
diff --git a/apps/benchmarks/NonUniformGridCPU/LdcSetup.h b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h
@@ -48,14 +48,8 @@ class LDCRefinement
    {
       const AABB & domain = forest.getDomain();
 
-      const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
-      const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
-
-      const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
-                            domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
-
-      const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
-                             domain.xMax(), domain.yMin() + ySize, domain.zMax() );
+      const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() );
+      const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() );
 
       for(auto & block : forest)
       {

diff --git a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
@@ -38,6 +38,7 @@
 
 #include <cmath>
 
+#include "GridGeneration.h"
 #include "LdcSetup.h"
 #include "NonUniformGridCPUInfoHeader.h"
 #include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
@@ -77,23 +78,25 @@ int main(int argc, char** argv)
 
       auto config = *cfg;
       logging::configureLogging(config);
-
+      auto domainSetup      = config->getOneBlock("DomainSetup");
       auto blockForestSetup = config->getOneBlock("SetupBlockForest");
+      const bool writeSetupForestAndReturn = blockForestSetup.getParameter< bool >("writeSetupForestAndReturn", true);
+
       const std::string blockForestFilestem =
          blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
       const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
 
-      auto domainSetup                = config->getOneBlock("DomainSetup");
       Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
 
-      // Load structured block forest from file
-      std::ostringstream oss;
-      oss << blockForestFilestem << ".bfs";
-      const std::string setupBlockForestFilepath = oss.str();
+      shared_ptr< BlockForest > bfs;
+      createBlockForest(bfs, domainSetup, blockForestSetup);
+
+      if (writeSetupForestAndReturn && mpi::MPIManager::instance()->numProcesses() == 1)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program")
+         return EXIT_SUCCESS;
+      }
 
-      WALBERLA_LOG_INFO_ON_ROOT("Creating structured block forest...")
-      auto bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
-                                                 setupBlockForestFilepath.c_str(), false);
       auto blocks =
          std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
       blocks->createCellBoundingBoxes();
@@ -173,13 +176,21 @@ int main(int argc, char** argv)
       const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
       const bool useVTKAMRWriter = parameters.getParameter< bool >("useVTKAMRWriter", false);
       const bool oneFilePerProcess = parameters.getParameter< bool >("oneFilePerProcess", false);
+
+      auto finalDomain = blocks->getDomain();
       if (vtkWriteFrequency > 0)
       {
          auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
                                                          "simulation_step", false, true, true, false, 0, useVTKAMRWriter, oneFilePerProcess);
          auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldID, "vel");
          vtkOutput->addCellDataWriter(velWriter);
 
+         if (parameters.getParameter< bool >("writeOnlySlice", true)){
+            const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - blocks->dz(refinementDepth),
+                               finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + blocks->dz(refinementDepth));
+            vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXY));
+         }
+
          vtkOutput->addBeforeFunction([&]() {
             for (auto& block : *blocks)
                sweepCollection.calculateMacroscopicParameters(&block);
@@ -236,6 +247,8 @@ int main(int argc, char** argv)
                pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
                pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
                pythonCallbackResults.data().exposeValue("numCores", performance.cores());
+               pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
+               pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
                pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
                pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
                pythonCallbackResults.data().exposeValue("mlupsPerProcess",

diff --git a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py
@@ -23,17 +23,23 @@
 
 with CodeGeneration() as ctx:
     field_type = "float64" if ctx.double_accuracy else "float32"
+    cpu_vec = {"instruction_set": None}
 
-    streaming_pattern = 'aa'
+    streaming_pattern = 'esopull'
     timesteps = get_timesteps(streaming_pattern)
     stencil = LBStencil(Stencil.D3Q19)
+    method_enum = Method.CUMULANT
+
+    fourth_order_correction = 0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False
+    collision_setup = "cumulant-K17" if fourth_order_correction else method_enum.name.lower()
 
     assert stencil.D == 3, "This application supports only three-dimensional stencils"
     pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
     density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
     macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
 
-    lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega, compressible=True,
+    lbm_config = LBMConfig(stencil=stencil, method=method_enum, relaxation_rate=omega, compressible=True,
+                           fourth_order_correction=fourth_order_correction,
                            streaming_pattern=streaming_pattern)
     lbm_opt = LBMOptimisation(cse_global=False, field_layout="fzyx")
 
@@ -50,12 +56,12 @@
                          lbm_config=lbm_config, lbm_optimisation=lbm_opt,
                          nonuniform=True, boundaries=[no_slip, ubb],
                          macroscopic_fields=macroscopic_fields,
-                         target=ps.Target.CPU)
+                         target=ps.Target.CPU, cpu_vectorize_info=cpu_vec,)
 
     infoHeaderParams = {
         'stencil': stencil.name.lower(),
         'streaming_pattern': streaming_pattern,
-        'collision_setup': lbm_config.method.name.lower(),
+        'collision_setup': collision_setup,
         'cse_global': int(lbm_opt.cse_global),
         'cse_pdfs': int(lbm_opt.cse_pdfs),
     }