Skip to content

Commit

Permalink
Merge branch 'UpdateBenchmarks' into 'master'
Browse files Browse the repository at this point in the history
Update benchmarks

See merge request walberla/walberla!685
  • Loading branch information
Philipp Suffa committed Oct 8, 2024
2 parents 0c0092f + a74df61 commit fb98460
Show file tree
Hide file tree
Showing 94 changed files with 5,231 additions and 5,245 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ qrc_*

# CLion indexing
*.uuid
.fleet


# Generated files
Expand All @@ -32,11 +33,13 @@ qrc_*
# Visual Studio Code
/.vscode

# Zed
/.cache*

# CLion
*.idea
*.clion*


# QtCreator
CMakeLists.txt.user.*

Expand Down
7 changes: 1 addition & 6 deletions apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,6 @@ waLBerla_generate_target_from_python(NAME NonUniformGridCPUGenerated
NonUniformGridCPUBoundaryCollection.h
NonUniformGridCPUInfoHeader.h)

waLBerla_add_executable( NAME NonUniformGridGenerator
FILES NonUniformGridGenerator.cpp LdcSetup.h
DEPENDS blockforest core field python_coupling )


waLBerla_add_executable( NAME NonUniformGridCPU
FILES NonUniformGridCPU.cpp LdcSetup.h
FILES NonUniformGridCPU.cpp LdcSetup.h GridGeneration.h
DEPENDS blockforest boundary core domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridCPUGenerated )
144 changes: 144 additions & 0 deletions apps/benchmarks/NonUniformGridCPU/GridGeneration.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file GridGeneration.h
//! \author Markus Holzer <markus.holzer@fau.de>
//
//======================================================================================================================
#pragma once

#include "blockforest/Initialization.h"
#include "blockforest/SetupBlock.h"
#include "blockforest/SetupBlockForest.h"
#include "blockforest/loadbalancing/StaticCurve.h"

#include "core/Environment.h"
#include "core/logging/Initialization.h"
#include "core/timing/RemainingTimeLogger.h"
#include "core/timing/TimingPool.h"

#include <string>

#include "LdcSetup.h"
#include "NonUniformGridCPUInfoHeader.h"

using StorageSpecification_T = lbm::NonUniformGridCPUStorageSpecification;
using Stencil_T = StorageSpecification_T::Stencil;

using namespace walberla;

void createSetupBlockForest(SetupBlockForest& setupBfs,
const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup,
const bool useMPIManager=false)
{
WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")

Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");

const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest");
const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false);
const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false);

if(useMPIManager)
numProcesses = uint_c(mpi::MPIManager::instance()->numProcesses());

const LDC ldc(refinementDepth);

auto refSelection = ldc.refinementSelector();
setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);

if(mpi::MPIManager::instance()->numProcesses() > 1)
return;

{
std::ostringstream oss;
oss << blockForestFilestem << ".bfs";
setupBfs.saveToFile(oss.str().c_str());
}

if(writeVtk){
setupBfs.writeVTKOutput(blockForestFilestem);
}

if(outputStatistics){
WALBERLA_LOG_INFO_ON_ROOT("=========================== BLOCK FOREST STATISTICS ============================");
WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
for (uint_t level = 0; level <= refinementDepth; level++)
{
const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
}

const real_t avgBlocksPerProc = real_c(setupBfs.getNumberOfBlocks()) / real_c(setupBfs.getNumberOfProcesses());
WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);

const uint_t totalNumberCells = setupBfs.getNumberOfBlocks() * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2];
const real_t averageCellsPerGPU = avgBlocksPerProc * real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);

const uint_t PDFsPerCell = StorageSpecification_T::inplace ? Stencil_T::Q : 2 * Stencil_T::Q;
const uint_t valuesPerCell = (PDFsPerCell + VelocityField_T::F_SIZE + ScalarField_T::F_SIZE);
const uint_t sizePerValue = sizeof(StorageSpecification_T::value_type);
const double expectedMemory = double_c(totalNumberCells * valuesPerCell * sizePerValue) * 1e-9;
const double expectedMemoryPerGPU = double_c(averageCellsPerGPU * valuesPerCell * sizePerValue) * 1e-9;

WALBERLA_LOG_INFO_ON_ROOT( "Total number of cells will be " << totalNumberCells << " fluid cells (in total on all levels)")
WALBERLA_LOG_INFO_ON_ROOT( "Expected total memory demand will be " << expectedMemory << " GB")
WALBERLA_LOG_INFO_ON_ROOT( "Average memory demand per GPU will be " << expectedMemoryPerGPU << " GB")

WALBERLA_LOG_INFO_ON_ROOT("=================================================================================");
}
}

void createBlockForest(shared_ptr< BlockForest >& bfs,
const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup)
{
if (mpi::MPIManager::instance()->numProcesses() > 1)
{
const std::string blockForestFilestem =
blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
// Load structured block forest from file
std::ostringstream oss;
oss << blockForestFilestem << ".bfs";
const std::string setupBlockForestFilepath = oss.str();
std::ifstream infile(setupBlockForestFilepath.c_str());
if(!infile.good())
{
WALBERLA_LOG_WARNING_ON_ROOT("Blockforest was not created beforehand and thus needs to be created on the fly. For large simulation runs this can be a severe problem!")
SetupBlockForest setupBfs;
createSetupBlockForest(setupBfs, domainSetup, blockForestSetup, true);
bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
}
else
{
bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
setupBlockForestFilepath.c_str(), false);
}
}
else
{
SetupBlockForest setupBfs;
createSetupBlockForest(setupBfs, domainSetup, blockForestSetup);
bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
}
}
10 changes: 2 additions & 8 deletions apps/benchmarks/NonUniformGridCPU/LdcSetup.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,8 @@ class LDCRefinement
{
const AABB & domain = forest.getDomain();

const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );

const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );

const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
domain.xMax(), domain.yMin() + ySize, domain.zMax() );
const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() );
const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() );

for(auto & block : forest)
{
Expand Down
31 changes: 22 additions & 9 deletions apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

#include <cmath>

#include "GridGeneration.h"
#include "LdcSetup.h"
#include "NonUniformGridCPUInfoHeader.h"
#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
Expand Down Expand Up @@ -77,23 +78,25 @@ int main(int argc, char** argv)

auto config = *cfg;
logging::configureLogging(config);

auto domainSetup = config->getOneBlock("DomainSetup");
auto blockForestSetup = config->getOneBlock("SetupBlockForest");
const bool writeSetupForestAndReturn = blockForestSetup.getParameter< bool >("writeSetupForestAndReturn", true);

const std::string blockForestFilestem =
blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));

auto domainSetup = config->getOneBlock("DomainSetup");
Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");

// Load structured block forest from file
std::ostringstream oss;
oss << blockForestFilestem << ".bfs";
const std::string setupBlockForestFilepath = oss.str();
shared_ptr< BlockForest > bfs;
createBlockForest(bfs, domainSetup, blockForestSetup);

if (writeSetupForestAndReturn && mpi::MPIManager::instance()->numProcesses() == 1)
{
WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program")
return EXIT_SUCCESS;
}

WALBERLA_LOG_INFO_ON_ROOT("Creating structured block forest...")
auto bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
setupBlockForestFilepath.c_str(), false);
auto blocks =
std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
blocks->createCellBoundingBoxes();
Expand Down Expand Up @@ -173,13 +176,21 @@ int main(int argc, char** argv)
const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
const bool useVTKAMRWriter = parameters.getParameter< bool >("useVTKAMRWriter", false);
const bool oneFilePerProcess = parameters.getParameter< bool >("oneFilePerProcess", false);

auto finalDomain = blocks->getDomain();
if (vtkWriteFrequency > 0)
{
auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0, useVTKAMRWriter, oneFilePerProcess);
auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldID, "vel");
vtkOutput->addCellDataWriter(velWriter);

if (parameters.getParameter< bool >("writeOnlySlice", true)){
const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - blocks->dz(refinementDepth),
finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + blocks->dz(refinementDepth));
vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXY));
}

vtkOutput->addBeforeFunction([&]() {
for (auto& block : *blocks)
sweepCollection.calculateMacroscopicParameters(&block);
Expand Down Expand Up @@ -236,6 +247,8 @@ int main(int argc, char** argv)
pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
pythonCallbackResults.data().exposeValue("numCores", performance.cores());
pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerProcess",
Expand Down
14 changes: 10 additions & 4 deletions apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,23 @@

with CodeGeneration() as ctx:
field_type = "float64" if ctx.double_accuracy else "float32"
cpu_vec = {"instruction_set": None}

streaming_pattern = 'aa'
streaming_pattern = 'esopull'
timesteps = get_timesteps(streaming_pattern)
stencil = LBStencil(Stencil.D3Q19)
method_enum = Method.CUMULANT

fourth_order_correction = 0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False
collision_setup = "cumulant-K17" if fourth_order_correction else method_enum.name.lower()

assert stencil.D == 3, "This application supports only three-dimensional stencils"
pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
macroscopic_fields = {'density': density_field, 'velocity': velocity_field}

lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega, compressible=True,
lbm_config = LBMConfig(stencil=stencil, method=method_enum, relaxation_rate=omega, compressible=True,
fourth_order_correction=fourth_order_correction,
streaming_pattern=streaming_pattern)
lbm_opt = LBMOptimisation(cse_global=False, field_layout="fzyx")

Expand All @@ -50,12 +56,12 @@
lbm_config=lbm_config, lbm_optimisation=lbm_opt,
nonuniform=True, boundaries=[no_slip, ubb],
macroscopic_fields=macroscopic_fields,
target=ps.Target.CPU)
target=ps.Target.CPU, cpu_vectorize_info=cpu_vec,)

infoHeaderParams = {
'stencil': stencil.name.lower(),
'streaming_pattern': streaming_pattern,
'collision_setup': lbm_config.method.name.lower(),
'collision_setup': collision_setup,
'cse_global': int(lbm_opt.cse_global),
'cse_pdfs': int(lbm_opt.cse_pdfs),
}
Expand Down
Loading

0 comments on commit fb98460

Please sign in to comment.