-
Notifications
You must be signed in to change notification settings - Fork 13
Subgroup2 Benchmark #190
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Subgroup2 Benchmark #190
Conversation
{ | ||
NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount; | ||
|
||
uint32_t subgroupSize; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
redundant, you know and will set it up-front
template<typename T> | ||
struct bit_and : nbl::hlsl::bit_and<T> | ||
{ | ||
using base_t = nbl::hlsl::bit_and<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "bit_and"; | ||
#endif | ||
}; | ||
template<typename T> | ||
struct bit_or : nbl::hlsl::bit_or<T> | ||
{ | ||
using base_t = nbl::hlsl::bit_or<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "bit_xor"; | ||
#endif | ||
}; | ||
template<typename T> | ||
struct bit_xor : nbl::hlsl::bit_xor<T> | ||
{ | ||
using base_t = nbl::hlsl::bit_xor<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "bit_or"; | ||
#endif | ||
}; | ||
template<typename T> | ||
struct plus : nbl::hlsl::plus<T> | ||
{ | ||
using base_t = nbl::hlsl::plus<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "plus"; | ||
#endif | ||
}; | ||
template<typename T> | ||
struct multiplies : nbl::hlsl::multiplies<T> | ||
{ | ||
using base_t = nbl::hlsl::multiplies<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "multiplies"; | ||
#endif | ||
}; | ||
template<typename T> | ||
struct minimum : nbl::hlsl::minimum<T> | ||
{ | ||
using base_t = nbl::hlsl::minimum<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "minimum"; | ||
#endif | ||
}; | ||
template<typename T> | ||
struct maximum : nbl::hlsl::maximum<T> | ||
{ | ||
using base_t = nbl::hlsl::maximum<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "maximum"; | ||
#endif | ||
}; | ||
|
||
template<typename T> | ||
struct ballot : nbl::hlsl::plus<T> | ||
{ | ||
using base_t = nbl::hlsl::plus<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "bitcount"; | ||
#endif | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for the benchmark lets only benchmark plus
and ballot
static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) | ||
{ | ||
using config_t = nbl::hlsl::subgroup::Configuration<SUBGROUP_SIZE_LOG2>; | ||
using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
make the "use-native" come from a define too
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cause you want 2 different pipelines and 2 different test runs for each
[unroll] | ||
for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) | ||
{ | ||
sourceVal[i] = inputValue[idx + i]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
eeh you've left perf on the table, your subgroup is now doing a heavily strided load
you need to load differently:
sourceVal[i] = nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE*ITEMS_PER_INVOCATION+((SubgroupID*ITEMS_PER_INVOCATION+i)<<SubgroupSizeLog2)+SubgroupInvocationIndex;
so consecutive invocations load consecutive memory locations during a lock-step
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
or make the inputValue
a ByteBuffer and do a templated Load from it of a whole type_t
for (uint32_t i = 0; i < NUM_LOOPS; i++) | ||
value = func(value); | ||
|
||
output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you're storing to the wrong place, compute the index same way you compute for inputValue
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and dont store whole vector
store a scalar in aan unrolled loop same way you load input
subbench<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subbench<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subbench<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subbench<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subbench<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subbench<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subbench<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just bench the plus
// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way | ||
uint32_t globalIndex(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
rename to globalFirstItemIndex
to mean that its the first index to access by the invocation, and add a comment that to get next item one does not scroll by 1 but by SubgroupSize
template<template<class> class binop, typename T, uint32_t N> | ||
static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) | ||
{ | ||
// TODO static assert vector<T, N> == type_t | ||
//using type_t = vector<T, N>; | ||
using config_t = nbl::hlsl::subgroup::Configuration<SUBGROUP_SIZE_LOG2>; | ||
using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>; | ||
|
||
if (globalIndex()==0u) | ||
output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize()); | ||
|
||
operation_t<params_t> func; | ||
if (canStore()) | ||
output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); | ||
} | ||
|
||
|
||
type_t test() | ||
{ | ||
const uint32_t idx = globalIndex() * ITEMS_PER_INVOCATION; | ||
type_t sourceVal; | ||
#if ITEMS_PER_INVOCATION > 1 | ||
[unroll] | ||
for (uint32_t i = 0; i < ITEMS_PER_INVOCATION; i++) | ||
{ | ||
sourceVal[i] = inputValue[idx + i]; | ||
} | ||
#else | ||
sourceVal = inputValue[idx]; | ||
#endif | ||
|
||
subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
return sourceVal; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lets delete all the testing code (its a bit messy), and move it into the Arithmetic Unit Test example where swapchains for NSight are not needed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
your indexing for input/output is all wrong anyway and should fail for any ItemsPerInvocation>1
// create 8 buffers for 8 operations | ||
for (auto i=0u; i<OutputBufferCount; i++) | ||
{ | ||
IGPUBuffer::SCreationParams params = {}; | ||
params.size = sizeof(uint32_t) + gpuinputDataBuffer->getSize(); | ||
params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT; | ||
|
||
outputBuffers[i] = m_device->createBuffer(std::move(params)); | ||
auto mreq = outputBuffers[i]->getMemoryReqs(); | ||
mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); | ||
assert(mreq.memoryTypeBits); | ||
|
||
auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get()); | ||
assert(bufferMem.isValid()); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do one and only bnechmark add/plus
// create dummy image | ||
dummyImg = m_device->createImage({ | ||
{ | ||
.type = IGPUImage::ET_2D, | ||
.samples = asset::ICPUImage::ESCF_1_BIT, | ||
.format = asset::EF_R16G16B16A16_SFLOAT, | ||
.extent = {WIN_W, WIN_H, 1}, | ||
.mipLevels = 1, | ||
.arrayLayers = 1, | ||
.flags = IImage::ECF_NONE, | ||
.usage = core::bitflag(asset::IImage::EUF_STORAGE_BIT) | asset::IImage::EUF_TRANSFER_SRC_BIT | ||
} | ||
}); | ||
if (!dummyImg || !m_device->allocate(dummyImg->getMemoryReqs(), dummyImg.get()).isValid()) | ||
return logFail("Could not create HDR Image"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you don't need to create a dummy image to write into descriptor set, write the swapchain image instead
infos[i].info.buffer = { 0u,buff->getSize() }; | ||
infos[i].desc = std::move(buff); // save an atomic in the refcount | ||
} | ||
// write swapchain image descriptor in loop |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no, write them all into descriptor set at once at start, its okay not to use them and for them to be in wrong layout as long as they're not used
// create Descriptor Set Layout | ||
smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout; | ||
{ | ||
IGPUDescriptorSetLayout::SBinding binding[2]; | ||
for (uint32_t i = 0u; i < 2; i++) | ||
binding[i] = {{},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; | ||
binding[1].count = OutputBufferCount; | ||
dsLayout = m_device->createDescriptorSetLayout(binding); | ||
} | ||
|
||
// set and transient pool | ||
auto descPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,{&dsLayout.get(),1}); | ||
testDs = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout)); | ||
{ | ||
IGPUDescriptorSet::SDescriptorInfo infos[1+OutputBufferCount]; | ||
infos[0].desc = gpuinputDataBuffer; | ||
infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() }; | ||
for (uint32_t i = 1u; i <= OutputBufferCount; i++) | ||
{ | ||
auto buff = outputBuffers[i - 1]; | ||
infos[i].info.buffer = { 0u,buff->getSize() }; | ||
infos[i].desc = std::move(buff); // save an atomic in the refcount | ||
} | ||
|
||
IGPUDescriptorSet::SWriteDescriptorSet writes[2]; | ||
for (uint32_t i=0u; i<2; i++) | ||
writes[i] = {testDs.get(),i,0u,1u,infos+i}; | ||
writes[1].count = OutputBufferCount; | ||
|
||
m_device->updateDescriptorSets(2, writes, 0u, nullptr); | ||
} | ||
testPplnLayout = m_device->createPipelineLayout({}, std::move(dsLayout)); | ||
|
||
|
||
smart_refctd_ptr<IGPUDescriptorSetLayout> benchLayout; | ||
{ | ||
IGPUDescriptorSetLayout::SBinding binding[3]; | ||
for (uint32_t i = 0u; i < 2; i++) | ||
binding[i] = { {},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; | ||
binding[1].count = OutputBufferCount; | ||
binding[2] = { {},2,IDescriptor::E_TYPE::ET_STORAGE_IMAGE,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; | ||
benchLayout = m_device->createDescriptorSetLayout(binding); | ||
} | ||
|
||
benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &benchLayout.get(),1 }); | ||
benchDs = benchPool->createDescriptorSet(smart_refctd_ptr(benchLayout)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
delete testing code, or use one descriptor layout and pipeline layout to rule them all
|
||
virtual bool onAppTerminated() override | ||
{ | ||
delete[] inputData; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if you don't do testing, you can discard the input data as soon as input buffer is made, OR even better yet
Use a Hash from your path tracer / BxDF Math PR and skip having an input buffer entirely!
std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs; | ||
ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; | ||
|
||
smart_refctd_ptr<InputSystem> m_inputSystem; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you shouldn't be getting any events
73_ArithmeticBench/main.cpp
Outdated
|
||
|
||
template<template<class> class Arithmetic> | ||
bool runBenchmark(IGPUCommandBuffer* cmdbuf, const BenchmarkSet& set, const uint32_t elementCount, const uint8_t subgroupSizeLog2) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unused variable subgroupSizeLog2
73_ArithmeticBench/main.cpp
Outdated
const uint32_t workgroupCount = elementCount / (set.workgroupSize * set.itemsPerInvocation); | ||
|
||
cmdbuf->bindComputePipeline(set.pipeline.get()); | ||
cmdbuf->bindDescriptorSets(EPBP_COMPUTE, set.pipeline->getLayout(), 0u, 1u, &benchDs.get()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the set you can bind once at start of commandbuffer if you never change the layout
73_ArithmeticBench/main.cpp
Outdated
template<template<class> class Arithmetic> | ||
BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr<const ICPUShader>&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) | ||
{ | ||
std::string arith_name = Arithmetic<bit_xor<uint32_t>>::name; // TODO all operations |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
only plus
73_ArithmeticBench/main.cpp
Outdated
options.spirvOptimizer = nullptr; | ||
//#ifndef _NBL_DEBUG | ||
// ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; | ||
// auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1)); | ||
// options.spirvOptimizer = opt.get(); | ||
//#endif | ||
options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you must use zero debug flags and a SPIR-V optimizer to get representable perf, just ask @Fletterio about his FFT examples
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's in the createShader
method in main.cpp
of example 28, but to get a standard optimizer + strip debug info you just provide an optimizer with a single strip debug info flag to the compiler.
I don't remember exactly whether this invokes other optimizations or just the strip debug
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should invoke the standard passes specified in the SPIRV compiler repo when you run with -O, but in that regard I think the intent of the optimizer is busted (providing a custom optimizer likely is intended to disable all other passes by default)
auto* const cmdbuf = m_cmdBufs.data()[resourceIx].get(); | ||
cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); | ||
cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
best practice is to have a pool per frame in flight and reset the pool instead of the commandbuffer (and create the pool & commandbuffer without the individually resettable capability)
// bind dummy image | ||
IGPUImageView::SCreationParams viewParams = { | ||
.flags = IGPUImageView::ECF_NONE, | ||
.subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT, | ||
.image = dummyImg, | ||
.viewType = IGPUImageView::ET_2D, | ||
.format = dummyImg->getCreationParameters().format | ||
}; | ||
auto dummyImgView = m_device->createImageView(std::move(viewParams)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do not create an image view every frame
// barrier transition to GENERAL | ||
{ | ||
IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; | ||
imageBarriers[0].barrier = { | ||
.dep = { | ||
.srcStageMask = PIPELINE_STAGE_FLAGS::NONE, | ||
.srcAccessMask = ACCESS_FLAGS::NONE, | ||
.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, | ||
.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS | ||
} | ||
}; | ||
imageBarriers[0].image = dummyImg.get(); | ||
imageBarriers[0].subresourceRange = { | ||
.aspectMask = IImage::EAF_COLOR_BIT, | ||
.baseMipLevel = 0u, | ||
.levelCount = 1u, | ||
.baseArrayLayer = 0u, | ||
.layerCount = 1u | ||
}; | ||
imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED; | ||
imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL; | ||
|
||
cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if you don't actually touch the image, you don't need to transition it (you may need to transition right after creation / first frame so validation layer doesn't complain about it being in UNDEFINED layout)
video::IGPUDescriptorSet::SDescriptorInfo dsInfo; | ||
dsInfo.info.image.imageLayout = IImage::LAYOUT::GENERAL; | ||
dsInfo.desc = dummyImgView; | ||
|
||
IGPUDescriptorSet::SWriteDescriptorSet dsWrites[1u] = | ||
{ | ||
{ | ||
.dstSet = benchDs.get(), | ||
.binding = 2u, | ||
.arrayElement = 0u, | ||
.count = 1u, | ||
.info = &dsInfo, | ||
} | ||
}; | ||
m_device->updateDescriptorSets(1u, dsWrites, 0u, nullptr); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't write descriptor set every frame, and write the swapchain images instead
const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; | ||
const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; | ||
|
||
const auto SubgroupSizeLog2 = hlsl::findMSB(MinSubgroupSize); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unused var
73_ArithmeticBench/main.cpp
Outdated
|
||
const auto SubgroupSizeLog2 = hlsl::findMSB(MinSubgroupSize); | ||
|
||
bool passed = true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you don't check, so maybe make runBenchmark
return void?
73_ArithmeticBench/main.cpp
Outdated
passed = runBenchmark<emulatedScanInclusive>(cmdbuf, benchSets[0], elementCount, SubgroupSizeLog2); | ||
passed = runBenchmark<emulatedScanInclusive>(cmdbuf, benchSets[1], elementCount, SubgroupSizeLog2); | ||
passed = runBenchmark<emulatedScanInclusive>(cmdbuf, benchSets[2], elementCount, SubgroupSizeLog2); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you don't need the CPU validator emulatedScanInclusive
for anything, untemplate the method
// blit | ||
{ | ||
IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[2]; | ||
imageBarriers[0].barrier = { | ||
.dep = { | ||
.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, | ||
.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, | ||
.dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, | ||
.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT | ||
} | ||
}; | ||
imageBarriers[0].image = dummyImg.get(); | ||
imageBarriers[0].subresourceRange = { | ||
.aspectMask = IImage::EAF_COLOR_BIT, | ||
.baseMipLevel = 0u, | ||
.levelCount = 1u, | ||
.baseArrayLayer = 0u, | ||
.layerCount = 1u | ||
}; | ||
imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED; | ||
imageBarriers[0].newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL; | ||
|
||
imageBarriers[1].barrier = { | ||
.dep = { | ||
.srcStageMask = PIPELINE_STAGE_FLAGS::NONE, | ||
.srcAccessMask = ACCESS_FLAGS::NONE, | ||
.dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, | ||
.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT | ||
} | ||
}; | ||
imageBarriers[1].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex); | ||
imageBarriers[1].subresourceRange = { | ||
.aspectMask = IImage::EAF_COLOR_BIT, | ||
.baseMipLevel = 0u, | ||
.levelCount = 1u, | ||
.baseArrayLayer = 0u, | ||
.layerCount = 1u | ||
}; | ||
imageBarriers[1].oldLayout = IImage::LAYOUT::UNDEFINED; | ||
imageBarriers[1].newLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL; | ||
|
||
cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); | ||
} | ||
|
||
{ | ||
IGPUCommandBuffer::SImageBlit regions[] = { { | ||
.srcMinCoord = {0,0,0}, | ||
.srcMaxCoord = {WIN_W,WIN_H,1}, | ||
.dstMinCoord = {0,0,0}, | ||
.dstMaxCoord = {WIN_W,WIN_H,1}, | ||
.layerCount = 1, | ||
.srcBaseLayer = 0, | ||
.dstBaseLayer = 0, | ||
.srcMipLevel = 0, | ||
.dstMipLevel = 0, | ||
.aspectMask = IGPUImage::E_ASPECT_FLAGS::EAF_COLOR_BIT | ||
} }; | ||
|
||
auto srcImg = dummyImg.get(); | ||
auto scRes = static_cast<CDefaultSwapchainFramebuffers*>(m_surface->getSwapchainResources()); | ||
auto dstImg = scRes->getImage(m_currentImageAcquire.imageIndex); | ||
|
||
cmdbuf->blitImage(srcImg, IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, dstImg, IImage::LAYOUT::TRANSFER_DST_OPTIMAL, regions, ISampler::ETF_NEAREST); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
blit is not needed
// barrier transition to PRESENT | ||
{ | ||
IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; | ||
imageBarriers[0].barrier = { | ||
.dep = { | ||
.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, | ||
.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, | ||
.dstStageMask = PIPELINE_STAGE_FLAGS::NONE, | ||
.dstAccessMask = ACCESS_FLAGS::NONE | ||
} | ||
}; | ||
imageBarriers[0].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex); | ||
imageBarriers[0].subresourceRange = { | ||
.aspectMask = IImage::EAF_COLOR_BIT, | ||
.baseMipLevel = 0u, | ||
.levelCount = 1u, | ||
.baseArrayLayer = 0u, | ||
.layerCount = 1u | ||
}; | ||
imageBarriers[0].oldLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL; | ||
imageBarriers[0].newLayout = IImage::LAYOUT::PRESENT_SRC; | ||
|
||
cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
transition once to PRESENT and never touch later on
std::string caption = "[Nabla Engine] Geometry Creator"; | ||
{ | ||
caption += ", displaying [all objects]"; | ||
m_window->setCaption(caption); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
dont change captions
// options.spirvOptimizer = opt.get(); | ||
//#endif | ||
#ifndef _NBL_DEBUG | ||
ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
check if adding hardcore O3 passes + aggressive inlining makes any perf diff
73_ArithmeticBench/main.cpp
Outdated
uint32_t ItemsPerInvocation = 1u; | ||
uint32_t ItemsPerInvocation = 4u; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
test all item counts in the loop (so 3x4 tests)
Should close, replace with #192 |
No description provided.