-
Notifications
You must be signed in to change notification settings - Fork 13
Subgroup2 Benchmark #190
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Subgroup2 Benchmark #190
Changes from all commits
Commits
Show all changes
19 commits
Select commit
Hold shift + click to select a range
8090a2d
initial benchmark example copy
keptsecret 3a2ff14
test subgroup2 funcs correct
keptsecret dd021a0
fix test
keptsecret ca21941
benchmarking shader + pipeline working
keptsecret 0bb41db
begin adding fake frames for nsight profiler
keptsecret 24a93bb
merge master, fix conflicts
keptsecret 17dda8e
re-numbered example to avoid duplicate
keptsecret 3d4e0f2
fake frames for nsight
keptsecret 0192999
use correct shader, spirv line dbinfo for nsight
keptsecret 8c9d55e
support for 1 item per invoc
keptsecret 07d6980
handle when items per invoc =1
keptsecret be756d5
minor fixes
keptsecret 1963b51
changes in Param, Config usage
keptsecret 99cf5d8
coalesced load/store data
keptsecret 1d5e433
Merge branch 'master' into scan_perf_bench
keptsecret a3bb526
fixed some bugs
keptsecret 355c605
disable test by default
keptsecret 6b57674
refactor to load data as vectors, consecutive uints
keptsecret f717206
Merge branch 'master' into scan_perf_bench
keptsecret File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
|
||
include(common RESULT_VARIABLE RES) | ||
if(NOT RES) | ||
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") | ||
endif() | ||
|
||
nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") | ||
|
||
if(NBL_EMBED_BUILTIN_RESOURCES) | ||
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) | ||
set(RESOURCE_DIR "app_resources") | ||
|
||
get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) | ||
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) | ||
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) | ||
|
||
file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") | ||
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) | ||
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") | ||
endforeach() | ||
|
||
ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") | ||
|
||
LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) | ||
endif() |
54 changes: 54 additions & 0 deletions
54
73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#pragma shader_stage(compute) | ||
|
||
#define operation_t nbl::hlsl::OPERATION | ||
|
||
#include "shaderCommon.hlsl" | ||
|
||
// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders | ||
[[vk::binding(2, 0)]] RWTexture2D<float32_t4> outImage; // dummy | ||
|
||
uint32_t globalIndex() | ||
{ | ||
return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); | ||
} | ||
|
||
bool canStore() {return true;} | ||
|
||
#ifndef NUM_LOOPS | ||
#error "Define NUM_LOOPS!" | ||
#endif | ||
|
||
template<template<class> class binop, typename T, uint32_t N> | ||
static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) | ||
{ | ||
using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>; | ||
using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>; | ||
devshgraphicsprogramming marked this conversation as resolved.
Show resolved
Hide resolved
|
||
type_t value = sourceVal; | ||
|
||
operation_t<params_t> func; | ||
// [unroll] | ||
for (uint32_t i = 0; i < NUM_LOOPS; i++) | ||
value = func(value); | ||
|
||
output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value); | ||
devshgraphicsprogramming marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
void benchmark() | ||
{ | ||
const uint32_t idx = globalIndex(); | ||
type_t sourceVal = inputValue[idx]; | ||
|
||
subbench<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subbench<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subbench<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subbench<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subbench<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subbench<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subbench<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
devshgraphicsprogramming marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
[numthreads(WORKGROUP_SIZE,1,1)] | ||
void main() | ||
{ | ||
benchmark(); | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
#include "nbl/builtin/hlsl/cpp_compat.hlsl" | ||
#include "nbl/builtin/hlsl/functional.hlsl" | ||
|
||
template<uint32_t kScanElementCount=1024*1024> | ||
struct Output | ||
{ | ||
NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount; | ||
|
||
uint32_t subgroupSize; | ||
devshgraphicsprogramming marked this conversation as resolved.
Show resolved
Hide resolved
|
||
uint32_t data[ScanElementCount]; | ||
}; | ||
|
||
template<typename T> | ||
struct bit_and : nbl::hlsl::bit_and<T> | ||
{ | ||
using base_t = nbl::hlsl::bit_and<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "bit_and"; | ||
#endif | ||
}; | ||
template<typename T> | ||
struct bit_or : nbl::hlsl::bit_or<T> | ||
{ | ||
using base_t = nbl::hlsl::bit_or<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "bit_xor"; | ||
#endif | ||
}; | ||
template<typename T> | ||
struct bit_xor : nbl::hlsl::bit_xor<T> | ||
{ | ||
using base_t = nbl::hlsl::bit_xor<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "bit_or"; | ||
#endif | ||
}; | ||
template<typename T> | ||
struct plus : nbl::hlsl::plus<T> | ||
{ | ||
using base_t = nbl::hlsl::plus<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "plus"; | ||
#endif | ||
}; | ||
template<typename T> | ||
struct multiplies : nbl::hlsl::multiplies<T> | ||
{ | ||
using base_t = nbl::hlsl::multiplies<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "multiplies"; | ||
#endif | ||
}; | ||
template<typename T> | ||
struct minimum : nbl::hlsl::minimum<T> | ||
{ | ||
using base_t = nbl::hlsl::minimum<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "minimum"; | ||
#endif | ||
}; | ||
template<typename T> | ||
struct maximum : nbl::hlsl::maximum<T> | ||
{ | ||
using base_t = nbl::hlsl::maximum<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "maximum"; | ||
#endif | ||
}; | ||
|
||
template<typename T> | ||
struct ballot : nbl::hlsl::plus<T> | ||
{ | ||
using base_t = nbl::hlsl::plus<T>; | ||
|
||
NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7; | ||
#ifndef __HLSL_VERSION | ||
static inline constexpr const char* name = "bitcount"; | ||
#endif | ||
}; | ||
devshgraphicsprogramming marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
#include "nbl/builtin/hlsl/subgroup/basic.hlsl" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
#include "common.hlsl" | ||
|
||
#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" | ||
#include "nbl/builtin/hlsl/subgroup/basic.hlsl" | ||
#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" | ||
#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" | ||
|
||
#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" | ||
|
||
// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 | ||
uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} | ||
|
||
#ifndef ITEMS_PER_INVOCATION | ||
#error "Define ITEMS_PER_INVOCATION!" | ||
#endif | ||
|
||
typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t; | ||
|
||
// unfortunately DXC chokes on descriptors as static members | ||
// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 | ||
[[vk::binding(0, 0)]] StructuredBuffer<type_t> inputValue; | ||
[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; | ||
|
||
// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way | ||
uint32_t globalIndex(); | ||
devshgraphicsprogramming marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs | ||
bool canStore(); | ||
|
||
#ifndef OPERATION | ||
#error "Define OPERATION!" | ||
#endif | ||
|
||
#ifndef SUBGROUP_SIZE_LOG2 | ||
#error "Define SUBGROUP_SIZE_LOG2!" | ||
#endif | ||
template<template<class> class binop, typename T, uint32_t N> | ||
static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) | ||
{ | ||
// TODO static assert vector<T, N> == type_t | ||
//using type_t = vector<T, N>; | ||
using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>; | ||
using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>; | ||
|
||
if (globalIndex()==0u) | ||
output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize()); | ||
|
||
operation_t<params_t> func; | ||
if (canStore()) | ||
output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); | ||
} | ||
|
||
|
||
type_t test() | ||
{ | ||
const uint32_t idx = globalIndex(); | ||
type_t sourceVal = inputValue[idx]; | ||
|
||
subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
return sourceVal; | ||
} | ||
|
||
#include "nbl/builtin/hlsl/workgroup/basic.hlsl" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#pragma shader_stage(compute) | ||
|
||
#define operation_t nbl::hlsl::OPERATION | ||
|
||
#include "shaderCommon.hlsl" | ||
|
||
uint32_t globalIndex() | ||
{ | ||
return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); | ||
} | ||
|
||
bool canStore() {return true;} | ||
|
||
[numthreads(WORKGROUP_SIZE,1,1)] | ||
void main() | ||
{ | ||
test(); | ||
} |
107 changes: 107 additions & 0 deletions
107
73_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#pragma shader_stage(compute) | ||
|
||
|
||
#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl" | ||
|
||
static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic<ITEMS_PER_WG>::value; | ||
static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot<ITEMS_PER_WG>::value; | ||
static const uint32_t ScratchSz = ArithmeticSz+BallotSz; | ||
|
||
// TODO: Can we make it a static variable in the ScratchProxy struct? | ||
groupshared uint32_t scratch[ScratchSz]; | ||
|
||
|
||
#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" | ||
|
||
|
||
template<uint16_t offset> | ||
struct ScratchProxy | ||
{ | ||
void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) | ||
{ | ||
value = scratch[ix+offset]; | ||
} | ||
void set(const uint32_t ix, const uint32_t value) | ||
{ | ||
scratch[ix+offset] = value; | ||
} | ||
|
||
uint32_t atomicOr(const uint32_t ix, const uint32_t value) | ||
{ | ||
return nbl::hlsl::glsl::atomicOr(scratch[ix],value); | ||
} | ||
|
||
void workgroupExecutionAndMemoryBarrier() | ||
{ | ||
nbl::hlsl::glsl::barrier(); | ||
//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above | ||
} | ||
}; | ||
|
||
static ScratchProxy<0> arithmeticAccessor; | ||
|
||
|
||
#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" | ||
|
||
|
||
template<class Binop, class device_capabilities> | ||
struct operation_t | ||
{ | ||
using type_t = typename Binop::type_t; | ||
|
||
type_t operator()(type_t value) | ||
{ | ||
type_t retval = nbl::hlsl::OPERATION<Binop,ITEMS_PER_WG,device_capabilities>::template __call<ScratchProxy<0> >(value,arithmeticAccessor); | ||
// we barrier before because we alias the accessors for Binop | ||
arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); | ||
return retval; | ||
} | ||
}; | ||
|
||
|
||
#include "shaderCommon.hlsl" | ||
|
||
static ScratchProxy<ArithmeticSz> ballotAccessor; | ||
|
||
|
||
uint32_t globalIndex() | ||
{ | ||
return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); | ||
} | ||
|
||
bool canStore() | ||
{ | ||
return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG; | ||
} | ||
|
||
[numthreads(WORKGROUP_SIZE,1,1)] | ||
void main() | ||
{ | ||
const type_t sourceVal = test(); | ||
if (globalIndex()==0u) | ||
output[ballot<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize()); | ||
|
||
// we can only ballot booleans, so low bit | ||
nbl::hlsl::workgroup::ballot<ScratchProxy<ArithmeticSz> >(bool(sourceVal & 0x1u), ballotAccessor); | ||
// need to barrier between ballot and usages of a ballot by myself | ||
ballotAccessor.workgroupExecutionAndMemoryBarrier(); | ||
|
||
uint32_t destVal = 0xdeadbeefu; | ||
#define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same<nbl::hlsl::OPERATION<nbl::hlsl::bit_xor<float>,0x45>,nbl::hlsl::workgroup::IS_OP<nbl::hlsl::bit_xor<float>,0x45> >::value | ||
#define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities | ||
if (CONSTEXPR_OP_TYPE_TEST(reduction)) | ||
destVal = nbl::hlsl::workgroup::ballotBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor); | ||
else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan)) | ||
destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor); | ||
else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan)) | ||
destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor); | ||
else | ||
{ | ||
assert(false); | ||
} | ||
#undef BALLOT_TEMPLATE_ARGS | ||
#undef CONSTEXPR_OP_TYPE_TEST | ||
|
||
if (canStore()) | ||
output[ballot<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal); | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.