Skip to content

Subgroup2 Benchmark #190

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions 73_ArithmeticBench/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

include(common RESULT_VARIABLE RES)
if(NOT RES)
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
endif()

nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")

if(NBL_EMBED_BUILTIN_RESOURCES)
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
set(RESOURCE_DIR "app_resources")

get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)

file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
endforeach()

ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")

LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
endif()
54 changes: 54 additions & 0 deletions 73_ArithmeticBench/app_resources/benchmarkSubgroup.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#pragma shader_stage(compute)

#define operation_t nbl::hlsl::OPERATION

#include "shaderCommon.hlsl"

// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders
[[vk::binding(2, 0)]] RWTexture2D<float32_t4> outImage; // dummy

uint32_t globalIndex()
{
return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
}

bool canStore() {return true;}

#ifndef NUM_LOOPS
#error "Define NUM_LOOPS!"
#endif

template<template<class> class binop, typename T, uint32_t N>
static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
{
using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;
type_t value = sourceVal;

operation_t<params_t> func;
// [unroll]
for (uint32_t i = 0; i < NUM_LOOPS; i++)
value = func(value);

output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value);
}

void benchmark()
{
const uint32_t idx = globalIndex();
type_t sourceVal = inputValue[idx];

subbench<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
subbench<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
subbench<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
subbench<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
subbench<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
subbench<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
subbench<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
}

[numthreads(WORKGROUP_SIZE,1,1)]
void main()
{
benchmark();
}
95 changes: 95 additions & 0 deletions 73_ArithmeticBench/app_resources/common.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#include "nbl/builtin/hlsl/cpp_compat.hlsl"
#include "nbl/builtin/hlsl/functional.hlsl"

template<uint32_t kScanElementCount=1024*1024>
struct Output
{
NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount;

uint32_t subgroupSize;
uint32_t data[ScanElementCount];
};

template<typename T>
struct bit_and : nbl::hlsl::bit_and<T>
{
using base_t = nbl::hlsl::bit_and<T>;

NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0;
#ifndef __HLSL_VERSION
static inline constexpr const char* name = "bit_and";
#endif
};
template<typename T>
struct bit_or : nbl::hlsl::bit_or<T>
{
using base_t = nbl::hlsl::bit_or<T>;

NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1;
#ifndef __HLSL_VERSION
static inline constexpr const char* name = "bit_xor";
#endif
};
template<typename T>
struct bit_xor : nbl::hlsl::bit_xor<T>
{
using base_t = nbl::hlsl::bit_xor<T>;

NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2;
#ifndef __HLSL_VERSION
static inline constexpr const char* name = "bit_or";
#endif
};
template<typename T>
struct plus : nbl::hlsl::plus<T>
{
using base_t = nbl::hlsl::plus<T>;

NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3;
#ifndef __HLSL_VERSION
static inline constexpr const char* name = "plus";
#endif
};
template<typename T>
struct multiplies : nbl::hlsl::multiplies<T>
{
using base_t = nbl::hlsl::multiplies<T>;

NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4;
#ifndef __HLSL_VERSION
static inline constexpr const char* name = "multiplies";
#endif
};
template<typename T>
struct minimum : nbl::hlsl::minimum<T>
{
using base_t = nbl::hlsl::minimum<T>;

NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5;
#ifndef __HLSL_VERSION
static inline constexpr const char* name = "minimum";
#endif
};
template<typename T>
struct maximum : nbl::hlsl::maximum<T>
{
using base_t = nbl::hlsl::maximum<T>;

NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6;
#ifndef __HLSL_VERSION
static inline constexpr const char* name = "maximum";
#endif
};

template<typename T>
struct ballot : nbl::hlsl::plus<T>
{
using base_t = nbl::hlsl::plus<T>;

NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7;
#ifndef __HLSL_VERSION
static inline constexpr const char* name = "bitcount";
#endif
};

#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
68 changes: 68 additions & 0 deletions 73_ArithmeticBench/app_resources/shaderCommon.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#include "common.hlsl"

#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl"
#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"

#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"

// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}

#ifndef ITEMS_PER_INVOCATION
#error "Define ITEMS_PER_INVOCATION!"
#endif

typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t;

// unfortunately DXC chokes on descriptors as static members
// https://github.com/microsoft/DirectXShaderCompiler/issues/5940
[[vk::binding(0, 0)]] StructuredBuffer<type_t> inputValue;
[[vk::binding(1, 0)]] RWByteAddressBuffer output[8];

// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
uint32_t globalIndex();
// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
bool canStore();

#ifndef OPERATION
#error "Define OPERATION!"
#endif

#ifndef SUBGROUP_SIZE_LOG2
#error "Define SUBGROUP_SIZE_LOG2!"
#endif
template<template<class> class binop, typename T, uint32_t N>
static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
{
// TODO static assert vector<T, N> == type_t
//using type_t = vector<T, N>;
using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>;
using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>;

if (globalIndex()==0u)
output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());

operation_t<params_t> func;
if (canStore())
output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
}


type_t test()
{
const uint32_t idx = globalIndex();
type_t sourceVal = inputValue[idx];

subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal);
return sourceVal;
}

#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
18 changes: 18 additions & 0 deletions 73_ArithmeticBench/app_resources/testSubgroup.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma shader_stage(compute)

#define operation_t nbl::hlsl::OPERATION

#include "shaderCommon.hlsl"

uint32_t globalIndex()
{
return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
}

bool canStore() {return true;}

[numthreads(WORKGROUP_SIZE,1,1)]
void main()
{
test();
}
107 changes: 107 additions & 0 deletions 73_ArithmeticBench/app_resources/testWorkgroup.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#pragma shader_stage(compute)


#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"

static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic<ITEMS_PER_WG>::value;
static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot<ITEMS_PER_WG>::value;
static const uint32_t ScratchSz = ArithmeticSz+BallotSz;

// TODO: Can we make it a static variable in the ScratchProxy struct?
groupshared uint32_t scratch[ScratchSz];


#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"


template<uint16_t offset>
struct ScratchProxy
{
void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
{
value = scratch[ix+offset];
}
void set(const uint32_t ix, const uint32_t value)
{
scratch[ix+offset] = value;
}

uint32_t atomicOr(const uint32_t ix, const uint32_t value)
{
return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
}

void workgroupExecutionAndMemoryBarrier()
{
nbl::hlsl::glsl::barrier();
//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
}
};

static ScratchProxy<0> arithmeticAccessor;


#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"


template<class Binop, class device_capabilities>
struct operation_t
{
using type_t = typename Binop::type_t;

type_t operator()(type_t value)
{
type_t retval = nbl::hlsl::OPERATION<Binop,ITEMS_PER_WG,device_capabilities>::template __call<ScratchProxy<0> >(value,arithmeticAccessor);
// we barrier before because we alias the accessors for Binop
arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
return retval;
}
};


#include "shaderCommon.hlsl"

static ScratchProxy<ArithmeticSz> ballotAccessor;


uint32_t globalIndex()
{
return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex();
}

bool canStore()
{
return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG;
}

[numthreads(WORKGROUP_SIZE,1,1)]
void main()
{
const type_t sourceVal = test();
if (globalIndex()==0u)
output[ballot<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());

// we can only ballot booleans, so low bit
nbl::hlsl::workgroup::ballot<ScratchProxy<ArithmeticSz> >(bool(sourceVal & 0x1u), ballotAccessor);
// need to barrier between ballot and usages of a ballot by myself
ballotAccessor.workgroupExecutionAndMemoryBarrier();

uint32_t destVal = 0xdeadbeefu;
#define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same<nbl::hlsl::OPERATION<nbl::hlsl::bit_xor<float>,0x45>,nbl::hlsl::workgroup::IS_OP<nbl::hlsl::bit_xor<float>,0x45> >::value
#define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities
if (CONSTEXPR_OP_TYPE_TEST(reduction))
destVal = nbl::hlsl::workgroup::ballotBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan))
destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan))
destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
else
{
assert(false);
}
#undef BALLOT_TEMPLATE_ARGS
#undef CONSTEXPR_OP_TYPE_TEST

if (canStore())
output[ballot<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal);
}
Loading