Skip to content

Block Chain Scan #884

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 17 commits into
base: improve-workgroup-scan-2
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples_tests
Submodule examples_tests updated 30 files
+0 −0 14_ComputeScan/CMakeLists.txt
+97 −0 14_ComputeScan/app_resources/common.hlsl
+253 −0 14_ComputeScan/app_resources/testScans.comp.hlsl
+0 −0 14_ComputeScan/config.json.template
+512 −0 14_ComputeScan/main.cpp
+0 −0 14_ComputeScan/pipeline.groovy
+0 −30 23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
+0 −55 23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
+0 −76 23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+0 −535 23_Arithmetic2UnitTest/main.cpp
+0 −0 23_ArithmeticUnitTest/CMakeLists.txt
+7 −7 23_ArithmeticUnitTest/app_resources/common.hlsl
+55 −0 23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl
+18 −0 23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl
+107 −0 23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl
+0 −0 23_ArithmeticUnitTest/config.json.template
+462 −0 23_ArithmeticUnitTest/main.cpp
+0 −0 23_ArithmeticUnitTest/pipeline.groovy
+10 −10 28_FFTBloom/app_resources/fft_common.hlsl
+0 −52 29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
+0 −123 29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+0 −35 29_Arithmetic2Bench/app_resources/common.hlsl
+0 −37 29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
+0 −726 29_Arithmetic2Bench/main.cpp
+0 −50 29_Arithmetic2Bench/pipeline.groovy
+2 −2 CMakeLists.txt
+0 −121 common/include/WorkgroupDataAccessors.hlsl
+0 −6 old_to_refactor/14_ComputeScan/CMakeLists.txt
+0 −28 old_to_refactor/14_ComputeScan/config.json.template
+0 −234 old_to_refactor/14_ComputeScan/main.cpp
33 changes: 33 additions & 0 deletions include/nbl/builtin/hlsl/scan/arithmetic.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h
#ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_INCLUDED_
#define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_INCLUDED_

#include "nbl/builtin/hlsl/scan/arithmetic_impl.hlsl"

namespace nbl
{
namespace hlsl
{
namespace scan
{

template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities=void>
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should make a fake device feature called forwardProgressGuarantees which is basically always false

struct reduction
{
using scalar_t = typename BinOp::type_t;

template<class ReadOnlyDataAccessor, class OutputAccessor, class StatusAccessor, class ScratchAccessor>
static void __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
{
impl::reduce<Config, BinOp, ForwardProgressGuarantees, device_capabilities> fn;
fn.template __call<ReadOnlyDataAccessor,OutputAccessor,StatusAccessor,ScratchAccessor>(dataAccessor, outputAccessor, statusAccessor, sharedMemScratchAccessor);
}
};

}
}
}

#endif
233 changes: 233 additions & 0 deletions include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h
#ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_
#define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_

#include "nbl/builtin/hlsl/bda/__ptr.hlsl"
#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"

namespace nbl
{
namespace hlsl
{
namespace scan
{

template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2, uint16_t _ItemsPerInvocation>
struct ScanConfiguration
{
NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2;
NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2;
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2;
NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = _ItemsPerInvocation;

using arith_config_t = workgroup2::ArithmeticConfiguration<WorkgroupSizeLog2, SubgroupSizeLog2, ItemsPerInvocation>;
NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = arith_config_t::SharedScratchElementCount;
};

namespace impl
{

template<typename T> // only uint32_t or uint64_t for now?
struct Constants
{
NBL_CONSTEXPR_STATIC_INLINE T NOT_READY = 0;
NBL_CONSTEXPR_STATIC_INLINE T LOCAL_COUNT = T(0x1u) << (sizeof(T)*8-2);
NBL_CONSTEXPR_STATIC_INLINE T GLOBAL_COUNT = T(0x1u) << (sizeof(T)*8-1);
NBL_CONSTEXPR_STATIC_INLINE T STATUS_MASK = LOCAL_COUNT | GLOBAL_COUNT;
};
Comment on lines +33 to +40
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can use enum class if you update DXC btw also with : uint16_t or : uint64_t

not everything needs to be a crazy template


// NOTE: there doesn't seem to be a way to set OpMemoryModel yet: https://github.com/microsoft/DirectXShaderCompiler/issues/7180
// MakeAvailable semantic requires memory model set to Vulkan instead of GLSL450 currently
template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities>
struct reduce;

#define SPECIALIZE(BINOP,ATOMIC_OP) template<class Config, typename T, bool ForwardProgressGuarantees, class device_capabilities>\
struct reduce<Config, BINOP<T>, ForwardProgressGuarantees, device_capabilities>\
{\
using scalar_t = T;\
using arith_config_t = typename Config::arith_config_t;\
using workgroup_reduce_t = workgroup2::reduction<arith_config_t, BINOP<T>, device_capabilities>;\
\
template<class DataAccessor, class OutputAccessor, class StatusAccessor, class ScratchAccessor>\
void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)\
{\
const scalar_t localReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);\
\
const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);\
if (lastInvocation)\
{\
spirv::ATOMIC_OP(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, localReduction);\
spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u);\
}\
}\
}

SPECIALIZE(bit_and,atomicAnd);
SPECIALIZE(bit_or,atomicOr);
SPECIALIZE(bit_xor,atomicXor);

SPECIALIZE(plus,atomicIAdd);
// there's no atomic multiply so we use a CAS loop

SPECIALIZE(minimum,atomicUMin);
SPECIALIZE(maximum,atomicUMax);

#undef SPECIALIZE

template<class Config, typename T, bool ForwardProgressGuarantees, class device_capabilities>
struct reduce<Config, multiplies<T>, ForwardProgressGuarantees, device_capabilities>
{
using scalar_t = T;
using arith_config_t = typename Config::arith_config_t;
using workgroup_reduce_t = workgroup2::reduction<arith_config_t, multiplies<T>, device_capabilities>;

template<class DataAccessor, class OutputAccessor, class StatusAccessor, class ScratchAccessor>
void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
{
const scalar_t localReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);

const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);
if (lastInvocation)
{
{
scalar_t actual, expected;
actual = multiplies<T>::identity;
do
{
expected = actual;
scalar_t newVal = expected * localReduction;
actual = spirv::atomicCompareExchange(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, spv::MemorySemanticsAcquireMask, newVal, expected);
} while (expected != actual);
}
spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u);
}
}
};

// TODO: change this to scan, it totally won't work for reduce anyways
template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities>
struct scan
{
using scalar_t = typename BinOp::type_t;
using constants_t = Constants<scalar_t>;
using config_t = Config;
using arith_config_t = typename Config::arith_config_t;
using workgroup_reduce_t = workgroup2::reduction<arith_config_t, BinOp, device_capabilities>;
using binop_t = BinOp;

template<class DataAccessor, class ScratchAccessor>
scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
Comment on lines +121 to +122
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you have the readonly accessor to get your element, and you have the scratch memory accessor (for the workgroup scans/reductions) but you don't have:

  • accessor for the Device-Scope scratch
  • accessor for where to store the reduction result (reduction is special compared to a scan, you can't get the result right away)

{
const scalar_t localReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);
bda::__ptr<scalar_t> scratch = dataAccessor.getScratchPtr(); // scratch data should be at least T[NumWorkgroups]

const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);
if (lastInvocation)
{
bda::__ref<scalar_t> scratchId = (scratch + glsl::gl_WorkGroupID().x).deref();
spirv::atomicStore(scratchId.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT);
}

binop_t binop;
scalar_t prefix = scalar_t(0);
// decoupled lookback
if (ForwardProgressGuarantees)
{
if (lastInvocation) // don't make whole block work and do busy stuff
{
// for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--) // won't run properly this way for some reason, results in device lost
for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++)
Comment on lines +139 to +142
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually using the whole workgroup or at least subgroup (benchmark it) would be much faster here, so each invocation checks a workgroup and you can use workgroup2::reduce with a Max binop to find the first preceeding workgroup with a ready GLOBAL scan value

You'd also be able to accumulate the prefix faster over the ones which have LOCAL ready

{
const uint32_t prevID = glsl::gl_WorkGroupID().x-i;
Comment on lines +142 to +144
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't use gl_WorkGroupID ask for the virtualWorkgroupIndex in the function call

scalar_t value = constants_t::NOT_READY;
{
// spin until something is ready
while (value == constants_t::NOT_READY)
{
bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref();
value = spirv::atomicLoad<scalar_t>(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask);
}
}
prefix = binop(value & (~constants_t::STATUS_MASK), prefix);

// last was actually a global sum, we have the prefix, we can quit
if (value & constants_t::GLOBAL_COUNT)
break;
}
}
prefix = workgroup::Broadcast(prefix, sharedMemScratchAccessor, Config::WorkgroupSize-1);
}
else
{
// for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--)
for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++)
{
const uint32_t prevID = glsl::gl_WorkGroupID().x-i;
scalar_t value = scalar_t(0);
if (lastInvocation)
{
bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref();
value = spirv::atomicLoad<scalar_t>(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask);
}
value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1);

if (value & constants_t::STATUS_MASK)
{
prefix = binop(value & (~constants_t::STATUS_MASK), prefix);

if (value & constants_t::GLOBAL_COUNT)
break;
}
else // can't wait/spin, have to do it ourselves
{
sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier();

DataAccessor prevDataAccessor = DataAccessor::create(prevID);
prevDataAccessor.begin(); // prepare data accessor if needed (e.g. preload)
const scalar_t prevReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(prevDataAccessor, sharedMemScratchAccessor);

// if DoAndRaceStore, stores in place of prev workgroup id as well
// bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref();
// if (lastInvocation)
// spirv::atomicUMax(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, prevReduction|constants_t::LOCAL_COUNT);

prefix = binop(prevReduction, prefix);
}
}
}

const scalar_t globalReduction = binop(prefix,localReduction);
// TODO globalReduction value changing in following block somehow, double check
if (lastInvocation)
{
bda::__ref<scalar_t> scratchId = (scratch + glsl::gl_WorkGroupID().x).deref();
spirv::atomicStore(scratchId.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, globalReduction|constants_t::GLOBAL_COUNT);
}

// get last item from scratch
const uint32_t lastWorkgroup = glsl::gl_NumWorkGroups().x - 1;
bda::__ref<scalar_t> scratchLast = (scratch + lastWorkgroup).deref();
scalar_t value = constants_t::NOT_READY;
if (lastInvocation)
{
// wait until last workgroup does reduction
while (!(value & constants_t::GLOBAL_COUNT))
{
value = spirv::atomicLoad<scalar_t>(scratchLast.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask);
}
}
value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1);
return value & (~constants_t::STATUS_MASK);
}
};

}

}
}
}

#endif
66 changes: 0 additions & 66 deletions include/nbl/builtin/hlsl/scan/declarations.hlsl

This file was deleted.

Loading
Loading