Devsh-Graphics-Programming · devshgraphicsprogramming · May 26, 2025 · May 26, 2025 · May 26, 2025 · May 26, 2025
diff --git a/examples_tests b/examples_tests
diff --git a/include/nbl/builtin/hlsl/scan/arithmetic.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic.hlsl
@@ -0,0 +1,33 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_INCLUDED_
+
+#include "nbl/builtin/hlsl/scan/arithmetic_impl.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace scan
+{
+
+template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities=void>
+struct reduction
+{
+    using scalar_t = typename BinOp::type_t;
+
+    template<class ReadOnlyDataAccessor, class OutputAccessor, class StatusAccessor, class ScratchAccessor>
+    static void __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
+    {
+        impl::reduce<Config, BinOp, ForwardProgressGuarantees, device_capabilities> fn;
+        fn.template __call<ReadOnlyDataAccessor,OutputAccessor,StatusAccessor,ScratchAccessor>(dataAccessor, outputAccessor, statusAccessor, sharedMemScratchAccessor);
+    }
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl b/include/nbl/builtin/hlsl/scan/arithmetic_impl.hlsl
@@ -0,0 +1,233 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SCAN_ARITHMETIC_IMPL_INCLUDED_
+
+#include "nbl/builtin/hlsl/bda/__ptr.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace scan
+{
+
+template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2, uint16_t _ItemsPerInvocation>
+struct ScanConfiguration
+{
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSizeLog2 = _WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(0x1u) << WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSizeLog2 = _SubgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = uint16_t(0x1u) << SubgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t ItemsPerInvocation = _ItemsPerInvocation;
+
+    using arith_config_t = workgroup2::ArithmeticConfiguration<WorkgroupSizeLog2, SubgroupSizeLog2, ItemsPerInvocation>;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedScratchElementCount = arith_config_t::SharedScratchElementCount;
+};
+
+namespace impl
+{
+
+template<typename T>    // only uint32_t or uint64_t for now?
+struct Constants
+{
+    NBL_CONSTEXPR_STATIC_INLINE T NOT_READY = 0;
+    NBL_CONSTEXPR_STATIC_INLINE T LOCAL_COUNT = T(0x1u) << (sizeof(T)*8-2);
+    NBL_CONSTEXPR_STATIC_INLINE T GLOBAL_COUNT = T(0x1u) << (sizeof(T)*8-1);
+    NBL_CONSTEXPR_STATIC_INLINE T STATUS_MASK = LOCAL_COUNT | GLOBAL_COUNT;
+};
+
+// NOTE: there doesn't seem to be a way to set OpMemoryModel yet: https://github.com/microsoft/DirectXShaderCompiler/issues/7180
+// MakeAvailable semantic requires memory model set to Vulkan instead of GLSL450 currently
+template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities>
+struct reduce;
+
+#define SPECIALIZE(BINOP,ATOMIC_OP) template<class Config, typename T, bool ForwardProgressGuarantees, class device_capabilities>\
+struct reduce<Config, BINOP<T>, ForwardProgressGuarantees, device_capabilities>\
+{\
+    using scalar_t = T;\
+    using arith_config_t = typename Config::arith_config_t;\
+    using workgroup_reduce_t = workgroup2::reduction<arith_config_t, BINOP<T>, device_capabilities>;\
+\
+    template<class DataAccessor, class OutputAccessor, class StatusAccessor, class ScratchAccessor>\
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)\
+    {\
+        const scalar_t localReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);\
+\
+        const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);\
+        if (lastInvocation)\
+        {\
+            spirv::ATOMIC_OP(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, localReduction);\
+            spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u);\
+        }\
+    }\
+}
+
+SPECIALIZE(bit_and,atomicAnd);
+SPECIALIZE(bit_or,atomicOr);
+SPECIALIZE(bit_xor,atomicXor);
+
+SPECIALIZE(plus,atomicIAdd);
+// there's no atomic multiply so we use a CAS loop
+
+SPECIALIZE(minimum,atomicUMin);
+SPECIALIZE(maximum,atomicUMax);
+
+#undef SPECIALIZE
+
+template<class Config, typename T, bool ForwardProgressGuarantees, class device_capabilities>
+struct reduce<Config, multiplies<T>, ForwardProgressGuarantees, device_capabilities>
+{
+    using scalar_t = T;
+    using arith_config_t = typename Config::arith_config_t;
+    using workgroup_reduce_t = workgroup2::reduction<arith_config_t, multiplies<T>, device_capabilities>;
+
+    template<class DataAccessor, class OutputAccessor, class StatusAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(OutputAccessor) outputAccessor, NBL_REF_ARG(StatusAccessor) statusAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
+    {
+        const scalar_t localReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);
+
+        const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);
+        if (lastInvocation)
+        {
+            {
+                scalar_t actual, expected;
+                actual = multiplies<T>::identity;
+                do
+                {
+                    expected = actual;
+                    scalar_t newVal = expected * localReduction;
+                    actual = spirv::atomicCompareExchange(outputAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, spv::MemorySemanticsAcquireMask, newVal, expected);
+                } while (expected != actual);
+            }
+            spirv::atomicIAdd(statusAccessor.getPtr().deref().__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask/*|spv::MemorySemanticsMakeAvailableMask*/, 1u);
+        }
+    }
+};
+
+// TODO: change this to scan, it totally won't work for reduce anyways
+template<class Config, class BinOp, bool ForwardProgressGuarantees, class device_capabilities>
+struct scan
+{
+    using scalar_t = typename BinOp::type_t;
+    using constants_t = Constants<scalar_t>;
+    using config_t = Config;
+    using arith_config_t = typename Config::arith_config_t;
+    using workgroup_reduce_t = workgroup2::reduction<arith_config_t, BinOp, device_capabilities>;
+    using binop_t = BinOp;
+
+    template<class DataAccessor, class ScratchAccessor>
+    scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) sharedMemScratchAccessor)
+    {
+        const scalar_t localReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(dataAccessor, sharedMemScratchAccessor);
+        bda::__ptr<scalar_t> scratch = dataAccessor.getScratchPtr();   // scratch data should be at least T[NumWorkgroups]
+
+        const bool lastInvocation = (workgroup::SubgroupContiguousIndex() == Config::WorkgroupSize-1);
+        if (lastInvocation)
+        {
+            bda::__ref<scalar_t> scratchId = (scratch + glsl::gl_WorkGroupID().x).deref();
+            spirv::atomicStore(scratchId.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, localReduction|constants_t::LOCAL_COUNT);
+        }
+
+        binop_t binop;
+        scalar_t prefix = scalar_t(0);
+        // decoupled lookback
+        if (ForwardProgressGuarantees)
+        {
+            if (lastInvocation) // don't make whole block work and do busy stuff
+            {
+                // for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--)   // won't run properly this way for some reason, results in device lost
+                for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++)
+                {
+                    const uint32_t prevID = glsl::gl_WorkGroupID().x-i;
+                    scalar_t value = constants_t::NOT_READY;
+                    {
+                        // spin until something is ready
+                        while (value == constants_t::NOT_READY)
+                        {
+                            bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref();
+                            value = spirv::atomicLoad<scalar_t>(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask);
+                        }
+                    }
+                    prefix = binop(value & (~constants_t::STATUS_MASK), prefix);
+
+                    // last was actually a global sum, we have the prefix, we can quit
+                    if (value & constants_t::GLOBAL_COUNT)
+                        break;
+                }
+            }
+            prefix = workgroup::Broadcast(prefix, sharedMemScratchAccessor, Config::WorkgroupSize-1);
+        }
+        else
+        {
+            // for (uint32_t prevID = glsl::gl_WorkGroupID().x-1; prevID >= 0u; prevID--)
+            for (uint32_t i = 1; i <= glsl::gl_WorkGroupID().x; i++)
+            {
+                const uint32_t prevID = glsl::gl_WorkGroupID().x-i;
+                scalar_t value = scalar_t(0);
+                if (lastInvocation)
+                {
+                    bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref();
+                    value = spirv::atomicLoad<scalar_t>(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask);
+                }
+                value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1);
+
+                if (value & constants_t::STATUS_MASK)
+                {
+                    prefix = binop(value & (~constants_t::STATUS_MASK), prefix);
+
+                    if (value & constants_t::GLOBAL_COUNT)
+                        break;
+                }
+                else    // can't wait/spin, have to do it ourselves
+                {
+                    sharedMemScratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+                    DataAccessor prevDataAccessor = DataAccessor::create(prevID);
+                    prevDataAccessor.begin();   // prepare data accessor if needed (e.g. preload)
+                    const scalar_t prevReduction = workgroup_reduce_t::template __call<DataAccessor, ScratchAccessor>(prevDataAccessor, sharedMemScratchAccessor);
+
+                    // if DoAndRaceStore, stores in place of prev workgroup id as well
+                    // bda::__ref<scalar_t> scratchPrev = (scratch + prevID).deref();
+                    // if (lastInvocation)
+                    //     spirv::atomicUMax(scratchPrev.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, prevReduction|constants_t::LOCAL_COUNT);
+
+                    prefix = binop(prevReduction, prefix);
+                }
+            }
+        }
+
+        const scalar_t globalReduction = binop(prefix,localReduction);
+        // TODO globalReduction value changing in following block somehow, double check
+        if (lastInvocation)
+        {
+            bda::__ref<scalar_t> scratchId = (scratch + glsl::gl_WorkGroupID().x).deref();
+            spirv::atomicStore(scratchId.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsReleaseMask, globalReduction|constants_t::GLOBAL_COUNT);
+        }
+
+        // get last item from scratch
+        const uint32_t lastWorkgroup = glsl::gl_NumWorkGroups().x - 1;
+        bda::__ref<scalar_t> scratchLast = (scratch + lastWorkgroup).deref();
+        scalar_t value = constants_t::NOT_READY;
+        if (lastInvocation)
+        {
+            // wait until last workgroup does reduction
+            while (!(value & constants_t::GLOBAL_COUNT))
+            {
+                value = spirv::atomicLoad<scalar_t>(scratchLast.__get_spv_ptr(), spv::ScopeDevice, spv::MemorySemanticsAcquireMask);
+            }
+        }
+        value = workgroup::Broadcast(value, sharedMemScratchAccessor, Config::WorkgroupSize-1);
+        return value & (~constants_t::STATUS_MASK);
+    }
+};
+
+}
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/scan/declarations.hlsl b/include/nbl/builtin/hlsl/scan/declarations.hlsl
+0 −0		14_ComputeScan/CMakeLists.txt
+97 −0		14_ComputeScan/app_resources/common.hlsl
+253 −0		14_ComputeScan/app_resources/testScans.comp.hlsl
+0 −0		14_ComputeScan/config.json.template
+512 −0		14_ComputeScan/main.cpp
+0 −0		14_ComputeScan/pipeline.groovy
+0 −30		23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
+0 −55		23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
+0 −76		23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
+0 −535		23_Arithmetic2UnitTest/main.cpp
+0 −0		23_ArithmeticUnitTest/CMakeLists.txt
+7 −7		23_ArithmeticUnitTest/app_resources/common.hlsl
+55 −0		23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl
+18 −0		23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl
+107 −0		23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl
+0 −0		23_ArithmeticUnitTest/config.json.template
+462 −0		23_ArithmeticUnitTest/main.cpp
+0 −0		23_ArithmeticUnitTest/pipeline.groovy
+10 −10		28_FFTBloom/app_resources/fft_common.hlsl
+0 −52		29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
+0 −123		29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
+0 −35		29_Arithmetic2Bench/app_resources/common.hlsl
+0 −37		29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
+0 −726		29_Arithmetic2Bench/main.cpp
+0 −50		29_Arithmetic2Bench/pipeline.groovy
+2 −2		CMakeLists.txt
+0 −121		common/include/WorkgroupDataAccessors.hlsl
+0 −6		old_to_refactor/14_ComputeScan/CMakeLists.txt
+0 −28		old_to_refactor/14_ComputeScan/config.json.template
+0 −234		old_to_refactor/14_ComputeScan/main.cpp