diff --git a/util/test/demos/CMakeLists.txt b/util/test/demos/CMakeLists.txt index 25427a8e6..840aeb722 100644 --- a/util/test/demos/CMakeLists.txt +++ b/util/test/demos/CMakeLists.txt @@ -165,7 +165,8 @@ set(VULKAN_SRC vk/vk_validation_use.cpp vk/vk_vertex_attr_zoo.cpp vk/vk_video_textures.cpp - vk/vk_vs_max_desc_set.cpp) + vk/vk_vs_max_desc_set.cpp + vk/vk_workgroup_zoo.cpp) set(OPENGL_SRC 3rdparty/glad/glad.c diff --git a/util/test/demos/d3d12/d3d12_subgroup_zoo.cpp b/util/test/demos/d3d12/d3d12_subgroup_zoo.cpp index b4acf17fe..0e15f12e1 100644 --- a/util/test/demos/d3d12/d3d12_subgroup_zoo.cpp +++ b/util/test/demos/d3d12/d3d12_subgroup_zoo.cpp @@ -57,7 +57,7 @@ RWStructuredBuffer outbuf : register(u0); static uint3 tid; -void SetOuput(float4 data) +void SetOutput(float4 data) { outbuf[root_test * 1024 + tid.y * GROUP_SIZE_X + tid.x] = data; } @@ -221,7 +221,7 @@ void main(uint3 inTid : SV_DispatchThreadID) uint id = WaveGetLaneIndex(); - SetOuput(id); + SetOutput(id); if(IsTest(0)) { @@ -297,7 +297,7 @@ void main(uint3 inTid : SV_DispatchThreadID) if (id < 10) { data.x = WaveActiveSum(id+10); - SetOuput(data); + SetOutput(data); return; } data.x = WaveActiveSum(id); @@ -402,7 +402,7 @@ void main(uint3 inTid : SV_DispatchThreadID) data.w = float(WaveActiveAllEqual(test4).w); } } - SetOuput(data); + SetOutput(data); } )EOSHADER"; @@ -417,7 +417,7 @@ void main(uint3 inTid : SV_DispatchThreadID) uint id = WaveGetLaneIndex(); - SetOuput(id); + SetOutput(id); if(IsTest(0)) { @@ -439,7 +439,7 @@ void main(uint3 inTid : SV_DispatchThreadID) data.z = WaveMultiPrefixBitOr(id, mask); data.w = WaveMultiPrefixBitXor(id, mask); } - SetOuput(data); + SetOutput(data); } )EOSHADER"; @@ -548,12 +548,7 @@ void main(uint3 inTid : SV_DispatchThreadID) ID3D12PipelineStatePtr comppipe65[ARRAY_COUNT(compsize)]; std::string defines60; - defines60 += fmt::format("#define COMP_TESTS {}\n", numCompTests60); - defines60 += "\n"; - std::string defines65; - defines65 += fmt::format("#define COMP_TESTS {}\n", numCompTests65); - defines65 += "\n"; bool supportSM65 = (m_HighestShaderModel >= D3D_SHADER_MODEL_6_5) && m_DXILSupport; bool supportSM67 = (m_HighestShaderModel >= D3D_SHADER_MODEL_6_7) && m_DXILSupport; diff --git a/util/test/demos/d3d12/d3d12_workgroup_zoo.cpp b/util/test/demos/d3d12/d3d12_workgroup_zoo.cpp new file mode 100644 index 000000000..1a2d3291c --- /dev/null +++ b/util/test/demos/d3d12/d3d12_workgroup_zoo.cpp @@ -0,0 +1,418 @@ +/****************************************************************************** + * The MIT License (MIT) + * + * Copyright (c) 2019-2025 Baldur Karlsson + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + ******************************************************************************/ + +#include "3rdparty/fmt/core.h" +#include "d3d12_test.h" + +RD_TEST(D3D12_Workgroup_Zoo, D3D12GraphicsTest) +{ + static constexpr const char *Description = + "Test of behaviour around workgroup operations in shaders."; + + const std::string common = R"EOSHADER( + +cbuffer rootconsts : register(b0) +{ + uint root_test; +} + +#define IsTest(x) (root_test == x) + +)EOSHADER"; + + const std::string compCommon = common + R"EOSHADER( + +RWStructuredBuffer outbuf : register(u0); + +static uint3 tid; + +groupshared uint4 gsmUint4[1024]; + +void SetOutput(float4 data) +{ + outbuf[root_test * 1024 + tid.y * GROUP_SIZE_X + tid.x] = data; +} + +)EOSHADER"; + + const std::string comp = compCommon + R"EOSHADER( + +float4 funcD(uint id) +{ + return WaveActiveSum(id/2).xxxx; +} + +float4 nestedFunc(uint id) +{ + float4 ret = funcD(id/3); + ret.w = WaveActiveSum(id); + return ret; +} + +float4 funcA(uint id) +{ + return nestedFunc(id*2); +} + +float4 funcB(uint id) +{ + return nestedFunc(id*4); +} + +float4 funcTest(uint id) +{ + if ((id % 2) == 0) + { + return 0.xxxx; + } + else + { + float value = WaveActiveSum(id); + if (id < 10) + { + return value.xxxx; + } + value += WaveActiveSum(id/2); + return value.xxxx; + } +} + +[numthreads(GROUP_SIZE_X, GROUP_SIZE_Y, 1)] +void main(uint3 inTid : SV_DispatchThreadID) +{ + tid = inTid; + float4 data = 0.0f.xxxx; + uint id = WaveGetLaneIndex(); + gsmUint4[id] = id; + SetOutput(data); + + if(IsTest(0)) + { + data.x = id; + } + else if(IsTest(1)) + { + data.x = WaveActiveSum(id); + } + else if(IsTest(2)) + { + // Diverged threads which reconverge + if (id < 10) + { + // active threads 0-9 + data.x = WaveActiveSum(id); + + if ((id % 2) == 0) + data.y = WaveActiveSum(id); + else + data.y = WaveActiveSum(id); + + data.x += WaveActiveSum(id); + } + else + { + // active threads 10... + data.x = WaveActiveSum(id); + } + data.y = WaveActiveSum(id); + } + else if(IsTest(3)) + { + // Converged threads calling a function + data = funcTest(id); + data.y = WaveActiveSum(id); + } + else if(IsTest(4)) + { + // Converged threads calling a function which has a nested function call in it + data = nestedFunc(id); + data.y = WaveActiveSum(id); + } + else if(IsTest(5)) + { + // Diverged threads calling the same function + if (id < 10) + { + data = funcD(id); + } + else + { + data = funcD(id); + } + data.y = WaveActiveSum(id); + } + else if(IsTest(6)) + { + // Diverged threads calling the same function which has a nested function call in it + if (id < 10) + { + data = funcA(id); + } + else + { + data = funcB(id); + } + data.y = WaveActiveSum(id); + } + else if(IsTest(7)) + { + // Diverged threads which early exit + if (id < 10) + { + data.x = WaveActiveSum(id+10); + SetOutput(data); + return; + } + data.x = WaveActiveSum(id); + } + else if(IsTest(8)) + { + // Loops with different number of iterations per thread + for (uint i = 0; i < id; i++) + { + data.x += WaveActiveSum(id); + } + } + else if(IsTest(9)) + { + // Query functions : unit tests + data.x = float(WaveGetLaneCount()); + data.y = float(WaveGetLaneIndex()); + data.z = float(WaveIsFirstLane()); + } + else if(IsTest(10)) + { + // Vote functions : unit tests + data.x = float(WaveActiveAnyTrue(id*2 > id+10)); + data.y = float(WaveActiveAllTrue(id < WaveGetLaneCount())); + if (id > 10) + { + data.z = float(WaveActiveAllTrue(id > 10)); + uint4 ballot = WaveActiveBallot(id > 20); + data.w = countbits(ballot.x) + countbits(ballot.y) + countbits(ballot.z) + countbits(ballot.w); + } + else + { + data.z = float(WaveActiveAllTrue(id > 3)); + uint4 ballot = WaveActiveBallot(id > 4); + data.w = countbits(ballot.x) + countbits(ballot.y) + countbits(ballot.z) + countbits(ballot.w); + } + } + else if(IsTest(11)) + { + // Broadcast functions : unit tests + if (id >= 2 && id <= 20) + { + data.x = WaveReadLaneFirst(id); + data.y = WaveReadLaneAt(id, 5); + data.z = WaveReadLaneAt(id, id); + data.w = WaveReadLaneAt(data.x, 2+id%3); + } + } + else if(IsTest(12)) + { + // Scan and Prefix functions : unit tests + if (id >= 2 && id <= 20) + { + data.x = WavePrefixCountBits(id > 4); + data.y = WavePrefixCountBits(id > 10); + data.z = WavePrefixSum(data.x); + data.w = WavePrefixProduct(1 + data.y); + } + else + { + data.x = WavePrefixCountBits(id > 23); + data.y = WavePrefixCountBits(id < 1); + data.z = WavePrefixSum(data.x); + data.w = WavePrefixSum(data.y); + } + } + else if(IsTest(13)) + { + // Reduction functions : unit tests + if (id >= 2 && id <= 20) + { + data.x = float(WaveActiveMax(id)); + data.y = float(WaveActiveMin(id)); + data.z = float(WaveActiveProduct(id)); + data.w = float(WaveActiveSum(id)); + } + } + else if(IsTest(14)) + { + // Reduction functions : unit tests + if (id >= 2 && id <= 20) + { + data.x = float(WaveActiveCountBits(id > 23)); + data.y = float(WaveActiveBitAnd(id)); + data.z = float(WaveActiveBitOr(id)); + data.w = float(WaveActiveBitXor(id)); + } + } + else if(IsTest(15)) + { + // Reduction functions : unit tests + if (id > 13) + { + bool test1 = (id > 15).x; + bool2 test2 = bool2(test1, (id < 23)); + bool3 test3 = bool3(test1, (id < 23), (id >= 25)); + bool4 test4 = bool4(test1, (id < 23), (id >= 25), (id >= 28)); + + data.x = float(WaveActiveAllEqual(test1).x); + data.y = float(WaveActiveAllEqual(test2).y); + data.z = float(WaveActiveAllEqual(test3).z); + data.w = float(WaveActiveAllEqual(test4).w); + } + } + + SetOutput(data); +} + +)EOSHADER"; + + void Prepare(int argc, char **argv) + { + D3D12GraphicsTest::Prepare(argc, argv); + + if(opts1.WaveLaneCountMax < 16) + Avail = "Subgroup size is less than 16"; + + bool supportSM60 = (m_HighestShaderModel >= D3D_SHADER_MODEL_6_0) && m_DXILSupport; + if(!supportSM60) + Avail = "SM 6.0 not supported"; + } + + int main() + { + // initialise, create window, create device, etc + if(!Init()) + return 3; + + ID3D12RootSignaturePtr sig = MakeSig({constParam(D3D12_SHADER_VISIBILITY_ALL, 0, 0, 1), + uavParam(D3D12_SHADER_VISIBILITY_ALL, 0, 0)}); + + const uint32_t imgDim = 128; + + ID3D12ResourcePtr fltTex = MakeTexture(DXGI_FORMAT_R32G32B32A32_FLOAT, imgDim, imgDim) + .RTV() + .InitialState(D3D12_RESOURCE_STATE_RENDER_TARGET); + fltTex->SetName(L"fltTex"); + D3D12_CPU_DESCRIPTOR_HANDLE fltRTV = MakeRTV(fltTex).CreateCPU(0); + D3D12_GPU_DESCRIPTOR_HANDLE fltSRV = MakeSRV(fltTex).CreateGPU(8); + + int32_t numCompTests = 0; + + size_t pos = 0; + while(pos != std::string::npos) + { + pos = comp.find("IsTest(", pos); + if(pos == std::string::npos) + break; + pos += sizeof("IsTest(") - 1; + numCompTests = std::max(numCompTests, atoi(comp.c_str() + pos) + 1); + } + + struct + { + int x, y; + } compsize[] = { + {70, 1}, + }; + std::string comppipe_name[ARRAY_COUNT(compsize)]; + ID3D12PipelineStatePtr comppipe[ARRAY_COUNT(compsize)]; + + std::string defines; + + for(int i = 0; i < ARRAY_COUNT(comppipe); i++) + { + std::string sizedefine; + sizedefine = fmt::format("#define GROUP_SIZE_X {}\n#define GROUP_SIZE_Y {}\n", compsize[i].x, + compsize[i].y); + comppipe_name[i] = fmt::format("{}x{}", compsize[i].x, compsize[i].y); + + comppipe[i] = + MakePSO().RootSig(sig).CS(Compile(defines + sizedefine + comp, "main", "cs_6_0")); + comppipe[i]->SetName(UTF82Wide(comppipe_name[i]).c_str()); + } + + ID3D12ResourcePtr bufOut = MakeBuffer().Size(sizeof(Vec4f) * 1024 * numCompTests).UAV(); + D3D12ViewCreator uavView = + MakeUAV(bufOut).Format(DXGI_FORMAT_R32_UINT).NumElements(4 * 1024 * numCompTests); + D3D12_CPU_DESCRIPTOR_HANDLE uavcpu = uavView.CreateClearCPU(10); + D3D12_GPU_DESCRIPTOR_HANDLE uavgpu = uavView.CreateGPU(10); + + bufOut->SetName(L"bufOut"); + + while(Running()) + { + ID3D12GraphicsCommandListPtr cmd = GetCommandBuffer(); + + Reset(cmd); + + cmd->SetDescriptorHeaps(1, &m_CBVUAVSRV.GetInterfacePtr()); + + ID3D12ResourcePtr bb = StartUsingBackbuffer(cmd, D3D12_RESOURCE_STATE_RENDER_TARGET); + + ClearRenderTargetView(cmd, BBRTV, {0.2f, 0.2f, 0.2f, 1.0f}); + + pushMarker(cmd, "Compute Tests"); + + for(size_t p = 0; p < ARRAY_COUNT(comppipe); p++) + { + ResourceBarrier(cmd); + + UINT zero[4] = {}; + cmd->ClearUnorderedAccessViewUint(uavgpu, uavcpu, bufOut, zero, 0, NULL); + + ResourceBarrier(cmd); + pushMarker(cmd, comppipe_name[p]); + + cmd->SetPipelineState(comppipe[p]); + cmd->SetComputeRootSignature(sig); + cmd->SetComputeRootUnorderedAccessView(1, bufOut->GetGPUVirtualAddress()); + + for(int i = 0; i < numCompTests; i++) + { + cmd->SetComputeRoot32BitConstant(0, i, 0); + cmd->Dispatch(1, 1, 1); + } + + popMarker(cmd); + } + + popMarker(cmd); + + FinishUsingBackbuffer(cmd, D3D12_RESOURCE_STATE_RENDER_TARGET); + + cmd->Close(); + + SubmitAndPresent({cmd}); + } + + return 0; + } +}; + +REGISTER_TEST(); diff --git a/util/test/demos/demos.vcxproj b/util/test/demos/demos.vcxproj index d4f024d66..29e9cba16 100644 --- a/util/test/demos/demos.vcxproj +++ b/util/test/demos/demos.vcxproj @@ -232,6 +232,7 @@ + @@ -374,6 +375,7 @@ + diff --git a/util/test/demos/demos.vcxproj.filters b/util/test/demos/demos.vcxproj.filters index 241b29aae..69a29736e 100644 --- a/util/test/demos/demos.vcxproj.filters +++ b/util/test/demos/demos.vcxproj.filters @@ -718,6 +718,12 @@ D3D12\demos + + Vulkan\demos + + + D3D12\demos + diff --git a/util/test/demos/vk/vk_subgroup_zoo.cpp b/util/test/demos/vk/vk_subgroup_zoo.cpp index b48064ee4..abe1c2914 100644 --- a/util/test/demos/vk/vk_subgroup_zoo.cpp +++ b/util/test/demos/vk/vk_subgroup_zoo.cpp @@ -191,7 +191,7 @@ vec4 funcTest(uint id) } } -void SetOuput(vec4 data) +void SetOutput(vec4 data) { outbuf.data[push.test].vals[gl_LocalInvocationID.y * GROUP_SIZE_X + gl_LocalInvocationID.x] = data; } @@ -199,7 +199,7 @@ void main() { vec4 data = vec4(0); uint id = gl_SubgroupInvocationID; - SetOuput(data); + SetOutput(data); if(IsTest(0)) { @@ -275,7 +275,7 @@ void main() if (id < 10) { data.x = subgroupAdd(id+10); - SetOuput(data); + SetOutput(data); return; } data.x = subgroupAdd(id); @@ -380,7 +380,7 @@ void main() data.w = float(subgroupAllEqual(id >= 28)); } } - SetOuput(data); + SetOutput(data); } )EOSHADER"; diff --git a/util/test/demos/vk/vk_workgroup_zoo.cpp b/util/test/demos/vk/vk_workgroup_zoo.cpp new file mode 100644 index 000000000..b3855e2fe --- /dev/null +++ b/util/test/demos/vk/vk_workgroup_zoo.cpp @@ -0,0 +1,489 @@ +/****************************************************************************** + * The MIT License (MIT) + * + * Copyright (c) 2019-2025 Baldur Karlsson + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + ******************************************************************************/ + +#include "3rdparty/fmt/core.h" +#include "vk_test.h" + +RD_TEST(VK_Workgroup_Zoo, VulkanGraphicsTest) +{ + static constexpr const char *Description = + "Test of behaviour around workgroup operations in shaders."; + + const std::string common = R"EOSHADER( + +#version 460 core +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_KHR_shader_subgroup_ballot : enable +#extension GL_KHR_shader_subgroup_vote : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable + +#if FEAT_SHUFFLE +#extension GL_KHR_shader_subgroup_shuffle : enable +#endif + +#if FEAT_SHUFFLE_RELATIVE +#extension GL_KHR_shader_subgroup_shuffle_relative : enable +#endif + +#if FEAT_CLUSTERED +#extension GL_KHR_shader_subgroup_clustered : enable +#endif + +#if FEAT_QUAD +#extension GL_KHR_shader_subgroup_quad : enable +#endif + +#if FEAT_ROTATE || FEAT_ROTATE_CLUSTERED +#extension GL_KHR_shader_subgroup_rotate : enable +#endif + +layout(push_constant) uniform PushData +{ + uint test; +} push; + +#define IsTest(x) (push.test == x) + +)EOSHADER"; + + const std::string comp = common + R"EOSHADER( + +shared uvec4 gsmUint4[COMP_TESTS]; + +struct Output +{ + vec4 vals[1024]; +}; + +layout(binding = 0, std430) buffer outbuftype { + Output data[COMP_TESTS]; +} outbuf; + +layout(local_size_x = GROUP_SIZE_X, local_size_y = GROUP_SIZE_Y, local_size_z = 1) in; + +vec4 funcD(uint id) +{ + return vec4(subgroupAdd(id/2)); +} + +vec4 nestedFunc(uint id) +{ + vec4 ret = funcD(id/3); + ret.w = subgroupAdd(id); + return ret; +} + +vec4 funcA(uint id) +{ + return nestedFunc(id*2); +} + +vec4 funcB(uint id) +{ + return nestedFunc(id*4); +} + +vec4 funcTest(uint id) +{ + if ((id % 2) == 0) + { + return vec4(0); + } + else + { + float value = subgroupAdd(id); + if (id < 10) + { + return vec4(value); + } + value += subgroupAdd(id/2); + return vec4(value); + } +} + +void SetOutput(vec4 data) +{ + outbuf.data[push.test].vals[gl_LocalInvocationID.y * GROUP_SIZE_X + gl_LocalInvocationID.x] = data; +} +void main() +{ + vec4 data = vec4(0); + uint id = gl_SubgroupInvocationID; + gsmUint4[id] = id; + SetOutput(data); + + if(IsTest(0)) + { + data.x = id; + } + else if(IsTest(1)) + { + data.x = subgroupAdd(id); + } + else if(IsTest(2)) + { + // Diverged threads which reconverge + if (id < 10) + { + // active threads 0-9 + data.x = subgroupAdd(id); + + if ((id % 2) == 0) + data.y = subgroupAdd(id); + else + data.y = subgroupAdd(id); + + data.x += subgroupAdd(id); + } + else + { + // active threads 10... + data.x = subgroupAdd(id); + } + data.y = subgroupAdd(id); + } + else if(IsTest(3)) + { + // Converged threads calling a function + data = funcTest(id); + data.y = subgroupAdd(id); + } + else if(IsTest(4)) + { + // Converged threads calling a function which has a nested function call in it + data = nestedFunc(id); + data.y = subgroupAdd(id); + } + else if(IsTest(5)) + { + // Diverged threads calling the same function + if (id < 10) + { + data = funcD(id); + } + else + { + data = funcD(id); + } + data.y = subgroupAdd(id); + } + else if(IsTest(6)) + { + // Diverged threads calling the same function which has a nested function call in it + if (id < 10) + { + data = funcA(id); + } + else + { + data = funcB(id); + } + data.y = subgroupAdd(id); + } + else if(IsTest(7)) + { + // Diverged threads which early exit + if (id < 10) + { + data.x = subgroupAdd(id+10); + SetOutput(data); + return; + } + data.x = subgroupAdd(id); + } + else if(IsTest(8)) + { + // Loops with different number of iterations per thread + for (uint i = 0; i < id; i++) + { + data.x += subgroupAdd(id); + } + } + else if(IsTest(9)) + { + // Query functions : unit tests + data.x = float(gl_SubgroupSize); + data.y = float(gl_SubgroupInvocationID); + data.z = float(subgroupElect()); + } + else if(IsTest(10)) + { + // Vote functions : unit tests + data.x = float(subgroupAny(id*2 > id+10)); + data.y = float(subgroupAll(id < gl_SubgroupSize)); + if (id > 10) + { + data.z = float(subgroupAll(id > 10)); + uvec4 ballot = subgroupBallot(id > 20); + data.w = bitCount(ballot.x) + bitCount(ballot.y) + bitCount(ballot.z) + bitCount(ballot.w); + } + else + { + data.z = float(subgroupAll(id > 3)); + uvec4 ballot = subgroupBallot(id > 4); + data.w = bitCount(ballot.x) + bitCount(ballot.y) + bitCount(ballot.z) + bitCount(ballot.w); + } + } + else if(IsTest(11)) + { + // Broadcast functions : unit tests + if (id >= 2 && id <= 20) + { + data.x = subgroupBroadcastFirst(id); + data.y = subgroupBroadcast(id, 5); + data.z = subgroupShuffle(id, id); + data.w = subgroupShuffle(data.x, 2+id%3); + } + } + else if(IsTest(12)) + { + // Scan and Prefix functions : unit tests + if (id >= 2 && id <= 20) + { + uvec4 bits = subgroupBallot(id > 4); + data.x = subgroupBallotExclusiveBitCount(bits); + bits = subgroupBallot(id > 10); + data.y = subgroupBallotExclusiveBitCount(bits); + data.z = subgroupExclusiveAdd(data.x); + data.w = subgroupExclusiveMul(1 + data.y); + } + else + { + uvec4 bits = subgroupBallot(id > 23); + data.x = subgroupBallotExclusiveBitCount(bits); + bits = subgroupBallot(id < 1); + data.y = subgroupBallotExclusiveBitCount(bits); + data.z = subgroupExclusiveAdd(data.x); + data.w = subgroupExclusiveAdd(data.y); + } + } + else if(IsTest(13)) + { + // Reduction functions : unit tests + if (id >= 2 && id <= 20) + { + data.x = float(subgroupMax(id)); + data.y = float(subgroupMin(id)); + data.z = float(subgroupMul(id)); + data.w = float(subgroupAdd(id)); + } + } + else if(IsTest(14)) + { + // Reduction functions : unit tests + if (id >= 2 && id <= 20) + { + uvec4 bits = subgroupBallot(id > 23); + data.x = float(subgroupBallotBitCount(bits)); + data.y = float(subgroupAnd(id)); + data.z = float(subgroupOr(id)); + data.w = float(subgroupXor(id)); + } + } + else if(IsTest(15)) + { + // Reduction functions : unit tests + if (id > 13) + { + data.x = float(subgroupAllEqual(id > 15)); + data.y = float(subgroupAllEqual(id < 23)); + data.z = float(subgroupAllEqual(id >= 25)); + data.w = float(subgroupAllEqual(id >= 28)); + } + } + SetOutput(data); +} + +)EOSHADER"; + + VkSubgroupFeatureFlags ops = 0; + + void Prepare(int argc, char **argv) + { + VulkanGraphicsTest::Prepare(argc, argv); + + if(!Avail.empty()) + return; + + if(devVersion < VK_API_VERSION_1_1) + Avail = "Vulkan device version isn't 1.1"; + + static VkPhysicalDeviceSubgroupProperties subProps = { + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES, + }; + + getPhysProperties2(&subProps); + + if(subProps.subgroupSize < 16) + Avail = "Subgroup size is less than 16"; + + // require at least a few ops so we only have a few conditional compilations + const VkSubgroupFeatureFlags requiredOps = + VK_SUBGROUP_FEATURE_BASIC_BIT | VK_SUBGROUP_FEATURE_VOTE_BIT | + VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | VK_SUBGROUP_FEATURE_BALLOT_BIT; + + ops = subProps.supportedOperations; + + if((subProps.supportedOperations & requiredOps) != requiredOps) + Avail = "Missing ops support"; + + if((subProps.supportedStages & VK_SHADER_STAGE_COMPUTE_BIT) == 0) + Avail = "Missing compute subgroup support"; + } + + int main() + { + // initialise, create window, create context, etc + if(!Init()) + return 3; + + VkDescriptorSetLayout setlayout = createDescriptorSetLayout(vkh::DescriptorSetLayoutCreateInfo({ + {0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT}, + })); + + VkPipelineLayout layout = createPipelineLayout(vkh::PipelineLayoutCreateInfo( + {setlayout}, {vkh::PushConstantRange(VK_SHADER_STAGE_ALL, 0, 4)})); + + std::map macros; + + int numCompTests = 0; + + size_t pos = 0; + while(pos != std::string::npos) + { + pos = comp.find("IsTest(", pos); + if(pos == std::string::npos) + break; + pos += sizeof("IsTest(") - 1; + numCompTests = std::max(numCompTests, atoi(comp.c_str() + pos) + 1); + } + + if(ops & VK_SUBGROUP_FEATURE_SHUFFLE_BIT) + macros["FEAT_SHUFFLE"] = "1"; + else + macros["FEAT_SHUFFLE"] = "0"; + if(ops & VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) + macros["FEAT_SHUFFLE_RELATIVE"] = "1"; + else + macros["FEAT_SHUFFLE_RELATIVE"] = "0"; + if(ops & VK_SUBGROUP_FEATURE_CLUSTERED_BIT) + macros["FEAT_CLUSTERED"] = "1"; + else + macros["FEAT_CLUSTERED"] = "0"; + if(ops & VK_SUBGROUP_FEATURE_QUAD_BIT) + macros["FEAT_QUAD"] = "1"; + else + macros["FEAT_QUAD"] = "0"; + if(ops & VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR) + macros["FEAT_ROTATE"] = "1"; + else + macros["FEAT_ROTATE"] = "0"; + if(ops & VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR) + macros["FEAT_ROTATE_CLUSTERED"] = "1"; + else + macros["FEAT_ROTATE_CLUSTERED"] = "0"; + + std::string comppipe_name[1]; + VkPipeline comppipe[1]; + uint32_t countPipes = 0; + + macros["COMP_TESTS"] = fmt::format("{}", numCompTests); + + macros["GROUP_SIZE_X"] = "70"; + macros["GROUP_SIZE_Y"] = "1"; + comppipe_name[countPipes] = "70x1"; + comppipe[countPipes] = createComputePipeline(vkh::ComputePipelineCreateInfo( + layout, CompileShaderModule(comp, ShaderLang::glsl, ShaderStage::comp, "main", macros, + SPIRVTarget::vulkan11))); + ++countPipes; + + AllocatedBuffer bufout( + this, + vkh::BufferCreateInfo(sizeof(Vec4f) * 1024 * numCompTests, + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT), + VmaAllocationCreateInfo({0, VMA_MEMORY_USAGE_CPU_TO_GPU})); + + setName(bufout.buffer, "bufout"); + + VkDescriptorSet set = allocateDescriptorSet(setlayout); + + vkh::updateDescriptorSets( + device, {vkh::WriteDescriptorSet(set, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + {vkh::DescriptorBufferInfo(bufout.buffer)})}); + + while(Running()) + { + VkCommandBuffer cmd = GetCommandBuffer(); + + vkBeginCommandBuffer(cmd, vkh::CommandBufferBeginInfo()); + + VkImage swapimg = + StartUsingBackbuffer(cmd, VK_ACCESS_TRANSFER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL); + + vkh::cmdClearImage(cmd, swapimg, vkh::ClearColorValue(0.2f, 0.2f, 0.2f, 1.0f)); + + pushMarker(cmd, "Compute Tests"); + + for(size_t p = 0; p < countPipes; p++) + { + vkh::cmdPipelineBarrier( + cmd, {}, + {vkh::BufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, + bufout.buffer, 0, sizeof(Vec4f) * 1024 * numCompTests)}); + + vkCmdFillBuffer(cmd, bufout.buffer, 0, sizeof(Vec4f) * 1024 * numCompTests, 0); + + vkh::cmdPipelineBarrier( + cmd, {}, + {vkh::BufferMemoryBarrier(VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_WRITE_BIT, + bufout.buffer, 0, sizeof(Vec4f) * 1024 * numCompTests)}); + + pushMarker(cmd, comppipe_name[p]); + + vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, comppipe[p]); + vkh::cmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, {set}, {}); + + for(int i = 0; i < numCompTests; i++) + { + vkh::cmdPushConstants(cmd, layout, i); + vkCmdDispatch(cmd, 1, 1, 1); + } + + popMarker(cmd); + } + + popMarker(cmd); + + FinishUsingBackbuffer(cmd, VK_ACCESS_TRANSFER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL); + + vkEndCommandBuffer(cmd); + + SubmitAndPresent({cmd}); + } + + return 0; + } +}; + +REGISTER_TEST(); diff --git a/util/test/rdtest/__init__.py b/util/test/rdtest/__init__.py index b80a84068..54f816fb3 100644 --- a/util/test/rdtest/__init__.py +++ b/util/test/rdtest/__init__.py @@ -10,3 +10,4 @@ from .shared.Overlay_Test import * from .shared.Buffer_Truncation import * from .shared.Discard_Zoo import * from .shared.Subgroup_Zoo import * +from .shared.Workgroup_Zoo import * diff --git a/util/test/rdtest/shared/Subgroup_Zoo.py b/util/test/rdtest/shared/Subgroup_Zoo.py index 2a6d20c7c..a912fa699 100644 --- a/util/test/rdtest/shared/Subgroup_Zoo.py +++ b/util/test/rdtest/shared/Subgroup_Zoo.py @@ -13,11 +13,116 @@ class Subgroup_Zoo(rdtest.TestCase): return True, '' return False, 'Disabled test' + def check_compute_thread_result(self, test, action, x, y, z, dim, bufdata): + try: + real = struct.unpack_from( + "4f", bufdata, 16*y*dim[0] + 16*x) + + trace = self.controller.DebugThread( + (0, 0, 0), (x, y, z)) + + _, variables = self.process_trace(trace) + + if trace.debugger is None: + raise rdtest.TestFailureException(f"Test {test} at {action.eventId} got no debug result at {x},{y},{z}") + + # Find the source variable 'data' at the highest instruction index + name = 'data' + debugged = None + countInst = len(trace.instInfo) + for inst in range(countInst): + sourceVars = trace.instInfo[countInst-1-inst].sourceVars + try: + dataVars = [v for v in sourceVars if v.name == name] + if len(dataVars) == 0: + continue + debugged = self.evaluate_source_var(dataVars[0], variables) + except KeyError as ex: + continue + except rdtest.TestFailureException as ex: + continue + break + if debugged is None: + raise rdtest.TestFailureException(f"Couldn't find source variable {name} at {x},{y},{z}") + + debuggedValue = list(debugged.value.f32v[0:4]) + + if not rdtest.value_compare(real, debuggedValue, eps=5.0E-06): + raise rdtest.TestFailureException(f"EID:{action.eventId} TID:{x},{y},{z} debugged thread value {debuggedValue} does not match output {real}") + + except rdtest.TestFailureException as ex: + rdtest.log.error(f"Test {test} failed {ex}") + return False + finally: + self.controller.FreeTrace(trace) + + return True + + def check_compute_tests(self, compute_dims, thread_checks): + overallFailed = False + for comp_dim in compute_dims: + rdtest.log.begin_section( + f"Compute tests with {comp_dim.customName} workgroup") + + compute_tests = [ + a for a in comp_dim.children if a.flags & rd.ActionFlags.Dispatch] + + for test, action in enumerate(compute_tests): + failed = False + self.controller.SetFrameEvent(action.eventId, False) + + pipe = self.controller.GetPipelineState() + csrefl = pipe.GetShaderReflection(rd.ShaderStage.Compute) + + dim = csrefl.dispatchThreadsDimension + + rw = pipe.GetReadWriteResources(rd.ShaderStage.Compute) + + if len(rw) != 1: + rdtest.log.error("Unexpected number of RW resources") + continue + + # each test writes up to 16k data, one vec4 per thread * up to 1024 threads + bufdata = self.controller.GetBufferData( + rw[0].descriptor.resource, test*16*1024, 16*1024) + + for t in thread_checks: + xrange = 1 + yrange = dim[1] + xbase = t + ybase = 0 + + # vertical orientation + if dim[1] > dim[0]: + xrange = dim[0] + yrange = 1 + xbase = 0 + ybase = t + + for x in range(xbase, xbase+xrange): + for y in range(ybase, ybase+yrange): + z = 0 + + if x >= dim[0] or y >= dim[1]: + continue + + if not self.check_compute_thread_result(test, action, x, y, z, dim, bufdata): + failed = True + + overallFailed |= failed + if not failed: + rdtest.log.success(f"Test {test} successful") + else: + rdtest.log.error(f"Test {test} failed") + + rdtest.log.end_section( + f"Compute tests with {comp_dim.customName} workgroup") + + return overallFailed + def check_capture(self): graphics_tests = [a for a in self.find_action( "Graphics Tests").children if a.flags & rd.ActionFlags.Drawcall] - compute_dims = [a for a in self.find_action( - "Compute Tests").children if 'x' in a.customName] rdtest.log.begin_section("Graphics tests") @@ -34,19 +139,6 @@ class Subgroup_Zoo(rdtest.TestCase): # middle quad on other triangle (56, 64), (57, 64), (56, 65), (57, 65), ] - # threads to check. largest dimension only (all small dim checked) - thread_checks = [ - # first few - 0, 1, 2, - # near end of 32-subgroup and boundary - 30, 31, 32, - # near end of 64-subgroup and boundary - 62, 63, 64, - # near end of 64-subgroup and boundary - 62, 63, 64, - # large values spaced out with one near the end of our unaligned size - 100, 110, 120, 140, 149, 150, 160, 200, 250, - ] clear_col = (123456.0, 789.0, 101112.0, 0.0) overallFailed = False @@ -163,102 +255,21 @@ class Subgroup_Zoo(rdtest.TestCase): rdtest.log.end_section("Graphics tests") - for comp_dim in compute_dims: - rdtest.log.begin_section( - f"Compute tests with {comp_dim.customName} workgroup") + # threads to check. largest dimension only (all small dim checked) + thread_checks = [ + # first few + 0, 1, 2, + # near end of 32-subgroup and boundary + 30, 31, 32, 33, 34, + # near end of 64-subgroup and boundary + 62, 63, 64, 64, 65, + # large values spaced out with one near the end of our unaligned size + 100, 110, 120, 140, 149, 150, 160, 200, 250, + ] + compute_dims = [a for a in self.find_action( + "Compute Tests").children if 'x' in a.customName] - compute_tests = [ - a for a in comp_dim.children if a.flags & rd.ActionFlags.Dispatch] - - for test, action in enumerate(compute_tests): - failed = False - self.controller.SetFrameEvent(action.eventId, False) - - pipe = self.controller.GetPipelineState() - csrefl = pipe.GetShaderReflection(rd.ShaderStage.Compute) - - dim = csrefl.dispatchThreadsDimension - - rw = pipe.GetReadWriteResources(rd.ShaderStage.Compute) - - if len(rw) != 1: - rdtest.log.error("Unexpected number of RW resources") - continue - - # each test writes up to 16k data, one vec4 per thread * up to 1024 threads - bufdata = self.controller.GetBufferData( - rw[0].descriptor.resource, test*16*1024, 16*1024) - - for t in thread_checks: - xrange = 1 - yrange = dim[1] - xbase = t - ybase = 0 - - # vertical orientation - if dim[1] > dim[0]: - xrange = dim[0] - yrange = 1 - xbase = 0 - ybase = t - - for x in range(xbase, xbase+xrange): - for y in range(ybase, ybase+yrange): - z = 0 - - if x >= dim[0] or y >= dim[1]: - continue - - try: - real = struct.unpack_from( - "4f", bufdata, 16*y*dim[0] + 16*x) - - trace = self.controller.DebugThread( - (0, 0, 0), (x, y, z)) - - _, variables = self.process_trace(trace) - - if trace.debugger is None: - raise rdtest.TestFailureException(f"Test {test} at {action.eventId} got no debug result at {x},{y},{z}") - - # Find the source variable 'data' at the highest instruction index - debugged = None - countInst = len(trace.instInfo) - for inst in range(countInst): - sourceVars = trace.instInfo[countInst-1-inst].sourceVars - try: - dataVars = [v for v in sourceVars if v.name == 'data'] - if len(dataVars) == 0: - continue - debugged = self.evaluate_source_var(dataVars[0], variables) - except KeyError as ex: - continue - except rdtest.TestFailureException as ex: - continue - break - if debugged is None: - raise rdtest.TestFailureException(f"Couldn't find source variable {name}") - - debuggedValue = list(debugged.value.f32v[0:4]) - - if not rdtest.value_compare(real, debuggedValue, eps=5.0E-06): - raise rdtest.TestFailureException(f"EID:{action.eventId} TID:{x},{y},{z} debugged thread value {debuggedValue} does not match output {real}") - - except rdtest.TestFailureException as ex: - rdtest.log.error(f"Test {test} failed {ex}") - failed = True - continue - finally: - self.controller.FreeTrace(trace) - - overallFailed |= failed - if not failed: - rdtest.log.success(f"Test {test} successful") - else: - rdtest.log.error(f"Test {test} failed") - - rdtest.log.end_section( - f"Compute tests with {comp_dim.customName} workgroup") + overallFailed |= self.check_compute_tests(compute_dims, thread_checks) if overallFailed: raise rdtest.TestFailureException("Some tests were not as expected") \ No newline at end of file diff --git a/util/test/rdtest/shared/Workgroup_Zoo.py b/util/test/rdtest/shared/Workgroup_Zoo.py new file mode 100644 index 000000000..c6d33331a --- /dev/null +++ b/util/test/rdtest/shared/Workgroup_Zoo.py @@ -0,0 +1,28 @@ +import rdtest + +# Not a real test, re-used by API-specific tests +class Workgroup_Zoo(rdtest.Subgroup_Zoo): + internal = True + demos_test_name = None + + def check_capture(self): + compute_dims = [a for a in self.find_action("Compute Tests").children if 'x' in a.customName] + + # threads to check. largest dimension only (all small dim checked) + thread_checks = [ + # first few + 0, 1, 2, + # near end of 16-subgroup and boundary + 15, 16, 17, + # near end of 32-subgroup and boundary + 31, 32, 33, + # near end of 64-subgroup and boundary + 63, 64, 65, + # near end of 128-subgroup and boundary + 127, 128, 129, + # large values + 150 + ] + + if self.check_compute_tests(compute_dims, thread_checks): + raise rdtest.TestFailureException("Some tests were not as expected") \ No newline at end of file diff --git a/util/test/tests/D3D12/D3D12_Workgroup_Zoo.py b/util/test/tests/D3D12/D3D12_Workgroup_Zoo.py new file mode 100644 index 000000000..b049cbfb8 --- /dev/null +++ b/util/test/tests/D3D12/D3D12_Workgroup_Zoo.py @@ -0,0 +1,5 @@ +import rdtest + +class D3D12_Workgroup_Zoo(rdtest.Workgroup_Zoo): + demos_test_name = 'D3D12_Workgroup_Zoo' + internal = False diff --git a/util/test/tests/Vulkan/VK_Workgroup_Zoo.py b/util/test/tests/Vulkan/VK_Workgroup_Zoo.py new file mode 100644 index 000000000..7b21da794 --- /dev/null +++ b/util/test/tests/Vulkan/VK_Workgroup_Zoo.py @@ -0,0 +1,5 @@ +import rdtest + +class VK_Workgroup_Zoo(rdtest.Workgroup_Zoo): + demos_test_name = 'VK_Workgroup_Zoo' + internal = False