Added VK_Workgroup_Zoo, D3D12_Workgroup_Zoo tests

Tests specifically aimed at workgroup debugging i.e GSM and non-aligned subgroups
Not focused on unit tests of subgroup/quad instructions that is handled by *_Subgroup_Zoo
This commit is contained in:
Jake Turner
2025-04-17 09:35:14 +01:00
parent a8a0e9628d
commit e14bc98e18
12 changed files with 1087 additions and 126 deletions
+2 -1
View File
@@ -165,7 +165,8 @@ set(VULKAN_SRC
vk/vk_validation_use.cpp
vk/vk_vertex_attr_zoo.cpp
vk/vk_video_textures.cpp
vk/vk_vs_max_desc_set.cpp)
vk/vk_vs_max_desc_set.cpp
vk/vk_workgroup_zoo.cpp)
set(OPENGL_SRC
3rdparty/glad/glad.c
+6 -11
View File
@@ -57,7 +57,7 @@ RWStructuredBuffer<float4> outbuf : register(u0);
static uint3 tid;
void SetOuput(float4 data)
void SetOutput(float4 data)
{
outbuf[root_test * 1024 + tid.y * GROUP_SIZE_X + tid.x] = data;
}
@@ -221,7 +221,7 @@ void main(uint3 inTid : SV_DispatchThreadID)
uint id = WaveGetLaneIndex();
SetOuput(id);
SetOutput(id);
if(IsTest(0))
{
@@ -297,7 +297,7 @@ void main(uint3 inTid : SV_DispatchThreadID)
if (id < 10)
{
data.x = WaveActiveSum(id+10);
SetOuput(data);
SetOutput(data);
return;
}
data.x = WaveActiveSum(id);
@@ -402,7 +402,7 @@ void main(uint3 inTid : SV_DispatchThreadID)
data.w = float(WaveActiveAllEqual(test4).w);
}
}
SetOuput(data);
SetOutput(data);
}
)EOSHADER";
@@ -417,7 +417,7 @@ void main(uint3 inTid : SV_DispatchThreadID)
uint id = WaveGetLaneIndex();
SetOuput(id);
SetOutput(id);
if(IsTest(0))
{
@@ -439,7 +439,7 @@ void main(uint3 inTid : SV_DispatchThreadID)
data.z = WaveMultiPrefixBitOr(id, mask);
data.w = WaveMultiPrefixBitXor(id, mask);
}
SetOuput(data);
SetOutput(data);
}
)EOSHADER";
@@ -548,12 +548,7 @@ void main(uint3 inTid : SV_DispatchThreadID)
ID3D12PipelineStatePtr comppipe65[ARRAY_COUNT(compsize)];
std::string defines60;
defines60 += fmt::format("#define COMP_TESTS {}\n", numCompTests60);
defines60 += "\n";
std::string defines65;
defines65 += fmt::format("#define COMP_TESTS {}\n", numCompTests65);
defines65 += "\n";
bool supportSM65 = (m_HighestShaderModel >= D3D_SHADER_MODEL_6_5) && m_DXILSupport;
bool supportSM67 = (m_HighestShaderModel >= D3D_SHADER_MODEL_6_7) && m_DXILSupport;
@@ -0,0 +1,418 @@
/******************************************************************************
* The MIT License (MIT)
*
* Copyright (c) 2019-2025 Baldur Karlsson
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
******************************************************************************/
#include "3rdparty/fmt/core.h"
#include "d3d12_test.h"
RD_TEST(D3D12_Workgroup_Zoo, D3D12GraphicsTest)
{
static constexpr const char *Description =
"Test of behaviour around workgroup operations in shaders.";
const std::string common = R"EOSHADER(
cbuffer rootconsts : register(b0)
{
uint root_test;
}
#define IsTest(x) (root_test == x)
)EOSHADER";
const std::string compCommon = common + R"EOSHADER(
RWStructuredBuffer<float4> outbuf : register(u0);
static uint3 tid;
groupshared uint4 gsmUint4[1024];
void SetOutput(float4 data)
{
outbuf[root_test * 1024 + tid.y * GROUP_SIZE_X + tid.x] = data;
}
)EOSHADER";
const std::string comp = compCommon + R"EOSHADER(
float4 funcD(uint id)
{
return WaveActiveSum(id/2).xxxx;
}
float4 nestedFunc(uint id)
{
float4 ret = funcD(id/3);
ret.w = WaveActiveSum(id);
return ret;
}
float4 funcA(uint id)
{
return nestedFunc(id*2);
}
float4 funcB(uint id)
{
return nestedFunc(id*4);
}
float4 funcTest(uint id)
{
if ((id % 2) == 0)
{
return 0.xxxx;
}
else
{
float value = WaveActiveSum(id);
if (id < 10)
{
return value.xxxx;
}
value += WaveActiveSum(id/2);
return value.xxxx;
}
}
[numthreads(GROUP_SIZE_X, GROUP_SIZE_Y, 1)]
void main(uint3 inTid : SV_DispatchThreadID)
{
tid = inTid;
float4 data = 0.0f.xxxx;
uint id = WaveGetLaneIndex();
gsmUint4[id] = id;
SetOutput(data);
if(IsTest(0))
{
data.x = id;
}
else if(IsTest(1))
{
data.x = WaveActiveSum(id);
}
else if(IsTest(2))
{
// Diverged threads which reconverge
if (id < 10)
{
// active threads 0-9
data.x = WaveActiveSum(id);
if ((id % 2) == 0)
data.y = WaveActiveSum(id);
else
data.y = WaveActiveSum(id);
data.x += WaveActiveSum(id);
}
else
{
// active threads 10...
data.x = WaveActiveSum(id);
}
data.y = WaveActiveSum(id);
}
else if(IsTest(3))
{
// Converged threads calling a function
data = funcTest(id);
data.y = WaveActiveSum(id);
}
else if(IsTest(4))
{
// Converged threads calling a function which has a nested function call in it
data = nestedFunc(id);
data.y = WaveActiveSum(id);
}
else if(IsTest(5))
{
// Diverged threads calling the same function
if (id < 10)
{
data = funcD(id);
}
else
{
data = funcD(id);
}
data.y = WaveActiveSum(id);
}
else if(IsTest(6))
{
// Diverged threads calling the same function which has a nested function call in it
if (id < 10)
{
data = funcA(id);
}
else
{
data = funcB(id);
}
data.y = WaveActiveSum(id);
}
else if(IsTest(7))
{
// Diverged threads which early exit
if (id < 10)
{
data.x = WaveActiveSum(id+10);
SetOutput(data);
return;
}
data.x = WaveActiveSum(id);
}
else if(IsTest(8))
{
// Loops with different number of iterations per thread
for (uint i = 0; i < id; i++)
{
data.x += WaveActiveSum(id);
}
}
else if(IsTest(9))
{
// Query functions : unit tests
data.x = float(WaveGetLaneCount());
data.y = float(WaveGetLaneIndex());
data.z = float(WaveIsFirstLane());
}
else if(IsTest(10))
{
// Vote functions : unit tests
data.x = float(WaveActiveAnyTrue(id*2 > id+10));
data.y = float(WaveActiveAllTrue(id < WaveGetLaneCount()));
if (id > 10)
{
data.z = float(WaveActiveAllTrue(id > 10));
uint4 ballot = WaveActiveBallot(id > 20);
data.w = countbits(ballot.x) + countbits(ballot.y) + countbits(ballot.z) + countbits(ballot.w);
}
else
{
data.z = float(WaveActiveAllTrue(id > 3));
uint4 ballot = WaveActiveBallot(id > 4);
data.w = countbits(ballot.x) + countbits(ballot.y) + countbits(ballot.z) + countbits(ballot.w);
}
}
else if(IsTest(11))
{
// Broadcast functions : unit tests
if (id >= 2 && id <= 20)
{
data.x = WaveReadLaneFirst(id);
data.y = WaveReadLaneAt(id, 5);
data.z = WaveReadLaneAt(id, id);
data.w = WaveReadLaneAt(data.x, 2+id%3);
}
}
else if(IsTest(12))
{
// Scan and Prefix functions : unit tests
if (id >= 2 && id <= 20)
{
data.x = WavePrefixCountBits(id > 4);
data.y = WavePrefixCountBits(id > 10);
data.z = WavePrefixSum(data.x);
data.w = WavePrefixProduct(1 + data.y);
}
else
{
data.x = WavePrefixCountBits(id > 23);
data.y = WavePrefixCountBits(id < 1);
data.z = WavePrefixSum(data.x);
data.w = WavePrefixSum(data.y);
}
}
else if(IsTest(13))
{
// Reduction functions : unit tests
if (id >= 2 && id <= 20)
{
data.x = float(WaveActiveMax(id));
data.y = float(WaveActiveMin(id));
data.z = float(WaveActiveProduct(id));
data.w = float(WaveActiveSum(id));
}
}
else if(IsTest(14))
{
// Reduction functions : unit tests
if (id >= 2 && id <= 20)
{
data.x = float(WaveActiveCountBits(id > 23));
data.y = float(WaveActiveBitAnd(id));
data.z = float(WaveActiveBitOr(id));
data.w = float(WaveActiveBitXor(id));
}
}
else if(IsTest(15))
{
// Reduction functions : unit tests
if (id > 13)
{
bool test1 = (id > 15).x;
bool2 test2 = bool2(test1, (id < 23));
bool3 test3 = bool3(test1, (id < 23), (id >= 25));
bool4 test4 = bool4(test1, (id < 23), (id >= 25), (id >= 28));
data.x = float(WaveActiveAllEqual(test1).x);
data.y = float(WaveActiveAllEqual(test2).y);
data.z = float(WaveActiveAllEqual(test3).z);
data.w = float(WaveActiveAllEqual(test4).w);
}
}
SetOutput(data);
}
)EOSHADER";
void Prepare(int argc, char **argv)
{
D3D12GraphicsTest::Prepare(argc, argv);
if(opts1.WaveLaneCountMax < 16)
Avail = "Subgroup size is less than 16";
bool supportSM60 = (m_HighestShaderModel >= D3D_SHADER_MODEL_6_0) && m_DXILSupport;
if(!supportSM60)
Avail = "SM 6.0 not supported";
}
int main()
{
// initialise, create window, create device, etc
if(!Init())
return 3;
ID3D12RootSignaturePtr sig = MakeSig({constParam(D3D12_SHADER_VISIBILITY_ALL, 0, 0, 1),
uavParam(D3D12_SHADER_VISIBILITY_ALL, 0, 0)});
const uint32_t imgDim = 128;
ID3D12ResourcePtr fltTex = MakeTexture(DXGI_FORMAT_R32G32B32A32_FLOAT, imgDim, imgDim)
.RTV()
.InitialState(D3D12_RESOURCE_STATE_RENDER_TARGET);
fltTex->SetName(L"fltTex");
D3D12_CPU_DESCRIPTOR_HANDLE fltRTV = MakeRTV(fltTex).CreateCPU(0);
D3D12_GPU_DESCRIPTOR_HANDLE fltSRV = MakeSRV(fltTex).CreateGPU(8);
int32_t numCompTests = 0;
size_t pos = 0;
while(pos != std::string::npos)
{
pos = comp.find("IsTest(", pos);
if(pos == std::string::npos)
break;
pos += sizeof("IsTest(") - 1;
numCompTests = std::max(numCompTests, atoi(comp.c_str() + pos) + 1);
}
struct
{
int x, y;
} compsize[] = {
{70, 1},
};
std::string comppipe_name[ARRAY_COUNT(compsize)];
ID3D12PipelineStatePtr comppipe[ARRAY_COUNT(compsize)];
std::string defines;
for(int i = 0; i < ARRAY_COUNT(comppipe); i++)
{
std::string sizedefine;
sizedefine = fmt::format("#define GROUP_SIZE_X {}\n#define GROUP_SIZE_Y {}\n", compsize[i].x,
compsize[i].y);
comppipe_name[i] = fmt::format("{}x{}", compsize[i].x, compsize[i].y);
comppipe[i] =
MakePSO().RootSig(sig).CS(Compile(defines + sizedefine + comp, "main", "cs_6_0"));
comppipe[i]->SetName(UTF82Wide(comppipe_name[i]).c_str());
}
ID3D12ResourcePtr bufOut = MakeBuffer().Size(sizeof(Vec4f) * 1024 * numCompTests).UAV();
D3D12ViewCreator uavView =
MakeUAV(bufOut).Format(DXGI_FORMAT_R32_UINT).NumElements(4 * 1024 * numCompTests);
D3D12_CPU_DESCRIPTOR_HANDLE uavcpu = uavView.CreateClearCPU(10);
D3D12_GPU_DESCRIPTOR_HANDLE uavgpu = uavView.CreateGPU(10);
bufOut->SetName(L"bufOut");
while(Running())
{
ID3D12GraphicsCommandListPtr cmd = GetCommandBuffer();
Reset(cmd);
cmd->SetDescriptorHeaps(1, &m_CBVUAVSRV.GetInterfacePtr());
ID3D12ResourcePtr bb = StartUsingBackbuffer(cmd, D3D12_RESOURCE_STATE_RENDER_TARGET);
ClearRenderTargetView(cmd, BBRTV, {0.2f, 0.2f, 0.2f, 1.0f});
pushMarker(cmd, "Compute Tests");
for(size_t p = 0; p < ARRAY_COUNT(comppipe); p++)
{
ResourceBarrier(cmd);
UINT zero[4] = {};
cmd->ClearUnorderedAccessViewUint(uavgpu, uavcpu, bufOut, zero, 0, NULL);
ResourceBarrier(cmd);
pushMarker(cmd, comppipe_name[p]);
cmd->SetPipelineState(comppipe[p]);
cmd->SetComputeRootSignature(sig);
cmd->SetComputeRootUnorderedAccessView(1, bufOut->GetGPUVirtualAddress());
for(int i = 0; i < numCompTests; i++)
{
cmd->SetComputeRoot32BitConstant(0, i, 0);
cmd->Dispatch(1, 1, 1);
}
popMarker(cmd);
}
popMarker(cmd);
FinishUsingBackbuffer(cmd, D3D12_RESOURCE_STATE_RENDER_TARGET);
cmd->Close();
SubmitAndPresent({cmd});
}
return 0;
}
};
REGISTER_TEST();
+2
View File
@@ -232,6 +232,7 @@
<ClCompile Include="d3d12\d3d12_vertex_uav.cpp" />
<ClCompile Include="d3d12\d3d12_video_textures.cpp" />
<ClCompile Include="d3d12\d3d12_vrs.cpp" />
<ClCompile Include="d3d12\d3d12_workgroup_zoo.cpp" />
<ClCompile Include="d3d12\d3d12_write_subresource.cpp" />
<ClCompile Include="dx\d3d_helpers.cpp" />
<ClCompile Include="3rdparty\glad\glad.c" />
@@ -374,6 +375,7 @@
<ClCompile Include="vk\vk_simple_triangle.cpp" />
<ClCompile Include="vk\vk_test.cpp" />
<ClCompile Include="3rdparty\volk\volk.c" />
<ClCompile Include="vk\vk_workgroup_zoo.cpp" />
<ClCompile Include="win32\win32_platform.cpp" />
<ClCompile Include="win32\win32_window.cpp" />
</ItemGroup>
+6
View File
@@ -718,6 +718,12 @@
<ClCompile Include="d3d12\d3d12_subgroup_zoo.cpp">
<Filter>D3D12\demos</Filter>
</ClCompile>
<ClCompile Include="vk\vk_workgroup_zoo.cpp">
<Filter>Vulkan\demos</Filter>
</ClCompile>
<ClCompile Include="d3d12\d3d12_workgroup_zoo.cpp">
<Filter>D3D12\demos</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<Filter Include="D3D11">
+4 -4
View File
@@ -191,7 +191,7 @@ vec4 funcTest(uint id)
}
}
void SetOuput(vec4 data)
void SetOutput(vec4 data)
{
outbuf.data[push.test].vals[gl_LocalInvocationID.y * GROUP_SIZE_X + gl_LocalInvocationID.x] = data;
}
@@ -199,7 +199,7 @@ void main()
{
vec4 data = vec4(0);
uint id = gl_SubgroupInvocationID;
SetOuput(data);
SetOutput(data);
if(IsTest(0))
{
@@ -275,7 +275,7 @@ void main()
if (id < 10)
{
data.x = subgroupAdd(id+10);
SetOuput(data);
SetOutput(data);
return;
}
data.x = subgroupAdd(id);
@@ -380,7 +380,7 @@ void main()
data.w = float(subgroupAllEqual(id >= 28));
}
}
SetOuput(data);
SetOutput(data);
}
)EOSHADER";
+489
View File
@@ -0,0 +1,489 @@
/******************************************************************************
* The MIT License (MIT)
*
* Copyright (c) 2019-2025 Baldur Karlsson
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
******************************************************************************/
#include "3rdparty/fmt/core.h"
#include "vk_test.h"
RD_TEST(VK_Workgroup_Zoo, VulkanGraphicsTest)
{
static constexpr const char *Description =
"Test of behaviour around workgroup operations in shaders.";
const std::string common = R"EOSHADER(
#version 460 core
#extension GL_KHR_shader_subgroup_basic : enable
#extension GL_KHR_shader_subgroup_ballot : enable
#extension GL_KHR_shader_subgroup_vote : enable
#extension GL_KHR_shader_subgroup_arithmetic : enable
#if FEAT_SHUFFLE
#extension GL_KHR_shader_subgroup_shuffle : enable
#endif
#if FEAT_SHUFFLE_RELATIVE
#extension GL_KHR_shader_subgroup_shuffle_relative : enable
#endif
#if FEAT_CLUSTERED
#extension GL_KHR_shader_subgroup_clustered : enable
#endif
#if FEAT_QUAD
#extension GL_KHR_shader_subgroup_quad : enable
#endif
#if FEAT_ROTATE || FEAT_ROTATE_CLUSTERED
#extension GL_KHR_shader_subgroup_rotate : enable
#endif
layout(push_constant) uniform PushData
{
uint test;
} push;
#define IsTest(x) (push.test == x)
)EOSHADER";
const std::string comp = common + R"EOSHADER(
shared uvec4 gsmUint4[COMP_TESTS];
struct Output
{
vec4 vals[1024];
};
layout(binding = 0, std430) buffer outbuftype {
Output data[COMP_TESTS];
} outbuf;
layout(local_size_x = GROUP_SIZE_X, local_size_y = GROUP_SIZE_Y, local_size_z = 1) in;
vec4 funcD(uint id)
{
return vec4(subgroupAdd(id/2));
}
vec4 nestedFunc(uint id)
{
vec4 ret = funcD(id/3);
ret.w = subgroupAdd(id);
return ret;
}
vec4 funcA(uint id)
{
return nestedFunc(id*2);
}
vec4 funcB(uint id)
{
return nestedFunc(id*4);
}
vec4 funcTest(uint id)
{
if ((id % 2) == 0)
{
return vec4(0);
}
else
{
float value = subgroupAdd(id);
if (id < 10)
{
return vec4(value);
}
value += subgroupAdd(id/2);
return vec4(value);
}
}
void SetOutput(vec4 data)
{
outbuf.data[push.test].vals[gl_LocalInvocationID.y * GROUP_SIZE_X + gl_LocalInvocationID.x] = data;
}
void main()
{
vec4 data = vec4(0);
uint id = gl_SubgroupInvocationID;
gsmUint4[id] = id;
SetOutput(data);
if(IsTest(0))
{
data.x = id;
}
else if(IsTest(1))
{
data.x = subgroupAdd(id);
}
else if(IsTest(2))
{
// Diverged threads which reconverge
if (id < 10)
{
// active threads 0-9
data.x = subgroupAdd(id);
if ((id % 2) == 0)
data.y = subgroupAdd(id);
else
data.y = subgroupAdd(id);
data.x += subgroupAdd(id);
}
else
{
// active threads 10...
data.x = subgroupAdd(id);
}
data.y = subgroupAdd(id);
}
else if(IsTest(3))
{
// Converged threads calling a function
data = funcTest(id);
data.y = subgroupAdd(id);
}
else if(IsTest(4))
{
// Converged threads calling a function which has a nested function call in it
data = nestedFunc(id);
data.y = subgroupAdd(id);
}
else if(IsTest(5))
{
// Diverged threads calling the same function
if (id < 10)
{
data = funcD(id);
}
else
{
data = funcD(id);
}
data.y = subgroupAdd(id);
}
else if(IsTest(6))
{
// Diverged threads calling the same function which has a nested function call in it
if (id < 10)
{
data = funcA(id);
}
else
{
data = funcB(id);
}
data.y = subgroupAdd(id);
}
else if(IsTest(7))
{
// Diverged threads which early exit
if (id < 10)
{
data.x = subgroupAdd(id+10);
SetOutput(data);
return;
}
data.x = subgroupAdd(id);
}
else if(IsTest(8))
{
// Loops with different number of iterations per thread
for (uint i = 0; i < id; i++)
{
data.x += subgroupAdd(id);
}
}
else if(IsTest(9))
{
// Query functions : unit tests
data.x = float(gl_SubgroupSize);
data.y = float(gl_SubgroupInvocationID);
data.z = float(subgroupElect());
}
else if(IsTest(10))
{
// Vote functions : unit tests
data.x = float(subgroupAny(id*2 > id+10));
data.y = float(subgroupAll(id < gl_SubgroupSize));
if (id > 10)
{
data.z = float(subgroupAll(id > 10));
uvec4 ballot = subgroupBallot(id > 20);
data.w = bitCount(ballot.x) + bitCount(ballot.y) + bitCount(ballot.z) + bitCount(ballot.w);
}
else
{
data.z = float(subgroupAll(id > 3));
uvec4 ballot = subgroupBallot(id > 4);
data.w = bitCount(ballot.x) + bitCount(ballot.y) + bitCount(ballot.z) + bitCount(ballot.w);
}
}
else if(IsTest(11))
{
// Broadcast functions : unit tests
if (id >= 2 && id <= 20)
{
data.x = subgroupBroadcastFirst(id);
data.y = subgroupBroadcast(id, 5);
data.z = subgroupShuffle(id, id);
data.w = subgroupShuffle(data.x, 2+id%3);
}
}
else if(IsTest(12))
{
// Scan and Prefix functions : unit tests
if (id >= 2 && id <= 20)
{
uvec4 bits = subgroupBallot(id > 4);
data.x = subgroupBallotExclusiveBitCount(bits);
bits = subgroupBallot(id > 10);
data.y = subgroupBallotExclusiveBitCount(bits);
data.z = subgroupExclusiveAdd(data.x);
data.w = subgroupExclusiveMul(1 + data.y);
}
else
{
uvec4 bits = subgroupBallot(id > 23);
data.x = subgroupBallotExclusiveBitCount(bits);
bits = subgroupBallot(id < 1);
data.y = subgroupBallotExclusiveBitCount(bits);
data.z = subgroupExclusiveAdd(data.x);
data.w = subgroupExclusiveAdd(data.y);
}
}
else if(IsTest(13))
{
// Reduction functions : unit tests
if (id >= 2 && id <= 20)
{
data.x = float(subgroupMax(id));
data.y = float(subgroupMin(id));
data.z = float(subgroupMul(id));
data.w = float(subgroupAdd(id));
}
}
else if(IsTest(14))
{
// Reduction functions : unit tests
if (id >= 2 && id <= 20)
{
uvec4 bits = subgroupBallot(id > 23);
data.x = float(subgroupBallotBitCount(bits));
data.y = float(subgroupAnd(id));
data.z = float(subgroupOr(id));
data.w = float(subgroupXor(id));
}
}
else if(IsTest(15))
{
// Reduction functions : unit tests
if (id > 13)
{
data.x = float(subgroupAllEqual(id > 15));
data.y = float(subgroupAllEqual(id < 23));
data.z = float(subgroupAllEqual(id >= 25));
data.w = float(subgroupAllEqual(id >= 28));
}
}
SetOutput(data);
}
)EOSHADER";
VkSubgroupFeatureFlags ops = 0;
void Prepare(int argc, char **argv)
{
VulkanGraphicsTest::Prepare(argc, argv);
if(!Avail.empty())
return;
if(devVersion < VK_API_VERSION_1_1)
Avail = "Vulkan device version isn't 1.1";
static VkPhysicalDeviceSubgroupProperties subProps = {
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES,
};
getPhysProperties2(&subProps);
if(subProps.subgroupSize < 16)
Avail = "Subgroup size is less than 16";
// require at least a few ops so we only have a few conditional compilations
const VkSubgroupFeatureFlags requiredOps =
VK_SUBGROUP_FEATURE_BASIC_BIT | VK_SUBGROUP_FEATURE_VOTE_BIT |
VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | VK_SUBGROUP_FEATURE_BALLOT_BIT;
ops = subProps.supportedOperations;
if((subProps.supportedOperations & requiredOps) != requiredOps)
Avail = "Missing ops support";
if((subProps.supportedStages & VK_SHADER_STAGE_COMPUTE_BIT) == 0)
Avail = "Missing compute subgroup support";
}
int main()
{
// initialise, create window, create context, etc
if(!Init())
return 3;
VkDescriptorSetLayout setlayout = createDescriptorSetLayout(vkh::DescriptorSetLayoutCreateInfo({
{0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT},
}));
VkPipelineLayout layout = createPipelineLayout(vkh::PipelineLayoutCreateInfo(
{setlayout}, {vkh::PushConstantRange(VK_SHADER_STAGE_ALL, 0, 4)}));
std::map<std::string, std::string> macros;
int numCompTests = 0;
size_t pos = 0;
while(pos != std::string::npos)
{
pos = comp.find("IsTest(", pos);
if(pos == std::string::npos)
break;
pos += sizeof("IsTest(") - 1;
numCompTests = std::max(numCompTests, atoi(comp.c_str() + pos) + 1);
}
if(ops & VK_SUBGROUP_FEATURE_SHUFFLE_BIT)
macros["FEAT_SHUFFLE"] = "1";
else
macros["FEAT_SHUFFLE"] = "0";
if(ops & VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT)
macros["FEAT_SHUFFLE_RELATIVE"] = "1";
else
macros["FEAT_SHUFFLE_RELATIVE"] = "0";
if(ops & VK_SUBGROUP_FEATURE_CLUSTERED_BIT)
macros["FEAT_CLUSTERED"] = "1";
else
macros["FEAT_CLUSTERED"] = "0";
if(ops & VK_SUBGROUP_FEATURE_QUAD_BIT)
macros["FEAT_QUAD"] = "1";
else
macros["FEAT_QUAD"] = "0";
if(ops & VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR)
macros["FEAT_ROTATE"] = "1";
else
macros["FEAT_ROTATE"] = "0";
if(ops & VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR)
macros["FEAT_ROTATE_CLUSTERED"] = "1";
else
macros["FEAT_ROTATE_CLUSTERED"] = "0";
std::string comppipe_name[1];
VkPipeline comppipe[1];
uint32_t countPipes = 0;
macros["COMP_TESTS"] = fmt::format("{}", numCompTests);
macros["GROUP_SIZE_X"] = "70";
macros["GROUP_SIZE_Y"] = "1";
comppipe_name[countPipes] = "70x1";
comppipe[countPipes] = createComputePipeline(vkh::ComputePipelineCreateInfo(
layout, CompileShaderModule(comp, ShaderLang::glsl, ShaderStage::comp, "main", macros,
SPIRVTarget::vulkan11)));
++countPipes;
AllocatedBuffer bufout(
this,
vkh::BufferCreateInfo(sizeof(Vec4f) * 1024 * numCompTests,
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT),
VmaAllocationCreateInfo({0, VMA_MEMORY_USAGE_CPU_TO_GPU}));
setName(bufout.buffer, "bufout");
VkDescriptorSet set = allocateDescriptorSet(setlayout);
vkh::updateDescriptorSets(
device, {vkh::WriteDescriptorSet(set, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
{vkh::DescriptorBufferInfo(bufout.buffer)})});
while(Running())
{
VkCommandBuffer cmd = GetCommandBuffer();
vkBeginCommandBuffer(cmd, vkh::CommandBufferBeginInfo());
VkImage swapimg =
StartUsingBackbuffer(cmd, VK_ACCESS_TRANSFER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
vkh::cmdClearImage(cmd, swapimg, vkh::ClearColorValue(0.2f, 0.2f, 0.2f, 1.0f));
pushMarker(cmd, "Compute Tests");
for(size_t p = 0; p < countPipes; p++)
{
vkh::cmdPipelineBarrier(
cmd, {},
{vkh::BufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
bufout.buffer, 0, sizeof(Vec4f) * 1024 * numCompTests)});
vkCmdFillBuffer(cmd, bufout.buffer, 0, sizeof(Vec4f) * 1024 * numCompTests, 0);
vkh::cmdPipelineBarrier(
cmd, {},
{vkh::BufferMemoryBarrier(VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_WRITE_BIT,
bufout.buffer, 0, sizeof(Vec4f) * 1024 * numCompTests)});
pushMarker(cmd, comppipe_name[p]);
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, comppipe[p]);
vkh::cmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, {set}, {});
for(int i = 0; i < numCompTests; i++)
{
vkh::cmdPushConstants(cmd, layout, i);
vkCmdDispatch(cmd, 1, 1, 1);
}
popMarker(cmd);
}
popMarker(cmd);
FinishUsingBackbuffer(cmd, VK_ACCESS_TRANSFER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
vkEndCommandBuffer(cmd);
SubmitAndPresent({cmd});
}
return 0;
}
};
REGISTER_TEST();
+1
View File
@@ -10,3 +10,4 @@ from .shared.Overlay_Test import *
from .shared.Buffer_Truncation import *
from .shared.Discard_Zoo import *
from .shared.Subgroup_Zoo import *
from .shared.Workgroup_Zoo import *
+121 -110
View File
@@ -13,11 +13,116 @@ class Subgroup_Zoo(rdtest.TestCase):
return True, ''
return False, 'Disabled test'
def check_compute_thread_result(self, test, action, x, y, z, dim, bufdata):
try:
real = struct.unpack_from(
"4f", bufdata, 16*y*dim[0] + 16*x)
trace = self.controller.DebugThread(
(0, 0, 0), (x, y, z))
_, variables = self.process_trace(trace)
if trace.debugger is None:
raise rdtest.TestFailureException(f"Test {test} at {action.eventId} got no debug result at {x},{y},{z}")
# Find the source variable 'data' at the highest instruction index
name = 'data'
debugged = None
countInst = len(trace.instInfo)
for inst in range(countInst):
sourceVars = trace.instInfo[countInst-1-inst].sourceVars
try:
dataVars = [v for v in sourceVars if v.name == name]
if len(dataVars) == 0:
continue
debugged = self.evaluate_source_var(dataVars[0], variables)
except KeyError as ex:
continue
except rdtest.TestFailureException as ex:
continue
break
if debugged is None:
raise rdtest.TestFailureException(f"Couldn't find source variable {name} at {x},{y},{z}")
debuggedValue = list(debugged.value.f32v[0:4])
if not rdtest.value_compare(real, debuggedValue, eps=5.0E-06):
raise rdtest.TestFailureException(f"EID:{action.eventId} TID:{x},{y},{z} debugged thread value {debuggedValue} does not match output {real}")
except rdtest.TestFailureException as ex:
rdtest.log.error(f"Test {test} failed {ex}")
return False
finally:
self.controller.FreeTrace(trace)
return True
def check_compute_tests(self, compute_dims, thread_checks):
overallFailed = False
for comp_dim in compute_dims:
rdtest.log.begin_section(
f"Compute tests with {comp_dim.customName} workgroup")
compute_tests = [
a for a in comp_dim.children if a.flags & rd.ActionFlags.Dispatch]
for test, action in enumerate(compute_tests):
failed = False
self.controller.SetFrameEvent(action.eventId, False)
pipe = self.controller.GetPipelineState()
csrefl = pipe.GetShaderReflection(rd.ShaderStage.Compute)
dim = csrefl.dispatchThreadsDimension
rw = pipe.GetReadWriteResources(rd.ShaderStage.Compute)
if len(rw) != 1:
rdtest.log.error("Unexpected number of RW resources")
continue
# each test writes up to 16k data, one vec4 per thread * up to 1024 threads
bufdata = self.controller.GetBufferData(
rw[0].descriptor.resource, test*16*1024, 16*1024)
for t in thread_checks:
xrange = 1
yrange = dim[1]
xbase = t
ybase = 0
# vertical orientation
if dim[1] > dim[0]:
xrange = dim[0]
yrange = 1
xbase = 0
ybase = t
for x in range(xbase, xbase+xrange):
for y in range(ybase, ybase+yrange):
z = 0
if x >= dim[0] or y >= dim[1]:
continue
if not self.check_compute_thread_result(test, action, x, y, z, dim, bufdata):
failed = True
overallFailed |= failed
if not failed:
rdtest.log.success(f"Test {test} successful")
else:
rdtest.log.error(f"Test {test} failed")
rdtest.log.end_section(
f"Compute tests with {comp_dim.customName} workgroup")
return overallFailed
def check_capture(self):
graphics_tests = [a for a in self.find_action(
"Graphics Tests").children if a.flags & rd.ActionFlags.Drawcall]
compute_dims = [a for a in self.find_action(
"Compute Tests").children if 'x' in a.customName]
rdtest.log.begin_section("Graphics tests")
@@ -34,19 +139,6 @@ class Subgroup_Zoo(rdtest.TestCase):
# middle quad on other triangle
(56, 64), (57, 64), (56, 65), (57, 65),
]
# threads to check. largest dimension only (all small dim checked)
thread_checks = [
# first few
0, 1, 2,
# near end of 32-subgroup and boundary
30, 31, 32,
# near end of 64-subgroup and boundary
62, 63, 64,
# near end of 64-subgroup and boundary
62, 63, 64,
# large values spaced out with one near the end of our unaligned size
100, 110, 120, 140, 149, 150, 160, 200, 250,
]
clear_col = (123456.0, 789.0, 101112.0, 0.0)
overallFailed = False
@@ -163,102 +255,21 @@ class Subgroup_Zoo(rdtest.TestCase):
rdtest.log.end_section("Graphics tests")
for comp_dim in compute_dims:
rdtest.log.begin_section(
f"Compute tests with {comp_dim.customName} workgroup")
# threads to check. largest dimension only (all small dim checked)
thread_checks = [
# first few
0, 1, 2,
# near end of 32-subgroup and boundary
30, 31, 32, 33, 34,
# near end of 64-subgroup and boundary
62, 63, 64, 64, 65,
# large values spaced out with one near the end of our unaligned size
100, 110, 120, 140, 149, 150, 160, 200, 250,
]
compute_dims = [a for a in self.find_action(
"Compute Tests").children if 'x' in a.customName]
compute_tests = [
a for a in comp_dim.children if a.flags & rd.ActionFlags.Dispatch]
for test, action in enumerate(compute_tests):
failed = False
self.controller.SetFrameEvent(action.eventId, False)
pipe = self.controller.GetPipelineState()
csrefl = pipe.GetShaderReflection(rd.ShaderStage.Compute)
dim = csrefl.dispatchThreadsDimension
rw = pipe.GetReadWriteResources(rd.ShaderStage.Compute)
if len(rw) != 1:
rdtest.log.error("Unexpected number of RW resources")
continue
# each test writes up to 16k data, one vec4 per thread * up to 1024 threads
bufdata = self.controller.GetBufferData(
rw[0].descriptor.resource, test*16*1024, 16*1024)
for t in thread_checks:
xrange = 1
yrange = dim[1]
xbase = t
ybase = 0
# vertical orientation
if dim[1] > dim[0]:
xrange = dim[0]
yrange = 1
xbase = 0
ybase = t
for x in range(xbase, xbase+xrange):
for y in range(ybase, ybase+yrange):
z = 0
if x >= dim[0] or y >= dim[1]:
continue
try:
real = struct.unpack_from(
"4f", bufdata, 16*y*dim[0] + 16*x)
trace = self.controller.DebugThread(
(0, 0, 0), (x, y, z))
_, variables = self.process_trace(trace)
if trace.debugger is None:
raise rdtest.TestFailureException(f"Test {test} at {action.eventId} got no debug result at {x},{y},{z}")
# Find the source variable 'data' at the highest instruction index
debugged = None
countInst = len(trace.instInfo)
for inst in range(countInst):
sourceVars = trace.instInfo[countInst-1-inst].sourceVars
try:
dataVars = [v for v in sourceVars if v.name == 'data']
if len(dataVars) == 0:
continue
debugged = self.evaluate_source_var(dataVars[0], variables)
except KeyError as ex:
continue
except rdtest.TestFailureException as ex:
continue
break
if debugged is None:
raise rdtest.TestFailureException(f"Couldn't find source variable {name}")
debuggedValue = list(debugged.value.f32v[0:4])
if not rdtest.value_compare(real, debuggedValue, eps=5.0E-06):
raise rdtest.TestFailureException(f"EID:{action.eventId} TID:{x},{y},{z} debugged thread value {debuggedValue} does not match output {real}")
except rdtest.TestFailureException as ex:
rdtest.log.error(f"Test {test} failed {ex}")
failed = True
continue
finally:
self.controller.FreeTrace(trace)
overallFailed |= failed
if not failed:
rdtest.log.success(f"Test {test} successful")
else:
rdtest.log.error(f"Test {test} failed")
rdtest.log.end_section(
f"Compute tests with {comp_dim.customName} workgroup")
overallFailed |= self.check_compute_tests(compute_dims, thread_checks)
if overallFailed:
raise rdtest.TestFailureException("Some tests were not as expected")
+28
View File
@@ -0,0 +1,28 @@
import rdtest
# Not a real test, re-used by API-specific tests
class Workgroup_Zoo(rdtest.Subgroup_Zoo):
internal = True
demos_test_name = None
def check_capture(self):
compute_dims = [a for a in self.find_action("Compute Tests").children if 'x' in a.customName]
# threads to check. largest dimension only (all small dim checked)
thread_checks = [
# first few
0, 1, 2,
# near end of 16-subgroup and boundary
15, 16, 17,
# near end of 32-subgroup and boundary
31, 32, 33,
# near end of 64-subgroup and boundary
63, 64, 65,
# near end of 128-subgroup and boundary
127, 128, 129,
# large values
150
]
if self.check_compute_tests(compute_dims, thread_checks):
raise rdtest.TestFailureException("Some tests were not as expected")
@@ -0,0 +1,5 @@
import rdtest
class D3D12_Workgroup_Zoo(rdtest.Workgroup_Zoo):
demos_test_name = 'D3D12_Workgroup_Zoo'
internal = False
@@ -0,0 +1,5 @@
import rdtest
class VK_Workgroup_Zoo(rdtest.Workgroup_Zoo):
demos_test_name = 'VK_Workgroup_Zoo'
internal = False