Improvements to *_Groupshared tests

More checking of GSM local/global cache behaviour when debugging
One test is not GPU stable and its results are verified against hard coded expectation (this is to test the expected behaviour of the local GSM cache on the active thread)
This commit is contained in:
Jake Turner
2025-05-09 15:19:40 +01:00
parent 370334ca6e
commit fc956fee61
5 changed files with 331 additions and 109 deletions
+90 -23
View File
@@ -30,45 +30,93 @@ RD_TEST(D3D11_Groupshared, D3D11GraphicsTest)
std::string comp = R"EOSHADER(
#define MAX_THREADS 64
RWStructuredBuffer<float> indata : register(u0);
RWStructuredBuffer<float4> outdata : register(u1);
groupshared float tmp[64];
groupshared float gsmData[MAX_THREADS];
[numthreads(64,1,1)]
void main(uint3 tid : SV_GroupThreadID)
cbuffer consts : register(b0)
{
if(tid.x == 0)
int inTest;
};
#define IsTest(x) (inTest == x)
float GetGSMValue(uint i)
{
return gsmData[i % MAX_THREADS];
}
[numthreads(MAX_THREADS,1,1)]
void main(uint3 gid : SV_GroupThreadID)
{
if(gid.x == 0)
{
for(int i=0; i < 64; i++) tmp[i] = 1.234f;
for(int i=0; i < MAX_THREADS; i++) gsmData[i] = 1.25f;
}
GroupMemoryBarrierWithGroupSync();
float4 outval;
float4 outval = 0.0f.xxxx;
// first write, should be the init value for all threads
outval.x = tmp[tid.x];
if (IsTest(0))
{
// first write, should be the init value for all threads
outval.x = GetGSMValue(gid.x);
tmp[tid.x] = indata[tid.x];
gsmData[gid.x] = indata[gid.x];
// second write, should be the read value because we're reading our own value
outval.y = tmp[tid.x];
// second write, should be the read value because we're reading our own value
outval.y = GetGSMValue(gid.x);
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
// third write, should be our pairwise neighbour's value
outval.z = tmp[tid.x ^ 1];
// third write, should be our pairwise neighbour's value
outval.z = GetGSMValue(gid.x ^ 1);
// do calculation with our neighbour
tmp[tid.x] = (1.0f + tmp[tid.x]) * (1.0f + tmp[tid.x ^ 1]);
// do calculation with our neighbour
gsmData[gid.x] = (1.0f + GetGSMValue(gid.x)) * (1.0f + GetGSMValue(gid.x ^ 1));
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
// fourth write, our neighbour should be identical to our value
outval.w = tmp[tid.x] == tmp[tid.x ^ 1] ? 9.99f : -9.99f;
// fourth write, our neighbour should be identical to our value
outval.w = GetGSMValue(gid.x) == GetGSMValue(gid.x ^ 1) ? 9.99f : -9.99f;
}
else if (IsTest(1))
{
gsmData[gid.x] = (float)gid.x;
gsmData[gid.x] += 10.0f;
GroupMemoryBarrierWithGroupSync();
outdata[tid.x] = outval;
outval.x = GetGSMValue(gid.x);
outval.y = GetGSMValue(gid.x + 1);
GroupMemoryBarrierWithGroupSync();
gsmData[gid.x] += 10.0f;
GroupMemoryBarrierWithGroupSync();
outval.z = GetGSMValue(gid.x + 2);
GroupMemoryBarrierWithGroupSync();
gsmData[gid.x] += 10.0f;
GroupMemoryBarrierWithGroupSync();
outval.w = GetGSMValue(gid.x + 3);
}
else if (IsTest(2))
{
// Deliberately no sync to test debugger behaviour not GPU correctness
// Debugger should see the initial value of 1.25f for all of GSM
gsmData[gid.x] = (float)gid.x;
outval.x = GetGSMValue(gid.x);
outval.y = GetGSMValue(gid.x + 1);
outval.z = GetGSMValue(gid.x + 2);
outval.w = GetGSMValue(gid.x + 3);
}
outdata[gid.x] = outval;
}
)EOSHADER";
@@ -90,18 +138,37 @@ void main(uint3 tid : SV_GroupThreadID)
ID3D11ComputeShaderPtr shad = CreateCS(Compile(comp, "main", "cs_5_0", true));
int cbufferdata[4];
memset(cbufferdata, 0, sizeof(cbufferdata));
ID3D11BufferPtr cb = MakeBuffer().Size(16).Constant().Data(&cbufferdata);
int numCompTests = 0;
size_t pos = 0;
while(pos != std::string::npos)
{
pos = comp.find("IsTest(", pos);
if(pos == std::string::npos)
break;
pos += sizeof("IsTest(") - 1;
numCompTests = std::max(numCompTests, atoi(comp.c_str() + pos) + 1);
}
while(Running())
{
ClearRenderTargetView(bbRTV, {0.2f, 0.2f, 0.2f, 1.0f});
ClearUnorderedAccessView(outUAV, Vec4u());
ctx->CSSetShader(shad, NULL, 0);
ctx->CSSetUnorderedAccessViews(0, 1, &inUAV.GetInterfacePtr(), NULL);
ctx->CSSetUnorderedAccessViews(1, 1, &outUAV.GetInterfacePtr(), NULL);
pushMarker("Compute Tests");
ctx->Dispatch(1, 1, 1);
for(int i = 0; i < numCompTests; ++i)
{
ClearUnorderedAccessView(outUAV, Vec4u());
ctx->UpdateSubresource(cb, 0, NULL, &i, 4, 0);
ctx->CSSetConstantBuffers(0, 1, &cb.GetInterfacePtr());
ctx->Dispatch(1, 1, 1);
}
popMarker();
Present();
+108 -36
View File
@@ -30,45 +30,93 @@ RD_TEST(D3D12_Groupshared, D3D12GraphicsTest)
std::string comp = R"EOSHADER(
#define MAX_THREADS 64
RWStructuredBuffer<float> indata : register(u0);
RWStructuredBuffer<float4> outdata : register(u1);
groupshared float tmp[64];
[numthreads(64,1,1)]
void main(uint3 tid : SV_GroupThreadID)
cbuffer rootconsts : register(b0)
{
if(tid.x == 0)
uint root_test;
}
groupshared float gsmData[MAX_THREADS];
#define IsTest(x) (root_test == x)
float GetGSMValue(uint i)
{
return gsmData[i % MAX_THREADS];
}
[numthreads(MAX_THREADS,1,1)]
void main(uint3 gid : SV_GroupThreadID)
{
if(gid.x == 0)
{
for(int i=0; i < 64; i++) tmp[i] = 1.234f;
for(int i=0; i < MAX_THREADS; i++) gsmData[i] = 1.25f;
}
GroupMemoryBarrierWithGroupSync();
float4 outval;
float4 outval = 0.0f.xxxx;
// first write, should be the init value for all threads
outval.x = tmp[tid.x];
if (IsTest(0))
{
// first write, should be the init value for all threads
outval.x = GetGSMValue(gid.x);
tmp[tid.x] = indata[tid.x];
gsmData[gid.x] = indata[gid.x];
// second write, should be the read value because we're reading our own value
outval.y = tmp[tid.x];
// second write, should be the read value because we're reading our own value
outval.y = GetGSMValue(gid.x);
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
// third write, should be our pairwise neighbour's value
outval.z = tmp[tid.x ^ 1];
// third write, should be our pairwise neighbour's value
outval.z = GetGSMValue(gid.x ^ 1);
// do calculation with our neighbour
tmp[tid.x] = (1.0f + tmp[tid.x]) * (1.0f + tmp[tid.x ^ 1]);
// do calculation with our neighbour
gsmData[gid.x] = (1.0f + GetGSMValue(gid.x)) * (1.0f + GetGSMValue(gid.x ^ 1));
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
// fourth write, our neighbour should be identical to our value
outval.w = tmp[tid.x] == tmp[tid.x ^ 1] ? 9.99f : -9.99f;
// fourth write, our neighbour should be identical to our value
outval.w = GetGSMValue(gid.x) == GetGSMValue(gid.x ^ 1) ? 9.99f : -9.99f;
}
else if (IsTest(1))
{
gsmData[gid.x] = (float)gid.x;
gsmData[gid.x] += 10.0f;
GroupMemoryBarrierWithGroupSync();
outdata[tid.x] = outval;
outval.x = GetGSMValue(gid.x);
outval.y = GetGSMValue(gid.x + 1);
GroupMemoryBarrierWithGroupSync();
gsmData[gid.x] += 10.0f;
GroupMemoryBarrierWithGroupSync();
outval.z = GetGSMValue(gid.x + 2);
GroupMemoryBarrierWithGroupSync();
gsmData[gid.x] += 10.0f;
GroupMemoryBarrierWithGroupSync();
outval.w = GetGSMValue(gid.x + 3);
}
else if (IsTest(2))
{
// Deliberately no sync to test debugger behaviour not GPU correctness
// Debugger should see the initial value of 1.25f for all of GSM
gsmData[gid.x] = (float)gid.x;
outval.x = GetGSMValue(gid.x);
outval.y = GetGSMValue(gid.x + 1);
outval.z = GetGSMValue(gid.x + 2);
outval.w = GetGSMValue(gid.x + 3);
}
outdata[gid.x] = outval;
}
)EOSHADER";
@@ -80,6 +128,7 @@ void main(uint3 tid : SV_GroupThreadID)
return 3;
ID3D12RootSignaturePtr rs = MakeSig({
constParam(D3D12_SHADER_VISIBILITY_ALL, 0, 0, 1),
uavParam(D3D12_SHADER_VISIBILITY_ALL, 0, 0),
uavParam(D3D12_SHADER_VISIBILITY_ALL, 0, 1),
});
@@ -103,9 +152,20 @@ void main(uint3 tid : SV_GroupThreadID)
ID3D12ResourcePtr outBuf = MakeBuffer().Size(sizeof(Vec4f) * 64 * 2).UAV();
D3D12_GPU_DESCRIPTOR_HANDLE outUAVGPU =
MakeUAV(outBuf).Format(DXGI_FORMAT_R32G32B32A32_FLOAT).CreateGPU(0);
MakeUAV(outBuf).Format(DXGI_FORMAT_R32G32B32A32_FLOAT).CreateGPU(10);
D3D12_CPU_DESCRIPTOR_HANDLE outUAVClearCPU =
MakeUAV(outBuf).Format(DXGI_FORMAT_R32G32B32A32_FLOAT).CreateClearCPU(0);
MakeUAV(outBuf).Format(DXGI_FORMAT_R32G32B32A32_FLOAT).CreateClearCPU(10);
int numCompTests = 0;
size_t pos = 0;
while(pos != std::string::npos)
{
pos = comp.find("IsTest(", pos);
if(pos == std::string::npos)
break;
pos += sizeof("IsTest(") - 1;
numCompTests = std::max(numCompTests, atoi(comp.c_str() + pos) + 1);
}
while(Running())
{
@@ -119,30 +179,42 @@ void main(uint3 tid : SV_GroupThreadID)
UINT zero[4] = {};
D3D12_RECT rect = {0, 0, sizeof(Vec4f) * 64, 1};
cmd->ClearUnorderedAccessViewUint(outUAVGPU, outUAVClearCPU, outBuf, zero, 1, &rect);
ResourceBarrier(cmd);
ClearRenderTargetView(cmd, BBRTV, {0.2f, 0.2f, 0.2f, 1.0f});
cmd->SetComputeRootSignature(rs);
cmd->SetComputeRootUnorderedAccessView(0, inBuf->GetGPUVirtualAddress());
cmd->SetComputeRootUnorderedAccessView(1, outBuf->GetGPUVirtualAddress());
cmd->SetComputeRootUnorderedAccessView(1, inBuf->GetGPUVirtualAddress());
cmd->SetComputeRootUnorderedAccessView(2, outBuf->GetGPUVirtualAddress());
pushMarker(cmd, "Compute Tests");
setMarker(cmd, "SM5");
pushMarker(cmd, "SM5");
cmd->SetPipelineState(pso50);
cmd->Dispatch(1, 1, 1);
for(int i = 0; i < numCompTests; ++i)
{
ResourceBarrier(cmd);
cmd->ClearUnorderedAccessViewUint(outUAVGPU, outUAVClearCPU, outBuf, zero, 1, &rect);
ResourceBarrier(cmd);
cmd->SetComputeRoot32BitConstant(0, i, 0);
cmd->Dispatch(1, 1, 1);
}
popMarker(cmd);
if(pso60)
{
setMarker(cmd, "SM6");
cmd->SetComputeRootUnorderedAccessView(1,
outBuf->GetGPUVirtualAddress() + sizeof(Vec4f) * 64);
pushMarker(cmd, "SM6");
cmd->SetPipelineState(pso60);
cmd->Dispatch(1, 1, 1);
for(int i = 0; i < numCompTests; ++i)
{
ResourceBarrier(cmd);
cmd->ClearUnorderedAccessViewUint(outUAVGPU, outUAVClearCPU, outBuf, zero, 1, &rect);
ResourceBarrier(cmd);
cmd->SetComputeRoot32BitConstant(0, i, 0);
cmd->Dispatch(1, 1, 1);
}
popMarker(cmd);
}
popMarker(cmd);
FinishUsingBackbuffer(cmd, D3D12_RESOURCE_STATE_RENDER_TARGET);
+98 -34
View File
@@ -31,57 +31,105 @@ RD_TEST(VK_Groupshared, VulkanGraphicsTest)
std::string comp = R"EOSHADER(
#version 460 core
#define MAX_THREADS 64
layout(push_constant) uniform PushData
{
uint test;
} push;
layout(binding = 0, std430) buffer indataBuf
{
float indata[64];
float indata[MAX_THREADS];
};
layout(binding = 1, std430) buffer outdataBuf
{
vec4 outdata[64];
vec4 outdata[MAX_THREADS];
};
shared float tmp[64];
shared float gsmData[MAX_THREADS];
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
#define IsTest(x) (push.test == x)
float GetGSMValue(uint i)
{
return gsmData[i % MAX_THREADS];
}
layout(local_size_x = MAX_THREADS, local_size_y = 1, local_size_z = 1) in;
#define GroupMemoryBarrierWithGroupSync() memoryBarrierShared();groupMemoryBarrier();barrier();
void main()
{
uvec3 tid = gl_LocalInvocationID;
uvec3 gid = gl_LocalInvocationID;
if(gl_LocalInvocationID.x == 0)
{
for(int i=0; i < 64; i++) tmp[i] = 1.234f;
for(int i=0; i < MAX_THREADS; i++) gsmData[i] = 1.25f;
}
GroupMemoryBarrierWithGroupSync();
vec4 outval;
vec4 outval = vec4(0.0);
// first write, should be the init value for all threads
outval.x = tmp[tid.x];
if (IsTest(0))
{
// first write, should be the init value for all threads
outval.x = GetGSMValue(gid.x);
tmp[tid.x] = indata[tid.x];
gsmData[gid.x] = indata[gid.x];
// second write, should be the read value because we're reading our own value
outval.y = tmp[tid.x];
// second write, should be the read value because we're reading our own value
outval.y = GetGSMValue(gid.x);
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
// third write, should be our pairwise neighbour's value
outval.z = tmp[tid.x ^ 1];
// third write, should be our pairwise neighbour's value
outval.z = GetGSMValue(gid.x ^ 1);
// do calculation with our neighbour
tmp[tid.x] = (1.0f + tmp[tid.x]) * (1.0f + tmp[tid.x ^ 1]);
// do calculation with our neighbour
gsmData[gid.x] = (1.0f + GetGSMValue(gid.x)) * (1.0f + GetGSMValue(gid.x ^ 1));
GroupMemoryBarrierWithGroupSync();
GroupMemoryBarrierWithGroupSync();
// fourth write, our neighbour should be identical to our value
outval.w = tmp[tid.x] == tmp[tid.x ^ 1] ? 9.99f : -9.99f;
// fourth write, our neighbour should be identical to our value
outval.w = GetGSMValue(gid.x) == GetGSMValue(gid.x ^ 1) ? 9.99f : -9.99f;
}
else if (IsTest(1))
{
gsmData[gid.x] = float(gid.x);
gsmData[gid.x] += 10.0f;
GroupMemoryBarrierWithGroupSync();
outdata[tid.x] = outval;
outval.x = GetGSMValue(gid.x);
outval.y = GetGSMValue(gid.x + 1);
GroupMemoryBarrierWithGroupSync();
gsmData[gid.x] += 10.0f;
GroupMemoryBarrierWithGroupSync();
outval.z = GetGSMValue(gid.x + 2);
GroupMemoryBarrierWithGroupSync();
gsmData[gid.x] += 10.0f;
GroupMemoryBarrierWithGroupSync();
outval.w = GetGSMValue(gid.x + 3);
}
else if (IsTest(2))
{
// Deliberately no sync to test debugger behaviour not GPU correctness
// Debugger should see the initial value of 1.25f for all of GSM
gsmData[gid.x] = float(gid.x);
outval.x = GetGSMValue(gid.x);
outval.y = GetGSMValue(gid.x + 1);
outval.z = GetGSMValue(gid.x + 2);
outval.w = GetGSMValue(gid.x + 3);
}
outdata[gid.x] = outval;
}
)EOSHADER";
@@ -96,7 +144,8 @@ void main()
{0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT},
{1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT},
}));
VkPipelineLayout layout = createPipelineLayout(vkh::PipelineLayoutCreateInfo({setLayout}));
VkPipelineLayout layout = createPipelineLayout(vkh::PipelineLayoutCreateInfo(
{setLayout}, {vkh::PushConstantRange(VK_SHADER_STAGE_ALL, 0, 4)}));
VkPipeline pipe = createComputePipeline(vkh::ComputePipelineCreateInfo(
layout, CompileShaderModule(comp, ShaderLang::glsl, ShaderStage::comp)));
@@ -125,6 +174,17 @@ void main()
{vkh::DescriptorBufferInfo(outBuf.buffer)}),
});
int numCompTests = 0;
size_t pos = 0;
while(pos != std::string::npos)
{
pos = comp.find("IsTest(", pos);
if(pos == std::string::npos)
break;
pos += sizeof("IsTest(") - 1;
numCompTests = std::max(numCompTests, atoi(comp.c_str() + pos) + 1);
}
while(Running())
{
VkCommandBuffer cmd = GetCommandBuffer();
@@ -135,22 +195,26 @@ void main()
vkh::cmdClearImage(cmd, swapimg, vkh::ClearColorValue(0.2f, 0.2f, 0.2f, 1.0f));
vkh::cmdPipelineBarrier(
cmd, {},
{vkh::BufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
outBuf.buffer)});
vkCmdFillBuffer(cmd, outBuf.buffer, 0, sizeof(Vec4f) * 64, 0);
vkh::cmdPipelineBarrier(cmd, {},
{vkh::BufferMemoryBarrier(VK_ACCESS_TRANSFER_WRITE_BIT,
VK_ACCESS_SHADER_WRITE_BIT, outBuf.buffer)});
vkh::cmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, {descSet}, {});
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipe);
pushMarker(cmd, "Compute Tests");
vkCmdDispatch(cmd, 1, 1, 1);
for(int i = 0; i < numCompTests; ++i)
{
vkh::cmdPipelineBarrier(
cmd, {},
{vkh::BufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
outBuf.buffer)});
vkCmdFillBuffer(cmd, outBuf.buffer, 0, sizeof(Vec4f) * 64, 0);
vkh::cmdPipelineBarrier(
cmd, {},
{vkh::BufferMemoryBarrier(VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_WRITE_BIT,
outBuf.buffer)});
vkh::cmdPushConstants(cmd, layout, i);
vkCmdDispatch(cmd, 1, 1, 1);
}
popMarker(cmd);
FinishUsingBackbuffer(cmd);
+21 -16
View File
@@ -12,10 +12,8 @@ class Groupshared(rdtest.TestCase):
return True, ''
return False, 'Disabled test'
def check_compute_thread_result(self, test, action, x, y, z, dim, bufdata):
def check_compute_thread_result(self, test, action, x, y, z, expected):
try:
real = struct.unpack_from("4f", bufdata, 16*x)
workgroup = (0, 0, 0)
trace = self.controller.DebugThread(workgroup, (x, y, z))
@@ -45,12 +43,15 @@ class Groupshared(rdtest.TestCase):
debuggedValue = list(debugged.value.f32v[0:4])
if not rdtest.value_compare(real, debuggedValue, eps=5.0E-06):
raise rdtest.TestFailureException(f"EID:{action.eventId} TID:{x},{y},{z} debugged thread value {debuggedValue} does not match output {real}")
if not rdtest.value_compare(expected, debuggedValue, eps=5.0E-06):
raise rdtest.TestFailureException(f"EID:{action.eventId} TID:{x},{y},{z} debugged thread value {debuggedValue} does not match output {expected}")
except rdtest.TestFailureException as ex:
rdtest.log.error(f"Test {test} failed {ex}")
return False
except Exception as ex:
rdtest.log.error(f"Test {test} exception {ex}")
return False
finally:
self.controller.FreeTrace(trace)
@@ -72,7 +73,7 @@ class Groupshared(rdtest.TestCase):
rw = pipe.GetReadWriteResources(rd.ShaderStage.Compute)
if len(rw) != 2:
rdtest.log.error("Unexpected number of RW resources")
rdtest.log.error(f"Unexpected number of RW resources {len(rw)}")
return False
outBuf = rw[1].descriptor.resource
@@ -80,12 +81,17 @@ class Groupshared(rdtest.TestCase):
maxThreads = 64
dataPerThread = 4 * 4
dataPerTest = dataPerThread * maxThreads
bufdata = self.controller.GetBufferData(outBuf, test*dataPerTest, dataPerTest)
bufdata = self.controller.GetBufferData(outBuf, 0, dataPerTest)
for x in range(dim[0]):
y = 0
z = 0
if not self.check_compute_thread_result(test, action, x, y, z, dim, bufdata):
expected = struct.unpack_from("4f", bufdata, 16*x)
# Test 2 is a special case with hard coded results
if test == 2:
expected = [x, 1.25, 1.25, 1.25]
if not self.check_compute_thread_result(test, action, x, y, z, expected):
failed = True
overallFailed |= failed
@@ -97,18 +103,17 @@ class Groupshared(rdtest.TestCase):
return overallFailed
def check_capture(self):
overallFailed = False
action = self.find_action("Compute Tests")
sectionName = action.customName
def check_compute_section_tests(self, sectionAction):
sectionName = sectionAction.customName
rdtest.log.begin_section(sectionName)
overallFailed |= self.check_compute_tests(action)
failed = self.check_compute_tests(sectionAction)
rdtest.log.end_section(sectionName)
if overallFailed:
if failed:
raise rdtest.TestFailureException("Some tests were not as expected")
def check_capture(self):
action = self.find_action("Compute Tests")
self.check_compute_section_tests(action)
self.check_renderdoc_log_asserts()
rdtest.log.success("All tests matched")
@@ -3,3 +3,17 @@ import rdtest
class D3D12_Groupshared(rdtest.Groupshared):
internal = False
demos_test_name = 'D3D12_Groupshared'
def check_capture(self):
overallFailed = False
action = self.find_action("SM5")
self.check_compute_section_tests(action)
action = self.find_action("SM6")
self.check_compute_section_tests(action)
if overallFailed:
raise rdtest.TestFailureException("Some tests were not as expected")
self.check_renderdoc_log_asserts()
rdtest.log.success("All tests matched")