Prepare SPIR-V debugger for larger workgroup sizes

* The workgroup size is passed in at creation time, and we handle the potenttial
  for multiple quads by identifying quads via quad ID
This commit is contained in:
baldurk
2025-01-31 14:20:11 +00:00
parent 1cfb684d16
commit 8d3f40b0a9
4 changed files with 159 additions and 65 deletions
+38 -27
View File
@@ -77,13 +77,9 @@ inline uint64_t CountOnes(uint64_t value)
namespace rdcspv
{
ThreadState::ThreadState(uint32_t workgroupIdx, Debugger &debug, const GlobalState &globalState)
ThreadState::ThreadState(Debugger &debug, const GlobalState &globalState)
: debugger(debug), global(globalState)
{
workgroupIndex = workgroupIdx;
nextInstruction = 0;
helperInvocation = false;
killed = false;
}
ThreadState::~ThreadState()
@@ -95,7 +91,7 @@ ThreadState::~ThreadState()
bool ThreadState::Finished() const
{
return killed || callstack.empty();
return dead || callstack.empty();
}
void ThreadState::FillCallstack(rdcarray<Id> &funcs)
@@ -415,18 +411,33 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De
{
const ThreadState *a = NULL, *b = NULL;
if(quadNeighbours[0] == ~0U || quadNeighbours[1] == ~0U || quadNeighbours[2] == ~0U ||
quadNeighbours[3] == ~0U)
{
debugger.GetAPIWrapper()->AddDebugMessage(
MessageCategory::Execution, MessageSeverity::High, MessageSource::RuntimeWarning,
StringFormat::Fmt("Derivative calculation within non-quad on input %s",
debugger.GetHumanName(val).c_str()));
return ShaderVariable("", 0.0f, 0.0f, 0.0f, 0.0f);
}
RDCASSERT(quadNeighbours[0] < workgroup.size(), quadNeighbours[0], workgroup.size());
RDCASSERT(quadNeighbours[1] < workgroup.size(), quadNeighbours[1], workgroup.size());
RDCASSERT(quadNeighbours[2] < workgroup.size(), quadNeighbours[2], workgroup.size());
RDCASSERT(quadNeighbours[3] < workgroup.size(), quadNeighbours[3], workgroup.size());
const bool xdirection = (dir == DDX);
if(type == Coarse)
{
// coarse derivatives are identical across the quad, based on the top-left.
a = &workgroup[0];
b = &workgroup[xdirection ? 1 : 2];
a = &workgroup[quadNeighbours[0]];
b = &workgroup[quadNeighbours[xdirection ? 1 : 2]];
}
else
{
// we need to figure out the exact pair to use
int x = workgroupIndex & 1;
int y = workgroupIndex / 2;
int x = quadLaneIndex & 1;
int y = quadLaneIndex / 2;
if(x == 0)
{
@@ -435,13 +446,13 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De
// top-left
if(xdirection)
{
a = &workgroup[0];
b = &workgroup[1];
a = &workgroup[quadNeighbours[0]];
b = &workgroup[quadNeighbours[1]];
}
else
{
a = &workgroup[0];
b = &workgroup[2];
a = &workgroup[quadNeighbours[0]];
b = &workgroup[quadNeighbours[2]];
}
}
else
@@ -449,13 +460,13 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De
// bottom-left
if(xdirection)
{
a = &workgroup[2];
b = &workgroup[3];
a = &workgroup[quadNeighbours[2]];
b = &workgroup[quadNeighbours[3]];
}
else
{
a = &workgroup[0];
b = &workgroup[2];
a = &workgroup[quadNeighbours[0]];
b = &workgroup[quadNeighbours[2]];
}
}
}
@@ -466,13 +477,13 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De
// top-right
if(xdirection)
{
a = &workgroup[0];
b = &workgroup[1];
a = &workgroup[quadNeighbours[0]];
b = &workgroup[quadNeighbours[1]];
}
else
{
a = &workgroup[1];
b = &workgroup[3];
a = &workgroup[quadNeighbours[1]];
b = &workgroup[quadNeighbours[3]];
}
}
else
@@ -480,13 +491,13 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De
// bottom-right
if(xdirection)
{
a = &workgroup[2];
b = &workgroup[3];
a = &workgroup[quadNeighbours[2]];
b = &workgroup[quadNeighbours[3]];
}
else
{
a = &workgroup[1];
b = &workgroup[3];
a = &workgroup[quadNeighbours[1]];
b = &workgroup[quadNeighbours[3]];
}
}
}
@@ -3068,7 +3079,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray<ThreadState>
case Op::TerminateInvocation:
case Op::Kill:
{
killed = true;
dead = true;
// destroy all stack frames
for(StackFrame *exitingFrame : callstack)
+13 -6
View File
@@ -175,7 +175,7 @@ class Debugger;
struct ThreadState
{
ThreadState(uint32_t workgroupIdx, Debugger &debug, const GlobalState &globalState);
ThreadState(Debugger &debug, const GlobalState &globalState);
~ThreadState();
void EnterEntryPoint(ShaderDebugState *state);
@@ -231,10 +231,16 @@ struct ThreadState
std::map<Id, uint32_t> lastWrite;
// index in the pixel quad
uint32_t workgroupIndex;
bool helperInvocation;
bool killed;
// quad ID (arbitrary, just used to find neighbours for derivatives)
uint32_t quadId = 0;
// index in the pixel quad (relative to the active lane)
uint32_t quadLaneIndex = ~0U;
// the lane indices of our quad neighbours
uint32_t quadNeighbours[4] = {~0U, ~0U, ~0U, ~0U};
// index in the workgroup
uint32_t workgroupIndex = 0;
bool helperInvocation = false;
bool dead = true;
const ShaderVariable &GetSrc(Id id) const;
void WritePointerValue(Id pointer, const ShaderVariable &val);
@@ -368,7 +374,8 @@ public:
ShaderDebugTrace *BeginDebug(DebugAPIWrapper *apiWrapper, const ShaderStage stage,
const rdcstr &entryPoint, const rdcarray<SpecConstant> &specInfo,
const std::map<size_t, uint32_t> &instructionLines,
const SPIRVPatchData &patchData, uint32_t activeIndex);
const SPIRVPatchData &patchData, uint32_t activeIndex,
uint32_t workgroupSize);
rdcarray<ShaderDebugState> ContinueDebug();
@@ -838,7 +838,8 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s
const rdcstr &entryPoint,
const rdcarray<SpecConstant> &specInfo,
const std::map<size_t, uint32_t> &instructionLines,
const SPIRVPatchData &patchData, uint32_t activeIndex)
const SPIRVPatchData &patchData, uint32_t activeIndex,
uint32_t workgroupSize)
{
Id entryId = entryLookup[ShaderEntryPoint(entryPoint, shaderStage)];
@@ -895,9 +896,8 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s
stage = shaderStage;
apiWrapper = api;
uint32_t workgroupSize = shaderStage == ShaderStage::Pixel ? 4 : 1;
for(uint32_t i = 0; i < workgroupSize; i++)
workgroup.push_back(ThreadState(i, *this, global));
workgroup.push_back(ThreadState(*this, global));
ThreadState &active = GetActiveLane();
@@ -1489,6 +1489,7 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s
for(uint32_t i = 0; i < workgroupSize; i++)
{
ThreadState &lane = workgroup[i];
lane.workgroupIndex = i;
if(i != activeLaneIndex)
{
lane.nextInstruction = active.nextInstruction;
@@ -1499,16 +1500,68 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s
if(stage == ShaderStage::Pixel)
{
ShaderVariable var(rdcstr(), 0U, 0U, 0U, 0U);
apiWrapper->FillInputValue(var, ShaderBuiltin::IsHelper, i, 0, 0);
lane.helperInvocation = var.value.u32v[0] != 0;
lane.helperInvocation = apiWrapper->GetThreadProperty(i, ThreadProperty::Helper) != 0;
lane.quadLaneIndex = apiWrapper->GetThreadProperty(i, ThreadProperty::QuadLane);
lane.quadId = apiWrapper->GetThreadProperty(i, ThreadProperty::QuadId);
}
lane.dead = apiWrapper->GetThreadProperty(i, ThreadProperty::Active) == 0;
// now that the globals are allocated and their storage won't move, we can take pointers to them
for(const PointerId &p : pointerIDs)
p.Set(*this, global, lane);
}
// find quad neighbours
{
rdcarray<uint32_t> processedQuads;
for(uint32_t i = 0; i < workgroupSize; i++)
{
uint32_t desiredQuad = workgroup[i].quadId;
// ignore threads not in any quad
if(desiredQuad == 0)
continue;
// quads are almost certainly sorted together, so shortcut by checking the last one
if((!processedQuads.empty() && processedQuads.back() == desiredQuad) ||
processedQuads.contains(desiredQuad))
continue;
processedQuads.push_back(desiredQuad);
// find the threads
uint32_t threads[4] = {
i,
~0U,
~0U,
~0U,
};
for(uint32_t j = i + 1, t = 1; j < workgroupSize && t < 4; j++)
{
if(workgroup[j].quadId == desiredQuad)
threads[t++] = j;
}
// now swizzle the threads to know each other
for(uint32_t src = 0; src < 4; src++)
{
uint32_t lane = workgroup[threads[src]].quadLaneIndex;
if(lane >= 4)
continue;
for(uint32_t dst = 0; dst < 4; dst++)
{
if(threads[dst] == ~0U)
continue;
workgroup[threads[dst]].quadNeighbours[lane] = threads[src];
}
}
}
}
// this contains all the accumulated line number information. Add in our disassembly mapping
ret->instInfo = m_InstInfo;
for(size_t i = 0; i < m_InstInfo.size(); i++)
+49 -26
View File
@@ -2968,8 +2968,8 @@ struct VertexLaneData
struct PixelLaneData
{
Vec4f fragCoord; // per-lane coord
uint32_t helper; // per-lane helper bit
Vec4f fragCoord; // per-lane coord
uint32_t isHelper; // per-lane helper bit
uint32_t padding[3];
};
@@ -3302,7 +3302,7 @@ static void CreateInputFetcher(rdcarray<uint32_t> &spv,
helper.base = helper.loadOps.add(rdcspv::OpSelect(uint32Type, editor.MakeId(), helper.base,
getUIntConst(1), getUIntConst(0)));
fixedValues.push_back(helper);
structMembers.push_back({uint32Type, "__rd_helper", offsetof(PixelLaneData, helper)});
structMembers.push_back({uint32Type, "__rd_helper", offsetof(PixelLaneData, isHelper)});
offset += sizeof(PixelLaneData);
}
@@ -3881,6 +3881,9 @@ ShaderDebugTrace *VulkanReplay::DebugVertex(uint32_t eventId, uint32_t vertid, u
apiWrapper->location_inputs.resize(numThreads);
apiWrapper->thread_builtins.resize(numThreads);
apiWrapper->thread_props.resize(numThreads);
apiWrapper->thread_props[0][(size_t)rdcspv::ThreadProperty::Active] = 1;
std::unordered_map<ShaderBuiltin, ShaderVariable> &global_builtins = apiWrapper->global_builtins;
global_builtins[ShaderBuiltin::BaseInstance] =
@@ -4056,7 +4059,7 @@ ShaderDebugTrace *VulkanReplay::DebugVertex(uint32_t eventId, uint32_t vertid, u
rdcspv::Debugger *debugger = new rdcspv::Debugger;
debugger->Parse(shader.spirv.GetSPIRV());
ShaderDebugTrace *ret = debugger->BeginDebug(apiWrapper, ShaderStage::Vertex, entryPoint, spec,
shadRefl.instructionLines, shadRefl.patchData, 0);
shadRefl.instructionLines, shadRefl.patchData, 0, 1);
apiWrapper->ResetReplay();
return ret;
@@ -4589,27 +4592,40 @@ ShaderDebugTrace *VulkanReplay::DebugPixel(uint32_t eventId, uint32_t x, uint32_
{
byte *value = LaneData + t * structStride;
apiWrapper->thread_props[t][(size_t)rdcspv::ThreadProperty::Active] = 1;
// read PixelLaneData
PixelLaneData *pixelData = (PixelLaneData *)value;
{
PixelLaneData *pixelData = (PixelLaneData *)value;
{
ShaderVariable &var = apiWrapper->thread_builtins[t][ShaderBuiltin::Position];
var.rows = 1;
var.columns = 4;
var.type = VarType::Float;
memcpy(var.value.u8v.data(), &pixelData->fragCoord, sizeof(Vec4f));
}
{
ShaderVariable &var = apiWrapper->thread_builtins[t][ShaderBuiltin::IsHelper];
var.rows = 1;
var.columns = 1;
var.type = VarType::Bool;
memcpy(var.value.u8v.data(), &pixelData->isHelper, sizeof(uint32_t));
}
if(numThreads == 4)
apiWrapper->thread_props[t][(size_t)rdcspv::ThreadProperty::Active] = 1;
apiWrapper->thread_props[t][(size_t)rdcspv::ThreadProperty::Helper] =
t != winner->laneIndex ? 1 : 0;
apiWrapper->thread_props[t][(size_t)rdcspv::ThreadProperty::QuadId] = 1000;
apiWrapper->thread_props[t][(size_t)rdcspv::ThreadProperty::QuadLane] = t;
}
value += sizeof(PixelLaneData);
{
ShaderVariable &var = apiWrapper->thread_builtins[t][ShaderBuiltin::Position];
var.rows = 1;
var.columns = 4;
var.type = VarType::Float;
memcpy(var.value.u8v.data(), &pixelData->fragCoord, sizeof(Vec4f));
}
{
ShaderVariable &var = apiWrapper->thread_builtins[t][ShaderBuiltin::IsHelper];
var.rows = 1;
var.columns = 1;
var.type = VarType::Bool;
memcpy(var.value.u8v.data(), &pixelData->helper, sizeof(uint32_t));
}
for(size_t i = 0; i < shadRefl.refl->inputSignature.size(); i++)
{
@@ -4640,7 +4656,8 @@ ShaderDebugTrace *VulkanReplay::DebugPixel(uint32_t eventId, uint32_t x, uint32_
}
ret = debugger->BeginDebug(apiWrapper, ShaderStage::Pixel, entryPoint, spec,
shadRefl.instructionLines, shadRefl.patchData, winner->laneIndex);
shadRefl.instructionLines, shadRefl.patchData, winner->laneIndex,
numThreads);
apiWrapper->ResetReplay();
}
else
@@ -4712,6 +4729,9 @@ ShaderDebugTrace *VulkanReplay::DebugThread(uint32_t eventId,
static const uint32_t numThreads = 1;
apiWrapper->thread_builtins.resize(numThreads);
apiWrapper->thread_props.resize(numThreads);
apiWrapper->thread_props[0][(size_t)rdcspv::ThreadProperty::Active] = 1;
std::unordered_map<ShaderBuiltin, ShaderVariable> &global_builtins = apiWrapper->global_builtins;
global_builtins[ShaderBuiltin::DispatchSize] =
@@ -4736,7 +4756,7 @@ ShaderDebugTrace *VulkanReplay::DebugThread(uint32_t eventId,
rdcspv::Debugger *debugger = new rdcspv::Debugger;
debugger->Parse(shader.spirv.GetSPIRV());
ShaderDebugTrace *ret = debugger->BeginDebug(apiWrapper, ShaderStage::Compute, entryPoint, spec,
shadRefl.instructionLines, shadRefl.patchData, 0);
shadRefl.instructionLines, shadRefl.patchData, 0, 1);
apiWrapper->ResetReplay();
return ret;
@@ -4800,6 +4820,9 @@ ShaderDebugTrace *VulkanReplay::DebugMeshThread(uint32_t eventId,
static const uint32_t numThreads = 1;
apiWrapper->thread_builtins.resize(numThreads);
apiWrapper->thread_props.resize(numThreads);
apiWrapper->thread_props[0][(size_t)rdcspv::ThreadProperty::Active] = 1;
std::unordered_map<ShaderBuiltin, ShaderVariable> &global_builtins = apiWrapper->global_builtins;
global_builtins[ShaderBuiltin::DispatchSize] =
@@ -4824,7 +4847,7 @@ ShaderDebugTrace *VulkanReplay::DebugMeshThread(uint32_t eventId,
rdcspv::Debugger *debugger = new rdcspv::Debugger;
debugger->Parse(shader.spirv.GetSPIRV());
ShaderDebugTrace *ret = debugger->BeginDebug(apiWrapper, ShaderStage::Mesh, entryPoint, spec,
shadRefl.instructionLines, shadRefl.patchData, 0);
shadRefl.instructionLines, shadRefl.patchData, 0, 1);
apiWrapper->ResetReplay();
return ret;