Change SPIRV debugger control flow to emulate maximal reconvergence

This commit is contained in:
Jake Turner
2025-02-19 09:42:10 +00:00
parent 7ca6c741ca
commit c1013a1b82
3 changed files with 169 additions and 83 deletions
+21 -1
View File
@@ -173,6 +173,11 @@ ThreadState::~ThreadState()
callstack.clear();
}
void ThreadState::SetConvergencePoint(Id block)
{
convergenceInstruction = debugger.GetInstructionForLabel(block);
}
bool ThreadState::Finished() const
{
return dead || callstack.empty();
@@ -618,7 +623,11 @@ void ThreadState::JumpToLabel(Id target)
frame->lastBlock = frame->curBlock;
frame->curBlock = target;
nextInstruction = debugger.GetInstructionForLabel(target) + 1;
diverged = true;
uint32_t labelInstruction = debugger.GetInstructionForLabel(target);
enteredPoints.push_back(labelInstruction);
nextInstruction = labelInstruction + 1;
// if jumping to an empty unconditional loop header, continue to the loop block
Iter it = debugger.GetIterForInstruction(nextInstruction);
@@ -627,6 +636,7 @@ void ThreadState::JumpToLabel(Id target)
OpLoopMerge merge(it);
mergeBlock = merge.mergeBlock;
SetConvergencePoint(merge.mergeBlock);
it++;
if(it.opcode() == Op::Branch)
@@ -699,6 +709,7 @@ void ThreadState::SkipIgnoredInstructions()
OpSelectionMerge merge(it);
mergeBlock = merge.mergeBlock;
SetConvergencePoint(merge.mergeBlock);
nextInstruction++;
continue;
@@ -709,6 +720,7 @@ void ThreadState::SkipIgnoredInstructions()
OpLoopMerge merge(it);
mergeBlock = merge.mergeBlock;
SetConvergencePoint(merge.mergeBlock);
nextInstruction++;
continue;
@@ -734,6 +746,10 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray<ThreadState>
Iter it = debugger.GetIterForInstruction(nextInstruction);
nextInstruction++;
diverged = false;
enteredPoints.clear();
convergenceInstruction = INVALID_EXECUTION_POINT;
functionReturnPoint = INVALID_EXECUTION_POINT;
OpDecoder opdata(it);
@@ -3879,6 +3895,8 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray<ThreadState>
// function. The second time we do have a return value so we process it and continue
if(returnValue.name.empty())
{
// The instruction after a function call is defined to be a convergence point
functionReturnPoint = nextInstruction;
uint32_t returnInstruction = nextInstruction - 1;
nextInstruction = debugger.GetInstructionForFunction(call.function);
@@ -3891,6 +3909,8 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray<ThreadState>
{
SetDst(call.result, returnValue);
returnValue.name.clear();
// The instruction after a function call is defined to be a convergence point, mark that we entered it
enteredPoints.push_back(nextInstruction);
}
break;
}
+11 -1
View File
@@ -27,6 +27,7 @@
#include "api/replay/rdcarray.h"
#include "maths/vec.h"
#include "spirv_common.h"
#include "spirv_controlflow.h"
#include "spirv_processor.h"
struct SPIRVInterfaceAccess;
@@ -225,12 +226,19 @@ struct ThreadState
// the id of the merge block that the last branch targetted
Id mergeBlock;
uint32_t convergenceInstruction;
uint32_t functionReturnPoint;
ShaderVariable returnValue;
rdcarray<StackFrame *> callstack;
// the list of IDs that are currently valid and live
rdcarray<Id> live;
// true if executed an operation which could trigger divergence
bool diverged;
// list of potential convergence points that were entered in a single step (used for tracking thread convergence)
rdcarray<uint32_t> enteredPoints;
std::map<Id, uint32_t> lastWrite;
// quad ID (arbitrary, just used to find neighbours for derivatives)
@@ -259,6 +267,7 @@ private:
bool ReferencePointer(Id id);
void SkipIgnoredInstructions();
void SetConvergencePoint(Id block);
ShaderDebugState *m_State = NULL;
};
@@ -498,7 +507,6 @@ private:
std::set<rdcstr> usedNames;
std::map<Id, rdcstr> dynamicNames;
void CalcActiveMask(rdcarray<bool> &activeMask);
struct
{
@@ -528,6 +536,8 @@ private:
rdcarray<LocalMapping> activeLocalMappings;
} m_DebugInfo;
rdcspv::ControlFlow controlFlow;
const ScopeData *GetScope(size_t offset) const;
};
@@ -1561,6 +1561,7 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s
std::sort(liveGlobals.begin(), liveGlobals.end());
rdcarray<rdcspv::ThreadIndex> threadIds;
for(uint32_t i = 0; i < threadsInWorkgroup; i++)
{
ThreadState &lane = workgroup[i];
@@ -1589,8 +1590,14 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s
// now that the globals are allocated and their storage won't move, we can take pointers to them
for(const PointerId &p : pointerIDs)
p.Set(*this, global, lane);
// Only add active lanes to control flow
if(!lane.dead)
threadIds.push_back(i);
}
controlFlow.Construct(threadIds);
// find quad neighbours
{
rdcarray<uint32_t> processedQuads;
@@ -2462,6 +2469,7 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
if(steps == 0)
{
ShaderDebugState initial;
uint32_t startBlock = INVALID_EXECUTION_POINT;
// we should be sitting at the entry point function prologue, step forward into the first block
// and past any function-local variable declarations
@@ -2474,6 +2482,7 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
thread.EnterEntryPoint(&initial);
FillCallstack(thread, initial);
initial.nextInstruction = thread.nextInstruction;
startBlock = thread.callstack.back()->curBlock.value();
}
else
{
@@ -2495,6 +2504,21 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
ret.push_back(std::move(initial));
// Set the initial block for the threads in the root tangle
ThreadExecutionStates threadExecutionStates;
TangleGroup &tangles = controlFlow.GetTangles();
RDCASSERTEQUAL(tangles.size(), 1);
RDCASSERTNOTEQUAL(startBlock, INVALID_EXECUTION_POINT);
for(Tangle &tangle : tangles)
{
RDCASSERT(tangle.IsAliveActive());
for(uint32_t threadIdx = 0; threadIdx < workgroup.size(); ++threadIdx)
{
if(!workgroup[threadIdx].Finished())
threadExecutionStates[threadIdx].push_back(startBlock);
}
}
controlFlow.UpdateState(threadExecutionStates);
steps++;
}
@@ -2513,21 +2537,60 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
if(active.Finished())
break;
// calculate the current mask of which threads are active
CalcActiveMask(activeMask);
// Execute the threads in each active tangle
ThreadExecutionStates threadExecutionStates;
TangleGroup &tangles = controlFlow.GetTangles();
// step all active members of the workgroup
for(size_t lane = 0; lane < workgroup.size(); lane++)
bool anyActiveThreads = false;
for(Tangle &tangle : tangles)
{
ThreadState &thread = workgroup[lane];
if(!tangle.IsAliveActive())
continue;
if(activeMask[lane])
rdcarray<rdcspv::ThreadReference> threadRefs = tangle.GetThreadRefs();
// calculate the current active thread mask from the threads in the tangle
{
if(thread.nextInstruction >= instructionOffsets.size())
// one bool per workgroup thread
activeMask.resize(workgroup.size());
// start with all threads as inactive
for(size_t i = 0; i < workgroup.size(); i++)
activeMask[i] = false;
// activate the threads in the tangle
for(const rdcspv::ThreadReference &ref : threadRefs)
{
uint32_t idx = ref.id;
RDCASSERT(idx < workgroup.size(), idx, workgroup.size());
RDCASSERT(!workgroup[idx].Finished());
activeMask[idx] = true;
anyActiveThreads = true;
}
}
ExecutionPoint newConvergeInstruction = INVALID_EXECUTION_POINT;
ExecutionPoint newFunctionReturnPoint = INVALID_EXECUTION_POINT;
uint32_t countActiveThreads = 0;
uint32_t countDivergedThreads = 0;
uint32_t countConvergePointThreads = 0;
uint32_t countFunctionReturnThreads = 0;
// step all active members of the workgroup
for(size_t lane = 0; lane < workgroup.size(); lane++)
{
if(!activeMask[lane])
continue;
++countActiveThreads;
ThreadState &thread = workgroup[lane];
const uint32_t currentPC = thread.nextInstruction;
const uint32_t threadId = lane;
if(currentPC >= instructionOffsets.size())
{
if(lane == activeLaneIndex)
ret.emplace_back();
tangle.SetThreadDead(threadId);
continue;
}
@@ -2535,7 +2598,7 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
{
ShaderDebugState state;
size_t instOffs = instructionOffsets[thread.nextInstruction];
size_t instOffs = instructionOffsets[currentPC];
// see if we're retiring any IDs at this state
for(size_t l = 0; l < thread.live.size();)
@@ -2574,7 +2637,7 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
if(m_DebugInfo.valid)
{
size_t endOffs = instructionOffsets[thread.nextInstruction - 1];
size_t endOffs = instructionOffsets[currentPC - 1];
// append any inlined functions to the top of the stack
InlineData *inlined = m_DebugInfo.lineInline[endOffs];
@@ -2622,8 +2685,73 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
{
thread.StepNext(NULL, workgroup, activeMask);
}
threadExecutionStates[threadId] = thread.enteredPoints;
uint32_t threadConvergeInstruction = thread.convergenceInstruction;
// the thread activated a new convergence point
if(threadConvergeInstruction != INVALID_EXECUTION_POINT)
{
if(newConvergeInstruction == INVALID_EXECUTION_POINT)
{
newConvergeInstruction = threadConvergeInstruction;
RDCASSERTNOTEQUAL(newConvergeInstruction, INVALID_EXECUTION_POINT);
}
else
{
// All the threads in the tangle should set the same convergence point
RDCASSERTEQUAL(threadConvergeInstruction, newConvergeInstruction);
}
++countConvergePointThreads;
}
uint32_t threadFunctionReturnPoint = thread.functionReturnPoint;
// the thread activated a new function return point
if(threadFunctionReturnPoint != INVALID_EXECUTION_POINT)
{
if(newFunctionReturnPoint == INVALID_EXECUTION_POINT)
{
newFunctionReturnPoint = threadFunctionReturnPoint;
RDCASSERTNOTEQUAL(newFunctionReturnPoint, INVALID_EXECUTION_POINT);
}
else
{
// All the threads in the tangle should set the same function return point
RDCASSERTEQUAL(threadFunctionReturnPoint, newFunctionReturnPoint);
}
++countFunctionReturnThreads;
}
if(thread.Finished())
tangle.SetThreadDead(threadId);
if(thread.diverged)
++countDivergedThreads;
}
if(countConvergePointThreads)
{
// all the active threads should have a convergence point if any have one
RDCASSERTEQUAL(countConvergePointThreads, countActiveThreads);
tangle.AddMergePoint(newConvergeInstruction);
}
if(countFunctionReturnThreads)
{
// all the active threads should have a function return point if any have one
RDCASSERTEQUAL(countFunctionReturnThreads, countActiveThreads);
tangle.AddFunctionReturnPoint(newFunctionReturnPoint);
}
if(countDivergedThreads)
{
// all the active threads should have diverged if any diverges
RDCASSERTEQUAL(countDivergedThreads, countActiveThreads);
tangle.SetDiverged(true);
}
}
if(!anyActiveThreads)
{
active.dead = true;
controlFlow.UpdateState(threadExecutionStates);
RDCERR("No active threads in any tangle, killing active thread to terminate the debugger");
}
controlFlow.UpdateState(threadExecutionStates);
}
return ret;
@@ -3426,78 +3554,6 @@ rdcstr Debugger::GetHumanName(Id id)
return name;
}
void Debugger::CalcActiveMask(rdcarray<bool> &activeMask)
{
// one bool per workgroup thread
activeMask.resize(workgroup.size());
// mark any threads that have finished as inactive, otherwise they're active
for(size_t i = 0; i < workgroup.size(); i++)
activeMask[i] = !workgroup[i].Finished();
// otherwise we need to make sure that control flow which converges stays in lockstep so that
// derivatives etc are still valid. While diverged, we don't have to keep threads in lockstep
// since using derivatives is invalid.
//
// We take advantage of SPIR-V's structured control flow. We only ever diverge at a branch
// instruction, and the preceeding OpLoopMerge/OpSelectionMerge.
//
// So the scheme is as follows:
// * If we haven't diverged and all threads have the same nextInstruction, we're still uniform so
// continue in lockstep.
// * As soon as they differ, we've diverged. Check the last mergeBlock that was specified - we
// won't be uniform again until all threads reach that block.
// * Once we've diverged, any threads which are NOT in the merge block are active, and any threads
// which are in it are inactive. This causes them to pause and wait for others to catch up
// until the point where all threads are in the merge block at which point we've converged and
// can go back to uniformity.
// if we're waiting on a converge block to be reached, we've diverged previously.
bool wasDiverged = convergeBlock != Id();
// see if we've diverged by starting procesing different next instructions
bool diverged = false;
for(size_t i = 1; !diverged && i < workgroup.size(); i++)
diverged |= (workgroup[0].nextInstruction != workgroup[i].nextInstruction);
if(!wasDiverged && diverged)
{
// if we've newly diverged, all workgroups should have the same merge block - the point where we
// become uniform again.
convergeBlock = workgroup[0].mergeBlock;
for(size_t i = 1; i < workgroup.size(); i++)
RDCASSERT(!activeMask[i] || convergeBlock == workgroup[i].mergeBlock);
}
if(wasDiverged || diverged)
{
// for every thread, turn it off if it's in the converge block
rdcarray<bool> inConverge;
inConverge.resize(activeMask.size());
for(size_t i = 0; i < workgroup.size(); i++)
inConverge[i] = (!workgroup[i].callstack.empty() &&
workgroup[i].callstack.back()->curBlock == convergeBlock);
// is any thread active, but not converged?
bool anyActiveNotConverged = false;
for(size_t i = 0; i < workgroup.size(); i++)
anyActiveNotConverged |= activeMask[i] && !inConverge[i];
if(anyActiveNotConverged)
{
// if so, then only non-converged threads are active right now
for(size_t i = 0; i < workgroup.size(); i++)
activeMask[i] &= !inConverge[i];
}
else
{
// otherwise we can leave the active mask as is, forget the convergence point, and allow
// everything to run as normal
convergeBlock = Id();
}
}
}
void Debugger::AllocateVariable(Id id, Id typeId, ShaderVariable &outVar)
{
// allocs should always be pointers