Change DXIL debugger control flow to use rdcshaders::ControlFlow

This emulates maximal re-convergence behaviour and matches how the SPRIV shader debugger control flow works.

Remove the quick fix for discard to skip over degenerate branches
Remove StepOverDegenerateBranch() method
This commit is contained in:
Jake Turner
2025-04-03 09:48:51 +01:00
parent ee4a2c40a9
commit c83be4088a
2 changed files with 181 additions and 95 deletions
+170 -92
View File
@@ -27,10 +27,13 @@
#include "core/settings.h"
#include "maths/formatpacking.h"
#include "replay/common/var_dispatch_helpers.h"
#include "shaders/controlflow.h"
RDOC_CONFIG(bool, D3D12_DXILShaderDebugger_Logging, false,
"Debug logging for the DXIL shader debugger");
using namespace rdcshaders;
// TODO: Extend support for Compound Constants: arithmetic, logical ops
// TODO: Assert m_Block in ThreadState is correct per instruction
// TODO: Automatically execute phi instructions after a branch
@@ -1653,7 +1656,7 @@ bool IsNopInstruction(const Instruction &inst)
return false;
}
bool ThreadState::JumpToBlock(const Block *target)
bool ThreadState::JumpToBlock(const Block *target, bool divergencePoint)
{
m_PreviousBlock = m_Block;
m_PhiVariables.clear();
@@ -1680,6 +1683,24 @@ bool ThreadState::JumpToBlock(const Block *target)
uint32_t nextInstruction = m_FunctionInfo->globalInstructionOffset + m_FunctionInstructionIdx;
if(m_State && !m_Ended)
m_State->nextInstruction = nextInstruction;
m_EnteredPoints.push_back(m_Block);
RDCASSERTEQUAL(m_FunctionInfo->divergentBlocks.contains(m_PreviousBlock), divergencePoint);
if(divergencePoint)
{
m_Diverged = true;
RDCASSERTEQUAL(m_ConvergencePoint, INVALID_EXECUTION_POINT);
for(const ConvergentBlockData &convergentBlock : m_FunctionInfo->convergentBlocks)
{
if(convergentBlock.first == m_PreviousBlock)
{
m_ConvergencePoint = convergentBlock.second;
break;
}
}
RDCASSERTNOTEQUAL(m_ConvergencePoint, INVALID_EXECUTION_POINT);
}
return true;
}
@@ -2991,9 +3012,6 @@ bool ThreadState::ExecuteInstruction(DebugAPIWrapper *apiWrapper,
m_Dead = true;
return true;
}
// Quick fix : need maximal reconvergence style control flow to handle discard properly
// * If the next instruction is a de-generate jump then skip over it
StepOverDegenerateBranch();
}
break;
}
@@ -4147,8 +4165,10 @@ bool ThreadState::ExecuteInstruction(DebugAPIWrapper *apiWrapper,
// Branch <label>
// Branch <label_true> <label_false> <BOOL_VAR>
uint32_t targetArg = 0;
bool divergencePoint = false;
if(inst.args.size() > 1)
{
divergencePoint = cast<Block>(inst.args[0])->id != cast<Block>(inst.args[1])->id;
ShaderVariable cond;
RDCASSERT(GetShaderVariable(inst.args[2], opCode, dxOpCode, cond));
if(!cond.value.u32v[0])
@@ -4156,7 +4176,7 @@ bool ThreadState::ExecuteInstruction(DebugAPIWrapper *apiWrapper,
}
const Block *target = cast<Block>(inst.args[targetArg]);
if(!JumpToBlock(target))
if(!JumpToBlock(target, divergencePoint))
RDCERR("Unknown branch target %u '%s'", m_Block, GetArgumentName(targetArg).c_str());
break;
}
@@ -5205,6 +5225,17 @@ bool ThreadState::ExecuteInstruction(DebugAPIWrapper *apiWrapper,
ShaderVariable val;
RDCASSERT(GetShaderVariable(inst.args[0], opCode, dxOpCode, val));
uint32_t targetArg = 1;
bool divergencePoint = false;
const uint32_t defaultBlockId = cast<Block>(inst.args[1])->id;
for(uint32_t a = 2; a < inst.args.size(); a += 2)
{
const uint32_t targetBlockId = cast<Block>(inst.args[a + 1])->id;
if(targetBlockId != defaultBlockId)
{
divergencePoint = true;
break;
}
}
for(uint32_t a = 2; a < inst.args.size(); a += 2)
{
ShaderVariable targetVal;
@@ -5224,7 +5255,7 @@ bool ThreadState::ExecuteInstruction(DebugAPIWrapper *apiWrapper,
}
const Block *target = cast<Block>(inst.args[targetArg]);
if(!JumpToBlock(target))
if(!JumpToBlock(target, divergencePoint))
RDCERR("Unknown switch target %u '%s'", m_Block, GetArgumentName(targetArg).c_str());
break;
}
@@ -5448,37 +5479,6 @@ bool ThreadState::ExecuteInstruction(DebugAPIWrapper *apiWrapper,
return true;
}
void ThreadState::StepOverDegenerateBranch()
{
if(m_Ended)
return;
uint32_t funcInstrIdx = m_FunctionInstructionIdx;
while(true)
{
RDCASSERT(funcInstrIdx < m_FunctionInfo->function->instructions.size());
const Instruction *inst = m_FunctionInfo->function->instructions[funcInstrIdx];
if(IsNopInstruction(*inst))
{
funcInstrIdx++;
continue;
}
if(inst->op != Operation::Branch)
{
return;
}
const Block *target = cast<Block>(inst->args[0]);
RDCASSERT(target);
uint32_t blockId = target->id;
if(blockId == m_Block + 1)
{
RDCASSERT(!JumpToBlock(target));
return;
}
return;
}
}
void ThreadState::StepOverNopInstructions()
{
if(m_Ended)
@@ -5502,6 +5502,9 @@ void ThreadState::StepNext(ShaderDebugState *state, DebugAPIWrapper *apiWrapper,
const rdcarray<ThreadState> &workgroup, const rdcarray<bool> &activeMask)
{
m_State = state;
m_Diverged = false;
m_EnteredPoints.clear();
m_ConvergencePoint = INVALID_EXECUTION_POINT;
RDCASSERTEQUAL(m_ActiveGlobalInstructionIdx,
m_FunctionInfo->globalInstructionOffset + m_FunctionInstructionIdx);
@@ -6454,43 +6457,6 @@ rdcstr Debugger::GetResourceReferenceName(const DXIL::Program *program,
return "UNKNOWN_RESOURCE_HANDLE";
}
// member functions
void Debugger::CalcActiveMask(rdcarray<bool> &activeMask)
{
// one bool per workgroup thread
activeMask.resize(m_Workgroup.size());
// mark any threads that have finished as inactive, otherwise they're active
for(size_t i = 0; i < m_Workgroup.size(); i++)
activeMask[i] = !m_Workgroup[i].Finished();
// only pixel shaders automatically converge workgroups, compute shaders need explicit sync
if(m_Stage != ShaderStage::Pixel)
return;
// Not diverged then all active
if(!ThreadState::WorkgroupIsDiverged(m_Workgroup))
return;
bool anyActive = false;
for(size_t i = 0; i < m_Workgroup.size(); i++)
{
if(!activeMask[i])
continue;
// Run any thread that is not in a uniform block
// Stop any thread that is not in a uniform block
activeMask[i] = !m_Workgroup[i].InUniformBlock();
anyActive |= activeMask[i];
}
if(!anyActive)
{
RDCERR("No active threads, forcing all unfinished threads to run");
for(size_t i = 0; i < m_Workgroup.size(); i++)
activeMask[i] = !m_Workgroup[i].Finished();
}
return;
}
ScopedDebugData *Debugger::FindScopedDebugData(const DXIL::Metadata *md) const
{
for(ScopedDebugData *s : m_DebugInfo.scopedDebugDatas)
@@ -7955,6 +7921,8 @@ ShaderDebugTrace *Debugger::BeginDebug(uint32_t eventId, const DXBC::DXBCContain
controlFlow.Construct(links);
info.uniformBlocks = controlFlow.GetUniformBlocks();
info.divergentBlocks = controlFlow.GetDivergentBlocks();
info.convergentBlocks = controlFlow.GetConvergentBlocks();
const rdcarray<uint32_t> loopBlocks = controlFlow.GetLoopBlocks();
// Handle de-generate case when a single block
@@ -8499,7 +8467,12 @@ void Debugger::InitialiseWorkgroup(const rdcarray<ThreadProperties> &workgroupPr
const uint32_t threadsInWorkgroup = (uint32_t)m_Workgroup.size();
if(threadsInWorkgroup == 1)
{
rdcarray<ThreadIndex> threadIds;
threadIds.push_back(0);
m_ControlFlow.Construct(threadIds);
return;
}
if(threadsInWorkgroup != workgroupProperties.size())
{
@@ -8508,6 +8481,7 @@ void Debugger::InitialiseWorkgroup(const rdcarray<ThreadProperties> &workgroupPr
return;
}
rdcarray<ThreadIndex> threadIds;
for(uint32_t i = 0; i < threadsInWorkgroup; i++)
{
ThreadState &lane = m_Workgroup[i];
@@ -8521,8 +8495,14 @@ void Debugger::InitialiseWorkgroup(const rdcarray<ThreadProperties> &workgroupPr
lane.m_Dead = workgroupProperties[i][ThreadProperty::Active] == 0;
lane.m_SubgroupIdx = workgroupProperties[i][ThreadProperty::SubgroupIdx];
// Only add active lanes to control flow
if(!lane.m_Dead)
threadIds.push_back(i);
}
m_ControlFlow.Construct(threadIds);
// find quad neighbours
{
rdcarray<uint32_t> processedQuads;
@@ -8584,6 +8564,7 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug(DebugAPIWrapper *apiWrapper)
if(m_Steps == 0)
{
ShaderDebugState initial;
uint32_t startPoint = INVALID_EXECUTION_POINT;
for(size_t lane = 0; lane < m_Workgroup.size(); lane++)
{
@@ -8594,6 +8575,7 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug(DebugAPIWrapper *apiWrapper)
thread.EnterEntryPoint(m_EntryPointFunction, &initial);
thread.FillCallstack(initial);
initial.nextInstruction = thread.m_ActiveGlobalInstructionIdx;
startPoint = initial.nextInstruction;
}
else
{
@@ -8611,6 +8593,21 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug(DebugAPIWrapper *apiWrapper)
ret.push_back(std::move(initial));
// Set the initial execution point for the threads in the root tangle
ThreadExecutionStates threadExecutionStates;
TangleGroup &tangles = m_ControlFlow.GetTangles();
RDCASSERTEQUAL(tangles.size(), 1);
RDCASSERTNOTEQUAL(startPoint, INVALID_EXECUTION_POINT);
for(Tangle &tangle : tangles)
{
RDCASSERT(tangle.IsAliveActive());
for(uint32_t threadIdx = 0; threadIdx < m_Workgroup.size(); ++threadIdx)
{
if(!m_Workgroup[threadIdx].Finished())
threadExecutionStates[threadIdx].push_back(startPoint);
}
}
m_ControlFlow.UpdateState(threadExecutionStates);
m_Steps++;
}
@@ -8625,21 +8622,59 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug(DebugAPIWrapper *apiWrapper)
if(active.Finished())
break;
// calculate the current mask of which threads are active
CalcActiveMask(activeMask);
// Execute the threads in each active tangle
ThreadExecutionStates threadExecutionStates;
TangleGroup &tangles = m_ControlFlow.GetTangles();
// step all active members of the workgroup
ShaderDebugState state;
bool hasDebugState = false;
for(size_t lane = 0; lane < m_Workgroup.size(); lane++)
bool anyActiveThreads = false;
for(Tangle &tangle : tangles)
{
if(activeMask[lane])
if(!tangle.IsAliveActive())
continue;
rdcarray<ThreadReference> threadRefs = tangle.GetThreadRefs();
// calculate the current active thread mask from the threads in the tangle
{
// one bool per workgroup thread
activeMask.resize(m_Workgroup.size());
// start with all threads as inactive
for(size_t i = 0; i < m_Workgroup.size(); i++)
activeMask[i] = false;
// activate the threads in the tangle
for(const ThreadReference &ref : threadRefs)
{
uint32_t idx = ref.id;
RDCASSERT(idx < m_Workgroup.size(), idx, m_Workgroup.size());
RDCASSERT(!m_Workgroup[idx].Finished());
activeMask[idx] = true;
anyActiveThreads = true;
}
}
ExecutionPoint newConvergencePoint = INVALID_EXECUTION_POINT;
uint32_t countActiveThreads = 0;
uint32_t countDivergedThreads = 0;
uint32_t countConvergePointThreads = 0;
// step all active members of the workgroup
ShaderDebugState state;
bool hasDebugState = false;
for(size_t lane = 0; lane < m_Workgroup.size(); lane++)
{
if(!activeMask[lane])
continue;
++countActiveThreads;
ThreadState &thread = m_Workgroup[lane];
const uint32_t threadId = (uint32_t)lane;
if(thread.Finished())
{
if(lane == m_ActiveLaneIndex)
ret.emplace_back();
tangle.SetThreadDead(threadId);
continue;
}
@@ -8654,21 +8689,64 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug(DebugAPIWrapper *apiWrapper)
{
thread.StepNext(NULL, apiWrapper, m_Workgroup, activeMask);
}
threadExecutionStates[threadId] = thread.m_EnteredPoints;
uint32_t threadConvergencePoint = thread.m_ConvergencePoint;
// the thread activated a new convergence point
if(threadConvergencePoint != INVALID_EXECUTION_POINT)
{
if(newConvergencePoint == INVALID_EXECUTION_POINT)
{
newConvergencePoint = threadConvergencePoint;
RDCASSERTNOTEQUAL(newConvergencePoint, INVALID_EXECUTION_POINT);
}
else
{
// All the threads in the tangle should set the same convergence point
RDCASSERTEQUAL(threadConvergencePoint, newConvergencePoint);
}
++countConvergePointThreads;
}
if(thread.Finished())
tangle.SetThreadDead(threadId);
if(thread.m_Diverged)
++countDivergedThreads;
}
for(size_t lane = 0; lane < m_Workgroup.size(); lane++)
{
if(activeMask[lane])
m_Workgroup[lane].StepOverNopInstructions();
}
// Update UI state after the execute and step over nops to make sure state.nextInstruction is in sync
if(hasDebugState)
{
ThreadState &thread = m_Workgroup[m_ActiveLaneIndex];
state.nextInstruction = thread.m_ActiveGlobalInstructionIdx;
thread.FillCallstack(state);
ret.push_back(std::move(state));
}
if(countConvergePointThreads)
{
// all the active threads should have a convergence point if any have one
RDCASSERTEQUAL(countConvergePointThreads, countActiveThreads);
tangle.AddMergePoint(newConvergencePoint);
}
if(countDivergedThreads)
{
// all the active threads should have diverged if any diverges
RDCASSERTEQUAL(countDivergedThreads, countActiveThreads);
tangle.SetDiverged(true);
}
}
for(size_t lane = 0; lane < m_Workgroup.size(); lane++)
if(!anyActiveThreads)
{
if(activeMask[lane])
m_Workgroup[lane].StepOverNopInstructions();
}
// Update UI state after the execute and step over nops to make sure state.nextInstruction is in sync
if(hasDebugState)
{
ThreadState &thread = m_Workgroup[m_ActiveLaneIndex];
state.nextInstruction = thread.m_ActiveGlobalInstructionIdx;
thread.FillCallstack(state);
ret.push_back(std::move(state));
active.m_Dead = true;
m_ControlFlow.UpdateState(threadExecutionStates);
RDCERR("No active threads in any tangle, killing active thread to terminate the debugger");
}
m_ControlFlow.UpdateState(threadExecutionStates);
}
return ret;
}
+11 -3
View File
@@ -29,6 +29,7 @@
#include "driver/shaders/dxbc/dx_debug.h"
#include "driver/shaders/dxbc/dxbc_bytecode.h"
#include "driver/shaders/dxbc/dxbc_container.h"
#include "shaders/controlflow.h"
#include "dxil_bytecode.h"
#include "dxil_controlflow.h"
#include "dxil_debuginfo.h"
@@ -100,6 +101,8 @@ struct FunctionInfo
PhiReferencedIdsPerBlock phiReferencedIdsPerBlock;
uint32_t globalInstructionOffset = ~0U;
rdcarray<uint32_t> uniformBlocks;
rdcarray<uint32_t> divergentBlocks;
rdcarray<DXIL::ConvergentBlockData> convergentBlocks;
DXIL::ControlFlow controlFlow;
std::map<uint32_t, Callstack> callstacks;
rdcarray<uint32_t> instructionToBlock;
@@ -239,12 +242,11 @@ struct ThreadState
void StepNext(ShaderDebugState *state, DebugAPIWrapper *apiWrapper,
const rdcarray<ThreadState> &workgroup, const rdcarray<bool> &activeMask);
void StepOverNopInstructions();
void StepOverDegenerateBranch();
bool Finished() const;
bool InUniformBlock() const;
bool JumpToBlock(const DXIL::Block *target);
bool JumpToBlock(const DXIL::Block *target, bool divergencePoint);
bool ExecuteInstruction(DebugAPIWrapper *apiWrapper, const rdcarray<ThreadState> &workgroup,
const rdcarray<bool> &activeMask);
@@ -353,6 +355,12 @@ struct ThreadState
// The global PC of the active instruction that was or will be executed on the current simulation step
uint32_t m_ActiveGlobalInstructionIdx = 0;
// true if executed an operation which could trigger divergence
bool m_Diverged;
// list of potential convergence points that were entered in a single step (used for tracking thread convergence)
rdcarray<uint32_t> m_EnteredPoints;
uint32_t m_ConvergencePoint;
// SSA Ids guaranteed to be greater than 0 and less than this value
uint32_t m_MaxSSAId;
@@ -592,7 +600,6 @@ public:
}
private:
void CalcActiveMask(rdcarray<bool> &activeMask);
void ParseDbgOpDeclare(const DXIL::Instruction &inst, uint32_t instructionIndex);
void ParseDbgOpValue(const DXIL::Instruction &inst, uint32_t instructionIndex);
const DXIL::Metadata *GetMDScope(const DXIL::Metadata *scopeMD) const;
@@ -604,6 +611,7 @@ private:
rdcarray<ThreadState> m_Workgroup;
std::map<const DXIL::Function *, FunctionInfo> m_FunctionInfos;
rdcshaders::ControlFlow m_ControlFlow;
// the live mutable global variables, to initialise a stack frame's live list
rdcarray<bool> m_LiveGlobals;