DXIL Debugger can simulate multiple instructions per step

2026-07-28 18:31:40 +00:00 · 2025-10-25 13:36:33 +01:00
parent fe8e63f7ec
commit 0243927d43
2 changed files with 190 additions and 35 deletions
@@ -1939,6 +1939,7 @@ void ThreadState::EnterEntryPoint(const Function *function, bool hasDebugState)
  }

  m_HasDebugState = false;
+  UpdateCurrentInstruction();
 }

 // Must be called from the replay manager thread (the debugger thread)
@@ -4126,7 +4127,8 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)
            if(lane < workgroup.size())
            {
              ShaderVariable var;
-              RDCASSERT(workgroup[lane].GetShaderVariable(inst.args[1], opCode, dxOpCode, var));
+              RDCASSERT(
+                  GetShaderVariableFromLane(workgroup[lane], inst.args[1], opCode, dxOpCode, var));
              result.value = var.value;
            }
            else
@@ -4146,7 +4148,8 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)
            if(lane < workgroup.size())
            {
              ShaderVariable var;
-              RDCASSERT(workgroup[lane].GetShaderVariable(inst.args[1], opCode, dxOpCode, var));
+              RDCASSERT(
+                  GetShaderVariableFromLane(workgroup[lane], inst.args[1], opCode, dxOpCode, var));
              result.value = var.value;
            }
            else
@@ -4190,7 +4193,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)

              RDCASSERT(lane < workgroup.size(), lane, workgroup.size());
              ShaderVariable x;
-              RDCASSERT(workgroup[lane].GetShaderVariable(inst.args[1], opCode, dxOpCode, x));
+              RDCASSERT(GetShaderVariableFromLane(workgroup[lane], inst.args[1], opCode, dxOpCode, x));

              for(uint8_t c = 0; c < x.columns; c++)
              {
@@ -4267,7 +4270,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)

              RDCASSERT(lane < workgroup.size(), lane, workgroup.size());
              ShaderVariable x;
-              RDCASSERT(workgroup[lane].GetShaderVariable(inst.args[1], opCode, dxOpCode, x));
+              RDCASSERT(GetShaderVariableFromLane(workgroup[lane], inst.args[1], opCode, dxOpCode, x));
              count += x.value.u32v[0];
            }

@@ -4333,7 +4336,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)
            {
              RDCASSERT(lane < workgroup.size(), lane, workgroup.size());
              ShaderVariable x;
-              RDCASSERT(workgroup[lane].GetShaderVariable(inst.args[1], opCode, dxOpCode, x));
+              RDCASSERT(GetShaderVariableFromLane(workgroup[lane], inst.args[1], opCode, dxOpCode, x));

              if(dxOpCode == DXOp::WaveAnyTrue)
              {
@@ -4415,7 +4418,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)
            {
              RDCASSERT(lane < workgroup.size(), lane, workgroup.size());
              ShaderVariable x;
-              RDCASSERT(workgroup[lane].GetShaderVariable(inst.args[1], opCode, dxOpCode, x));
+              RDCASSERT(GetShaderVariableFromLane(workgroup[lane], inst.args[1], opCode, dxOpCode, x));

              for(uint8_t c = 0; c < x.columns; c++)
              {
@@ -4546,7 +4549,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)
            {
              RDCASSERT(lane < workgroup.size(), lane, workgroup.size());
              ShaderVariable x;
-              RDCASSERT(workgroup[lane].GetShaderVariable(inst.args[1], opCode, dxOpCode, x));
+              RDCASSERT(GetShaderVariableFromLane(workgroup[lane], inst.args[1], opCode, dxOpCode, x));

              for(uint8_t c = 0; c < x.columns; c++)
              {
@@ -4611,7 +4614,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)
            {
              RDCASSERT(lane < workgroup.size(), lane, workgroup.size());
              ShaderVariable x;
-              RDCASSERT(workgroup[lane].GetShaderVariable(inst.args[1], opCode, dxOpCode, x));
+              RDCASSERT(GetShaderVariableFromLane(workgroup[lane], inst.args[1], opCode, dxOpCode, x));

              bool matches = true;
              for(uint8_t c = 0; c < x.columns; c++)
@@ -4691,7 +4694,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)

              RDCASSERT(lane < workgroup.size(), lane, workgroup.size());
              ShaderVariable x;
-              RDCASSERT(workgroup[lane].GetShaderVariable(inst.args[1], opCode, dxOpCode, x));
+              RDCASSERT(GetShaderVariableFromLane(workgroup[lane], inst.args[1], opCode, dxOpCode, x));
              for(uint8_t c = 0; c < x.columns; c++)
              {
                switch(waveMultiPrefixOpCode)
@@ -4804,7 +4807,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)

              RDCASSERT(lane < workgroup.size(), lane, workgroup.size());
              ShaderVariable x;
-              RDCASSERT(workgroup[lane].GetShaderVariable(inst.args[1], opCode, dxOpCode, x));
+              RDCASSERT(GetShaderVariableFromLane(workgroup[lane], inst.args[1], opCode, dxOpCode, x));
              count += x.value.u32v[0];
            }

@@ -4893,7 +4896,8 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)
            if(lane < workgroup.size())
            {
              ShaderVariable var;
-              RDCASSERT(workgroup[lane].GetShaderVariable(inst.args[1], opCode, dxOpCode, var));
+              RDCASSERT(
+                  GetShaderVariableFromLane(workgroup[lane], inst.args[1], opCode, dxOpCode, var));
              result.value = var.value;
            }
            else
@@ -4931,7 +4935,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)

              RDCASSERT(lane < workgroup.size(), lane, workgroup.size());
              ShaderVariable x;
-              RDCASSERT(workgroup[lane].GetShaderVariable(inst.args[1], opCode, dxOpCode, x));
+              RDCASSERT(GetShaderVariableFromLane(workgroup[lane], inst.args[1], opCode, dxOpCode, x));

              switch(quadVoteOp)
              {
@@ -6756,9 +6760,11 @@ void ThreadState::StepNext(bool hasDebugState, const rdcarray<ThreadState> &work
  m_HasDebugState = false;
 }

+// When getting live variables : this must be a thread safe operation using only thread safe containers
 bool ThreadState::GetShaderVariableHelper(const DXIL::Value *dxilValue, DXIL::Operation op,
                                          DXIL::DXOp dxOpCode, ShaderVariable &var,
-                                          bool flushDenormInput, bool isLive) const
+                                          bool flushDenormInput, bool isLive,
+                                          bool ignoreLiveCheck) const
 {
  var.name.clear();
  var.members.clear();
@@ -6863,7 +6869,7 @@ bool ThreadState::GetShaderVariableHelper(const DXIL::Value *dxilValue, DXIL::Op
  if(const Instruction *inst = cast<Instruction>(dxilValue))
  {
    if(isLive)
-      return GetLiveVariable(inst->slot, op, dxOpCode, var);
+      return GetLiveVariable(inst->slot, op, dxOpCode, ignoreLiveCheck, var);
    else
      return GetPhiVariable(inst->slot, op, dxOpCode, var);
  }
@@ -6899,11 +6905,13 @@ ShaderVariable ThreadState::GetBuiltin(ShaderBuiltin builtin) const
  return {};
 }

-bool ThreadState::GetLiveVariable(const Id &id, Operation op, DXOp dxOpCode, ShaderVariable &var) const
+// This must be a thread safe operation using only thread safe containers
+bool ThreadState::GetLiveVariable(const Id &id, Operation op, DXOp dxOpCode, bool ignoreLiveCheck,
+                                  ShaderVariable &var) const
 {
  if(id < m_Live.size())
  {
-    RDCASSERT(m_Live[id]);
+    RDCASSERT(ignoreLiveCheck || m_Live[id]);
  }
  else
  {
@@ -7476,8 +7484,10 @@ ShaderValue ThreadState::DDX(bool fine, Operation opCode, DXOp dxOpCode,

  ShaderVariable a;
  ShaderVariable b;
-  RDCASSERT(workgroup[m_QuadNeighbours[index + 1]].GetShaderVariable(dxilValue, opCode, dxOpCode, a));
-  RDCASSERT(workgroup[m_QuadNeighbours[index]].GetShaderVariable(dxilValue, opCode, dxOpCode, b));
+  RDCASSERT(GetShaderVariableFromLane(workgroup[m_QuadNeighbours[index + 1]], dxilValue, opCode,
+                                      dxOpCode, a));
+  RDCASSERT(GetShaderVariableFromLane(workgroup[m_QuadNeighbours[index + 0]], dxilValue, opCode,
+                                      dxOpCode, b));
  Sub(a, b, ret);
  return ret;
 }
@@ -7520,8 +7530,10 @@ ShaderValue ThreadState::DDY(bool fine, Operation opCode, DXOp dxOpCode,

  ShaderVariable a;
  ShaderVariable b;
-  RDCASSERT(workgroup[m_QuadNeighbours[index + 2]].GetShaderVariable(dxilValue, opCode, dxOpCode, a));
-  RDCASSERT(workgroup[m_QuadNeighbours[index]].GetShaderVariable(dxilValue, opCode, dxOpCode, b));
+  RDCASSERT(GetShaderVariableFromLane(workgroup[m_QuadNeighbours[index + 2]], dxilValue, opCode,
+                                      dxOpCode, a));
+  RDCASSERT(GetShaderVariableFromLane(workgroup[m_QuadNeighbours[index + 0]], dxilValue, opCode,
+                                      dxOpCode, b));
  Sub(a, b, ret);
  return ret;
 }
@@ -7594,12 +7606,12 @@ bool ThreadState::WorkgroupIsDiverged(const rdcarray<ThreadState> &workgroup)
      continue;
    if(block0 == ~0U)
    {
-      block0 = workgroup[i].m_Block;
+      block0 = workgroup[i].m_CurrentBlock;
      instr0 = workgroup[i].m_CurrentGlobalInstructionIdx;
      continue;
    }
    // not in the same basic block
-    if(workgroup[i].m_Block != block0)
+    if(workgroup[i].m_CurrentBlock != block0)
      return true;
    // not executing the same instruction
    if(workgroup[i].m_CurrentGlobalInstructionIdx != instr0)
@@ -7619,12 +7631,12 @@ bool ThreadState::SubgroupIsDiverged(const rdcarray<ThreadState> &workgroup,
      continue;
    if(block0 == ~0U)
    {
-      block0 = workgroup[lane].m_Block;
+      block0 = workgroup[lane].m_CurrentBlock;
      instr0 = workgroup[lane].m_CurrentGlobalInstructionIdx;
      continue;
    }
    // not in the same basic block
-    if(workgroup[lane].m_Block != block0)
+    if(workgroup[lane].m_CurrentBlock != block0)
      return true;
    // not executing the same instruction
    if(workgroup[lane].m_CurrentGlobalInstructionIdx != instr0)
@@ -7651,12 +7663,12 @@ bool ThreadState::QuadIsDiverged(const rdcarray<ThreadState> &workgroup,
      continue;
    if(block0 == ~0U)
    {
-      block0 = workgroup[i].m_Block;
+      block0 = workgroup[i].m_CurrentBlock;
      instr0 = workgroup[i].m_CurrentGlobalInstructionIdx;
      continue;
    }
    // not in the same basic block
-    if(workgroup[i].m_Block != block0)
+    if(workgroup[i].m_CurrentBlock != block0)
      return true;
    // not executing the same instruction
    if(workgroup[i].m_CurrentGlobalInstructionIdx != instr0)
@@ -7665,6 +7677,117 @@ bool ThreadState::QuadIsDiverged(const rdcarray<ThreadState> &workgroup,
  return false;
 }

+// The conditions where it is not safe to run another step are based on:
+// the current simulation state and the next instruction to simulate
+bool ThreadState::CanRunAnotherStep() const
+{
+  // Thread has finished
+  if(Finished())
+    return false;
+
+  // Current Simulated State that prevents running another step:
+  // Any control flow state changes i.e. branch, convergence point, partial convergence
+  if(m_Diverged)
+    return false;
+  if(!m_EnteredPoints.empty())
+    return false;
+  if(m_ConvergencePoint != INVALID_EXECUTION_POINT)
+    return false;
+  if(!m_PartialConvergencePoints.empty())
+    return false;
+
+  // current instructions that require full lockstep
+  const Instruction *inst = m_CurrentInstruction;
+  Operation opCode = inst->op;
+  DXOp dxOpCode = DXOp::NumOpCodes;
+  switch(opCode)
+  {
+    case Operation::Call:
+    {
+      const Function *callFunc = inst->getFuncCall();
+      if(callFunc->family == FunctionFamily::DXOp)
+      {
+        RDCASSERT(getival<DXOp>(inst->args[0], dxOpCode));
+        RDCASSERT(dxOpCode < DXOp::NumOpCodes, dxOpCode, DXOp::NumOpCodes);
+        switch(dxOpCode)
+        {
+          // no thread can continue until all threads execute the barrier
+          case DXOp::Barrier: return false;
+          default: break;
+        }
+      }
+    }
+    default: break;
+  }
+
+  // Next instructions that prevent running another step:
+  // any instruction that requires threads in the tangle to be in lockstep
+  inst = m_FunctionInfo->function->instructions[m_FunctionInstructionIdx];
+  opCode = inst->op;
+  dxOpCode = DXOp::NumOpCodes;
+  switch(opCode)
+  {
+    case Operation::Call:
+    {
+      const Function *callFunc = inst->getFuncCall();
+      if(callFunc->family == FunctionFamily::DXOp)
+      {
+        RDCASSERT(getival<DXOp>(inst->args[0], dxOpCode));
+        RDCASSERT(dxOpCode < DXOp::NumOpCodes, dxOpCode, DXOp::NumOpCodes);
+        switch(dxOpCode)
+        {
+          // thread barriers require threads in the tangle to be in lockstep
+          case DXOp::Barrier:
+            return false;
+            // Image operations require threads in the tangle to be in lockstep
+          case DXOp::Sample:
+          case DXOp::SampleBias:
+          case DXOp::SampleLevel:
+          case DXOp::SampleGrad:
+          case DXOp::SampleCmp:
+          case DXOp::SampleCmpBias:
+          case DXOp::SampleCmpLevel:
+          case DXOp::SampleCmpGrad:
+          case DXOp::SampleCmpLevelZero:
+          case DXOp::TextureGather:
+          case DXOp::TextureGatherCmp:
+          case DXOp::CalculateLOD: return false;
+          case DXOp::TextureLoad:
+            // TextureLoad does not require derivatives, does not have to be in lockstep
+            return true;
+            // derivatives require threads in the tangle to be in lockstep
+          case DXOp::DerivCoarseX:
+          case DXOp::DerivCoarseY:
+          case DXOp::DerivFineX:
+          case DXOp::DerivFineY: return false;
+          // wave/subgroup ops require threads in the tangle to be in lockstep
+          case DXOp::WaveIsFirstLane:
+          case DXOp::WaveReadLaneAt:
+          case DXOp::WaveReadLaneFirst:
+          case DXOp::WavePrefixOp:
+          case DXOp::WavePrefixBitCount:
+          case DXOp::WaveAllBitCount:
+          case DXOp::WaveAnyTrue:
+          case DXOp::WaveAllTrue:
+          case DXOp::WaveActiveBallot:
+          case DXOp::WaveActiveAllEqual:
+          case DXOp::WaveActiveOp:
+          case DXOp::WaveActiveBit:
+          case DXOp::WaveMatch:
+          case DXOp::WaveMultiPrefixOp:
+          case DXOp::WaveMultiPrefixBitCount:
+          case DXOp::QuadReadLaneAt:
+          case DXOp::QuadOp:
+          case DXOp::QuadVote: return false;
+          default: break;
+        }
+      }
+    }
+    default: break;
+  }
+  return true;
+}
+
 Debugger::DebugInfo::~DebugInfo()
 {
  for(const ScopedDebugData *scope : scopedDebugDatas)
@@ -10196,6 +10319,23 @@ void Debugger::StepThread(uint32_t lane, StepThreadMode stepMode)

    if(isActiveThread)
      curActiveSteps++;
+
+    if(stepMode == StepThreadMode::RUN_SINGLE_STEP)
+      break;
+
+    if(stepMode == StepThreadMode::QUEUE_SINGLE_STEP)
+      break;
+
+    simulateStep = thread.CanRunAnotherStep();
+    if(simulateStep)
+    {
+      DXIL_DEBUG_RDCASSERT(thread.IsSimulationStepActive());
+    }
+    if(simulateStep)
+      thread.SetStepQueued();
+
+    if(stepMode == StepThreadMode::QUEUE_MULTIPLE_STEPS)
+      break;
  };
  // Update the number of simulation steps
  if(isActiveThread)
@@ -10227,10 +10367,10 @@ void Debugger::InternalStepThread(uint32_t lane)
      thread.ClearPendingDebugState();
    }
    thread.StepNext(true, m_Workgroup);
+    m_ActiveDebugState.nextInstruction = thread.GetActiveGlobalInstructionIdx();
    thread.FillCallstack(m_ActiveDebugState);

    const ShaderDebugState &pendingDebugState = thread.GetPendingDebugState();
-    m_ActiveDebugState.nextInstruction = pendingDebugState.nextInstruction;
    m_ActiveDebugState.flags = pendingDebugState.flags;
    m_ActiveDebugState.changes.append(pendingDebugState.changes);
    thread.ClearPendingDebugState();
@@ -10257,6 +10397,6 @@ void Debugger::QueueJob(uint32_t lane)
  CHECK_DEBUGGER_THREAD();
  ThreadState &thread = m_Workgroup[lane];
  thread.SetStepQueued();
-  StepThread(lane, StepThreadMode::RUN_SINGLE_STEP);
+  StepThread(lane, StepThreadMode::RUN_MULTIPLE_STEPS);
 }
 };    // namespace DXILDebug
@@ -370,6 +370,7 @@ struct ThreadState

  bool Finished() const;
  bool IsSimulationStepActive() const { return (AtomicLoad(&atomic_isSimulationStepActive) == 1); }
+  bool CanRunAnotherStep() const;
  const ShaderVariable &GetInput() const { return m_Input; }
  const GlobalVariable &GetOutput() const { return m_Output; }
  bool IsDead() const { return m_Dead; }
@@ -403,7 +404,11 @@ struct ThreadState
    RDCASSERTEQUAL(m_ActiveMask.size(), activeMask.size());
    memcpy(m_ActiveMask.data(), activeMask.data(), activeMask.size() * sizeof(bool));
  }
-  void UpdateCurrentInstruction() { m_CurrentGlobalInstructionIdx = m_ActiveGlobalInstructionIdx; }
+  void UpdateCurrentInstruction()
+  {
+    m_CurrentGlobalInstructionIdx = m_ActiveGlobalInstructionIdx;
+    m_CurrentBlock = m_Block;
+  }
  void SetSimulationStepCompleted() { AtomicStore(&atomic_isSimulationStepActive, 0); }
  void SetStepQueued() { AtomicStore(&atomic_isSimulationStepActive, 1); }

@@ -438,20 +443,27 @@ private:
  Id GetArgumentId(uint32_t i) const;
  ResourceReferenceInfo GetResource(Id handleId, bool &annotatedHandle);

+  // This must be a thread safe operation using only thread safe containers
+  bool GetShaderVariableFromLane(const ThreadState &lane, const DXIL::Value *dxilValue,
+                                 DXIL::Operation op, DXIL::DXOp dxOpCode, ShaderVariable &var) const
+  {
+    return lane.GetShaderVariableHelper(dxilValue, op, dxOpCode, var, true, true, true);
+  }
  bool GetShaderVariable(const DXIL::Value *dxilValue, DXIL::Operation op, DXIL::DXOp dxOpCode,
                         ShaderVariable &var, bool flushDenormInput = true) const
  {
-    return GetShaderVariableHelper(dxilValue, op, dxOpCode, var, flushDenormInput, true);
+    return GetShaderVariableHelper(dxilValue, op, dxOpCode, var, flushDenormInput, true, false);
  }

  bool GetPhiShaderVariable(const DXIL::Value *dxilValue, DXIL::Operation op, DXIL::DXOp dxOpCode,
                            ShaderVariable &var, bool flushDenormInput = true) const
  {
-    return GetShaderVariableHelper(dxilValue, op, dxOpCode, var, flushDenormInput, false);
+    return GetShaderVariableHelper(dxilValue, op, dxOpCode, var, flushDenormInput, false, false);
  }

+  // This must be a thread safe operation using only thread safe containers
  bool GetLiveVariable(const Id &id, DXIL::Operation opCode, DXIL::DXOp dxOpCode,
-                       ShaderVariable &var) const;
+                       bool ignoreLiveCheck, ShaderVariable &var) const;
  bool GetPhiVariable(const Id &id, DXIL::Operation opCode, DXIL::DXOp dxOpCode,
                      ShaderVariable &var) const;
  bool GetVariableHelper(DXIL::Operation op, DXIL::DXOp dxOpCode, ShaderVariable &var) const;
@@ -479,8 +491,10 @@ private:
  static bool SubgroupIsDiverged(const rdcarray<ThreadState> &workgroup,
                                 const rdcarray<uint32_t> &activeLanes);

-  bool GetShaderVariableHelper(const DXIL::Value *dxilValue, DXIL::Operation op, DXIL::DXOp dxOpCode,
-                               ShaderVariable &var, bool flushDenormInput, bool isLive) const;
+  // When getting live variables : this must be a thread safe operation using only thread safe containers
+  bool GetShaderVariableHelper(const DXIL::Value *dxilValue, DXIL::Operation op,
+                               DXIL::DXOp dxOpCode, ShaderVariable &var, bool flushDenormInput,
+                               bool isLive, bool ignoreLiveCheck) const;
  bool IsVariableAssigned(const Id id) const;

  ShaderVariable GetBuiltin(ShaderBuiltin builtin) const;
@@ -542,8 +556,9 @@ private:
  uint32_t m_PreviousBlock = ~0U;
  // The global PC of the active instruction that will be executed on the next simulation step
  uint32_t m_ActiveGlobalInstructionIdx = 0;
-  // The global PC of the active instruction that was last executed
+  // The global PC and block of the instruction that was last executed
  uint32_t m_CurrentGlobalInstructionIdx = 0;
+  uint32_t m_CurrentBlock = ~0U;

  // true if executed an operation which could trigger divergence
  bool m_Diverged;