Reworked DXIL debugger simulation in preparation for multithreading

Rearranged to isolate the simulation of a single lane into a helper function: InternalStepThread(uint32_t lane)
2026-05-28 04:41:07 +00:00 · 2025-10-04 11:43:50 +01:00
parent c4fa11b6e6
commit fe8e63f7ec
3 changed files with 356 additions and 138 deletions
@@ -1802,7 +1802,7 @@ void MemoryTracking::ConvertGlobalAllocToLocal(Id allocId)

 // Must be called from the replay manager thread (the debugger thread)
 ThreadState::ThreadState(Debugger &debugger, const GlobalState &globalState, uint32_t maxSSAId,
-                         uint32_t laneIndex)
+                         uint32_t laneIndex, uint32_t numThreads)
    : m_Debugger(debugger),
      m_GlobalState(globalState),
      m_Program(debugger.GetProgram()),
@@ -1814,6 +1814,7 @@ ThreadState::ThreadState(Debugger &debugger, const GlobalState &globalState, uin
  m_Assigned.resize(maxSSAId);
  m_Live.resize(maxSSAId);
  m_Variables.resize(maxSSAId);
+  m_ActiveMask.resize(numThreads);
 }

 // Must be called from the replay manager thread (the debugger thread)
@@ -1842,7 +1843,7 @@ void ThreadState::ProcessScopeChange(const rdcarray<bool> &oldLive, const rdcarr
 {
  THREADSTATE_CHECK_DEBUGGER_THREAD();
  // nothing to do if we aren't tracking into a state
-  if(!m_State)
+  if(!m_HasDebugState)
    return;

  // all oldLive (except globals) are going out of scope. all newLive (except globals) are coming
@@ -1855,7 +1856,7 @@ void ThreadState::ProcessScopeChange(const rdcarray<bool> &oldLive, const rdcarr
    if(liveGlobals[id])
      continue;

-    m_State->changes.push_back({m_Variables[id]});
+    m_PendingDebugState.changes.push_back({m_Variables[id]});
  }

  for(uint32_t id = 0; id < newLive.size(); id++)
@@ -1863,7 +1864,7 @@ void ThreadState::ProcessScopeChange(const rdcarray<bool> &oldLive, const rdcarr
    if(liveGlobals[id])
      continue;

-    m_State->changes.push_back({ShaderVariable(), m_Variables[id]});
+    m_PendingDebugState.changes.push_back({ShaderVariable(), m_Variables[id]});
  }
 }

@@ -1894,16 +1895,14 @@ void ThreadState::EnterFunction(const Function *function, const rdcarray<Value *
  m_ActiveGlobalInstructionIdx = m_FunctionInfo->globalInstructionOffset + m_FunctionInstructionIdx;
  m_Callstack.push_back(frame);

-  ShaderDebugState *state = m_State;
-  m_State = state;
  StepOverNopInstructions();
 }

 // Must be called from the replay manager thread (the debugger thread)
-void ThreadState::EnterEntryPoint(const Function *function, ShaderDebugState *state)
+void ThreadState::EnterEntryPoint(const Function *function, bool hasDebugState)
 {
  THREADSTATE_CHECK_DEBUGGER_THREAD();
-  m_State = state;
+  m_HasDebugState = hasDebugState;

  EnterFunction(function, {});

@@ -1933,13 +1932,13 @@ void ThreadState::EnterEntryPoint(const Function *function, ShaderDebugState *st
  }

  // active lane : needs it own local backing memory for GSM
-  if(m_State)
+  if(m_HasDebugState)
  {
    for(Id id : m_GlobalState.groupSharedMemoryIds)
      m_Memory.ConvertGlobalAllocToLocal(id);
  }

-  m_State = NULL;
+  m_HasDebugState = false;
 }

 // Must be called from the replay manager thread (the debugger thread)
@@ -2017,8 +2016,8 @@ bool ThreadState::JumpToBlock(const Block *target, bool divergencePoint)
  }

  uint32_t nextInstruction = m_FunctionInfo->globalInstructionOffset + m_FunctionInstructionIdx;
-  if(m_State && !m_Ended)
-    m_State->nextInstruction = nextInstruction;
+  if(m_HasDebugState && !m_Ended)
+    m_PendingDebugState.nextInstruction = nextInstruction;

  m_EnteredPoints.push_back(m_Block);
  RDCASSERTEQUAL(m_FunctionInfo->divergentBlocks.contains(m_PreviousBlock), divergencePoint);
@@ -2051,16 +2050,15 @@ bool ThreadState::JumpToBlock(const Block *target, bool divergencePoint)
  return true;
 }

-uint32_t ThreadState::GetSubgroupActiveLanes(const rdcarray<bool> &activeMask,
-                                             const rdcarray<ThreadState> &workgroup,
+uint32_t ThreadState::GetSubgroupActiveLanes(const rdcarray<ThreadState> &workgroup,
                                             rdcarray<uint32_t> &activeLanes) const
 {
  const uint32_t firstLaneInSub = m_WorkgroupIndex - m_SubgroupIdx;
  for(uint32_t lane = firstLaneInSub; lane < firstLaneInSub + m_GlobalState.subgroupSize; lane++)
  {
-    RDCASSERT(lane < activeMask.size(), lane, activeMask.size());
+    RDCASSERT(lane < m_ActiveMask.size(), lane, m_ActiveMask.size());
    // wave operations exclude helpers
-    if(activeMask[lane])
+    if(m_ActiveMask[lane])
    {
      RDCASSERT(lane < workgroup.size(), lane, workgroup.size());
      if(!m_GlobalState.waveOpsIncludeHelpers && workgroup[lane].m_Helper)
@@ -2071,8 +2069,7 @@ uint32_t ThreadState::GetSubgroupActiveLanes(const rdcarray<bool> &activeMask,
  return firstLaneInSub;
 }

-bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,
-                                     const rdcarray<bool> &activeMask)
+bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)
 {
  m_CurrentInstruction = m_FunctionInfo->function->instructions[m_FunctionInstructionIdx];
  const Instruction &inst = *m_CurrentInstruction;
@@ -2148,7 +2145,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,
            RDCASSERT(GetShaderVariable(inst.args[4], opCode, dxOpCode, arg));

            // Only the active lane stores outputs
-            if(m_State)
+            if(m_HasDebugState)
            {
              ShaderVariable &var = m_Output.var.members[outputIdx];
              if(var.rows == 0)
@@ -3461,7 +3458,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,
            if(cond.value.u32v[0] != 0)
            {
              // Active lane is demoted to helper invocation which for pixel debug terminates the debug
-              if(m_State)
+              if(m_HasDebugState)
              {
                m_Dead = true;
                return true;
@@ -4113,7 +4110,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,
          {
            // determine active lane indices in our subgroup
            rdcarray<uint32_t> activeLanes;
-            GetSubgroupActiveLanes(activeMask, workgroup, activeLanes);
+            GetSubgroupActiveLanes(workgroup, activeLanes);
            RDCASSERT(!SubgroupIsDiverged(workgroup, activeLanes));
            result.value.u32v[0] = (m_WorkgroupIndex == activeLanes[0]) ? 1 : 0;
            break;
@@ -4143,7 +4140,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,
            // WaveReadLaneFirst(value)
            // determine active lane indices in our subgroup
            rdcarray<uint32_t> activeLanes;
-            GetSubgroupActiveLanes(activeMask, workgroup, activeLanes);
+            GetSubgroupActiveLanes(workgroup, activeLanes);
            RDCASSERT(!SubgroupIsDiverged(workgroup, activeLanes));
            uint32_t lane = activeLanes[0];
            if(lane < workgroup.size())
@@ -4183,7 +4180,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,

            // determine active lane indices in our subgroup
            rdcarray<uint32_t> activeLanes;
-            GetSubgroupActiveLanes(activeMask, workgroup, activeLanes);
+            GetSubgroupActiveLanes(workgroup, activeLanes);
            RDCASSERT(!SubgroupIsDiverged(workgroup, activeLanes));
            for(uint32_t lane : activeLanes)
            {
@@ -4256,7 +4253,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,

            // determine active lane indices in our subgroup
            rdcarray<uint32_t> activeLanes;
-            GetSubgroupActiveLanes(activeMask, workgroup, activeLanes);
+            GetSubgroupActiveLanes(workgroup, activeLanes);
            RDCASSERT(!SubgroupIsDiverged(workgroup, activeLanes));

            uint32_t maxLane = (dxOpCode == DXOp::WavePrefixBitCount) ? m_WorkgroupIndex : UINT32_MAX;
@@ -4329,8 +4326,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,

            // determine active lane indices in our subgroup
            rdcarray<uint32_t> activeLanes;
-            const uint32_t firstLaneInSub =
-                GetSubgroupActiveLanes(activeMask, workgroup, activeLanes);
+            const uint32_t firstLaneInSub = GetSubgroupActiveLanes(workgroup, activeLanes);
            RDCASSERT(!SubgroupIsDiverged(workgroup, activeLanes));

            for(uint32_t lane : activeLanes)
@@ -4412,7 +4408,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,

            // determine active lane indices in our subgroup
            rdcarray<uint32_t> activeLanes;
-            GetSubgroupActiveLanes(activeMask, workgroup, activeLanes);
+            GetSubgroupActiveLanes(workgroup, activeLanes);
            RDCASSERT(!SubgroupIsDiverged(workgroup, activeLanes));

            for(uint32_t lane : activeLanes)
@@ -4543,7 +4539,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,

            // determine active lane indices in our subgroup
            rdcarray<uint32_t> activeLanes;
-            GetSubgroupActiveLanes(activeMask, workgroup, activeLanes);
+            GetSubgroupActiveLanes(workgroup, activeLanes);
            RDCASSERT(!SubgroupIsDiverged(workgroup, activeLanes));

            for(uint32_t lane : activeLanes)
@@ -4608,8 +4604,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,

            // determine active lane indices in our subgroup
            rdcarray<uint32_t> activeLanes;
-            const uint32_t firstLaneInSub =
-                GetSubgroupActiveLanes(activeMask, workgroup, activeLanes);
+            const uint32_t firstLaneInSub = GetSubgroupActiveLanes(workgroup, activeLanes);
            RDCASSERT(!SubgroupIsDiverged(workgroup, activeLanes));

            for(uint32_t lane : activeLanes)
@@ -4677,8 +4672,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,

            // determine active lane indices in our subgroup
            rdcarray<uint32_t> activeLanes;
-            const uint32_t firstLaneInSub =
-                GetSubgroupActiveLanes(activeMask, workgroup, activeLanes);
+            const uint32_t firstLaneInSub = GetSubgroupActiveLanes(workgroup, activeLanes);
            RDCASSERT(!SubgroupIsDiverged(workgroup, activeLanes));

            uint32_t maxLane = m_WorkgroupIndex;
@@ -4790,8 +4784,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,

            // determine active lane indices in our subgroup
            rdcarray<uint32_t> activeLanes;
-            const uint32_t firstLaneInSub =
-                GetSubgroupActiveLanes(activeMask, workgroup, activeLanes);
+            const uint32_t firstLaneInSub = GetSubgroupActiveLanes(workgroup, activeLanes);
            RDCASSERT(!SubgroupIsDiverged(workgroup, activeLanes));

            uint32_t maxLane = m_WorkgroupIndex;
@@ -5464,7 +5457,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,

      UpdateBackingMemoryFromVariable(memory, allocSize, val);

-      bool recordBaseMemoryChange = m_State && baseMemoryId != ptrId;
+      bool recordBaseMemoryChange = m_HasDebugState && baseMemoryId != ptrId;
      ShaderVariableChange change;
      RDCASSERT(IsVariableAssigned(baseMemoryId));
      if(recordBaseMemoryChange)
@@ -5480,7 +5473,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,
      UpdateMemoryVariableFromBackingMemory(baseMemoryId, allocation.backingMemory);

      // active lane : writes to a GSM variable, write to local and global backing memory
-      if(m_State)
+      if(m_HasDebugState)
        UpdateGlobalBackingMemory(ptrId, ptr, allocation, val);

      // record the change to the base memory variable if it is not the ptrId variable
@@ -5492,7 +5485,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,
          m_Assigned[baseMemoryId] = true;
        }
        change.after = m_Variables[baseMemoryId];
-        m_State->changes.push_back(change);
+        m_PendingDebugState.changes.push_back(change);
      }

      // Update the ptr variable value and manually record the change to the ptr variable
@@ -5504,11 +5497,11 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,
      m_Variables[ptrId] = newValue;
      m_Assigned[ptrId] = true;

-      if(m_State)
+      if(m_HasDebugState)
      {
        change.before = originalValue;
        change.after = newValue;
-        m_State->changes.push_back(change);
+        m_PendingDebugState.changes.push_back(change);
      }

      result.name.clear();
@@ -6600,7 +6593,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,
      // Save the result back to the backing memory of the pointer
      UpdateBackingMemoryFromVariable(memory, allocSize, res);

-      bool recordBaseMemoryChange = m_State && baseMemoryId != resultId;
+      bool recordBaseMemoryChange = m_HasDebugState && baseMemoryId != resultId;
      ShaderVariableChange change;
      if(recordBaseMemoryChange)
        change.before = m_Variables[baseMemoryId];
@@ -6608,7 +6601,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,
      UpdateMemoryVariableFromBackingMemory(baseMemoryId, allocMemoryBackingPtr);

      // active lane : writes to a GSM variable, write to local and global backing memory
-      if(m_State)
+      if(m_HasDebugState)
        UpdateGlobalBackingMemory(ptrId, ptr, allocation, res);

      // record the change to the base memory variable
@@ -6620,11 +6613,11 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,
          m_Assigned[baseMemoryId] = true;
        }
        change.after = m_Variables[baseMemoryId];
-        m_State->changes.push_back(change);
+        m_PendingDebugState.changes.push_back(change);
      }

      // record the change to the ptr variable value
-      bool recordPtrMemoryChange = m_State && ptrId != resultId;
+      bool recordPtrMemoryChange = m_HasDebugState && ptrId != resultId;
      RDCASSERT(IsVariableAssigned(ptrId));
      if(recordPtrMemoryChange)
        change.before = m_Variables[ptrId];
@@ -6634,7 +6627,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,
      if(recordPtrMemoryChange)
      {
        change.after = m_Variables[ptrId];
-        m_State->changes.push_back(change);
+        m_PendingDebugState.changes.push_back(change);
      }

      RDCASSERTNOTEQUAL(resultId, ptrId);
@@ -6673,7 +6666,7 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup,

  if(!result.name.empty() && resultId != DXILDebug::INVALID_ID)
  {
-    if(m_State)
+    if(m_HasDebugState)
      SetResult(resultId, result, opCode, dxOpCode, eventFlags);

    // Fake Output results won't be in the referencedIds
@@ -6711,8 +6704,8 @@ void ThreadState::StepOverNopInstructions()

 void ThreadState::RetireLiveIDs()
 {
-  m_State->flags = ShaderEvents::NoEvent;
-  m_State->changes.clear();
+  m_PendingDebugState.flags = ShaderEvents::NoEvent;
+  m_PendingDebugState.changes.clear();

  // Remove variables which have gone out of scope
  ExecPointReference current(m_Block, m_FunctionInstructionIdx);
@@ -6737,15 +6730,14 @@ void ThreadState::RetireLiveIDs()

      ShaderVariableChange change;
      change.before = m_Variables[id];
-      m_State->changes.push_back(change);
+      m_PendingDebugState.changes.push_back(change);
    }
  }
 }

-void ThreadState::StepNext(ShaderDebugState *state, const rdcarray<ThreadState> &workgroup,
-                           const rdcarray<bool> &activeMask)
+void ThreadState::StepNext(bool hasDebugState, const rdcarray<ThreadState> &workgroup)
 {
-  m_State = state;
+  m_HasDebugState = hasDebugState;
  m_Diverged = false;
  m_EnteredPoints.clear();
  m_ConvergencePoint = INVALID_EXECUTION_POINT;
@@ -6753,15 +6745,15 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray<ThreadState>

  RDCASSERTEQUAL(m_ActiveGlobalInstructionIdx,
                 m_FunctionInfo->globalInstructionOffset + m_FunctionInstructionIdx);
-  if(m_State)
+  if(m_HasDebugState)
  {
-    m_State->flags = ShaderEvents::NoEvent;
-    m_State->changes.clear();
-    RetireLiveIDs();
+    m_PendingDebugState.flags = ShaderEvents::NoEvent;
+    m_PendingDebugState.changes.clear();
  }
-  ExecuteInstruction(workgroup, activeMask);
+  ExecuteInstruction(workgroup);
+  StepOverNopInstructions();

-  m_State = NULL;
+  m_HasDebugState = false;
 }

 bool ThreadState::GetShaderVariableHelper(const DXIL::Value *dxilValue, DXIL::Operation op,
@@ -6963,19 +6955,19 @@ void ThreadState::SetResult(const Id &id, ShaderVariable &result, Operation op,

  flags |= AssignValue(result, result, flushDenorm);

-  if(m_State)
+  if(m_HasDebugState)
  {
    ShaderVariableChange change;
-    m_State->flags |= flags;
+    m_PendingDebugState.flags |= flags;
    change.before = m_Variables[id];
    change.after = result;
-    m_State->changes.push_back(change);
+    m_PendingDebugState.changes.push_back(change);
  }
 }

 void ThreadState::MarkResourceAccess(const ShaderVariable &var)
 {
-  if(m_State == NULL)
+  if(!m_HasDebugState)
    return;

  if(var.type != VarType::ReadOnlyResource && var.type != VarType::ReadWriteResource)
@@ -6985,7 +6977,7 @@ void ThreadState::MarkResourceAccess(const ShaderVariable &var)
  change.before = var;
  change.after = var;

-  m_State->changes.push_back(change);
+  m_PendingDebugState.changes.push_back(change);
 }

 void ThreadState::UpdateBackingMemoryFromVariable(void *ptr, uint64_t &allocSize,
@@ -7537,7 +7529,7 @@ ShaderValue ThreadState::DDY(bool fine, Operation opCode, DXOp dxOpCode,
 void ThreadState::ExecuteMemoryBarrier()
 {
  // ignore if not the active thread
-  if(!m_State)
+  if(!m_HasDebugState)
    return;

  // copy the global GSM memory into the local GSM cache
@@ -7573,7 +7565,7 @@ void ThreadState::ExecuteMemoryBarrier()
    UpdateMemoryVariableFromBackingMemory(id, globalBackingMemory);
    change.after = local;
    if(!(change.after == change.before))
-      m_State->changes.push_back(change);
+      m_PendingDebugState.changes.push_back(change);

    // Update local backing memory from the local variable
    RDCASSERTEQUAL(globalMem->second.size, localMem->second.size);
@@ -7603,14 +7595,14 @@ bool ThreadState::WorkgroupIsDiverged(const rdcarray<ThreadState> &workgroup)
    if(block0 == ~0U)
    {
      block0 = workgroup[i].m_Block;
-      instr0 = workgroup[i].m_ActiveGlobalInstructionIdx;
+      instr0 = workgroup[i].m_CurrentGlobalInstructionIdx;
      continue;
    }
    // not in the same basic block
    if(workgroup[i].m_Block != block0)
      return true;
    // not executing the same instruction
-    if(workgroup[i].m_ActiveGlobalInstructionIdx != instr0)
+    if(workgroup[i].m_CurrentGlobalInstructionIdx != instr0)
      return true;
  }
  return false;
@@ -7628,14 +7620,14 @@ bool ThreadState::SubgroupIsDiverged(const rdcarray<ThreadState> &workgroup,
    if(block0 == ~0U)
    {
      block0 = workgroup[lane].m_Block;
-      instr0 = workgroup[lane].m_ActiveGlobalInstructionIdx;
+      instr0 = workgroup[lane].m_CurrentGlobalInstructionIdx;
      continue;
    }
    // not in the same basic block
    if(workgroup[lane].m_Block != block0)
      return true;
    // not executing the same instruction
-    if(workgroup[lane].m_ActiveGlobalInstructionIdx != instr0)
+    if(workgroup[lane].m_CurrentGlobalInstructionIdx != instr0)
      return true;
  }
  return false;
@@ -7660,14 +7652,14 @@ bool ThreadState::QuadIsDiverged(const rdcarray<ThreadState> &workgroup,
    if(block0 == ~0U)
    {
      block0 = workgroup[i].m_Block;
-      instr0 = workgroup[i].m_ActiveGlobalInstructionIdx;
+      instr0 = workgroup[i].m_CurrentGlobalInstructionIdx;
      continue;
    }
    // not in the same basic block
    if(workgroup[i].m_Block != block0)
      return true;
    // not executing the same instruction
-    if(workgroup[i].m_ActiveGlobalInstructionIdx != instr0)
+    if(workgroup[i].m_CurrentGlobalInstructionIdx != instr0)
      return true;
  }
  return false;
@@ -8978,7 +8970,7 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *apiWrapper, uint32_t eve
  m_GlobalState.constantBlocksDatas = apiWrapper->GetConstantBlocksDatas();

  for(uint32_t i = 0; i < threadsInWorkgroup; i++)
-    m_Workgroup.push_back(ThreadState(*this, m_GlobalState, maxSSAId, i));
+    m_Workgroup.push_back(ThreadState(*this, m_GlobalState, maxSSAId, i, threadsInWorkgroup));

  // Get the thread state from the API wrapper
  const rdcarray<rdcflatmap<ShaderBuiltin, ShaderVariable>> &threadsBuiltins =
@@ -9824,6 +9816,7 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
  ThreadState &active = GetActiveLane();

  rdcarray<ShaderDebugState> ret;
+  m_ShaderChangesReturn = NULL;

  // initialise the first ShaderDebugState if we haven't stepped yet
  if(m_Steps == 0)
@@ -9837,14 +9830,17 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()

      if(lane == m_ActiveLaneIndex)
      {
-        thread.EnterEntryPoint(m_EntryPointFunction, &initial);
+        thread.EnterEntryPoint(m_EntryPointFunction, true);
        thread.FillCallstack(initial);
        initial.nextInstruction = thread.GetActiveGlobalInstructionIdx();
+        const ShaderDebugState &pendingDebugState = thread.GetPendingDebugState();
+        initial.flags = pendingDebugState.flags;
+        initial.changes.append(pendingDebugState.changes);
        startPoint = initial.nextInstruction;
      }
      else
      {
-        thread.EnterEntryPoint(m_EntryPointFunction, NULL);
+        thread.EnterEntryPoint(m_EntryPointFunction, false);
      }
    }

@@ -9880,12 +9876,14 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
  if(active.Finished())
    return ret;

-  rdcarray<bool> activeMask;
+  bool allStepsCompleted = true;
+  m_ShaderChangesReturn = &ret;

  // continue stepping until we have 1000000 target steps completed in a chunk.
  for(int stepEnd = m_Steps + 1000000; m_Steps < stepEnd;)
  {
-    if(active.Finished())
+    allStepsCompleted = true;
+    if(active.Finished() && !active.IsSimulationStepActive())
      break;

    // Execute the threads in each active tangle
@@ -9893,31 +9891,97 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
    TangleGroup &tangles = m_ControlFlow.GetTangles();

    bool anyActiveThreads = false;
+    bool hasDebugState = false;
+
+    for(const Tangle &tangle : tangles)
+    {
+      if(!tangle.IsAliveActive())
+        continue;
+
+      rdcarray<bool> activeMask;
+      // one bool per workgroup thread
+      activeMask.resize(m_Workgroup.size());
+
+      // calculate the current active thread mask from the threads in the tangle
+      for(size_t i = 0; i < m_Workgroup.size(); i++)
+        activeMask[i] = false;
+
+      const rdcarray<ThreadReference> &threadRefs = tangle.GetThreadRefs();
+      for(const ThreadReference &ref : threadRefs)
+      {
+        uint32_t lane = ref.id;
+        RDCASSERT(lane < m_Workgroup.size(), lane, m_Workgroup.size());
+        ThreadState &thread = m_Workgroup[lane];
+        RDCASSERT(!thread.Finished());
+        activeMask[lane] = true;
+        anyActiveThreads = true;
+      }
+
+      // step all threads in the tangle
+      for(const ThreadReference &ref : threadRefs)
+      {
+        const uint32_t threadId = ref.id;
+        const uint32_t lane = threadId;
+
+        ThreadState &thread = m_Workgroup[lane];
+        if(thread.Finished())
+        {
+          if(lane == m_ActiveLaneIndex)
+            ret.emplace_back();
+          continue;
+        }
+        if(lane == m_ActiveLaneIndex)
+          hasDebugState = true;
+
+        thread.SetActiveMask(activeMask);
+        QueueJob(lane);
+      }
+    }
+
+    do
+    {
+      allStepsCompleted = true;
+      for(const Tangle &tangle : tangles)
+      {
+        if(!tangle.IsAliveActive())
+          continue;
+
+        bool tangleStepsCompleted = true;
+        const rdcarray<ThreadReference> &threadRefs = tangle.GetThreadRefs();
+        for(const ThreadReference &ref : threadRefs)
+        {
+          const uint32_t threadId = ref.id;
+          const uint32_t lane = threadId;
+          ThreadState &thread = m_Workgroup[lane];
+          if(thread.IsSimulationStepActive())
+          {
+            tangleStepsCompleted = false;
+            break;
+          }
+        }
+        if(!tangleStepsCompleted)
+        {
+          allStepsCompleted = false;
+          break;
+        }
+      }
+    } while(!allStepsCompleted);
+
    for(Tangle &tangle : tangles)
    {
      if(!tangle.IsAliveActive())
        continue;

      const rdcarray<ThreadReference> &threadRefs = tangle.GetThreadRefs();
-      // calculate the current active thread mask from the threads in the tangle
+#if !defined(RELEASE)
+      for(const ThreadReference &ref : threadRefs)
      {
-        // one bool per workgroup thread
-        activeMask.resize(m_Workgroup.size());
-
-        // start with all threads as inactive
-        for(size_t i = 0; i < m_Workgroup.size(); i++)
-          activeMask[i] = false;
-
-        // activate the threads in the tangle
-        for(const ThreadReference &ref : threadRefs)
-        {
-          uint32_t idx = ref.id;
-          RDCASSERT(idx < m_Workgroup.size(), idx, m_Workgroup.size());
-          RDCASSERT(!m_Workgroup[idx].Finished());
-          activeMask[idx] = true;
-          anyActiveThreads = true;
-        }
+        const uint32_t threadId = ref.id;
+        const uint32_t lane = threadId;
+        ThreadState &thread = m_Workgroup[lane];
+        RDCASSERT(!thread.IsSimulationStepActive());
      }
+#endif    // #if !defined(RELEASE)

      const DXIL::BlockArray *newPartialConvergentPoints = NULL;
      ExecutionPoint newConvergencePoint = INVALID_EXECUTION_POINT;
@@ -9926,38 +9990,20 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
      uint32_t countConvergePointThreads = 0;
      uint32_t countPartialConvergePointThreads = 0;

-      // step all active members of the workgroup
-      ShaderDebugState state;
-      bool hasDebugState = false;
-      for(size_t lane = 0; lane < m_Workgroup.size(); lane++)
+      // Update the control flow state
+      for(const ThreadReference &ref : threadRefs)
      {
-        if(!activeMask[lane])
-          continue;
+        const uint32_t threadId = ref.id;
+        const uint32_t lane = threadId;
+        ThreadState &thread = m_Workgroup[lane];
        ++countActiveThreads;

-        ThreadState &thread = m_Workgroup[lane];
-        const uint32_t threadId = (uint32_t)lane;
        if(thread.Finished())
        {
-          if(lane == m_ActiveLaneIndex)
-            ret.emplace_back();
-
          tangle.SetThreadDead(threadId);
          continue;
        }

-        if(lane == m_ActiveLaneIndex)
-        {
-          hasDebugState = true;
-          state.stepIndex = m_Steps;
-          thread.StepNext(&state, m_Workgroup, activeMask);
-          m_Steps++;
-        }
-        else
-        {
-          thread.StepNext(NULL, m_Workgroup, activeMask);
-        }
-
        threadExecutionStates[threadId] = thread.GetEnteredPoints();

        const uint32_t threadConvergencePoint = thread.GetConvergencePoint();
@@ -9994,25 +10040,17 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
          ++countPartialConvergePointThreads;
        }

-        if(thread.Finished())
-          tangle.SetThreadDead(threadId);
-
        if(thread.GetDiverged())
          ++countDivergedThreads;
      }
-      for(size_t lane = 0; lane < m_Workgroup.size(); lane++)
+
+      for(const ThreadReference &ref : threadRefs)
      {
-        if(activeMask[lane])
-          m_Workgroup[lane].StepOverNopInstructions();
-      }
-      // Update UI state after the execute and step over nops to make sure state.nextInstruction is in sync
-      if(hasDebugState)
-      {
-        ThreadState &thread = m_Workgroup[m_ActiveLaneIndex];
-        state.nextInstruction = thread.GetActiveGlobalInstructionIdx();
-        thread.FillCallstack(state);
-        ret.push_back(std::move(state));
+        const uint32_t threadId = ref.id;
+        const uint32_t lane = threadId;
+        m_Workgroup[lane].UpdateCurrentInstruction();
      }
+
      if(countConvergePointThreads)
      {
        // all the active threads should have a convergence point if any have one
@@ -10050,6 +10088,9 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
    }
    m_ControlFlow.UpdateState(threadExecutionStates);
  }
+
+  RDCASSERT(allStepsCompleted);
+  m_ShaderChangesReturn = NULL;
  return ret;
 }

@@ -10132,4 +10173,90 @@ ShaderDirectAccess Debugger::GetShaderDirectAccess(DescriptorType type,
  CHECK_DEBUGGER_THREAD();
  return m_ApiWrapper->GetShaderDirectAccess(type, slot);
 }
+
+// Called from any thread
+void Debugger::StepThread(uint32_t lane, StepThreadMode stepMode)
+{
+  ThreadState &thread = m_Workgroup[lane];
+  bool isActiveThread = lane == m_ActiveLaneIndex;
+  bool simulateStep = true;
+  DXIL_DEBUG_RDCASSERT(thread.IsSimulationStepActive());
+  int curActiveSteps = isActiveThread ? m_Steps : 0;
+
+  while(simulateStep)
+  {
+    simulateStep = false;
+    {
+      thread.ClearPendingDebugState();
+      if(isActiveThread)
+        m_ActiveDebugState.stepIndex = curActiveSteps;
+      InternalStepThread(lane);
+      thread.ClearPendingDebugState();
+    }
+
+    if(isActiveThread)
+      curActiveSteps++;
+  };
+  // Update the number of simulation steps
+  if(isActiveThread)
+    m_Steps = curActiveSteps;
+
+  DXIL_DEBUG_RDCASSERT(thread.IsSimulationStepActive());
+
+  if(simulateStep)
+  {
+    DXIL_DEBUG_RDCASSERTEQUAL(stepMode, StepThreadMode::QUEUE_MULTIPLE_STEPS);
+    QueueJob(lane);
+    return;
+  }
+  thread.SetSimulationStepCompleted();
+}
+
+// Called from any thread
+void Debugger::InternalStepThread(uint32_t lane)
+{
+  ThreadState &thread = m_Workgroup[lane];
+  if(lane == m_ActiveLaneIndex)
+  {
+    if(m_RetireIDs)
+    {
+      thread.RetireLiveIDs();
+      m_RetireIDs = false;
+      const ShaderDebugState &pendingDebugState = thread.GetPendingDebugState();
+      m_ActiveDebugState.changes.append(pendingDebugState.changes);
+      thread.ClearPendingDebugState();
+    }
+    thread.StepNext(true, m_Workgroup);
+    thread.FillCallstack(m_ActiveDebugState);
+
+    const ShaderDebugState &pendingDebugState = thread.GetPendingDebugState();
+    m_ActiveDebugState.nextInstruction = pendingDebugState.nextInstruction;
+    m_ActiveDebugState.flags = pendingDebugState.flags;
+    m_ActiveDebugState.changes.append(pendingDebugState.changes);
+    thread.ClearPendingDebugState();
+
+    m_ShaderChangesReturn->push_back(m_ActiveDebugState);
+    {
+      m_ActiveDebugState.callstack.clear();
+      m_ActiveDebugState.changes.clear();
+      m_ActiveDebugState.flags = ShaderEvents::NoEvent;
+      m_ActiveDebugState.stepIndex = 0;
+      m_ActiveDebugState.nextInstruction = 0;
+      m_RetireIDs = true;
+    }
+  }
+  else
+  {
+    thread.StepNext(false, m_Workgroup);
+  }
+}
+
+// Must be called from the replay manager thread (the debugger thread)
+void Debugger::QueueJob(uint32_t lane)
+{
+  CHECK_DEBUGGER_THREAD();
+  ThreadState &thread = m_Workgroup[lane];
+  thread.SetStepQueued();
+  StepThread(lane, StepThreadMode::RUN_SINGLE_STEP);
+}
 };    // namespace DXILDebug
@@ -34,6 +34,22 @@
 #include "dxil_controlflow.h"
 #include "dxil_debuginfo.h"

+#if defined(RELEASE)
+#define DXIL_DEBUG_RDCASSERT(...) \
+  do                              \
+  {                               \
+    (void)(__VA_ARGS__);          \
+  } while((void)0, 0)
+#define DXIL_DEBUG_RDCASSERTEQUAL(...) \
+  do                                   \
+  {                                    \
+    (void)(__VA_ARGS__);               \
+  } while((void)0, 0)
+#else
+#define DXIL_DEBUG_RDCASSERT(...) RDCASSERTMSG("", __VA_ARGS__)
+#define DXIL_DEBUG_RDCASSERTEQUAL(a, b) RDCASSERTEQUAL(a, b)
+#endif
+
 namespace DXILDebug
 {
 using namespace DXDebug;
@@ -51,6 +67,25 @@ struct GlobalState;
 // D3D12 descriptors are equal sized and treated as effectively one byte in size
 const uint32_t D3D12_DESCRIPTOR_BYTESIZE = 1;

+inline void AtomicStore(int32_t *var, int32_t newVal)
+{
+  int32_t oldVal = *var;
+  while(Atomic::CmpExch32(var, oldVal, newVal) != oldVal)
+  {
+    oldVal = *var;
+  };
+}
+
+inline int32_t AtomicLoad(int32_t *var)
+{
+  return Atomic::CmpExch32(var, 0, 0);
+}
+
+inline int32_t AtomicLoad(const int32_t *var)
+{
+  return Atomic::CmpExch32((int32_t *)var, 0, 0);
+}
+
 struct ExecPointReference
 {
  ExecPointReference() : block(~0U), instruction(~0U) {}
@@ -324,17 +359,17 @@ struct MemoryTracking
 struct ThreadState
 {
  ThreadState(Debugger &debugger, const GlobalState &globalState, uint32_t maxSSAId,
-              uint32_t laneIndex);
+              uint32_t laneIndex, uint32_t numThreads);
  ~ThreadState();

-  void EnterEntryPoint(const DXIL::Function *function, ShaderDebugState *state);
-  void StepNext(ShaderDebugState *state, const rdcarray<ThreadState> &workgroup,
-                const rdcarray<bool> &activeMask);
+  void EnterEntryPoint(const DXIL::Function *function, bool hasDebugState);
+  void StepNext(bool hasDebugState, const rdcarray<ThreadState> &workgroup);
  void StepOverNopInstructions();
  void FillCallstack(ShaderDebugState &state);
  void RetireLiveIDs();

  bool Finished() const;
+  bool IsSimulationStepActive() const { return (AtomicLoad(&atomic_isSimulationStepActive) == 1); }
  const ShaderVariable &GetInput() const { return m_Input; }
  const GlobalVariable &GetOutput() const { return m_Output; }
  bool IsDead() const { return m_Dead; }
@@ -348,6 +383,7 @@ struct ThreadState
  {
    return &m_PartialConvergencePoints;
  }
+  const ShaderDebugState &GetPendingDebugState() const { return m_PendingDebugState; }

  void SetBuiltins(const BuiltinInputs &builtins) { m_Builtins = builtins; }
  void SetInput(const ShaderVariable &input) { m_Input = input; }
@@ -362,6 +398,14 @@ struct ThreadState
  void SetQuadId(uint32_t quadId) { m_QuadId = quadId; }
  void SetSubgroupIdx(uint32_t subgroupIdx) { m_SubgroupIdx = subgroupIdx; }
  void SetQuadNeighbours(uint32_t lane, uint32_t index) { m_QuadNeighbours[lane] = index; }
+  void SetActiveMask(const rdcarray<bool> &activeMask)
+  {
+    RDCASSERTEQUAL(m_ActiveMask.size(), activeMask.size());
+    memcpy(m_ActiveMask.data(), activeMask.data(), activeMask.size() * sizeof(bool));
+  }
+  void UpdateCurrentInstruction() { m_CurrentGlobalInstructionIdx = m_ActiveGlobalInstructionIdx; }
+  void SetSimulationStepCompleted() { AtomicStore(&atomic_isSimulationStepActive, 0); }
+  void SetStepQueued() { AtomicStore(&atomic_isSimulationStepActive, 1); }

  void InitialiseFromActive(const ThreadState &active)
  {
@@ -373,13 +417,19 @@ struct ThreadState

  void UpdateBackingMemoryFromVariable(void *ptr, uint64_t &allocSize, const ShaderVariable &var);

+  void ClearPendingDebugState()
+  {
+    m_PendingDebugState.changes.clear();
+    m_PendingDebugState.flags = ShaderEvents::NoEvent;
+    m_PendingDebugState.nextInstruction = 0;
+  }
 private:
  void EnterFunction(const DXIL::Function *function, const rdcarray<DXIL::Value *> &args);

  bool InUniformBlock() const;

  bool JumpToBlock(const DXIL::Block *target, bool divergencePoint);
-  bool ExecuteInstruction(const rdcarray<ThreadState> &workgroup, const rdcarray<bool> &activeMask);
+  bool ExecuteInstruction(const rdcarray<ThreadState> &workgroup);

  void MarkResourceAccess(const ShaderVariable &var);
  void SetResult(const Id &id, ShaderVariable &result, DXIL::Operation op, DXIL::DXOp dxOpCode,
@@ -434,8 +484,7 @@ private:
  bool IsVariableAssigned(const Id id) const;

  ShaderVariable GetBuiltin(ShaderBuiltin builtin) const;
-  uint32_t GetSubgroupActiveLanes(const rdcarray<bool> &activeMask,
-                                  const rdcarray<ThreadState> &workgroup,
+  uint32_t GetSubgroupActiveLanes(const rdcarray<ThreadState> &workgroup,
                                  rdcarray<uint32_t> &activeLanes) const;

  struct AnnotationProperties
@@ -452,7 +501,7 @@ private:
  const GlobalState &m_GlobalState;

  rdcarray<StackFrame *> m_Callstack;
-  ShaderDebugState *m_State = NULL;
+  bool m_HasDebugState = false;

  ShaderVariable m_Input;
  GlobalVariable m_Output;
@@ -477,6 +526,10 @@ private:
  const FunctionInfo *m_FunctionInfo = NULL;
  DXBC::ShaderType m_ShaderType;

+  rdcarray<bool> m_ActiveMask;
+
+  ShaderDebugState m_PendingDebugState;
+
  // Track memory allocations
  // For stack allocations do not bother freeing when leaving functions
  MemoryTracking m_Memory;
@@ -487,8 +540,10 @@ private:
  // The current and previous function basic block index
  uint32_t m_Block = ~0U;
  uint32_t m_PreviousBlock = ~0U;
-  // The global PC of the active instruction that was or will be executed on the current simulation step
+  // The global PC of the active instruction that will be executed on the next simulation step
  uint32_t m_ActiveGlobalInstructionIdx = 0;
+  // The global PC of the active instruction that was last executed
+  uint32_t m_CurrentGlobalInstructionIdx = 0;

  // true if executed an operation which could trigger divergence
  bool m_Diverged;
@@ -513,6 +568,9 @@ private:
  bool m_Dead = false;
  bool m_Ended = false;
  bool m_Helper = false;
+
+  // These need to be accessed using atomics
+  int32_t atomic_isSimulationStepActive = 0;
 };

 struct GlobalState
@@ -628,6 +686,14 @@ struct TypeData
  bool colMajorMat = false;
 };

+enum class StepThreadMode
+{
+  RUN_SINGLE_STEP,
+  RUN_MULTIPLE_STEPS,
+  QUEUE_SINGLE_STEP,
+  QUEUE_MULTIPLE_STEPS
+};
+
 class Debugger : public DXBCContainerDebugger
 {
 public:
@@ -686,12 +752,20 @@ private:
  void AddLocalVariable(const DXIL::SourceMappingInfo &srcMapping, uint32_t instructionIndex);
  void ParseDebugData();

+  void QueueJob(uint32_t lane);
+  void StepThread(uint32_t lane, StepThreadMode stepMode);
+  void InternalStepThread(uint32_t lane);
+  void SimulationJobHelper();
+
  DebugAPIWrapper *m_ApiWrapper = NULL;

  rdcarray<ThreadState> m_Workgroup;
  std::map<const DXIL::Function *, FunctionInfo> m_FunctionInfos;
  rdcshaders::ControlFlow m_ControlFlow;

+  rdcarray<ShaderDebugState> *m_ShaderChangesReturn = NULL;
+  ShaderDebugState m_ActiveDebugState;
+
  // the live mutable global variables, to initialise a stack frame's live list
  rdcarray<bool> m_LiveGlobals;

@@ -713,6 +787,9 @@ private:
  const uint64_t m_DeviceThreadID;
  uint32_t m_ActiveLaneIndex = 0;
  int m_Steps = 0;
+  bool m_RetireIDs = true;
 };

 };    // namespace DXILDebug
+
+DECLARE_REFLECTION_ENUM(DXILDebug::StepThreadMode);
@@ -24,6 +24,7 @@

 #include "dxil_bytecode.h"
 #include "dxil_common.h"
+#include "dxil_debug.h"
 #include "dxil_debuginfo.h"

 template <>
@@ -889,3 +890,16 @@ rdcstr DoStringise(const DXIL::WaveMultiPrefixOpCode &el)
  }
  END_ENUM_STRINGISE();
 }
+
+template <>
+rdcstr DoStringise(const DXILDebug::StepThreadMode &el)
+{
+  BEGIN_ENUM_STRINGISE(DXILDebug::StepThreadMode)
+  {
+    STRINGISE_ENUM_CLASS(RUN_SINGLE_STEP)
+    STRINGISE_ENUM_CLASS(RUN_MULTIPLE_STEPS)
+    STRINGISE_ENUM_CLASS(QUEUE_SINGLE_STEP)
+    STRINGISE_ENUM_CLASS(QUEUE_MULTIPLE_STEPS)
+  }
+  END_ENUM_STRINGISE();
+};