From c1013a1b822dd29e351f536be78de8652ba5f57e Mon Sep 17 00:00:00 2001
From: Jake Turner <jake@evansturner.co.uk>
Date: Wed, 19 Feb 2025 09:42:10 +0000
Subject: [PATCH] Change SPIRV debugger control flow to emulate maximal
 reconvergence

---
 .../driver/shaders/spirv/spirv_debug.cpp      |  22 +-
 renderdoc/driver/shaders/spirv/spirv_debug.h  |  12 +-
 .../shaders/spirv/spirv_debug_setup.cpp       | 218 +++++++++++-------
 3 files changed, 169 insertions(+), 83 deletions(-)
diff --git a/renderdoc/driver/shaders/spirv/spirv_debug.cpp b/renderdoc/driver/shaders/spirv/spirv_debug.cpp
index 4b704527b..a700aff06 100644
--- a/renderdoc/driver/shaders/spirv/spirv_debug.cpp
+++ b/renderdoc/driver/shaders/spirv/spirv_debug.cpp
@@ -173,6 +173,11 @@ ThreadState::~ThreadState()
   callstack.clear();
 }
 
+void ThreadState::SetConvergencePoint(Id block)
+{
+  convergenceInstruction = debugger.GetInstructionForLabel(block);
+}
+
 bool ThreadState::Finished() const
 {
   return dead || callstack.empty();
@@ -618,7 +623,11 @@ void ThreadState::JumpToLabel(Id target)
   frame->lastBlock = frame->curBlock;
   frame->curBlock = target;
 
-  nextInstruction = debugger.GetInstructionForLabel(target) + 1;
+  diverged = true;
+
+  uint32_t labelInstruction = debugger.GetInstructionForLabel(target);
+  enteredPoints.push_back(labelInstruction);
+  nextInstruction = labelInstruction + 1;
 
   // if jumping to an empty unconditional loop header, continue to the loop block
   Iter it = debugger.GetIterForInstruction(nextInstruction);
@@ -627,6 +636,7 @@ void ThreadState::JumpToLabel(Id target)
     OpLoopMerge merge(it);
 
     mergeBlock = merge.mergeBlock;
+    SetConvergencePoint(merge.mergeBlock);
 
     it++;
     if(it.opcode() == Op::Branch)
@@ -699,6 +709,7 @@ void ThreadState::SkipIgnoredInstructions()
       OpSelectionMerge merge(it);
 
       mergeBlock = merge.mergeBlock;
+      SetConvergencePoint(merge.mergeBlock);
 
       nextInstruction++;
       continue;
@@ -709,6 +720,7 @@ void ThreadState::SkipIgnoredInstructions()
       OpLoopMerge merge(it);
 
       mergeBlock = merge.mergeBlock;
+      SetConvergencePoint(merge.mergeBlock);
 
       nextInstruction++;
       continue;
@@ -734,6 +746,10 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray<ThreadState>
 
   Iter it = debugger.GetIterForInstruction(nextInstruction);
   nextInstruction++;
+  diverged = false;
+  enteredPoints.clear();
+  convergenceInstruction = INVALID_EXECUTION_POINT;
+  functionReturnPoint = INVALID_EXECUTION_POINT;
 
   OpDecoder opdata(it);
 
@@ -3879,6 +3895,8 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray<ThreadState>
       // function. The second time we do have a return value so we process it and continue
       if(returnValue.name.empty())
       {
+        // The instruction after a function call is defined to be a convergence point
+        functionReturnPoint = nextInstruction;
         uint32_t returnInstruction = nextInstruction - 1;
         nextInstruction = debugger.GetInstructionForFunction(call.function);
 
@@ -3891,6 +3909,8 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray<ThreadState>
       {
         SetDst(call.result, returnValue);
         returnValue.name.clear();
+        // The instruction after a function call is defined to be a convergence point, mark that we entered it
+        enteredPoints.push_back(nextInstruction);
       }
       break;
     }
diff --git a/renderdoc/driver/shaders/spirv/spirv_debug.h b/renderdoc/driver/shaders/spirv/spirv_debug.h
index 8f16c95d3..5b12a8a44 100644
--- a/renderdoc/driver/shaders/spirv/spirv_debug.h
+++ b/renderdoc/driver/shaders/spirv/spirv_debug.h
@@ -27,6 +27,7 @@
 #include "api/replay/rdcarray.h"
 #include "maths/vec.h"
 #include "spirv_common.h"
+#include "spirv_controlflow.h"
 #include "spirv_processor.h"
 
 struct SPIRVInterfaceAccess;
@@ -225,12 +226,19 @@ struct ThreadState
 
   // the id of the merge block that the last branch targetted
   Id mergeBlock;
+  uint32_t convergenceInstruction;
+  uint32_t functionReturnPoint;
   ShaderVariable returnValue;
   rdcarray<StackFrame *> callstack;
 
   // the list of IDs that are currently valid and live
   rdcarray<Id> live;
 
+  // true if executed an operation which could trigger divergence
+  bool diverged;
+  // list of potential convergence points that were entered in a single step (used for tracking thread convergence)
+  rdcarray<uint32_t> enteredPoints;
+
   std::map<Id, uint32_t> lastWrite;
 
   // quad ID (arbitrary, just used to find neighbours for derivatives)
@@ -259,6 +267,7 @@ private:
   bool ReferencePointer(Id id);
 
   void SkipIgnoredInstructions();
+  void SetConvergencePoint(Id block);
 
   ShaderDebugState *m_State = NULL;
 };
@@ -498,7 +507,6 @@ private:
 
   std::set<rdcstr> usedNames;
   std::map<Id, rdcstr> dynamicNames;
-  void CalcActiveMask(rdcarray<bool> &activeMask);
 
   struct
   {
@@ -528,6 +536,8 @@ private:
     rdcarray<LocalMapping> activeLocalMappings;
   } m_DebugInfo;
 
+  rdcspv::ControlFlow controlFlow;
+
   const ScopeData *GetScope(size_t offset) const;
 };
 
diff --git a/renderdoc/driver/shaders/spirv/spirv_debug_setup.cpp b/renderdoc/driver/shaders/spirv/spirv_debug_setup.cpp
index 901123432..134596449 100644
--- a/renderdoc/driver/shaders/spirv/spirv_debug_setup.cpp
+++ b/renderdoc/driver/shaders/spirv/spirv_debug_setup.cpp
@@ -1561,6 +1561,7 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s
 
   std::sort(liveGlobals.begin(), liveGlobals.end());
 
+  rdcarray<rdcspv::ThreadIndex> threadIds;
   for(uint32_t i = 0; i < threadsInWorkgroup; i++)
   {
     ThreadState &lane = workgroup[i];
@@ -1589,8 +1590,14 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s
     // now that the globals are allocated and their storage won't move, we can take pointers to them
     for(const PointerId &p : pointerIDs)
       p.Set(*this, global, lane);
+
+    // Only add active lanes to control flow
+    if(!lane.dead)
+      threadIds.push_back(i);
   }
 
+  controlFlow.Construct(threadIds);
+
   // find quad neighbours
   {
     rdcarray<uint32_t> processedQuads;
@@ -2462,6 +2469,7 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
   if(steps == 0)
   {
     ShaderDebugState initial;
+    uint32_t startBlock = INVALID_EXECUTION_POINT;
 
     // we should be sitting at the entry point function prologue, step forward into the first block
     // and past any function-local variable declarations
@@ -2474,6 +2482,7 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
         thread.EnterEntryPoint(&initial);
         FillCallstack(thread, initial);
         initial.nextInstruction = thread.nextInstruction;
+        startBlock = thread.callstack.back()->curBlock.value();
       }
       else
       {
@@ -2495,6 +2504,21 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
 
     ret.push_back(std::move(initial));
 
+    // Set the initial block for the threads in the root tangle
+    ThreadExecutionStates threadExecutionStates;
+    TangleGroup &tangles = controlFlow.GetTangles();
+    RDCASSERTEQUAL(tangles.size(), 1);
+    RDCASSERTNOTEQUAL(startBlock, INVALID_EXECUTION_POINT);
+    for(Tangle &tangle : tangles)
+    {
+      RDCASSERT(tangle.IsAliveActive());
+      for(uint32_t threadIdx = 0; threadIdx < workgroup.size(); ++threadIdx)
+      {
+        if(!workgroup[threadIdx].Finished())
+          threadExecutionStates[threadIdx].push_back(startBlock);
+      }
+    }
+    controlFlow.UpdateState(threadExecutionStates);
     steps++;
   }
 
@@ -2513,21 +2537,60 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
     if(active.Finished())
       break;
 
-    // calculate the current mask of which threads are active
-    CalcActiveMask(activeMask);
+    // Execute the threads in each active tangle
+    ThreadExecutionStates threadExecutionStates;
+    TangleGroup &tangles = controlFlow.GetTangles();
 
-    // step all active members of the workgroup
-    for(size_t lane = 0; lane < workgroup.size(); lane++)
+    bool anyActiveThreads = false;
+    for(Tangle &tangle : tangles)
     {
-      ThreadState &thread = workgroup[lane];
+      if(!tangle.IsAliveActive())
+        continue;
 
-      if(activeMask[lane])
+      rdcarray<rdcspv::ThreadReference> threadRefs = tangle.GetThreadRefs();
+      // calculate the current active thread mask from the threads in the tangle
       {
-        if(thread.nextInstruction >= instructionOffsets.size())
+        // one bool per workgroup thread
+        activeMask.resize(workgroup.size());
+
+        // start with all threads as inactive
+        for(size_t i = 0; i < workgroup.size(); i++)
+          activeMask[i] = false;
+
+        // activate the threads in the tangle
+        for(const rdcspv::ThreadReference &ref : threadRefs)
+        {
+          uint32_t idx = ref.id;
+          RDCASSERT(idx < workgroup.size(), idx, workgroup.size());
+          RDCASSERT(!workgroup[idx].Finished());
+          activeMask[idx] = true;
+          anyActiveThreads = true;
+        }
+      }
+
+      ExecutionPoint newConvergeInstruction = INVALID_EXECUTION_POINT;
+      ExecutionPoint newFunctionReturnPoint = INVALID_EXECUTION_POINT;
+      uint32_t countActiveThreads = 0;
+      uint32_t countDivergedThreads = 0;
+      uint32_t countConvergePointThreads = 0;
+      uint32_t countFunctionReturnThreads = 0;
+
+      // step all active members of the workgroup
+      for(size_t lane = 0; lane < workgroup.size(); lane++)
+      {
+        if(!activeMask[lane])
+          continue;
+        ++countActiveThreads;
+
+        ThreadState &thread = workgroup[lane];
+        const uint32_t currentPC = thread.nextInstruction;
+        const uint32_t threadId = lane;
+        if(currentPC >= instructionOffsets.size())
         {
           if(lane == activeLaneIndex)
             ret.emplace_back();
 
+          tangle.SetThreadDead(threadId);
           continue;
         }
 
@@ -2535,7 +2598,7 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
         {
           ShaderDebugState state;
 
-          size_t instOffs = instructionOffsets[thread.nextInstruction];
+          size_t instOffs = instructionOffsets[currentPC];
 
           // see if we're retiring any IDs at this state
           for(size_t l = 0; l < thread.live.size();)
@@ -2574,7 +2637,7 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
 
           if(m_DebugInfo.valid)
           {
-            size_t endOffs = instructionOffsets[thread.nextInstruction - 1];
+            size_t endOffs = instructionOffsets[currentPC - 1];
 
             // append any inlined functions to the top of the stack
             InlineData *inlined = m_DebugInfo.lineInline[endOffs];
@@ -2622,8 +2685,73 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
         {
           thread.StepNext(NULL, workgroup, activeMask);
         }
+        threadExecutionStates[threadId] = thread.enteredPoints;
+
+        uint32_t threadConvergeInstruction = thread.convergenceInstruction;
+        // the thread activated a new convergence point
+        if(threadConvergeInstruction != INVALID_EXECUTION_POINT)
+        {
+          if(newConvergeInstruction == INVALID_EXECUTION_POINT)
+          {
+            newConvergeInstruction = threadConvergeInstruction;
+            RDCASSERTNOTEQUAL(newConvergeInstruction, INVALID_EXECUTION_POINT);
+          }
+          else
+          {
+            // All the threads in the tangle should set the same convergence point
+            RDCASSERTEQUAL(threadConvergeInstruction, newConvergeInstruction);
+          }
+          ++countConvergePointThreads;
+        }
+        uint32_t threadFunctionReturnPoint = thread.functionReturnPoint;
+        // the thread activated a new function return point
+        if(threadFunctionReturnPoint != INVALID_EXECUTION_POINT)
+        {
+          if(newFunctionReturnPoint == INVALID_EXECUTION_POINT)
+          {
+            newFunctionReturnPoint = threadFunctionReturnPoint;
+            RDCASSERTNOTEQUAL(newFunctionReturnPoint, INVALID_EXECUTION_POINT);
+          }
+          else
+          {
+            // All the threads in the tangle should set the same function return point
+            RDCASSERTEQUAL(threadFunctionReturnPoint, newFunctionReturnPoint);
+          }
+          ++countFunctionReturnThreads;
+        }
+
+        if(thread.Finished())
+          tangle.SetThreadDead(threadId);
+
+        if(thread.diverged)
+          ++countDivergedThreads;
+      }
+      if(countConvergePointThreads)
+      {
+        // all the active threads should have a convergence point if any have one
+        RDCASSERTEQUAL(countConvergePointThreads, countActiveThreads);
+        tangle.AddMergePoint(newConvergeInstruction);
+      }
+      if(countFunctionReturnThreads)
+      {
+        // all the active threads should have a function return point if any have one
+        RDCASSERTEQUAL(countFunctionReturnThreads, countActiveThreads);
+        tangle.AddFunctionReturnPoint(newFunctionReturnPoint);
+      }
+      if(countDivergedThreads)
+      {
+        // all the active threads should have diverged if any diverges
+        RDCASSERTEQUAL(countDivergedThreads, countActiveThreads);
+        tangle.SetDiverged(true);
       }
     }
+    if(!anyActiveThreads)
+    {
+      active.dead = true;
+      controlFlow.UpdateState(threadExecutionStates);
+      RDCERR("No active threads in any tangle, killing active thread to terminate the debugger");
+    }
+    controlFlow.UpdateState(threadExecutionStates);
   }
 
   return ret;
@@ -3426,78 +3554,6 @@ rdcstr Debugger::GetHumanName(Id id)
   return name;
 }
 
-void Debugger::CalcActiveMask(rdcarray<bool> &activeMask)
-{
-  // one bool per workgroup thread
-  activeMask.resize(workgroup.size());
-
-  // mark any threads that have finished as inactive, otherwise they're active
-  for(size_t i = 0; i < workgroup.size(); i++)
-    activeMask[i] = !workgroup[i].Finished();
-
-  // otherwise we need to make sure that control flow which converges stays in lockstep so that
-  // derivatives etc are still valid. While diverged, we don't have to keep threads in lockstep
-  // since using derivatives is invalid.
-  //
-  // We take advantage of SPIR-V's structured control flow. We only ever diverge at a branch
-  // instruction, and the preceeding OpLoopMerge/OpSelectionMerge.
-  //
-  // So the scheme is as follows:
-  // * If we haven't diverged and all threads have the same nextInstruction, we're still uniform so
-  //   continue in lockstep.
-  // * As soon as they differ, we've diverged. Check the last mergeBlock that was specified - we
-  //   won't be uniform again until all threads reach that block.
-  // * Once we've diverged, any threads which are NOT in the merge block are active, and any threads
-  //   which are in it are inactive. This causes them to pause and wait for others to catch up
-  //   until the point where all threads are in the merge block at which point we've converged and
-  //   can go back to uniformity.
-
-  // if we're waiting on a converge block to be reached, we've diverged previously.
-  bool wasDiverged = convergeBlock != Id();
-
-  // see if we've diverged by starting procesing different next instructions
-  bool diverged = false;
-  for(size_t i = 1; !diverged && i < workgroup.size(); i++)
-    diverged |= (workgroup[0].nextInstruction != workgroup[i].nextInstruction);
-
-  if(!wasDiverged && diverged)
-  {
-    // if we've newly diverged, all workgroups should have the same merge block - the point where we
-    // become uniform again.
-    convergeBlock = workgroup[0].mergeBlock;
-    for(size_t i = 1; i < workgroup.size(); i++)
-      RDCASSERT(!activeMask[i] || convergeBlock == workgroup[i].mergeBlock);
-  }
-
-  if(wasDiverged || diverged)
-  {
-    // for every thread, turn it off if it's in the converge block
-    rdcarray<bool> inConverge;
-    inConverge.resize(activeMask.size());
-    for(size_t i = 0; i < workgroup.size(); i++)
-      inConverge[i] = (!workgroup[i].callstack.empty() &&
-                       workgroup[i].callstack.back()->curBlock == convergeBlock);
-
-    // is any thread active, but not converged?
-    bool anyActiveNotConverged = false;
-    for(size_t i = 0; i < workgroup.size(); i++)
-      anyActiveNotConverged |= activeMask[i] && !inConverge[i];
-
-    if(anyActiveNotConverged)
-    {
-      // if so, then only non-converged threads are active right now
-      for(size_t i = 0; i < workgroup.size(); i++)
-        activeMask[i] &= !inConverge[i];
-    }
-    else
-    {
-      // otherwise we can leave the active mask as is, forget the convergence point, and allow
-      // everything to run as normal
-      convergeBlock = Id();
-    }
-  }
-}
-
 void Debugger::AllocateVariable(Id id, Id typeId, ShaderVariable &outVar)
 {
   // allocs should always be pointers