Prepare SPIR-V debugger for larger workgroup sizes

* The workgroup size is passed in at creation time, and we handle the potenttial for multiple quads by identifying quads via quad ID
2026-05-04 17:10:47 +00:00 · 2025-01-31 14:20:11 +00:00
parent 1cfb684d16
commit 8d3f40b0a9
4 changed files with 159 additions and 65 deletions
@@ -77,13 +77,9 @@ inline uint64_t CountOnes(uint64_t value)

 namespace rdcspv
 {
-ThreadState::ThreadState(uint32_t workgroupIdx, Debugger &debug, const GlobalState &globalState)
+ThreadState::ThreadState(Debugger &debug, const GlobalState &globalState)
    : debugger(debug), global(globalState)
 {
-  workgroupIndex = workgroupIdx;
-  nextInstruction = 0;
-  helperInvocation = false;
-  killed = false;
 }

 ThreadState::~ThreadState()
@@ -95,7 +91,7 @@ ThreadState::~ThreadState()

 bool ThreadState::Finished() const
 {
-  return killed || callstack.empty();
+  return dead || callstack.empty();
 }

 void ThreadState::FillCallstack(rdcarray<Id> &funcs)
@@ -415,18 +411,33 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De
 {
  const ThreadState *a = NULL, *b = NULL;

+  if(quadNeighbours[0] == ~0U || quadNeighbours[1] == ~0U || quadNeighbours[2] == ~0U ||
+     quadNeighbours[3] == ~0U)
+  {
+    debugger.GetAPIWrapper()->AddDebugMessage(
+        MessageCategory::Execution, MessageSeverity::High, MessageSource::RuntimeWarning,
+        StringFormat::Fmt("Derivative calculation within non-quad on input %s",
+                          debugger.GetHumanName(val).c_str()));
+    return ShaderVariable("", 0.0f, 0.0f, 0.0f, 0.0f);
+  }
+
+  RDCASSERT(quadNeighbours[0] < workgroup.size(), quadNeighbours[0], workgroup.size());
+  RDCASSERT(quadNeighbours[1] < workgroup.size(), quadNeighbours[1], workgroup.size());
+  RDCASSERT(quadNeighbours[2] < workgroup.size(), quadNeighbours[2], workgroup.size());
+  RDCASSERT(quadNeighbours[3] < workgroup.size(), quadNeighbours[3], workgroup.size());
+
  const bool xdirection = (dir == DDX);
  if(type == Coarse)
  {
    // coarse derivatives are identical across the quad, based on the top-left.
-    a = &workgroup[0];
-    b = &workgroup[xdirection ? 1 : 2];
+    a = &workgroup[quadNeighbours[0]];
+    b = &workgroup[quadNeighbours[xdirection ? 1 : 2]];
  }
  else
  {
    // we need to figure out the exact pair to use
-    int x = workgroupIndex & 1;
-    int y = workgroupIndex / 2;
+    int x = quadLaneIndex & 1;
+    int y = quadLaneIndex / 2;

    if(x == 0)
    {
@@ -435,13 +446,13 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De
        // top-left
        if(xdirection)
        {
-          a = &workgroup[0];
-          b = &workgroup[1];
+          a = &workgroup[quadNeighbours[0]];
+          b = &workgroup[quadNeighbours[1]];
        }
        else
        {
-          a = &workgroup[0];
-          b = &workgroup[2];
+          a = &workgroup[quadNeighbours[0]];
+          b = &workgroup[quadNeighbours[2]];
        }
      }
      else
@@ -449,13 +460,13 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De
        // bottom-left
        if(xdirection)
        {
-          a = &workgroup[2];
-          b = &workgroup[3];
+          a = &workgroup[quadNeighbours[2]];
+          b = &workgroup[quadNeighbours[3]];
        }
        else
        {
-          a = &workgroup[0];
-          b = &workgroup[2];
+          a = &workgroup[quadNeighbours[0]];
+          b = &workgroup[quadNeighbours[2]];
        }
      }
    }
@@ -466,13 +477,13 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De
        // top-right
        if(xdirection)
        {
-          a = &workgroup[0];
-          b = &workgroup[1];
+          a = &workgroup[quadNeighbours[0]];
+          b = &workgroup[quadNeighbours[1]];
        }
        else
        {
-          a = &workgroup[1];
-          b = &workgroup[3];
+          a = &workgroup[quadNeighbours[1]];
+          b = &workgroup[quadNeighbours[3]];
        }
      }
      else
@@ -480,13 +491,13 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De
        // bottom-right
        if(xdirection)
        {
-          a = &workgroup[2];
-          b = &workgroup[3];
+          a = &workgroup[quadNeighbours[2]];
+          b = &workgroup[quadNeighbours[3]];
        }
        else
        {
-          a = &workgroup[1];
-          b = &workgroup[3];
+          a = &workgroup[quadNeighbours[1]];
+          b = &workgroup[quadNeighbours[3]];
        }
      }
    }
@@ -3068,7 +3079,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray<ThreadState>
    case Op::TerminateInvocation:
    case Op::Kill:
    {
-      killed = true;
+      dead = true;

      // destroy all stack frames
      for(StackFrame *exitingFrame : callstack)
@@ -175,7 +175,7 @@ class Debugger;

 struct ThreadState
 {
-  ThreadState(uint32_t workgroupIdx, Debugger &debug, const GlobalState &globalState);
+  ThreadState(Debugger &debug, const GlobalState &globalState);
  ~ThreadState();

  void EnterEntryPoint(ShaderDebugState *state);
@@ -231,10 +231,16 @@ struct ThreadState

  std::map<Id, uint32_t> lastWrite;

-  // index in the pixel quad
-  uint32_t workgroupIndex;
-  bool helperInvocation;
-  bool killed;
+  // quad ID (arbitrary, just used to find neighbours for derivatives)
+  uint32_t quadId = 0;
+  // index in the pixel quad (relative to the active lane)
+  uint32_t quadLaneIndex = ~0U;
+  // the lane indices of our quad neighbours
+  uint32_t quadNeighbours[4] = {~0U, ~0U, ~0U, ~0U};
+  // index in the workgroup
+  uint32_t workgroupIndex = 0;
+  bool helperInvocation = false;
+  bool dead = true;

  const ShaderVariable &GetSrc(Id id) const;
  void WritePointerValue(Id pointer, const ShaderVariable &val);
@@ -368,7 +374,8 @@ public:
  ShaderDebugTrace *BeginDebug(DebugAPIWrapper *apiWrapper, const ShaderStage stage,
                               const rdcstr &entryPoint, const rdcarray<SpecConstant> &specInfo,
                               const std::map<size_t, uint32_t> &instructionLines,
-                               const SPIRVPatchData &patchData, uint32_t activeIndex);
+                               const SPIRVPatchData &patchData, uint32_t activeIndex,
+                               uint32_t workgroupSize);

  rdcarray<ShaderDebugState> ContinueDebug();

@@ -838,7 +838,8 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s
                                       const rdcstr &entryPoint,
                                       const rdcarray<SpecConstant> &specInfo,
                                       const std::map<size_t, uint32_t> &instructionLines,
-                                       const SPIRVPatchData &patchData, uint32_t activeIndex)
+                                       const SPIRVPatchData &patchData, uint32_t activeIndex,
+                                       uint32_t workgroupSize)
 {
  Id entryId = entryLookup[ShaderEntryPoint(entryPoint, shaderStage)];

@@ -895,9 +896,8 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s
  stage = shaderStage;
  apiWrapper = api;

-  uint32_t workgroupSize = shaderStage == ShaderStage::Pixel ? 4 : 1;
  for(uint32_t i = 0; i < workgroupSize; i++)
-    workgroup.push_back(ThreadState(i, *this, global));
+    workgroup.push_back(ThreadState(*this, global));

  ThreadState &active = GetActiveLane();

@@ -1489,6 +1489,7 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s
  for(uint32_t i = 0; i < workgroupSize; i++)
  {
    ThreadState &lane = workgroup[i];
+    lane.workgroupIndex = i;
    if(i != activeLaneIndex)
    {
      lane.nextInstruction = active.nextInstruction;
@@ -1499,16 +1500,68 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s

    if(stage == ShaderStage::Pixel)
    {
-      ShaderVariable var(rdcstr(), 0U, 0U, 0U, 0U);
-      apiWrapper->FillInputValue(var, ShaderBuiltin::IsHelper, i, 0, 0);
-      lane.helperInvocation = var.value.u32v[0] != 0;
+      lane.helperInvocation = apiWrapper->GetThreadProperty(i, ThreadProperty::Helper) != 0;
+      lane.quadLaneIndex = apiWrapper->GetThreadProperty(i, ThreadProperty::QuadLane);
+      lane.quadId = apiWrapper->GetThreadProperty(i, ThreadProperty::QuadId);
    }

+    lane.dead = apiWrapper->GetThreadProperty(i, ThreadProperty::Active) == 0;
+
    // now that the globals are allocated and their storage won't move, we can take pointers to them
    for(const PointerId &p : pointerIDs)
      p.Set(*this, global, lane);
  }

+  // find quad neighbours
+  {
+    rdcarray<uint32_t> processedQuads;
+    for(uint32_t i = 0; i < workgroupSize; i++)
+    {
+      uint32_t desiredQuad = workgroup[i].quadId;
+
+      // ignore threads not in any quad
+      if(desiredQuad == 0)
+        continue;
+
+      // quads are almost certainly sorted together, so shortcut by checking the last one
+      if((!processedQuads.empty() && processedQuads.back() == desiredQuad) ||
+         processedQuads.contains(desiredQuad))
+        continue;
+
+      processedQuads.push_back(desiredQuad);
+
+      // find the threads
+      uint32_t threads[4] = {
+          i,
+          ~0U,
+          ~0U,
+          ~0U,
+      };
+      for(uint32_t j = i + 1, t = 1; j < workgroupSize && t < 4; j++)
+      {
+        if(workgroup[j].quadId == desiredQuad)
+          threads[t++] = j;
+      }
+
+      // now swizzle the threads to know each other
+      for(uint32_t src = 0; src < 4; src++)
+      {
+        uint32_t lane = workgroup[threads[src]].quadLaneIndex;
+
+        if(lane >= 4)
+          continue;
+
+        for(uint32_t dst = 0; dst < 4; dst++)
+        {
+          if(threads[dst] == ~0U)
+            continue;
+
+          workgroup[threads[dst]].quadNeighbours[lane] = threads[src];
+        }
+      }
+    }
+  }
+
  // this contains all the accumulated line number information. Add in our disassembly mapping
  ret->instInfo = m_InstInfo;
  for(size_t i = 0; i < m_InstInfo.size(); i++)
@@ -2968,8 +2968,8 @@ struct VertexLaneData

 struct PixelLaneData
 {
-  Vec4f fragCoord;    // per-lane coord
-  uint32_t helper;    // per-lane helper bit
+  Vec4f fragCoord;      // per-lane coord
+  uint32_t isHelper;    // per-lane helper bit
  uint32_t padding[3];
 };

@@ -3302,7 +3302,7 @@ static void CreateInputFetcher(rdcarray<uint32_t> &spv,
      helper.base = helper.loadOps.add(rdcspv::OpSelect(uint32Type, editor.MakeId(), helper.base,
                                                        getUIntConst(1), getUIntConst(0)));
      fixedValues.push_back(helper);
-      structMembers.push_back({uint32Type, "__rd_helper", offsetof(PixelLaneData, helper)});
+      structMembers.push_back({uint32Type, "__rd_helper", offsetof(PixelLaneData, isHelper)});

      offset += sizeof(PixelLaneData);
    }
@@ -3881,6 +3881,9 @@ ShaderDebugTrace *VulkanReplay::DebugVertex(uint32_t eventId, uint32_t vertid, u

  apiWrapper->location_inputs.resize(numThreads);
  apiWrapper->thread_builtins.resize(numThreads);
+  apiWrapper->thread_props.resize(numThreads);
+
+  apiWrapper->thread_props[0][(size_t)rdcspv::ThreadProperty::Active] = 1;

  std::unordered_map<ShaderBuiltin, ShaderVariable> &global_builtins = apiWrapper->global_builtins;
  global_builtins[ShaderBuiltin::BaseInstance] =
@@ -4056,7 +4059,7 @@ ShaderDebugTrace *VulkanReplay::DebugVertex(uint32_t eventId, uint32_t vertid, u
  rdcspv::Debugger *debugger = new rdcspv::Debugger;
  debugger->Parse(shader.spirv.GetSPIRV());
  ShaderDebugTrace *ret = debugger->BeginDebug(apiWrapper, ShaderStage::Vertex, entryPoint, spec,
-                                               shadRefl.instructionLines, shadRefl.patchData, 0);
+                                               shadRefl.instructionLines, shadRefl.patchData, 0, 1);
  apiWrapper->ResetReplay();

  return ret;
@@ -4589,27 +4592,40 @@ ShaderDebugTrace *VulkanReplay::DebugPixel(uint32_t eventId, uint32_t x, uint32_
    {
      byte *value = LaneData + t * structStride;

+      apiWrapper->thread_props[t][(size_t)rdcspv::ThreadProperty::Active] = 1;
+
      // read PixelLaneData
-      PixelLaneData *pixelData = (PixelLaneData *)value;
+      {
+        PixelLaneData *pixelData = (PixelLaneData *)value;
+
+        {
+          ShaderVariable &var = apiWrapper->thread_builtins[t][ShaderBuiltin::Position];
+
+          var.rows = 1;
+          var.columns = 4;
+          var.type = VarType::Float;
+
+          memcpy(var.value.u8v.data(), &pixelData->fragCoord, sizeof(Vec4f));
+        }
+
+        {
+          ShaderVariable &var = apiWrapper->thread_builtins[t][ShaderBuiltin::IsHelper];
+
+          var.rows = 1;
+          var.columns = 1;
+          var.type = VarType::Bool;
+
+          memcpy(var.value.u8v.data(), &pixelData->isHelper, sizeof(uint32_t));
+        }
+
+        if(numThreads == 4)
+          apiWrapper->thread_props[t][(size_t)rdcspv::ThreadProperty::Active] = 1;
+        apiWrapper->thread_props[t][(size_t)rdcspv::ThreadProperty::Helper] =
+            t != winner->laneIndex ? 1 : 0;
+        apiWrapper->thread_props[t][(size_t)rdcspv::ThreadProperty::QuadId] = 1000;
+        apiWrapper->thread_props[t][(size_t)rdcspv::ThreadProperty::QuadLane] = t;
+      }
      value += sizeof(PixelLaneData);
-      {
-        ShaderVariable &var = apiWrapper->thread_builtins[t][ShaderBuiltin::Position];
-
-        var.rows = 1;
-        var.columns = 4;
-        var.type = VarType::Float;
-
-        memcpy(var.value.u8v.data(), &pixelData->fragCoord, sizeof(Vec4f));
-      }
-      {
-        ShaderVariable &var = apiWrapper->thread_builtins[t][ShaderBuiltin::IsHelper];
-
-        var.rows = 1;
-        var.columns = 1;
-        var.type = VarType::Bool;
-
-        memcpy(var.value.u8v.data(), &pixelData->helper, sizeof(uint32_t));
-      }

      for(size_t i = 0; i < shadRefl.refl->inputSignature.size(); i++)
      {
@@ -4640,7 +4656,8 @@ ShaderDebugTrace *VulkanReplay::DebugPixel(uint32_t eventId, uint32_t x, uint32_
    }

    ret = debugger->BeginDebug(apiWrapper, ShaderStage::Pixel, entryPoint, spec,
-                               shadRefl.instructionLines, shadRefl.patchData, winner->laneIndex);
+                               shadRefl.instructionLines, shadRefl.patchData, winner->laneIndex,
+                               numThreads);
    apiWrapper->ResetReplay();
  }
  else
@@ -4712,6 +4729,9 @@ ShaderDebugTrace *VulkanReplay::DebugThread(uint32_t eventId,
  static const uint32_t numThreads = 1;

  apiWrapper->thread_builtins.resize(numThreads);
+  apiWrapper->thread_props.resize(numThreads);
+
+  apiWrapper->thread_props[0][(size_t)rdcspv::ThreadProperty::Active] = 1;

  std::unordered_map<ShaderBuiltin, ShaderVariable> &global_builtins = apiWrapper->global_builtins;
  global_builtins[ShaderBuiltin::DispatchSize] =
@@ -4736,7 +4756,7 @@ ShaderDebugTrace *VulkanReplay::DebugThread(uint32_t eventId,
  rdcspv::Debugger *debugger = new rdcspv::Debugger;
  debugger->Parse(shader.spirv.GetSPIRV());
  ShaderDebugTrace *ret = debugger->BeginDebug(apiWrapper, ShaderStage::Compute, entryPoint, spec,
-                                               shadRefl.instructionLines, shadRefl.patchData, 0);
+                                               shadRefl.instructionLines, shadRefl.patchData, 0, 1);
  apiWrapper->ResetReplay();

  return ret;
@@ -4800,6 +4820,9 @@ ShaderDebugTrace *VulkanReplay::DebugMeshThread(uint32_t eventId,
  static const uint32_t numThreads = 1;

  apiWrapper->thread_builtins.resize(numThreads);
+  apiWrapper->thread_props.resize(numThreads);
+
+  apiWrapper->thread_props[0][(size_t)rdcspv::ThreadProperty::Active] = 1;

  std::unordered_map<ShaderBuiltin, ShaderVariable> &global_builtins = apiWrapper->global_builtins;
  global_builtins[ShaderBuiltin::DispatchSize] =
@@ -4824,7 +4847,7 @@ ShaderDebugTrace *VulkanReplay::DebugMeshThread(uint32_t eventId,
  rdcspv::Debugger *debugger = new rdcspv::Debugger;
  debugger->Parse(shader.spirv.GetSPIRV());
  ShaderDebugTrace *ret = debugger->BeginDebug(apiWrapper, ShaderStage::Mesh, entryPoint, spec,
-                                               shadRefl.instructionLines, shadRefl.patchData, 0);
+                                               shadRefl.instructionLines, shadRefl.patchData, 0, 1);
  apiWrapper->ResetReplay();

  return ret;