Fetch and apply pixel inputs and derivatives to quad

2026-05-05 17:40:39 +00:00 · 2020-04-03 17:17:46 +01:00
parent 21f869192c
commit fc2181d625
4 changed files with 477 additions and 24 deletions
@@ -25,6 +25,7 @@
 #pragma once

 #include "api/replay/rdcarray.h"
+#include "maths/vec.h"
 #include "spirv_common.h"
 #include "spirv_processor.h"

@@ -43,7 +44,17 @@ public:
  virtual void ReadConstantBufferValue(uint32_t set, uint32_t bind, uint32_t offset,
                                       uint32_t byteSize, void *dst) = 0;
  virtual void FillInputValue(ShaderVariable &var, ShaderBuiltin builtin, uint32_t location,
-                              uint32_t offset) = 0;
+                              uint32_t component) = 0;
+
+  struct DerivativeDeltas
+  {
+    Vec4f ddxcoarse;
+    Vec4f ddycoarse;
+    Vec4f ddxfine;
+    Vec4f ddyfine;
+  };
+
+  virtual DerivativeDeltas GetDerivative(uint32_t location, uint32_t component) = 0;
 };

 struct GlobalState
@@ -172,6 +183,9 @@ private:
  uint32_t AllocateVariable(const Decorations &varDecorations, const Decorations &curDecorations,
                            DebugVariableType sourceVarType, const rdcstr &sourceName,
                            uint32_t offset, const DataType &inType, ShaderVariable &outVar);
+  uint32_t ApplyDerivatives(uint32_t quadIndex, const Decorations &curDecorations,
+                            uint32_t location, const DataType &inType, ShaderVariable &outVar);
+
  void AddSourceVars(rdcarray<SourceVariableMapping> &sourceVars, const DataType &inType,
                     const rdcstr &sourceName, const rdcstr &varName, uint32_t &offset);
  void MakeSignatureNames(const rdcarray<SPIRVInterfaceAccess> &sigList, rdcarray<rdcstr> &sigNames);
@@ -264,7 +264,7 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *apiWrapper, const Shader
      // fill the interface variable
      AllocateVariable(decorations[v.id], decorations[v.id],
                       isInput ? DebugVariableType::Input : DebugVariableType::Variable, sourceName,
-                       0, dataTypes[type.InnerType()], var);
+                       decorations[v.id].location, dataTypes[type.InnerType()], var);

      for(size_t i = oldSize; i < globalSourceVars.size(); i++)
        globalSourceVars[i].signatureIndex =
@@ -385,6 +385,33 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *apiWrapper, const Shader
    workgroup[i].inputs = active.inputs;
    workgroup[i].outputs = active.outputs;
    workgroup[i].ids = active.ids;
+    // mark as inactive/helper lane
+    workgroup[i].done = true;
+  }
+
+  if(stage == ShaderStage::Pixel)
+  {
+    // apply derivatives to generate the correct inputs for the quad neighbours
+    for(uint32_t q = 0; q < workgroupSize; q++)
+    {
+      if(q == activeLaneIndex)
+        continue;
+
+      for(size_t i = 0; i < inputIDs.size(); i++)
+      {
+        Id id = inputIDs[i];
+
+        const DataType &type = dataTypes[idTypes[id]];
+
+        // global variables should all be pointers into opaque storage
+        RDCASSERT(type.type == DataType::PointerType);
+
+        const DataType &innertype = dataTypes[type.InnerType()];
+
+        ApplyDerivatives(q, decorations[id], decorations[id].location, innertype,
+                         workgroup[q].inputs[i]);
+      }
+    }
  }

  return ret;
@@ -980,10 +1007,21 @@ uint32_t Debugger::AllocateVariable(const Decorations &varDecorations,
  if(sourceVarType == DebugVariableType::Input)
  {
    uint32_t location = genLocations ? offset : 0;
+
+    uint32_t component = 0;
+    for(const DecorationAndParamData &dec : curDecorations.others)
+    {
+      if(dec.value == Decoration::Component)
+      {
+        component = dec.component;
+        break;
+      }
+    }
+
    apiWrapper->FillInputValue(
        outVar, builtin,
        (curDecorations.flags & Decorations::HasLocation) ? curDecorations.location : location,
-        (curDecorations.flags & Decorations::HasOffset) ? curDecorations.offset : 0);
+        component);
  }
  else if(sourceVarType == DebugVariableType::Constant)
  {
@@ -1044,6 +1082,226 @@ uint32_t Debugger::AllocateVariable(const Decorations &varDecorations,
  return outVar.rows;
 }

+uint32_t Debugger::ApplyDerivatives(uint32_t quadIndex, const Decorations &curDecorations,
+                                    uint32_t location, const DataType &inType, ShaderVariable &outVar)
+{
+  switch(inType.type)
+  {
+    case DataType::PointerType:
+    {
+      RDCERR("Pointers not supported in interface variables");
+      return 0;
+    }
+    case DataType::ScalarType:
+    case DataType::VectorType:
+    case DataType::MatrixType: break;
+    case DataType::StructType:
+    {
+      uint32_t childLocation = 0;
+      for(int32_t i = 0; i < inType.children.count(); i++)
+      {
+        const Decorations &childDecorations = inType.children[i].decorations;
+
+        uint32_t locations = ApplyDerivatives(quadIndex, childDecorations, location + childLocation,
+                                              dataTypes[inType.children[i].type], outVar.members[i]);
+
+        childLocation += locations;
+      }
+      return childLocation;
+    }
+    case DataType::ArrayType:
+    {
+      uint32_t childLocation = 0;
+
+      ShaderVariable len = GetActiveLane().ids[inType.length];
+      for(uint32_t i = 0; i < len.value.u.x; i++)
+      {
+        uint32_t locations = ApplyDerivatives(quadIndex, curDecorations, location + childLocation,
+                                              dataTypes[inType.InnerType()], outVar.members[i]);
+
+        childLocation += locations;
+      }
+      return childLocation;
+    }
+    case DataType::ImageType:
+    case DataType::SamplerType:
+    case DataType::SampledImageType:
+    case DataType::UnknownType:
+    {
+      RDCERR("Unexpected variable type %d", inType.type);
+      return 0;
+    }
+  }
+
+  // only floats have derivatives
+  if(outVar.type == VarType::Float)
+  {
+    uint32_t component = 0;
+    for(const DecorationAndParamData &dec : curDecorations.others)
+    {
+      if(dec.value == Decoration::Component)
+      {
+        component = dec.component;
+        break;
+      }
+    }
+
+    // We make the assumption that the coarse derivatives are generated from (0,0) in the quad, and
+    // fine derivatives are generated from the destination index and its neighbours in X and Y.
+    // This isn't spec'd but we must assume something and this will hopefully get us closest to
+    // reproducing actual results.
+    //
+    // For debugging, we need members of the quad to be able to generate coarse and fine
+    // derivatives.
+    //
+    // For (0,0) we only need the coarse derivatives to get our neighbours (1,0) and (0,1) which
+    // will give us coarse and fine derivatives being identical.
+    //
+    // For the others we will need to use a combination of coarse and fine derivatives to get the
+    // diagonal element in the quad. In the examples below, remember that the quad indices are:
+    //
+    // +---+---+
+    // | 0 | 1 |
+    // +---+---+
+    // | 2 | 3 |
+    // +---+---+
+    //
+    // And that we have definitions of the derivatives:
+    //
+    // ddx_coarse = (1,0) - (0,0)
+    // ddy_coarse = (0,1) - (0,0)
+    //
+    // i.e. the same for all members of the quad
+    //
+    // ddx_fine   = (x,y) - (1-x,y)
+    // ddy_fine   = (x,y) - (x,1-y)
+    //
+    // i.e. the difference to the neighbour of our desired invocation (the one we have the actual
+    // inputs for, from gathering above).
+    //
+    // So e.g. if our thread is at (1,1) destIdx = 3
+    //
+    // (1,0) = (1,1) - ddx_fine
+    // (0,1) = (1,1) - ddy_fine
+    // (0,0) = (1,1) - ddy_fine - ddx_coarse
+    //
+    // and ddy_coarse is unused. For (1,0) destIdx = 1:
+    //
+    // (1,1) = (1,0) + ddy_fine
+    // (0,1) = (1,0) - ddx_coarse + ddy_coarse
+    // (0,0) = (1,0) - ddx_coarse
+    //
+    // and ddx_fine is unused (it's identical to ddx_coarse anyway)
+
+    if(curDecorations.flags & Decorations::HasLocation)
+      location = curDecorations.location;
+
+    DebugAPIWrapper::DerivativeDeltas derivs = apiWrapper->GetDerivative(location, component);
+
+    Vec4f &dst = *(Vec4f *)outVar.value.fv;
+
+    // in the diagrams below * marks the active lane index.
+    //
+    //   V and ^ == coarse ddy
+    //   , and ` == fine ddy
+    //   < and > == coarse ddx
+    //   { and } == fine ddx
+    //
+    // We are basically making one or two cardinal direction moves from the starting point
+    // (activeLaneIndex) to the end point (quadIndex).
+    RDCASSERTNOTEQUAL(activeLaneIndex, quadIndex);
+
+    switch(activeLaneIndex)
+    {
+      case 0:
+      {
+        // +---+---+
+        // |*0 > 1 |
+        // +-V-+-V-+
+        // | 2 | 3 |
+        // +---+---+
+        switch(quadIndex)
+        {
+          case 0: break;
+          case 1: dst += derivs.ddxcoarse; break;
+          case 2: dst += derivs.ddycoarse; break;
+          case 3:
+            dst += derivs.ddxcoarse;
+            dst += derivs.ddycoarse;
+            break;
+          default: break;
+        }
+        break;
+      }
+      case 1:
+      {
+        // we need to use fine to get from 1 to 3 as coarse only ever involves 0->1 and 0->2
+        // +---+---+
+        // | 0 < 1*|
+        // +-V-+-,-+
+        // | 2 | 3 |
+        // +---+---+
+        switch(quadIndex)
+        {
+          case 0: dst -= derivs.ddxcoarse; break;
+          case 1: break;
+          case 2:
+            dst -= derivs.ddxcoarse;
+            dst += derivs.ddycoarse;
+            break;
+          case 3: dst += derivs.ddyfine; break;
+          default: break;
+        }
+        break;
+      }
+      case 2:
+      {
+        // +---+---+
+        // | 0 > 1 |
+        // +-^-+---+
+        // |*2 } 3 |
+        // +---+---+
+        switch(quadIndex)
+        {
+          case 0: dst -= derivs.ddycoarse; break;
+          case 1:
+            dst -= derivs.ddycoarse;
+            dst += derivs.ddxcoarse;
+            break;
+          case 2: break;
+          case 3: dst += derivs.ddxfine; break;
+          default: break;
+        }
+        break;
+      }
+      case 3:
+      {
+        // +---+---+
+        // | 0 < 1 |
+        // +---+-`-+
+        // | 2 { 3*|
+        // +---+---+
+        switch(quadIndex)
+        {
+          case 0:
+            dst -= derivs.ddyfine;
+            dst -= derivs.ddxcoarse;
+            break;
+          case 1: dst -= derivs.ddyfine; break;
+          case 2: dst -= derivs.ddxfine; break;
+          case 3: break;
+          default: break;
+        }
+        break;
+      }
+      default: break;
+    }
+  }
+
+  // each row consumes a new location
+  return outVar.rows;
+}
+
 void Debugger::PreParse(uint32_t maxId)
 {
  Processor::PreParse(maxId);
@@ -61,7 +61,7 @@ public:
  }

  virtual void FillInputValue(ShaderVariable &var, ShaderBuiltin builtin, uint32_t location,
-                              uint32_t offset) override
+                              uint32_t component) override
  {
    if(builtin != ShaderBuiltin::Undefined)
    {
@@ -76,7 +76,8 @@ public:
      return;
    }

-    RDCASSERT(offset == 0);
+    // TODO handle components
+    RDCASSERT(component == 0);

    if(location < location_inputs.size())
    {
@@ -84,13 +85,28 @@ public:
      return;
    }

-    RDCERR("Couldn't get input for location=%u, offset=%u", location, offset);
+    RDCERR("Couldn't get input for %s at location=%u, component=%u", var.name.c_str(), location,
+           component);
+  }
+
+  virtual DerivativeDeltas GetDerivative(uint32_t location, uint32_t component) override
+  {
+    // TODO handle components
+    RDCASSERT(component == 0);
+
+    if(location < derivatives.size())
+      return derivatives[location];
+
+    RDCERR("Couldn't get derivative for location=%u, component=%u", location, component);
+    return DerivativeDeltas();
  }

  std::map<rdcpair<uint32_t, uint32_t>, bytebuf> cbuffers;
  std::map<ShaderBuiltin, ShaderVariable> builtin_inputs;
  rdcarray<ShaderVariable> location_inputs;

+  rdcarray<DerivativeDeltas> derivatives;
+
 private:
  WrappedVulkan *m_pDriver = NULL;
 };
@@ -113,6 +129,16 @@ enum class InputSpecConstant

 static const uint32_t validMagicNumber = 12345;

+struct PSHit
+{
+  Vec4f pos;
+  uint32_t prim;
+  uint32_t sample;
+  uint32_t valid;
+  uint32_t padding;
+  // PSInput base, ddx, ....
+};
+
 static void CreatePSInputFetcher(rdcarray<uint32_t> &fragspv, uint32_t &structStride,
                                 VulkanCreationInfo::ShaderModuleReflection &shadRefl,
                                 StorageMode storageMode, bool usePrimitiveID, bool useSampleID)
@@ -487,9 +513,6 @@ static void CreatePSInputFetcher(rdcarray<uint32_t> &fragspv, uint32_t &structSt
    member++;
  }

-  // we have 5 input structs, and two vectors for our data
-  structStride = sizeof(Vec4f) + sizeof(Vec4f) + structStride * 5;
-
  rdcspv::Id PSHitRTArray = editor.AddType(rdcspv::OpTypeRuntimeArray(editor.MakeId(), PSHit));

  editor.AddDecoration(rdcspv::OpDecorate(
@@ -1145,7 +1168,10 @@ ShaderDebugTrace *VulkanReplay::DebugPixel(uint32_t eventId, uint32_t x, uint32_

  m_pDriver->GetShaderCache()->MakeGraphicsPipelineInfo(graphicsInfo, state.graphics.pipeline);

-  VkDeviceSize feedbackStorageSize = overdrawLevels * structStride + sizeof(Vec4f) + 1024;
+  // struct size is PSHit header plus 5x structStride = base, ddxcoarse, ddycoarse, ddxfine, ddyfine
+  uint32_t structSize = sizeof(PSHit) + structStride * 5;
+
+  VkDeviceSize feedbackStorageSize = overdrawLevels * structSize + sizeof(Vec4f) + 1024;

  if(feedbackStorageSize > m_BindlessFeedback.FeedbackBuffer.sz)
  {
@@ -1387,27 +1413,150 @@ ShaderDebugTrace *VulkanReplay::DebugPixel(uint32_t eventId, uint32_t x, uint32_

  base += sizeof(Vec4f);

-  struct PSHit
-  {
-    Vec4f pos;
-    uint32_t prim;
-    uint32_t sample;
-    uint32_t valid;
-    uint32_t padding;
-    // PSInput base, ddx, ....
-  };
+  PSHit *winner = NULL;
+
+  RDCLOG("Got %u hits", numHits);
+
+  // if we encounter multiple hits at our destination pixel co-ord (or any other) we
+  // check to see if a specific primitive was requested (via primitive parameter not
+  // being set to ~0U). If it was, debug that pixel, otherwise do a best-estimate
+  // of which fragment was the last to successfully depth test and debug that, just by
+  // checking if the depth test is ordered and picking the final fragment in the series
+
+  // figure out the TL pixel's coords. Assume even top left (towards 0,0)
+  // this isn't spec'd but is a reasonable assumption.
+  int xTL = x & (~1);
+  int yTL = y & (~1);
+
+  // get the index of our desired pixel
+  int destIdx = (x - xTL) + 2 * (y - yTL);
+
+  VkCompareOp depthOp = pipe.depthCompareOp;
+
+  // depth tests disabled acts the same as always compare mode
+  if(!pipe.depthTestEnable)
+    depthOp = VK_COMPARE_OP_ALWAYS;

  for(uint32_t i = 0; i < numHits; i++)
  {
    PSHit *hit = (PSHit *)(base + structStride * i);

-    RDCLOG("Hit %u at %f, %f, %f, %f", i, hit->pos.x, hit->pos.y, hit->pos.z, hit->pos.w);
+    if(hit->valid != validMagicNumber)
+    {
+      RDCWARN("Hit %u doesn't have valid magic number");
+      continue;
+    }
+
+    // see if this hit is a closer match than the previous winner.
+
+    // if there's no previous winner it's clearly better
+    if(winner == NULL)
+    {
+      winner = hit;
+      continue;
+    }
+
+    // if we're looking for a specific primitive
+    if(primitive != ~0U)
+    {
+      // and this hit is a match and the winner isn't, it's better
+      if(winner->prim != primitive && hit->prim == primitive)
+      {
+        winner = hit;
+        continue;
+      }
+
+      // if the winner is a match and we're not, we can't be better so stop now
+      if(winner->prim == primitive && hit->prim != primitive)
+      {
+        continue;
+      }
+    }
+
+    // if we're looking for a particular sample, check that
+    if(sample != ~0U)
+    {
+      if(winner->sample != sample && hit->sample == sample)
+      {
+        winner = hit;
+        continue;
+      }
+
+      if(winner->sample == sample && hit->sample != sample)
+      {
+        continue;
+      }
+    }
+
+    // otherwise apply depth test
+    switch(depthOp)
+    {
+      case VK_COMPARE_OP_NEVER:
+      case VK_COMPARE_OP_EQUAL:
+      case VK_COMPARE_OP_NOT_EQUAL:
+      case VK_COMPARE_OP_ALWAYS:
+      default:
+        // don't emulate equal or not equal since we don't know the reference value. Take any hit
+        // (thus meaning the last hit)
+        winner = hit;
+        break;
+      case VK_COMPARE_OP_LESS:
+        if(hit->pos.z < winner->pos.z)
+          winner = hit;
+        break;
+      case VK_COMPARE_OP_LESS_OR_EQUAL:
+        if(hit->pos.z <= winner->pos.z)
+          winner = hit;
+        break;
+      case VK_COMPARE_OP_GREATER:
+        if(hit->pos.z > winner->pos.z)
+          winner = hit;
+        break;
+      case VK_COMPARE_OP_GREATER_OR_EQUAL:
+        if(hit->pos.z >= winner->pos.z)
+          winner = hit;
+        break;
+    }
  }

-  rdcspv::Debugger *debugger = new rdcspv::Debugger;
-  debugger->Parse(shader.spirv.GetSPIRV());
-  ShaderDebugTrace *ret = debugger->BeginDebug(apiWrapper, ShaderStage::Pixel, entryPoint, spec,
-                                               shadRefl.instructionLines, shadRefl.patchData, 0);
+  ShaderDebugTrace *ret = NULL;
+
+  if(winner)
+  {
+    rdcspv::Debugger *debugger = new rdcspv::Debugger;
+    debugger->Parse(shader.spirv.GetSPIRV());
+
+    // the data immediately follows the PSHit header. Every piece of data is vec4 aligned, and the
+    // output is in input signature order.
+    byte *PSInputs = (byte *)(winner + 1);
+    Vec4f *value = (Vec4f *)(PSInputs + 0 * structStride);
+    Vec4f *ddxcoarse = (Vec4f *)(PSInputs + 1 * structStride);
+    Vec4f *ddycoarse = (Vec4f *)(PSInputs + 2 * structStride);
+    Vec4f *ddxfine = (Vec4f *)(PSInputs + 3 * structStride);
+    Vec4f *ddyfine = (Vec4f *)(PSInputs + 4 * structStride);
+
+    rdcarray<ShaderVariable> &locations = apiWrapper->location_inputs;
+    for(size_t i = 0; i < shadRefl.refl.inputSignature.size(); i++)
+    {
+      const SigParameter &param = shadRefl.refl.inputSignature[i];
+      locations.resize(RDCMAX((uint32_t)locations.size(), param.regIndex + 1));
+      apiWrapper->derivatives.resize(RDCMAX((uint32_t)locations.size(), param.regIndex + 1));
+
+      memcpy(&locations[param.regIndex].value.uv, &value[i], sizeof(Vec4f));
+      memcpy(&apiWrapper->derivatives[param.regIndex].ddxcoarse, &ddxcoarse[i], sizeof(Vec4f));
+      memcpy(&apiWrapper->derivatives[param.regIndex].ddycoarse, &ddycoarse[i], sizeof(Vec4f));
+      memcpy(&apiWrapper->derivatives[param.regIndex].ddxfine, &ddxfine[i], sizeof(Vec4f));
+      memcpy(&apiWrapper->derivatives[param.regIndex].ddyfine, &ddyfine[i], sizeof(Vec4f));
+    }
+
+    ret = debugger->BeginDebug(apiWrapper, ShaderStage::Pixel, entryPoint, spec,
+                               shadRefl.instructionLines, shadRefl.patchData, destIdx);
+  }
+  else
+  {
+    RDCLOG("Didn't get any valid hit to debug");
+    delete apiWrapper;
+  }

  if(descpool != VK_NULL_HANDLE)
  {
@@ -109,6 +109,38 @@ inline Vec3f operator+=(Vec3f &a, const Vec3f &b)
  return a;
 }

+inline Vec4f operator*(const Vec4f &a, const float b)
+{
+  return Vec4f(a.x * b, a.y * b, a.z * b);
+}
+
+inline Vec4f operator+(const Vec4f &a, const Vec4f &b)
+{
+  return Vec4f(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+
+inline Vec4f operator-(const Vec4f &a)
+{
+  return Vec4f(-a.x, -a.y, -a.z);
+}
+
+inline Vec4f operator-(const Vec4f &a, const Vec4f &b)
+{
+  return a + (-b);
+}
+
+inline Vec4f operator-=(Vec4f &a, const Vec4f &b)
+{
+  a = a - b;
+  return a;
+}
+
+inline Vec4f operator+=(Vec4f &a, const Vec4f &b)
+{
+  a = a + b;
+  return a;
+}
+
 struct Vec4u
 {
  Vec4u(uint32_t X = 0, uint32_t Y = 0, uint32_t Z = 0, uint32_t W = 0)