renderdoc/renderdoc/driver/shaders/dxbc/dx_debug.cpp

/******************************************************************************
 * The MIT License (MIT)
 *
 * Copyright (c) 2024-2025 Baldur Karlsson
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 ******************************************************************************/

#include "dx_debug.h"
#include "common/formatting.h"
#include "driver/shaders/dxil/dxil_debug.h"
#include "dxbc_bytecode.h"
#include "dxbc_common.h"
#include "dxbc_container.h"
#include "dxbc_debug.h"

namespace DXDebug
{
void GatherInputDataForInitialValues(const DXBC::DXBCContainer *dxbc, InputFetcher &fetcher,
                                     const DXBC::DXBCContainer *prevdxbc,
                                     rdcarray<rdcstr> &floatInputs, rdcarray<rdcstr> &nonfloatInputs,
                                     rdcarray<rdcstr> &inputVarNames)
{
  rdcarray<DXBC::InterpolationMode> interpModes;

  const rdcarray<SigParameter> &stageInputSig = dxbc->GetReflection()->InputSig;
  rdcarray<SigParameter> emptySig;
  const rdcarray<SigParameter> &prevStageOutputSig =
      prevdxbc ? prevdxbc->GetReflection()->OutputSig : emptySig;

  if(dxbc->GetDXBCByteCode())
    DXBCDebug::GetInterpolationModeForInputParams(stageInputSig, dxbc->GetDXBCByteCode(),
                                                  interpModes);
  else
    DXILDebug::GetInterpolationModeForInputParams(stageInputSig, dxbc->GetDXILByteCode(),
                                                  interpModes);

  // When debugging a pixel shader, we need to get the initial values of each pixel shader
  // input for the pixel that we are debugging, from whichever the previous shader stage was
  // configured in the pipeline. This function returns the input element definitions, other
  // associated data, the HLSL definition to use when gathering pixel shader initial values,
  // and the laneDataBufferStride of that HLSL structure.

  // This function does not provide any HLSL definitions for additional metadata that may be
  // needed for gathering initial values, such as primitive ID, and also does not provide the
  // shader function body.

  fetcher.inputs.clear();
  floatInputs.clear();
  nonfloatInputs.clear();
  inputVarNames.clear();
  fetcher.hlsl += "struct Inputs\n{\n";
  rdcstr defines, copyFunc;
  fetcher.laneDataBufferStride = 0;

  copyFunc =
      "void CopyInputs(out Inputs OUT, in Inputs IN) {\n"
      "  OUT = (Inputs)0;\n";

  if(stageInputSig.empty() && dxbc->m_Type == DXBC::ShaderType::Pixel)
  {
    fetcher.hlsl += "float4 input_dummy : SV_Position;\n";
    fetcher.hlsl += "#define POSITION_VAR input_dummy\n";

    fetcher.inputs.push_back(InputElement(-1, 0, 4, ShaderBuiltin::Undefined, true));

    fetcher.laneDataBufferStride += 4;
  }

  // name, pair<start semantic index, end semantic index>
  rdcarray<rdcpair<rdcstr, rdcpair<uint32_t, uint32_t>>> arrays;

  uint32_t nextreg = 0;

  size_t numInputs = stageInputSig.size();
  inputVarNames.resize(numInputs);

  for(size_t i = 0; i < numInputs; i++)
  {
    const SigParameter &sig = stageInputSig[i];

    fetcher.hlsl += "  ";

    bool included = true;

    // handled specially to account for SV_ ordering
    if(sig.systemValue == ShaderBuiltin::MSAACoverage ||
       sig.systemValue == ShaderBuiltin::IsFrontFace ||
       sig.systemValue == ShaderBuiltin::MSAASampleIndex)
    {
      fetcher.hlsl += "//";
      included = false;
    }

    // it seems sometimes primitive ID can be included within inputs and isn't subject to the SV_
    // ordering restrictions - possibly to allow for geometry shaders to output the primitive ID as
    // an interpolant. Only comment it out if it's the last input.
    if(i + 1 == numInputs && sig.systemValue == ShaderBuiltin::PrimitiveIndex)
    {
      fetcher.hlsl += "//";
      included = false;
    }

    int arrayIndex = -1;

    for(size_t a = 0; a < arrays.size(); a++)
    {
      if(sig.semanticName == arrays[a].first && arrays[a].second.first <= sig.semanticIndex &&
         arrays[a].second.second >= sig.semanticIndex)
      {
        fetcher.hlsl += "//";
        included = false;
        arrayIndex = sig.semanticIndex - arrays[a].second.first;
      }
    }

    int missingreg = int(sig.regIndex) - int(nextreg);

    // fill in holes from output sig of previous shader if possible, to try and
    // ensure the same register order
    for(int dummy = 0; dummy < missingreg; dummy++)
    {
      bool filled = false;

      size_t numPrevOutputs = prevStageOutputSig.size();
      for(size_t os = 0; os < numPrevOutputs; os++)
      {
        if(prevStageOutputSig[os].regIndex == nextreg + dummy)
        {
          filled = true;
          VarType varType = prevStageOutputSig[os].varType;
          uint32_t bytesPerColumn = (varType != VarType::Half) ? 4 : 2;

          if(varType == VarType::Float)
            fetcher.hlsl += "float";
          else if(varType == VarType::Half)
            fetcher.hlsl += "half";
          else if(varType == VarType::SInt)
            fetcher.hlsl += "int";
          else if(varType == VarType::UInt)
            fetcher.hlsl += "uint";
          else
            RDCERR("Unexpected input signature type: %s",
                   ToStr(prevStageOutputSig[os].varType).c_str());

          int numCols = (prevStageOutputSig[os].regChannelMask & 0x1 ? 1 : 0) +
                        (prevStageOutputSig[os].regChannelMask & 0x2 ? 1 : 0) +
                        (prevStageOutputSig[os].regChannelMask & 0x4 ? 1 : 0) +
                        (prevStageOutputSig[os].regChannelMask & 0x8 ? 1 : 0);

          rdcstr name = prevStageOutputSig[os].semanticIdxName;
          fetcher.hlsl += ToStr((uint32_t)numCols) + " input_" + name + " : " + name + ";\n";

          uint32_t byteSize = AlignUp4(numCols * bytesPerColumn);
          fetcher.laneDataBufferStride += byteSize;

          fetcher.inputs.push_back(InputElement(-1, 0, byteSize / 4, ShaderBuiltin::Undefined, true));
        }
      }

      if(!filled)
      {
        rdcstr dummy_reg = "dummy_register";
        dummy_reg += ToStr((uint32_t)nextreg + dummy);
        fetcher.hlsl += "float4 var_" + dummy_reg + " : semantic_" + dummy_reg + ";\n";

        fetcher.inputs.push_back(InputElement(-1, 0, 4, ShaderBuiltin::Undefined, true));

        fetcher.laneDataBufferStride += 4 * sizeof(float);
      }
    }

    nextreg = sig.regIndex + 1;

    DXBC::InterpolationMode interpolation = interpModes[i];
    if(interpolation != DXBC::InterpolationMode::INTERPOLATION_UNDEFINED)
      fetcher.hlsl += ToStr(interpolation) + " ";
    fetcher.hlsl += ToStr(sig.varType);

    int numCols = (sig.regChannelMask & 0x1 ? 1 : 0) + (sig.regChannelMask & 0x2 ? 1 : 0) +
                  (sig.regChannelMask & 0x4 ? 1 : 0) + (sig.regChannelMask & 0x8 ? 1 : 0);

    rdcstr name = sig.semanticIdxName;

    // arrays of interpolators are handled really weirdly. They use cbuffer
    // packing rules where each new value is in a new register (rather than
    // e.g. 2 x float2 in a single register), but that's pointless because
    // you can't dynamically index into input registers.
    // If we declare those elements as a non-array, the float2s or floats
    // will be packed into registers and won't match up to the previous
    // shader.
    // HOWEVER to add an extra bit of fun, fxc will happily pack other
    // parameters not in the array into spare parts of the registers.
    //
    // So I think the upshot is that we can detect arrays reliably by
    // whenever we encounter a float or float2 at the start of a register,
    // search forward to see if the next register has an element that is the
    // same semantic name and one higher semantic index. If so, there's an
    // array, so keep searching to enumerate its length.
    // I think this should be safe if the packing just happens to place those
    // registers together.

    int arrayLength = 0;

    if(included && numCols <= 3 && (sig.regChannelMask & 0x1))
    {
      uint32_t nextIdx = sig.semanticIndex + 1;

      for(size_t j = i + 1; j < numInputs; j++)
      {
        const SigParameter &jSig = stageInputSig[j];

        // if we've found the 'next' semantic
        if(sig.semanticName == jSig.semanticName && nextIdx == jSig.semanticIndex)
        {
          int jNumCols = (jSig.regChannelMask & 0x1 ? 1 : 0) + (jSig.regChannelMask & 0x2 ? 1 : 0) +
                         (jSig.regChannelMask & 0x4 ? 1 : 0) + (jSig.regChannelMask & 0x8 ? 1 : 0);

          DXBC::InterpolationMode jInterp = interpModes[j];

          // if it's the same size, type, and interpolation mode, then it could potentially be
          // packed into an array. Check if it's using the first channel component to tell whether
          // it's tightly packed with another semantic.
          if(jNumCols == numCols && interpolation == jInterp && sig.varType == jSig.varType &&
             jSig.regChannelMask & 0x1)
          {
            if(arrayLength == 0)
              arrayLength = 2;
            else
              arrayLength++;

            // continue searching now
            nextIdx++;
            j = i + 1;
            continue;
          }
        }
      }

      if(arrayLength > 0)
        arrays.push_back(
            make_rdcpair(sig.semanticName, make_rdcpair((uint32_t)sig.semanticIndex, nextIdx - 1)));
    }

    if(included)
    {
      // in UAV structs, arrays are packed tightly, so just multiply by arrayLength
      fetcher.laneDataBufferStride += 4 * numCols * RDCMAX(1, arrayLength);
    }

    // as another side effect of the above, an element declared as a 1-length array won't be
    // detected but it WILL be put in its own register (not packed together), so detect this
    // case too.
    // Note we have to search *backwards* because we need to know if this register should have
    // been packed into the previous register, but wasn't. float/float2/float3 can be packed after
    // an array just fine, so long as the sum of their components doesn't exceed a register width
    if(included && i > 0 && arrayLength == 0)
    {
      const SigParameter &prev = stageInputSig[i - 1];

      if(prev.regIndex != sig.regIndex && prev.compCount + sig.compCount <= 4)
        arrayLength = 1;
    }

    // The compiler is also really annoying and will go to great lengths to rearrange elements
    // and screw up our declaration, to pack things together. E.g.:
    // float2 a : TEXCOORD1;
    // float4 b : TEXCOORD2;
    // float4 c : TEXCOORD3;
    // float2 d : TEXCOORD4;
    // the compiler will move d up and pack it into the last two components of a.
    // To prevent this, we look forward and backward to check that we aren't expecting to pack
    // with anything, and if not then we just make it a 1-length array to ensure no packing.
    // Note the regChannelMask & 0x1 means it is using .x, so it's not the tail-end of a pack
    if(included && arrayLength == 0 && numCols <= 3 && (sig.regChannelMask & 0x1))
    {
      if(i == numInputs - 1)
      {
        // the last element is never packed
        arrayLength = 1;
      }
      else
      {
        // if the next reg is using .x, it wasn't packed with us
        if(stageInputSig[i + 1].regChannelMask & 0x1)
          arrayLength = 1;
      }
    }

    rdcstr inputName = "input_" + name;
    fetcher.hlsl += ToStr((uint32_t)numCols) + " " + inputName;
    if(arrayLength > 0)
      fetcher.hlsl += "[" + ToStr(arrayLength) + "]";
    fetcher.hlsl += " : " + name;
    // DXIL does not allow redeclaring SV_ variables, any that we might need which could already be
    // in Inputs must be obtained from there and not redeclared in our entry point
    if(included)
    {
      if(sig.systemValue == ShaderBuiltin::Position)
        defines += "#define POSITION_VAR " + inputName + rdcstr(arrayLength > 0 ? "[0]" : "") + "\n";
      else if(sig.systemValue == ShaderBuiltin::PrimitiveIndex)
        defines += "#define PRIM_VAR " + inputName + rdcstr(arrayLength > 0 ? "[0]" : "") + "\n";
      else if(sig.systemValue == ShaderBuiltin::VertexIndex)
        defines += "#define VERT_VAR " + inputName + rdcstr(arrayLength > 0 ? "[0]" : "") + "\n";
      else if(sig.systemValue == ShaderBuiltin::InstanceIndex)
        defines += "#define INST_VAR " + inputName + rdcstr(arrayLength > 0 ? "[0]" : "") + "\n";
    }

    inputVarNames[i] = inputName;
    if(arrayLength > 0)
      inputVarNames[i] += StringFormat::Fmt("[%d]", RDCMAX(0, arrayIndex));

    if(included && sig.channelUsedMask != 0)
    {
      rdcarray<rdcstr> &outArray = sig.varType == VarType::Float ? floatInputs : nonfloatInputs;
      if(arrayLength == 0)
      {
        outArray.push_back(inputName);
      }
      else
      {
        for(int a = 0; a < arrayLength; a++)
          outArray.push_back(inputName + "[" + ToStr(a) + "]");
      }

      copyFunc += StringFormat::Fmt("  OUT.%s = IN.%s;\n", inputName.c_str(), inputName.c_str());
    }

    fetcher.hlsl += ";\n";

    int firstElem = sig.regChannelMask & 0x1   ? 0
                    : sig.regChannelMask & 0x2 ? 1
                    : sig.regChannelMask & 0x4 ? 2
                    : sig.regChannelMask & 0x8 ? 3
                                               : -1;
    uint32_t bytesPerColumn = (sig.varType != VarType::Half) ? 4 : 2;
    uint32_t byteSize = AlignUp4(numCols * bytesPerColumn);

    // arrays get added all at once (because in the struct data, they are contiguous even if
    // in the input signature they're not).
    if(arrayIndex < 0)
    {
      if(arrayLength == 0)
      {
        fetcher.inputs.push_back(
            InputElement(sig.regIndex, firstElem, byteSize / 4, sig.systemValue, included));
      }
      else
      {
        for(int a = 0; a < arrayLength; a++)
        {
          fetcher.inputs.push_back(
              InputElement(sig.regIndex + a, firstElem, byteSize / 4, sig.systemValue, included));
        }
      }
    }
  }

  copyFunc += "\n};\n\n";

  fetcher.hlsl += "};\n\n" + defines + "\n\n" + copyFunc;
}

void CreateLegacyInputFetcher(const DXBC::DXBCContainer *dxbc, const InputFetcherConfig &cfg,
                              const rdcarray<rdcstr> &floatInputs,
                              const rdcarray<rdcstr> &inputVarNames, InputFetcher &fetcher)
{
  fetcher.hitBufferStride =
      sizeof(DXDebug::DebugHit) +
      (sizeof(DXDebug::PSLaneData) + fetcher.laneDataBufferStride) * cfg.maxWaveSize;
  fetcher.laneDataBufferStride = 0;

  bool dxil = dxbc->GetDXILByteCode() != NULL;

  // work around NV driver bug - it miscompiles the quad swizzle helper sometimes, so use the wave op instead
  if(!dxil || !cfg.waveOps)
    fetcher.hlsl += GetEmbeddedResource(hlsl_quadswizzle_hlsl);
  else
    fetcher.hlsl +=
        "#define quadSwizzleHelper(value, quadLaneIndex, readIndex) "
        "QuadReadLaneAt(value, readIndex)\n";

  fetcher.hlsl += R"(
struct LaneData
{
  uint laneIndex;
  uint active;
  uint2 pad;

  float4 pixelPos;

  uint isHelper;
  uint quadId;
  uint quadLane;
  uint coverage;

  uint sample;
  uint primitive;
  uint isFrontFace;
  uint pad2;

  Inputs IN;
};

struct DebugHit
{
  // only used in the first instance
  uint numHits;
  float3 pos_depth; // xy position and depth

  float derivValid;
  uint quadLaneIndex;
  uint laneIndex;
  uint subgroupSize;

  uint sample;
  uint primitive;
  uint2 pad;

  uint4 globalBallot;
  uint4 helperBallot;

  LaneData lanes[4];
};

RWStructuredBuffer<DebugHit> HitBuffer : register(HITBUFFER);

// float4 is wasteful in some cases but it's easier than using ByteAddressBuffer and manual
// packing
RWBuffer<float4> EvalCacheBuffer : register(EVALCACHEBUFFER);

void ExtractInputs(Inputs IN
#ifndef POSITION_VAR
                     , float4 debug_pixelPos : SV_Position
#endif
#if USEPRIM && !defined(PRIM_VAR)
                     , uint primitive : SV_PrimitiveID
#endif
                     // sample, coverage and isFrontFace are deliberately omittted from the
                     // IN struct for SV_ ordering reasons
                     , uint sample : SV_SampleIndex
                     , uint coverage : SV_Coverage
                     , bool isFrontFace : SV_IsFrontFace)
{
#ifdef POSITION_VAR
  float4 debug_pixelPos = IN.POSITION_VAR;
#endif

#if USEPRIM && defined(PRIM_VAR)
  uint primitive = IN.PRIM_VAR;
#elif !USEPRIM
  uint primitive = 0;
#endif

  const uint quadLaneIndex = (2u * (uint(debug_pixelPos.y) & 1u)) + (uint(debug_pixelPos.x) & 1u);

  // grab our output slot
  uint idx = MAXHIT;
  if(abs(debug_pixelPos.x - DESTX) < 0.5f && abs(debug_pixelPos.y - DESTY) < 0.5f)
    InterlockedAdd(HitBuffer[0].numHits, 1, idx);
  idx = min(idx, MAXHIT);

  HitBuffer[idx].pos_depth = debug_pixelPos.xyz;

  HitBuffer[idx].derivValid = ddx(debug_pixelPos.x);

  HitBuffer[idx].primitive = primitive;

  HitBuffer[idx].sample = sample;

  HitBuffer[idx].quadLaneIndex = quadLaneIndex;
  HitBuffer[idx].laneIndex = quadLaneIndex;
  HitBuffer[idx].subgroupSize = 4;

  HitBuffer[idx].globalBallot = 0;
  HitBuffer[idx].helperBallot = 0;

  // replicate these across the quad, we assume they do not vary
  HitBuffer[idx].lanes[0].primitive = primitive;
  HitBuffer[idx].lanes[1].primitive = primitive;
  HitBuffer[idx].lanes[2].primitive = primitive;
  HitBuffer[idx].lanes[3].primitive = primitive;

  HitBuffer[idx].lanes[0].isFrontFace = isFrontFace;
  HitBuffer[idx].lanes[1].isFrontFace = isFrontFace;
  HitBuffer[idx].lanes[2].isFrontFace = isFrontFace;
  HitBuffer[idx].lanes[3].isFrontFace = isFrontFace;

  HitBuffer[idx].lanes[0].sample = sample;
  HitBuffer[idx].lanes[1].sample = sample;
  HitBuffer[idx].lanes[2].sample = sample;
  HitBuffer[idx].lanes[3].sample = sample;

  // quad pixelPos will be set with other derivatives for float inputs

  // for the simple quad case, only the desired thread is considered non-helper
  HitBuffer[idx].lanes[0].isHelper = 1u;
  HitBuffer[idx].lanes[1].isHelper = 1u;
  HitBuffer[idx].lanes[2].isHelper = 1u;
  HitBuffer[idx].lanes[3].isHelper = 1u;
  HitBuffer[idx].lanes[quadLaneIndex].isHelper = 0u;

  // and all threads are active
  HitBuffer[idx].lanes[0].active = 1u;
  HitBuffer[idx].lanes[1].active = 1u;
  HitBuffer[idx].lanes[2].active = 1u;
  HitBuffer[idx].lanes[3].active = 1u;

  // quadId is a single value that's unique for this quad and uniform across the quad. Degenerate
  // for the simple quad case
  uint quadId = 1000+quadSwizzleHelper(quadLaneIndex, quadLaneIndex, 0u);
  HitBuffer[idx].lanes[0].quadId = quadId;
  HitBuffer[idx].lanes[1].quadId = quadId;
  HitBuffer[idx].lanes[2].quadId = quadId;
  HitBuffer[idx].lanes[3].quadId = quadId;

  // per-quad lane identifier, degenerate for the simple quad case
  HitBuffer[idx].lanes[0].quadLane = 0;
  HitBuffer[idx].lanes[1].quadLane = 1;
  HitBuffer[idx].lanes[2].quadLane = 2;
  HitBuffer[idx].lanes[3].quadLane = 3;

  // coverage is handled with pixelPos as it can vary per-thread

  // start off with just copying all the inputs to all the quad. For float inputs or uints that may
  // vary across the quad we will quadSwizzle them
  CopyInputs(HitBuffer[idx].lanes[0].IN, IN);
  CopyInputs(HitBuffer[idx].lanes[1].IN, IN);
  CopyInputs(HitBuffer[idx].lanes[2].IN, IN);
  CopyInputs(HitBuffer[idx].lanes[3].IN, IN);
)";

  for(int q = 0; q < 4; q++)
  {
    fetcher.hlsl += StringFormat::Fmt(
        "  HitBuffer[idx].lanes[%i].pixelPos = "
        "quadSwizzleHelper(debug_pixelPos, quadLaneIndex, %i);\n",
        q, q);
    fetcher.hlsl += StringFormat::Fmt(
        "  HitBuffer[idx].lanes[%i].coverage = "
        "quadSwizzleHelper(coverage, quadLaneIndex, %i);\n",
        q, q);
  }

  for(size_t i = 0; i < floatInputs.size(); i++)
  {
    const rdcstr &name = floatInputs[i];
    for(int q = 0; q < 4; q++)
    {
      fetcher.hlsl += StringFormat::Fmt(
          "  HitBuffer[idx].lanes[%i].IN.%s = quadSwizzleHelper(IN.%s, quadLaneIndex, %i);\n", q,
          name.c_str(), name.c_str(), q);
    }
  }

  // if we're not rendering at MSAA, no need to fill the cache because evaluates will all return
  // the plain input anyway.
  if(cfg.outputSampleCount > 1)
  {
    if(dxbc->GetDXBCByteCode())
    {
      dxbc->GetDXBCByteCode()->CalculateEvalSampleCache(cfg, fetcher);
    }
    else
    {
      RDCWARN("TODO DXIL Pixel Shader Debugging support for MSAA Evaluate");
    }
  }

  if(!fetcher.evalSampleCacheData.empty())
  {
    fetcher.hlsl += StringFormat::Fmt("  uint stride = %zu;\n", fetcher.evalSampleCacheData.size());
    fetcher.hlsl += StringFormat::Fmt("  uint evalIdx = idx * stride * 4;\n");
    fetcher.hlsl += StringFormat::Fmt("  float4 evalCacheVal;\n");

    uint32_t evalIdx = 0;
    for(const SampleEvalCacheKey &key : fetcher.evalSampleCacheData)
    {
      uint32_t keyMask = 0;

      for(int32_t i = 0; i < key.numComponents; i++)
        keyMask |= (1 << (key.firstComponent + i));

      // find the name of the variable matching the operand, in the case of merged input variables.
      rdcstr name, swizzle = "xyzw";
      for(size_t i = 0; i < dxbc->GetReflection()->InputSig.size(); i++)
      {
        if(dxbc->GetReflection()->InputSig[i].regIndex == (uint32_t)key.inputRegisterIndex &&
           dxbc->GetReflection()->InputSig[i].systemValue == ShaderBuiltin::Undefined &&
           (dxbc->GetReflection()->InputSig[i].regChannelMask & keyMask) == keyMask)
        {
          name = inputVarNames[i];

          if(!name.empty())
            break;
        }
      }

      swizzle.resize(key.numComponents);

      if(name.empty())
      {
        RDCERR("Couldn't find matching input variable for v%d [%d:%d]", key.inputRegisterIndex,
               key.firstComponent, key.numComponents);
        fetcher.hlsl += StringFormat::Fmt("  EvalCacheBuffer[evalIdx+stride*0+%u] = 0;\n", evalIdx);
        fetcher.hlsl += StringFormat::Fmt("  EvalCacheBuffer[evalIdx+stride*1+%u] = 0;\n", evalIdx);
        fetcher.hlsl += StringFormat::Fmt("  EvalCacheBuffer[evalIdx+stride*2+%u] = 0;\n", evalIdx);
        fetcher.hlsl += StringFormat::Fmt("  EvalCacheBuffer[evalIdx+stride*3+%u] = 0;\n", evalIdx);
        evalIdx++;
        continue;
      }

      name = StringFormat::Fmt("IN.%s.%s", name.c_str(), swizzle.c_str());

      // we must write all components, so just swizzle the values - they'll be ignored later.
      rdcstr expandSwizzle = swizzle;
      while(expandSwizzle.size() < 4)
        expandSwizzle.push_back('x');

      if(key.sample >= 0)
      {
        fetcher.hlsl +=
            StringFormat::Fmt("  evalCacheVal = EvaluateAttributeAtSample(%s, %d).%s;\n",
                              name.c_str(), key.sample, expandSwizzle.c_str());
      }
      else
      {
        // we don't need to special-case EvaluateAttributeAtCentroid, since it's just a case with
        // 0,0
        fetcher.hlsl +=
            StringFormat::Fmt("  evalCacheVal = EvaluateAttributeSnapped(%s, int2(%d, %d)).%s;\n",
                              name.c_str(), key.offsetx, key.offsety, expandSwizzle.c_str());
      }

      fetcher.hlsl += StringFormat::Fmt(
          "  EvalCacheBuffer[evalIdx+stride*0+%u] = "
          "quadSwizzleHelper(evalCacheVal, quadLaneIndex, 0);\n",
          evalIdx);
      fetcher.hlsl += StringFormat::Fmt(
          "  EvalCacheBuffer[evalIdx+stride*1+%u] = "
          "quadSwizzleHelper(evalCacheVal, quadLaneIndex, 1);\n",
          evalIdx);
      fetcher.hlsl += StringFormat::Fmt(
          "  EvalCacheBuffer[evalIdx+stride*2+%u] = "
          "quadSwizzleHelper(evalCacheVal, quadLaneIndex, 2);\n",
          evalIdx);
      fetcher.hlsl += StringFormat::Fmt(
          "  EvalCacheBuffer[evalIdx+stride*3+%u] = "
          "quadSwizzleHelper(evalCacheVal, quadLaneIndex, 3);\n",
          evalIdx);

      evalIdx++;
    }
  }

  fetcher.hlsl += "\n}\n";
}

void CreateInputFetcher(const DXBC::DXBCContainer *dxbc, const DXBC::DXBCContainer *prevdxbc,
                        const InputFetcherConfig &cfg, InputFetcher &fetcher)
{
  if(cfg.fetchWorkgroup && dxbc->m_Type != DXBC::ShaderType::Compute)
  {
    RDCERR("Can only fetch workgroup inputs for Compute shaders");
    return;
  }

  bool usePrimitiveID = prevdxbc && ((prevdxbc->m_Type != DXBC::ShaderType::Geometry) &&
                                     (prevdxbc->m_Type != DXBC::ShaderType::Mesh));

  rdcarray<rdcstr> floatInputs;
  rdcarray<rdcstr> nonfloatInputs;
  rdcarray<rdcstr> inputVarNames;
  if(dxbc->m_Type == DXBC::ShaderType::Compute)
  {
    fetcher.hlsl += R"(
struct Inputs
{
};
void CopyInputs(out Inputs OUT, in Inputs IN) {}
)";
  }
  else
  {
    DXDebug::GatherInputDataForInitialValues(dxbc, fetcher, prevdxbc, floatInputs, nonfloatInputs,
                                             inputVarNames);
  }

  for(const InputElement &e : fetcher.inputs)
  {
    if(e.sysattribute == ShaderBuiltin::PrimitiveIndex)
    {
      usePrimitiveID = true;
      break;
    }
  }

  uint32_t waveSize = cfg.maxWaveSize;
  uint32_t reqWaveSize = dxbc->GetReflection()->WaveSize;
  if(reqWaveSize != 0)
  {
    if(reqWaveSize < waveSize)
      waveSize = reqWaveSize;
    else
      RDCERR("Invalid requested wave size %u vs device maximum of %u", reqWaveSize, cfg.maxWaveSize);
  }

  fetcher.numLanesPerHit = cfg.fetchWorkgroup ? cfg.groupSize : waveSize;

  fetcher.hlsl += StringFormat::Fmt(
      "#define STAGE_VS %u\n"
      "#define STAGE_PS %u\n"
      "#define STAGE_CS %u\n"
      "#define STAGE %u\n"
      "#define MAXHIT %u\n"
      "#define MAXWAVESIZE %u\n"
      "#define USEPRIM %u\n"
      "#define FETCH_WORKGROUP %u\n",
      DXBC::ShaderType::Vertex, DXBC::ShaderType::Pixel, DXBC::ShaderType::Compute, dxbc->m_Type,
      DXDebug::maxPixelHits, waveSize, usePrimitiveID ? 1 : 0, cfg.fetchWorkgroup);

  if(dxbc->m_Type == DXBC::ShaderType::Vertex)
  {
    fetcher.hlsl += StringFormat::Fmt(
        "#define DEST_VERT %u\n"
        "#define DEST_INST %u\n",
        cfg.vert, cfg.inst);
  }
  else if(dxbc->m_Type == DXBC::ShaderType::Pixel)
  {
    fetcher.hlsl += StringFormat::Fmt(
        "#define DESTX %u.5\n"
        "#define DESTY %u.5\n",
        cfg.x, cfg.y);
  }
  else if(dxbc->m_Type == DXBC::ShaderType::Compute)
  {
    fetcher.hlsl += StringFormat::Fmt(
        "#define DESTX %u\n"
        "#define DESTY %u\n"
        "#define DESTZ %u\n",
        cfg.threadid[0], cfg.threadid[1], cfg.threadid[2]);
    fetcher.hlsl += StringFormat::Fmt("#define NUMTHREADS %u,%u,%u\n",
                                      dxbc->GetReflection()->DispatchThreadsDimension[0],
                                      dxbc->GetReflection()->DispatchThreadsDimension[1],
                                      dxbc->GetReflection()->DispatchThreadsDimension[2]);
    fetcher.hlsl += StringFormat::Fmt(
        "#define GROUPX %u\n"
        "#define GROUPY %u\n"
        "#define GROUPZ %u\n",
        cfg.groupid[0], cfg.groupid[1], cfg.groupid[2]);
  }
  else
  {
    RDCERR("Unexpected type of shader");
  }

  if(cfg.uavspace == 0)
  {
    fetcher.hlsl += StringFormat::Fmt(
        "#define HITBUFFER u%u\n"
        "#define EVALCACHEBUFFER u%u\n"
        "#define LANEBUFFER u%u\n",
        cfg.uavslot, cfg.uavslot + 1, cfg.uavslot + 2);
  }
  else
  {
    fetcher.hlsl += StringFormat::Fmt(
        "#define HITBUFFER u%u, space%u\n"
        "#define EVALCACHEBUFFER u%u, space%u\n"
        "#define LANEBUFFER u%u, space%u\n",
        cfg.uavslot, cfg.uavspace, cfg.uavslot + 1, cfg.uavspace, cfg.uavslot + 2, cfg.uavspace);
  }

  fetcher.hlsl += "\n";

  if(waveSize == 4 && dxbc->m_Type == DXBC::ShaderType::Pixel)
    return CreateLegacyInputFetcher(dxbc, cfg, floatInputs, inputVarNames, fetcher);

  fetcher.hitBufferStride = sizeof(DXDebug::DebugHit);

  if(dxbc->m_Type == DXBC::ShaderType::Vertex)
    fetcher.laneDataBufferStride = sizeof(DXDebug::VSLaneData) + fetcher.laneDataBufferStride;
  else if(dxbc->m_Type == DXBC::ShaderType::Pixel)
    fetcher.laneDataBufferStride = sizeof(DXDebug::PSLaneData) + fetcher.laneDataBufferStride;
  else if(dxbc->m_Type == DXBC::ShaderType::Compute)
    fetcher.laneDataBufferStride = sizeof(DXDebug::CSLaneData) + fetcher.laneDataBufferStride;

  fetcher.hlsl += R"(
struct VSLaneData
{
#if STAGE == STAGE_VS
  uint inst;
  uint vert;
  uint2 pad;
#endif
};

struct PSLaneData
{
#if STAGE == STAGE_PS
  float4 pixelPos;

  uint isHelper;
  uint quadId;
  uint quadLane;
  uint coverage;

  uint sample;
  uint primitive;
  uint isFrontFace;
  uint pad2;
#endif
};

struct CSLaneData
{
#if STAGE == STAGE_CS
  uint3 threadid;
  uint activeSubgroup;
#endif
};

struct LaneData
{
  uint laneIndex;
  uint active;
  uint2 pad2;

  VSLaneData vs;
  PSLaneData ps;
  CSLaneData cs;

  Inputs IN;
};

struct DebugHit
{
  // only used in the first instance
  uint numHits;
  float3 pos_depth; // xy position and depth

  float derivValid;
  uint quadLaneIndex;
  uint laneIndex;
  uint subgroupSize;

  uint sample;
  uint primitive;
  uint2 pad;

  uint4 globalBallot;
  uint4 helperBallot;
};

RWStructuredBuffer<DebugHit> HitBuffer : register(HITBUFFER);
RWStructuredBuffer<LaneData> LaneBuffer : register(LANEBUFFER);

#if STAGE == STAGE_CS
[numthreads(NUMTHREADS)]
#endif

void ExtractInputs(Inputs IN

#if STAGE == STAGE_VS

  #ifndef VERT_VAR
                     , uint vert : SV_VertexID
  #endif
  #ifndef INST_VAR
                     , uint inst : SV_InstanceID
  #endif

#elif STAGE == STAGE_PS


  #ifndef POSITION_VAR
                     , float4 debug_pixelPos : SV_Position
  #endif
  #if USEPRIM && !defined(PRIM_VAR)
                     , uint primitive : SV_PrimitiveID
  #endif
                     // sample, coverage and isFrontFace are deliberately omittted from the
                     // IN struct for SV_ ordering reasons
                     , uint sample : SV_SampleIndex
                     , uint coverage : SV_Coverage
                     , bool isFrontFace : SV_IsFrontFace

#elif STAGE == STAGE_CS

                     , uint3 threadid : SV_GroupThreadID
                     , uint3 dtid : SV_DispatchThreadID
                     , uint3 groupid : SV_GroupID
                     , uint groupindex : SV_GroupIndex

#endif
)
{
#ifdef VERT_VAR
  uint vert = IN.VERT_VAR;
#endif

#ifdef INST_VAR
  uint inst = IN.INST_VAR;
#endif

#ifdef POSITION_VAR
  float4 debug_pixelPos = IN.POSITION_VAR;
#endif

#if USEPRIM && defined(PRIM_VAR)
  uint primitive = IN.PRIM_VAR;
#elif !USEPRIM && STAGE == STAGE_PS
  uint primitive = 0;
#endif

#if STAGE != STAGE_PS
  float4 debug_pixelPos = 0;
  uint primitive = 0;
  uint sample = 0;
  uint isFrontFace = 0;
#endif

  VSLaneData vs = (VSLaneData)0;
  PSLaneData ps = (PSLaneData)0;
  CSLaneData cs = (CSLaneData)0;

  uint isHelper = 0;
  uint quadLaneIndex = 0;
  uint quadId = 0;
  uint4 globalBallot = WaveActiveBallot(true);
  uint laneIndex = WaveGetLaneIndex();
  uint4 helperBallot = 0;
  float derivValid = 1.0f;

#if STAGE == STAGE_VS
  bool candidateThread = (vert == DEST_VERT && inst == DEST_INST);
  bool fetchWorkgroup = false;

  vs.vert = vert;
  vs.inst = inst;
#elif STAGE == STAGE_PS
  bool candidateThread = (abs(debug_pixelPos.x - DESTX) < 0.5f && abs(debug_pixelPos.y - DESTY) < 0.5f);
  bool fetchWorkgroup = false;

  quadLaneIndex = (2u * (uint(debug_pixelPos.y) & 1u)) + (uint(debug_pixelPos.x) & 1u);
  derivValid = ddx(debug_pixelPos.x);
  isHelper = IsHelperLane() ? 1 : 0;

  helperBallot = WaveActiveBallot(isHelper != 0);

  // quadId is a single value that's unique for this quad and uniform across the quad. Degenerate
  // for the simple quad case
  quadId = 1000+QuadReadLaneAt(laneIndex, 0u);

  LaneData helper0data = (LaneData)0;
  LaneData helper1data = (LaneData)0;
  LaneData helper2data = (LaneData)0;
  LaneData helper3data = (LaneData)0;

)";

  for(uint32_t q = 0; q < 4; q++)
  {
    fetcher.hlsl += StringFormat::Fmt("  // quad %u\n", q);
    fetcher.hlsl += "  {\n";
    fetcher.hlsl += StringFormat::Fmt(
        "    helper%udata.ps.pixelPos = QuadReadLaneAt(debug_pixelPos, %uu);\n", q, q);
    fetcher.hlsl +=
        StringFormat::Fmt("    helper%udata.ps.isHelper = QuadReadLaneAt(isHelper, %uu);\n", q, q);
    fetcher.hlsl += StringFormat::Fmt("    helper%udata.ps.quadId = quadId;\n", q);
    fetcher.hlsl += StringFormat::Fmt("    helper%udata.ps.quadLane = %uu;\n", q, q);
    fetcher.hlsl +=
        StringFormat::Fmt("    helper%udata.ps.coverage = QuadReadLaneAt(coverage, %uu);\n", q, q);
    fetcher.hlsl +=
        StringFormat::Fmt("    helper%udata.laneIndex = QuadReadLaneAt(laneIndex, %uu);\n", q, q);
    fetcher.hlsl += StringFormat::Fmt("    helper%udata.active = 1;\n", q, q);
    for(size_t i = 0; i < floatInputs.size(); i++)
    {
      const rdcstr &name = floatInputs[i];
      fetcher.hlsl += StringFormat::Fmt("    helper%udata.IN.%s = QuadReadLaneAt(IN.%s, %u);\n", q,
                                        name.c_str(), name.c_str(), q);
    }
    for(size_t i = 0; i < nonfloatInputs.size(); i++)
    {
      const rdcstr &name = nonfloatInputs[i];
      fetcher.hlsl += StringFormat::Fmt("    helper%udata.IN.%s = QuadReadLaneAt(IN.%s, %u);\n", q,
                                        name.c_str(), name.c_str(), q);
    }
    fetcher.hlsl += "  }\n\n";
  }

  fetcher.hlsl += R"(
  ps.pixelPos = debug_pixelPos;

  ps.isHelper = isHelper;
  ps.quadId = quadId;
  ps.quadLane = quadLaneIndex;
  ps.coverage = coverage;

  ps.sample = sample;
  ps.primitive = primitive;
  ps.isFrontFace = isFrontFace;
#elif STAGE == STAGE_CS
  bool candidateThread = (dtid.x == DESTX && dtid.y == DESTY && dtid.z == DESTZ);
  cs.threadid = threadid;
#endif

#if FETCH_WORKGROUP

#if STAGE != STAGE_CS
#error "Only compute shader fetches whole workgroup"
#endif // #if STAGE != STAGE_CS

  bool candidateGroup = (groupid.x == GROUPX && groupid.y == GROUPY && groupid.z == GROUPZ);
  if (!candidateGroup)
    return;

  bool activeSubgroup = WaveActiveAnyTrue(candidateThread);
  cs.activeSubgroup = activeSubgroup ? 1 : 0;
  if(candidateThread)
  {
    HitBuffer[0].numHits = 1;
    HitBuffer[0].pos_depth = debug_pixelPos.xyz;
    HitBuffer[0].derivValid = derivValid;
    HitBuffer[0].primitive = primitive;
    HitBuffer[0].sample = sample;
    HitBuffer[0].laneIndex = laneIndex;
    HitBuffer[0].quadLaneIndex = quadLaneIndex;
    HitBuffer[0].subgroupSize = WaveGetLaneCount();
    HitBuffer[0].globalBallot = globalBallot;
    HitBuffer[0].helperBallot = helperBallot;
  }
  // Use SV_GroupIndex as the output index
  LaneBuffer[groupindex].laneIndex = laneIndex;
  LaneBuffer[groupindex].active = 1;
  LaneBuffer[groupindex].cs = cs;
#else // #if FETCH_WORKGROUP
  bool activeSubgroup = WaveActiveAnyTrue(candidateThread);
#if STAGE == STAGE_CS
  cs.activeSubgroup = activeSubgroup ? 1 : 0;
#endif
  if (activeSubgroup)
  {
    if(isHelper == 0)
    {
      uint idx = MAXHIT;
      if(WaveIsFirstLane())
      {
        InterlockedAdd(HitBuffer[0].numHits, 1, idx);
      }
      idx = WaveReadLaneFirst(idx);
      if(idx < MAXHIT)
      {
        if(candidateThread)
        {
          HitBuffer[idx].pos_depth = debug_pixelPos.xyz;
          HitBuffer[idx].derivValid = derivValid;
          HitBuffer[idx].primitive = primitive;
          HitBuffer[idx].sample = sample;
          HitBuffer[idx].laneIndex = laneIndex;
          HitBuffer[idx].quadLaneIndex = quadLaneIndex;
          HitBuffer[idx].subgroupSize = WaveGetLaneCount();
          HitBuffer[idx].globalBallot = globalBallot;
          HitBuffer[idx].helperBallot = helperBallot;
        }

#if STAGE == STAGE_PS
        if(helper0data.ps.isHelper)
          LaneBuffer[idx*MAXWAVESIZE+helper0data.laneIndex] = helper0data;

        if(helper1data.ps.isHelper)
          LaneBuffer[idx*MAXWAVESIZE+helper1data.laneIndex] = helper1data;

        if(helper2data.ps.isHelper)
          LaneBuffer[idx*MAXWAVESIZE+helper2data.laneIndex] = helper2data;

        if(helper3data.ps.isHelper)
          LaneBuffer[idx*MAXWAVESIZE+helper3data.laneIndex] = helper3data;
#endif

        LaneBuffer[idx*MAXWAVESIZE+laneIndex].laneIndex = laneIndex;
        LaneBuffer[idx*MAXWAVESIZE+laneIndex].active = 1;
        LaneBuffer[idx*MAXWAVESIZE+laneIndex].vs = vs;
        LaneBuffer[idx*MAXWAVESIZE+laneIndex].ps = ps;
        LaneBuffer[idx*MAXWAVESIZE+laneIndex].cs = cs;
        CopyInputs(LaneBuffer[idx*MAXWAVESIZE+laneIndex].IN, IN);
      }
    }
  }
#endif // #if FETCH_WORKGROUP
}
)";
}

// "NaN has special handling. If one source operand is NaN, then the other source operand is
// returned. If both are NaN, any NaN representation is returned."

float dxbc_min(float a, float b)
{
  if(RDCISNAN(a))
    return b;

  if(RDCISNAN(b))
    return a;

  return a < b ? a : b;
}

double dxbc_min(double a, double b)
{
  if(RDCISNAN(a))
    return b;

  if(RDCISNAN(b))
    return a;

  return a < b ? a : b;
}

float dxbc_max(float a, float b)
{
  if(RDCISNAN(a))
    return b;

  if(RDCISNAN(b))
    return a;

  return a >= b ? a : b;
}

double dxbc_max(double a, double b)
{
  if(RDCISNAN(a))
    return b;

  if(RDCISNAN(b))
    return a;

  return a >= b ? a : b;
}

float round_ne(float x)
{
  if(!RDCISFINITE(x))
    return x;

  float rem = remainderf(x, 1.0f);

  return x - rem;
}

double round_ne(double x)
{
  if(!RDCISFINITE(x))
    return x;

  double rem = remainder(x, 1.0);

  return x - rem;
}

float flush_denorm(const float f)
{
  uint32_t x;
  memcpy(&x, &f, sizeof(f));

  // if any bit is set in the exponent, it's not denormal
  if(x & 0x7F800000)
    return f;

  // keep only the sign bit
  x &= 0x80000000;
  float ret;
  memcpy(&ret, &x, sizeof(ret));
  return ret;
}

uint32_t BitwiseReverseLSB16(uint32_t x)
{
  // Reverse the bits in x, then discard the lower half
  // https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
  x = ((x >> 1) & 0x55555555) | ((x & 0x55555555) << 1);
  x = ((x >> 2) & 0x33333333) | ((x & 0x33333333) << 2);
  x = ((x >> 4) & 0x0F0F0F0F) | ((x & 0x0F0F0F0F) << 4);
  x = ((x >> 8) & 0x00FF00FF) | ((x & 0x00FF00FF) << 8);
  return x << 16;
}

uint32_t PopCount(uint32_t x)
{
  // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
  x = x - ((x >> 1) & 0x55555555);
  x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
  return (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
}

void get_sample_position(uint32_t sampleIndex, uint32_t sampleCount, float *position)
{
  // assume standard sample pattern - this might not hold in all cases
  // http://msdn.microsoft.com/en-us/library/windows/desktop/ff476218(v=vs.85).aspx

  if(sampleIndex >= sampleCount)
  {
    // Per HLSL docs, if sampleIndex is out of bounds a zero vector is returned
    RDCWARN("sample index %u is out of bounds on resource bound to sample_pos (%u samples)",
            sampleIndex, sampleCount);
    position[0] = 0.0f;
    position[1] = 0.0f;
    position[2] = 0.0f;
    position[3] = 0.0f;
  }
  else
  {
    const float *sample_pattern = NULL;

// co-ordinates are given as (i,j) in 16ths of a pixel
#define _SMP(c) ((c) / 16.0f)

    if(sampleCount == 1)
    {
      sample_pattern = NULL;
    }
    else if(sampleCount == 2)
    {
      static const float pattern_2x[] = {
          _SMP(4.0f),
          _SMP(4.0f),
          _SMP(-4.0f),
          _SMP(-4.0f),
      };

      sample_pattern = &pattern_2x[0];
    }
    else if(sampleCount == 4)
    {
      static const float pattern_4x[] = {
          _SMP(-2.0f), _SMP(-6.0f), _SMP(6.0f), _SMP(-2.0f),
          _SMP(-6.0f), _SMP(2.0f),  _SMP(2.0f), _SMP(6.0f),
      };

      sample_pattern = &pattern_4x[0];
    }
    else if(sampleCount == 8)
    {
      static const float pattern_8x[] = {
          _SMP(1.0f),  _SMP(-3.0f), _SMP(-1.0f), _SMP(3.0f),  _SMP(5.0f),  _SMP(1.0f),
          _SMP(-3.0f), _SMP(-5.0f), _SMP(-5.0f), _SMP(5.0f),  _SMP(-7.0f), _SMP(-1.0f),
          _SMP(3.0f),  _SMP(7.0f),  _SMP(7.0f),  _SMP(-7.0f),
      };

      sample_pattern = &pattern_8x[0];
    }
    else if(sampleCount == 16)
    {
      static const float pattern_16x[] = {
          _SMP(1.0f),  _SMP(1.0f),  _SMP(-1.0f), _SMP(-3.0f), _SMP(-3.0f), _SMP(2.0f),  _SMP(4.0f),
          _SMP(-1.0f), _SMP(-5.0f), _SMP(-2.0f), _SMP(2.0f),  _SMP(5.0f),  _SMP(5.0f),  _SMP(3.0f),
          _SMP(3.0f),  _SMP(-5.0f), _SMP(-2.0f), _SMP(6.0f),  _SMP(0.0f),  _SMP(-7.0f), _SMP(-4.0f),
          _SMP(-6.0f), _SMP(-6.0f), _SMP(4.0f),  _SMP(-8.0f), _SMP(0.0f),  _SMP(7.0f),  _SMP(-4.0f),
          _SMP(6.0f),  _SMP(7.0f),  _SMP(-7.0f), _SMP(-8.0f),
      };

      sample_pattern = &pattern_16x[0];
    }
    else    // unsupported sample count
    {
      RDCERR("Unsupported sample count on resource for sample_pos: %u", sampleCount);
      sample_pattern = NULL;
    }

    if(sample_pattern == NULL)
    {
      position[0] = 0.0f;
      position[1] = 0.0f;
    }
    else
    {
      position[0] = sample_pattern[sampleIndex * 2 + 0];
      position[1] = sample_pattern[sampleIndex * 2 + 1];
    }
  }
#undef _SMP
}

};    // namespace DXDebug

#if ENABLED(ENABLE_UNIT_TESTS)

#include <limits>
#include "catch/catch.hpp"

using namespace DXDebug;

TEST_CASE("DXBC DXIL shader debugging helpers", "[program]")
{
  const float posinf = std::numeric_limits<float>::infinity();
  const float neginf = -std::numeric_limits<float>::infinity();
  const float nan = std::numeric_limits<float>::quiet_NaN();
  const float a = 1.0f;
  const float b = 2.0f;

  SECTION("dxbc_min")
  {
    CHECK(dxbc_min(neginf, neginf) == neginf);
    CHECK(dxbc_min(neginf, a) == neginf);
    CHECK(dxbc_min(neginf, posinf) == neginf);
    CHECK(dxbc_min(neginf, nan) == neginf);
    CHECK(dxbc_min(a, neginf) == neginf);
    CHECK(dxbc_min(a, b) == a);
    CHECK(dxbc_min(a, posinf) == a);
    CHECK(dxbc_min(a, nan) == a);
    CHECK(dxbc_min(posinf, neginf) == neginf);
    CHECK(dxbc_min(posinf, a) == a);
    CHECK(dxbc_min(posinf, posinf) == posinf);
    CHECK(dxbc_min(posinf, nan) == posinf);
    CHECK(dxbc_min(nan, neginf) == neginf);
    CHECK(dxbc_min(nan, a) == a);
    CHECK(dxbc_min(nan, posinf) == posinf);
    CHECK(RDCISNAN(dxbc_min(nan, nan)));
  };

  SECTION("dxbc_max")
  {
    CHECK(dxbc_max(neginf, neginf) == neginf);
    CHECK(dxbc_max(neginf, a) == a);
    CHECK(dxbc_max(neginf, posinf) == posinf);
    CHECK(dxbc_max(neginf, nan) == neginf);
    CHECK(dxbc_max(a, neginf) == a);
    CHECK(dxbc_max(a, b) == b);
    CHECK(dxbc_max(a, posinf) == posinf);
    CHECK(dxbc_max(a, nan) == a);
    CHECK(dxbc_max(posinf, neginf) == posinf);
    CHECK(dxbc_max(posinf, a) == posinf);
    CHECK(dxbc_max(posinf, posinf) == posinf);
    CHECK(dxbc_max(posinf, nan) == posinf);
    CHECK(dxbc_max(nan, neginf) == neginf);
    CHECK(dxbc_max(nan, a) == a);
    CHECK(dxbc_max(nan, posinf) == posinf);
    CHECK(RDCISNAN(dxbc_max(nan, nan)));
  };

  SECTION("test denorm flushing")
  {
    float foo = 3.141f;

    // check normal values
    CHECK(flush_denorm(0.0f) == 0.0f);
    CHECK(flush_denorm(foo) == foo);
    CHECK(flush_denorm(-foo) == -foo);

    // check NaN/inf values
    CHECK(RDCISNAN(flush_denorm(nan)));
    CHECK(flush_denorm(neginf) == neginf);
    CHECK(flush_denorm(posinf) == posinf);

    // check zero sign bit - bit more complex
    uint32_t negzero = 0x80000000U;
    float negzerof;
    memcpy(&negzerof, &negzero, sizeof(negzero));

    float flushed = flush_denorm(negzerof);
    CHECK(memcmp(&flushed, &negzerof, sizeof(negzerof)) == 0);

    // check that denormal values are flushed, preserving sign
    foo = 1.12104e-44f;
    CHECK(flush_denorm(foo) != foo);
    CHECK(flush_denorm(-foo) != -foo);
    CHECK(flush_denorm(foo) == 0.0f);
    flushed = flush_denorm(-foo);
    CHECK(memcmp(&flushed, &negzerof, sizeof(negzerof)) == 0);
  };
};

#endif    // ENABLED(ENABLE_UNIT_TESTS)