renderdoc/renderdoc/driver/d3d11/d3d11_shaderdebug.cpp

/******************************************************************************
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2018 Baldur Karlsson
 * Copyright (c) 2014 Crytek
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 ******************************************************************************/

#include "data/resource.h"
#include "driver/d3d11/d3d11_renderstate.h"
#include "driver/d3d11/d3d11_resources.h"
#include "driver/shaders/dxbc/dxbc_debug.h"
#include "maths/formatpacking.h"
#include "maths/vec.h"
#include "strings/string_utils.h"
#include "d3d11_context.h"
#include "d3d11_debug.h"
#include "d3d11_manager.h"
#include "d3d11_shader_cache.h"

// over this number of cycles and things get problematic
#define SHADER_DEBUG_WARN_THRESHOLD 100000

bool PromptDebugTimeout(DXBC::ProgramType prog, uint32_t cycleCounter)
{
  string msg = StringFormat::Fmt(
      "RenderDoc's shader debugging has been running for over %u cycles, which indicates either a "
      "very long-running loop, or possibly an infinite loop. Continuing could lead to extreme "
      "memory allocations, slow UI or even crashes. Would you like to abort debugging to see what "
      "has run so far?\n\n"
      "Hit yes to abort debugging. Note that loading the resulting trace could take several "
      "minutes.",
      cycleCounter);

  int ret = MessageBoxA(NULL, msg.c_str(), "Shader debugging timeout", MB_YESNO | MB_ICONWARNING);

  if(ret == IDYES)
    return true;

  return false;
}

ShaderDebug::State D3D11DebugManager::CreateShaderDebugState(ShaderDebugTrace &trace, int quadIdx,
                                                             DXBC::DXBCFile *dxbc, bytebuf *cbufData)
{
  using namespace DXBC;
  using namespace ShaderDebug;

  State initialState = State(quadIdx, &trace, dxbc, m_pDevice);

  // use pixel shader here to get inputs

  int32_t maxReg = -1;
  for(size_t i = 0; i < dxbc->m_InputSig.size(); i++)
    maxReg = RDCMAX(maxReg, (int32_t)dxbc->m_InputSig[i].regIndex);

  bool inputCoverage = false;

  for(size_t i = 0; i < dxbc->GetNumDeclarations(); i++)
  {
    const DXBC::ASMDecl &decl = dxbc->GetDeclaration(i);

    if(decl.declaration == OPCODE_DCL_INPUT && decl.operand.type == TYPE_INPUT_COVERAGE_MASK)
    {
      inputCoverage = true;
      break;
    }
  }

  if(maxReg >= 0 || inputCoverage)
  {
    trace.inputs.resize(maxReg + 1 + (inputCoverage ? 1 : 0));
    for(size_t i = 0; i < dxbc->m_InputSig.size(); i++)
    {
      SigParameter &sig = dxbc->m_InputSig[i];

      ShaderVariable v;

      v.name = StringFormat::Fmt("v%d (%s)", sig.regIndex, sig.semanticIdxName.c_str());
      v.rows = 1;
      v.columns = sig.regChannelMask & 0x8 ? 4 : sig.regChannelMask & 0x4
                                                     ? 3
                                                     : sig.regChannelMask & 0x2
                                                           ? 2
                                                           : sig.regChannelMask & 0x1 ? 1 : 0;

      if(sig.compType == CompType::UInt)
        v.type = VarType::UInt;
      else if(sig.compType == CompType::SInt)
        v.type = VarType::Int;

      if(trace.inputs[sig.regIndex].columns == 0)
        trace.inputs[sig.regIndex] = v;
      else
        trace.inputs[sig.regIndex].columns = RDCMAX(trace.inputs[sig.regIndex].columns, v.columns);
    }

    if(inputCoverage)
    {
      trace.inputs[maxReg + 1] = ShaderVariable("vCoverage", 0U, 0U, 0U, 0U);
      trace.inputs[maxReg + 1].columns = 1;
    }
  }

  uint32_t specialOutputs = 0;
  maxReg = -1;
  for(size_t i = 0; i < dxbc->m_OutputSig.size(); i++)
  {
    if(dxbc->m_OutputSig[i].regIndex == ~0U)
      specialOutputs++;
    else
      maxReg = RDCMAX(maxReg, (int32_t)dxbc->m_OutputSig[i].regIndex);
  }

  if(maxReg >= 0 || specialOutputs > 0)
  {
    initialState.outputs.resize(maxReg + 1 + specialOutputs);
    for(size_t i = 0; i < dxbc->m_OutputSig.size(); i++)
    {
      SigParameter &sig = dxbc->m_OutputSig[i];

      if(sig.regIndex == ~0U)
        continue;

      ShaderVariable v;

      v.name = StringFormat::Fmt("o%d (%s)", sig.regIndex, sig.semanticIdxName.c_str());
      v.rows = 1;
      v.columns = sig.regChannelMask & 0x8 ? 4 : sig.regChannelMask & 0x4
                                                     ? 3
                                                     : sig.regChannelMask & 0x2
                                                           ? 2
                                                           : sig.regChannelMask & 0x1 ? 1 : 0;

      if(initialState.outputs[sig.regIndex].columns == 0)
        initialState.outputs[sig.regIndex] = v;
      else
        initialState.outputs[sig.regIndex].columns =
            RDCMAX(initialState.outputs[sig.regIndex].columns, v.columns);
    }

    int32_t outIdx = maxReg + 1;

    for(size_t i = 0; i < dxbc->m_OutputSig.size(); i++)
    {
      SigParameter &sig = dxbc->m_OutputSig[i];

      if(sig.regIndex != ~0U)
        continue;

      ShaderVariable v;

      if(sig.systemValue == ShaderBuiltin::OutputControlPointIndex)
        v.name = "vOutputControlPointID";
      else if(sig.systemValue == ShaderBuiltin::DepthOutput)
        v.name = "oDepth";
      else if(sig.systemValue == ShaderBuiltin::DepthOutputLessEqual)
        v.name = "oDepthLessEqual";
      else if(sig.systemValue == ShaderBuiltin::DepthOutputGreaterEqual)
        v.name = "oDepthGreaterEqual";
      else if(sig.systemValue == ShaderBuiltin::MSAACoverage)
        v.name = "oMask";
      // if(sig.systemValue == TYPE_OUTPUT_CONTROL_POINT)							str = "oOutputControlPoint";
      else
      {
        RDCERR("Unhandled output: %s (%d)", sig.semanticName.c_str(), sig.systemValue);
        continue;
      }

      v.rows = 1;
      v.columns = sig.regChannelMask & 0x8 ? 4 : sig.regChannelMask & 0x4
                                                     ? 3
                                                     : sig.regChannelMask & 0x2
                                                           ? 2
                                                           : sig.regChannelMask & 0x1 ? 1 : 0;

      initialState.outputs[outIdx++] = v;
    }
  }

  trace.constantBlocks.resize(dxbc->m_CBuffers.size());
  for(size_t i = 0; i < dxbc->m_CBuffers.size(); i++)
  {
    if(dxbc->m_CBuffers[i].descriptor.type != CBuffer::Descriptor::TYPE_CBUFFER)
      continue;

    std::vector<ShaderVariable> vars;

    size_t dummy = 0;
    FillCBufferVariables("", dummy, dxbc->m_CBuffers[i].variables, vars, true,
                         cbufData[dxbc->m_CBuffers[i].reg]);

    trace.constantBlocks[i].members = vars;

    for(size_t c = 0; c < trace.constantBlocks[i].members.size(); c++)
      trace.constantBlocks[i].members[c].name =
          StringFormat::Fmt("cb%u[%u] (%s)", dxbc->m_CBuffers[i].reg, (uint32_t)c,
                            trace.constantBlocks[i].members[c].name.c_str());
  }

  initialState.Init();

  return initialState;
}

void D3D11DebugManager::CreateShaderGlobalState(ShaderDebug::GlobalState &global,
                                                DXBC::DXBCFile *dxbc, uint32_t UAVStartSlot,
                                                ID3D11UnorderedAccessView **UAVs,
                                                ID3D11ShaderResourceView **SRVs)
{
  for(int i = 0; UAVs != NULL && i + UAVStartSlot < D3D11_1_UAV_SLOT_COUNT; i++)
  {
    int dsti = i + UAVStartSlot;
    if(UAVs[i])
    {
      ID3D11Resource *res = NULL;
      UAVs[i]->GetResource(&res);

      global.uavs[dsti].hiddenCounter = GetStructCount(UAVs[i]);

      D3D11_UNORDERED_ACCESS_VIEW_DESC udesc;
      UAVs[i]->GetDesc(&udesc);

      DXGI_FORMAT format = DXGI_FORMAT_UNKNOWN;

      if(udesc.Format != DXGI_FORMAT_UNKNOWN)
        format = udesc.Format;

      if(format == DXGI_FORMAT_UNKNOWN)
      {
        if(WrappedID3D11Texture1D::IsAlloc(res))
        {
          D3D11_TEXTURE1D_DESC desc;
          ((WrappedID3D11Texture1D *)res)->GetDesc(&desc);
          format = desc.Format;
        }
        else if(WrappedID3D11Texture2D1::IsAlloc(res))
        {
          D3D11_TEXTURE2D_DESC desc;
          ((WrappedID3D11Texture2D1 *)res)->GetDesc(&desc);
          format = desc.Format;
        }
        else if(WrappedID3D11Texture3D1::IsAlloc(res))
        {
          D3D11_TEXTURE3D_DESC desc;
          ((WrappedID3D11Texture3D1 *)res)->GetDesc(&desc);
          format = desc.Format;
        }
      }

      if(format != DXGI_FORMAT_UNKNOWN)
      {
        ResourceFormat fmt = MakeResourceFormat(GetTypedFormat(udesc.Format));

        global.uavs[dsti].format.byteWidth = fmt.compByteWidth;
        global.uavs[dsti].format.numComps = fmt.compCount;
        global.uavs[dsti].format.fmt = fmt.compType;

        if(udesc.Format == DXGI_FORMAT_R11G11B10_FLOAT)
          global.uavs[dsti].format.byteWidth = 11;
        if(udesc.Format == DXGI_FORMAT_R10G10B10A2_UINT ||
           udesc.Format == DXGI_FORMAT_R10G10B10A2_UNORM)
          global.uavs[dsti].format.byteWidth = 10;
      }

      if(udesc.ViewDimension == D3D11_UAV_DIMENSION_BUFFER)
      {
        global.uavs[dsti].firstElement = udesc.Buffer.FirstElement;
        global.uavs[dsti].numElements = udesc.Buffer.NumElements;
      }

      if(res)
      {
        if(WrappedID3D11Buffer::IsAlloc(res))
        {
          GetBufferData((ID3D11Buffer *)res, 0, 0, global.uavs[dsti].data);
        }
        else
        {
          global.uavs[dsti].tex = true;

          uint32_t &rowPitch = global.uavs[dsti].rowPitch;
          uint32_t &depthPitch = global.uavs[dsti].depthPitch;

          bytebuf &data = global.uavs[dsti].data;

          if(udesc.ViewDimension == D3D11_UAV_DIMENSION_TEXTURE1D ||
             udesc.ViewDimension == D3D11_UAV_DIMENSION_TEXTURE1DARRAY)
          {
            D3D11_TEXTURE1D_DESC desc;
            ((WrappedID3D11Texture1D *)res)->GetDesc(&desc);

            desc.MiscFlags = 0;
            desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
            desc.BindFlags = 0;
            desc.Usage = D3D11_USAGE_STAGING;

            ID3D11Texture1D *stagingTex = NULL;
            m_pDevice->CreateTexture1D(&desc, NULL, &stagingTex);

            m_pImmediateContext->CopyResource(stagingTex, res);

            D3D11_MAPPED_SUBRESOURCE mapped;
            m_pImmediateContext->Map(stagingTex, udesc.Texture1D.MipSlice, D3D11_MAP_READ, 0,
                                     &mapped);

            rowPitch = 0;
            depthPitch = 0;
            size_t datasize = GetByteSize(desc.Width, 1, 1, desc.Format, udesc.Texture1D.MipSlice);

            uint32_t numSlices = 1;

            byte *srcdata = (byte *)mapped.pData;
            if(udesc.ViewDimension == D3D11_UAV_DIMENSION_TEXTURE1DARRAY)
            {
              rowPitch = mapped.RowPitch;
              srcdata += udesc.Texture1DArray.FirstArraySlice * rowPitch;
              numSlices = udesc.Texture1DArray.ArraySize;
              datasize = numSlices * rowPitch;
            }

            data.resize(datasize);

            // copy with all padding etc intact
            memcpy(&data[0], srcdata, datasize);

            m_pImmediateContext->Unmap(stagingTex, udesc.Texture1D.MipSlice);

            SAFE_RELEASE(stagingTex);
          }
          else if(udesc.ViewDimension == D3D11_UAV_DIMENSION_TEXTURE2D ||
                  udesc.ViewDimension == D3D11_UAV_DIMENSION_TEXTURE2DARRAY)
          {
            D3D11_TEXTURE2D_DESC desc;
            ((WrappedID3D11Texture2D1 *)res)->GetDesc(&desc);

            desc.MiscFlags = 0;
            desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
            desc.BindFlags = 0;
            desc.Usage = D3D11_USAGE_STAGING;

            ID3D11Texture2D *stagingTex = NULL;
            m_pDevice->CreateTexture2D(&desc, NULL, &stagingTex);

            m_pImmediateContext->CopyResource(stagingTex, res);

            // MipSlice in union is shared between Texture2D and Texture2DArray unions, so safe to
            // use either
            D3D11_MAPPED_SUBRESOURCE mapped;
            m_pImmediateContext->Map(stagingTex, udesc.Texture2D.MipSlice, D3D11_MAP_READ, 0,
                                     &mapped);

            rowPitch = mapped.RowPitch;
            depthPitch = 0;
            size_t datasize = rowPitch * RDCMAX(1U, desc.Height >> udesc.Texture2D.MipSlice);

            uint32_t numSlices = 1;

            byte *srcdata = (byte *)mapped.pData;
            if(udesc.ViewDimension == D3D11_UAV_DIMENSION_TEXTURE2DARRAY)
            {
              depthPitch = mapped.DepthPitch;
              srcdata += udesc.Texture2DArray.FirstArraySlice * depthPitch;
              numSlices = udesc.Texture2DArray.ArraySize;
              datasize = numSlices * depthPitch;
            }

            // copy with all padding etc intact
            data.resize(datasize);

            memcpy(&data[0], srcdata, datasize);

            m_pImmediateContext->Unmap(stagingTex, udesc.Texture2D.MipSlice);

            SAFE_RELEASE(stagingTex);
          }
          else if(udesc.ViewDimension == D3D11_UAV_DIMENSION_TEXTURE3D)
          {
            D3D11_TEXTURE3D_DESC desc;
            ((WrappedID3D11Texture3D1 *)res)->GetDesc(&desc);

            desc.MiscFlags = 0;
            desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
            desc.BindFlags = 0;
            desc.Usage = D3D11_USAGE_STAGING;

            ID3D11Texture3D *stagingTex = NULL;
            m_pDevice->CreateTexture3D(&desc, NULL, &stagingTex);

            m_pImmediateContext->CopyResource(stagingTex, res);

            // MipSlice in union is shared between Texture2D and Texture2DArray unions, so safe to
            // use either
            D3D11_MAPPED_SUBRESOURCE mapped;
            m_pImmediateContext->Map(stagingTex, udesc.Texture3D.MipSlice, D3D11_MAP_READ, 0,
                                     &mapped);

            rowPitch = mapped.RowPitch;
            depthPitch = mapped.DepthPitch;

            byte *srcdata = (byte *)mapped.pData;
            srcdata += udesc.Texture3D.FirstWSlice * mapped.DepthPitch;
            uint32_t numSlices = udesc.Texture3D.WSize;
            size_t datasize = depthPitch * numSlices;

            data.resize(datasize);

            // copy with all padding etc intact
            memcpy(&data[0], srcdata, datasize);

            m_pImmediateContext->Unmap(stagingTex, udesc.Texture3D.MipSlice);

            SAFE_RELEASE(stagingTex);
          }
        }
      }

      SAFE_RELEASE(res);
    }
  }

  for(int i = 0; SRVs != NULL && i < D3D11_COMMONSHADER_INPUT_RESOURCE_SLOT_COUNT; i++)
  {
    if(SRVs[i])
    {
      ID3D11Resource *res = NULL;
      SRVs[i]->GetResource(&res);

      D3D11_SHADER_RESOURCE_VIEW_DESC sdesc;
      SRVs[i]->GetDesc(&sdesc);

      if(sdesc.Format != DXGI_FORMAT_UNKNOWN)
      {
        ResourceFormat fmt = MakeResourceFormat(sdesc.Format);

        global.srvs[i].format.byteWidth = fmt.compByteWidth;
        global.srvs[i].format.numComps = fmt.compCount;
        global.srvs[i].format.fmt = fmt.compType;

        if(sdesc.Format == DXGI_FORMAT_R11G11B10_FLOAT)
          global.srvs[i].format.byteWidth = 11;
        if(sdesc.Format == DXGI_FORMAT_R10G10B10A2_UINT ||
           sdesc.Format == DXGI_FORMAT_R10G10B10A2_UNORM)
          global.srvs[i].format.byteWidth = 10;
      }
      else
      {
        D3D11_RESOURCE_DIMENSION dim;
        res->GetType(&dim);

        if(dim == D3D11_RESOURCE_DIMENSION_BUFFER)
        {
          ID3D11Buffer *buf = (ID3D11Buffer *)res;
          D3D11_BUFFER_DESC bufdesc;
          buf->GetDesc(&bufdesc);

          global.srvs[i].format.stride = bufdesc.StructureByteStride;

          // if we didn't get a type from the SRV description, try to pull it from the declaration
          for(const DXBC::ShaderInputBind &bind : dxbc->m_SRVs)
          {
            if(bind.reg == (uint32_t)i && bind.dimension == DXBC::ShaderInputBind::DIM_BUFFER &&
               bind.retType < DXBC::RETURN_TYPE_MIXED && bind.retType != DXBC::RETURN_TYPE_UNKNOWN)
            {
              global.srvs[i].format.byteWidth = 4;
              global.srvs[i].format.numComps = bind.numSamples;

              if(bind.retType == DXBC::RETURN_TYPE_UNORM)
                global.srvs[i].format.fmt = CompType::UNorm;
              else if(bind.retType == DXBC::RETURN_TYPE_SNORM)
                global.srvs[i].format.fmt = CompType::SNorm;
              else if(bind.retType == DXBC::RETURN_TYPE_UINT)
                global.srvs[i].format.fmt = CompType::UInt;
              else if(bind.retType == DXBC::RETURN_TYPE_SINT)
                global.srvs[i].format.fmt = CompType::SInt;
              else
                global.srvs[i].format.fmt = CompType::Float;

              break;
            }
          }
        }
      }

      if(sdesc.ViewDimension == D3D11_SRV_DIMENSION_BUFFER)
      {
        // I know this isn't what the docs say, but as best as I can tell
        // this is how it's used.
        global.srvs[i].firstElement = sdesc.Buffer.FirstElement;
        global.srvs[i].numElements = sdesc.Buffer.NumElements;
      }
      else if(sdesc.ViewDimension == D3D11_SRV_DIMENSION_BUFFEREX)
      {
        global.srvs[i].firstElement = sdesc.BufferEx.FirstElement;
        global.srvs[i].numElements = sdesc.BufferEx.NumElements;
      }

      if(res)
      {
        if(WrappedID3D11Buffer::IsAlloc(res))
        {
          GetBufferData((ID3D11Buffer *)res, 0, 0, global.srvs[i].data);
        }
      }

      SAFE_RELEASE(res);
    }
  }

  for(size_t i = 0; i < dxbc->GetNumDeclarations(); i++)
  {
    const DXBC::ASMDecl &decl = dxbc->GetDeclaration(i);

    if(decl.declaration == DXBC::OPCODE_DCL_THREAD_GROUP_SHARED_MEMORY_RAW ||
       decl.declaration == DXBC::OPCODE_DCL_THREAD_GROUP_SHARED_MEMORY_STRUCTURED)
    {
      uint32_t slot = (uint32_t)decl.operand.indices[0].index;

      if(global.groupshared.size() <= slot)
      {
        global.groupshared.resize(slot + 1);

        ShaderDebug::GlobalState::groupsharedMem &mem = global.groupshared[slot];

        mem.structured = (decl.declaration == DXBC::OPCODE_DCL_THREAD_GROUP_SHARED_MEMORY_STRUCTURED);

        mem.count = decl.count;
        if(mem.structured)
          mem.bytestride = decl.stride;
        else
          mem.bytestride = 4;    // raw groupshared is implicitly uint32s

        mem.data.resize(mem.bytestride * mem.count);
      }
    }
  }
}

// struct that saves pointers as we iterate through to where we ultimately
// want to copy the data to
struct DataOutput
{
  DataOutput(int regster, int element, int numWords, ShaderBuiltin attr, bool inc)
  {
    reg = regster;
    elem = element;
    numwords = numWords;
    sysattribute = attr;
    included = inc;
  }

  int reg;
  int elem;
  ShaderBuiltin sysattribute;

  int numwords;

  bool included;
};

struct DebugHit
{
  uint32_t numHits;
  float posx;
  float posy;
  float depth;
  uint32_t primitive;
  uint32_t isFrontFace;
  uint32_t sample;
  uint32_t coverage;
  uint32_t rawdata;    // arbitrary, depending on shader
};

ShaderDebugTrace D3D11Replay::DebugVertex(uint32_t eventId, uint32_t vertid, uint32_t instid,
                                          uint32_t idx, uint32_t instOffset, uint32_t vertOffset)
{
  using namespace DXBC;
  using namespace ShaderDebug;

  D3D11MarkerRegion debugpixRegion(
      StringFormat::Fmt("DebugVertex @ %u of (%u,%u,%u)", eventId, vertid, instid, idx));

  ShaderDebugTrace empty;

  const DrawcallDescription *draw = m_pDevice->GetDrawcall(eventId);

  D3D11RenderStateTracker tracker(m_pImmediateContext);

  ID3D11VertexShader *stateVS = NULL;
  m_pImmediateContext->VSGetShader(&stateVS, NULL, NULL);

  WrappedID3D11Shader<ID3D11VertexShader> *vs = (WrappedID3D11Shader<ID3D11VertexShader> *)stateVS;

  SAFE_RELEASE(stateVS);

  if(!vs)
    return empty;

  DXBCFile *dxbc = vs->GetDXBC();

  if(!dxbc)
    return empty;

  dxbc->GetDisassembly();

  D3D11RenderState *rs = m_pImmediateContext->GetCurrentPipelineState();

  vector<D3D11_INPUT_ELEMENT_DESC> inputlayout = m_pDevice->GetLayoutDesc(rs->IA.Layout);

  set<UINT> vertexbuffers;
  uint32_t trackingOffs[32] = {0};

  UINT MaxStepRate = 1U;

  // need special handling for other step rates
  for(size_t i = 0; i < inputlayout.size(); i++)
  {
    if(inputlayout[i].InputSlotClass == D3D11_INPUT_PER_INSTANCE_DATA &&
       inputlayout[i].InstanceDataStepRate < draw->numInstances)
      MaxStepRate = RDCMAX(inputlayout[i].InstanceDataStepRate, MaxStepRate);

    UINT slot =
        RDCCLAMP(inputlayout[i].InputSlot, 0U, UINT(D3D11_IA_VERTEX_INPUT_RESOURCE_SLOT_COUNT - 1));

    vertexbuffers.insert(slot);

    if(inputlayout[i].AlignedByteOffset == ~0U)
    {
      inputlayout[i].AlignedByteOffset = trackingOffs[slot];
    }
    else
    {
      trackingOffs[slot] = inputlayout[i].AlignedByteOffset;
    }

    ResourceFormat fmt = MakeResourceFormat(inputlayout[i].Format);

    trackingOffs[slot] += fmt.compByteWidth * fmt.compCount;
  }

  bytebuf vertData[D3D11_IA_VERTEX_INPUT_RESOURCE_SLOT_COUNT];
  bytebuf *instData = new bytebuf[MaxStepRate * D3D11_IA_VERTEX_INPUT_RESOURCE_SLOT_COUNT];
  bytebuf staticData[D3D11_IA_VERTEX_INPUT_RESOURCE_SLOT_COUNT];

  for(auto it = vertexbuffers.begin(); it != vertexbuffers.end(); ++it)
  {
    UINT i = *it;
    if(rs->IA.VBs[i])
    {
      GetDebugManager()->GetBufferData(rs->IA.VBs[i],
                                       rs->IA.Offsets[i] + rs->IA.Strides[i] * (vertOffset + idx),
                                       rs->IA.Strides[i], vertData[i]);

      for(UINT isr = 1; isr <= MaxStepRate; isr++)
      {
        GetDebugManager()->GetBufferData(
            rs->IA.VBs[i], rs->IA.Offsets[i] + rs->IA.Strides[i] * (instOffset + (instid / isr)),
            rs->IA.Strides[i], instData[i * MaxStepRate + isr - 1]);
      }

      GetDebugManager()->GetBufferData(rs->IA.VBs[i],
                                       rs->IA.Offsets[i] + rs->IA.Strides[i] * instOffset,
                                       rs->IA.Strides[i], staticData[i]);
    }
  }

  bytebuf cbufData[D3D11_COMMONSHADER_CONSTANT_BUFFER_API_SLOT_COUNT];

  for(int i = 0; i < D3D11_COMMONSHADER_CONSTANT_BUFFER_API_SLOT_COUNT; i++)
    if(rs->VS.ConstantBuffers[i])
      GetDebugManager()->GetBufferData(rs->VS.ConstantBuffers[i],
                                       rs->VS.CBOffsets[i] * sizeof(Vec4f), 0, cbufData[i]);

  ShaderDebugTrace ret;

  GlobalState global;
  GetDebugManager()->CreateShaderGlobalState(global, dxbc, 0, NULL, rs->VS.SRVs);
  State initialState = GetDebugManager()->CreateShaderDebugState(ret, -1, dxbc, cbufData);

  for(size_t i = 0; i < ret.inputs.size(); i++)
  {
    if(dxbc->m_InputSig[i].systemValue == ShaderBuiltin::Undefined ||
       dxbc->m_InputSig[i].systemValue ==
           ShaderBuiltin::Position)    // SV_Position seems to get promoted
                                       // automatically, but it's invalid for
                                       // vertex input
    {
      const D3D11_INPUT_ELEMENT_DESC *el = NULL;

      std::string signame = strlower(dxbc->m_InputSig[i].semanticName);

      for(size_t l = 0; l < inputlayout.size(); l++)
      {
        std::string layoutname = strlower(inputlayout[l].SemanticName);

        if(signame == layoutname && dxbc->m_InputSig[i].semanticIndex == inputlayout[l].SemanticIndex)
        {
          el = &inputlayout[l];
          break;
        }
        if(signame == layoutname + ToStr(inputlayout[l].SemanticIndex))
        {
          el = &inputlayout[l];
          break;
        }
      }

      RDCASSERT(el);

      if(!el)
        continue;

      byte *srcData = NULL;
      size_t dataSize = 0;

      if(el->InputSlotClass == D3D11_INPUT_PER_VERTEX_DATA)
      {
        if(vertData[el->InputSlot].size() >= el->AlignedByteOffset)
        {
          srcData = &vertData[el->InputSlot][el->AlignedByteOffset];
          dataSize = vertData[el->InputSlot].size() - el->AlignedByteOffset;
        }
      }
      else
      {
        if(el->InstanceDataStepRate == 0 || el->InstanceDataStepRate >= draw->numInstances)
        {
          srcData = &staticData[el->InputSlot][el->AlignedByteOffset];
          dataSize = staticData[el->InputSlot].size() - el->AlignedByteOffset;
        }
        else
        {
          UINT isrIdx = el->InputSlot * MaxStepRate + (el->InstanceDataStepRate - 1);
          if(instData[isrIdx].size() >= el->AlignedByteOffset)
          {
            srcData = &instData[isrIdx][el->AlignedByteOffset];
            dataSize = instData[isrIdx].size() - el->AlignedByteOffset;
          }
        }
      }

      ResourceFormat fmt = MakeResourceFormat(el->Format);

      // more data needed than is provided
      if(dxbc->m_InputSig[i].compCount > fmt.compCount)
      {
        ret.inputs[i].value.u.w = 1;

        if(fmt.compType == CompType::Float)
          ret.inputs[i].value.f.w = 1.0f;
      }

      // interpret resource format types
      if(fmt.Special())
      {
        Vec3f *v3 = (Vec3f *)ret.inputs[i].value.fv;
        Vec4f *v4 = (Vec4f *)ret.inputs[i].value.fv;

        // only pull in all or nothing from these,
        // if there's only e.g. 3 bytes remaining don't read and unpack some of
        // a 4-byte resource format type
        size_t packedsize = 4;
        if(fmt.type == ResourceFormatType::R5G5B5A1 || fmt.type == ResourceFormatType::R5G6B5 ||
           fmt.type == ResourceFormatType::R4G4B4A4)
          packedsize = 2;

        if(srcData == NULL || packedsize > dataSize)
        {
          ret.inputs[i].value.u.x = ret.inputs[i].value.u.y = ret.inputs[i].value.u.z =
              ret.inputs[i].value.u.w = 0;
        }
        else if(fmt.type == ResourceFormatType::R5G5B5A1)
        {
          RDCASSERT(fmt.bgraOrder);
          uint16_t packed = ((uint16_t *)srcData)[0];
          *v4 = ConvertFromB5G5R5A1(packed);
        }
        else if(fmt.type == ResourceFormatType::R5G6B5)
        {
          RDCASSERT(fmt.bgraOrder);
          uint16_t packed = ((uint16_t *)srcData)[0];
          *v3 = ConvertFromB5G6R5(packed);
        }
        else if(fmt.type == ResourceFormatType::R4G4B4A4)
        {
          RDCASSERT(fmt.bgraOrder);
          uint16_t packed = ((uint16_t *)srcData)[0];
          *v4 = ConvertFromB4G4R4A4(packed);
        }
        else if(fmt.type == ResourceFormatType::R10G10B10A2)
        {
          uint32_t packed = ((uint32_t *)srcData)[0];

          if(fmt.compType == CompType::UInt)
          {
            ret.inputs[i].value.u.z = (packed >> 0) & 0x3ff;
            ret.inputs[i].value.u.y = (packed >> 10) & 0x3ff;
            ret.inputs[i].value.u.x = (packed >> 20) & 0x3ff;
            ret.inputs[i].value.u.w = (packed >> 30) & 0x003;
          }
          else
          {
            *v4 = ConvertFromR10G10B10A2(packed);
          }
        }
        else if(fmt.type == ResourceFormatType::R11G11B10)
        {
          uint32_t packed = ((uint32_t *)srcData)[0];
          *v3 = ConvertFromR11G11B10(packed);
        }
      }
      else
      {
        for(uint32_t c = 0; c < fmt.compCount; c++)
        {
          if(srcData == NULL || fmt.compByteWidth > dataSize)
          {
            ret.inputs[i].value.uv[c] = 0;
            continue;
          }

          dataSize -= fmt.compByteWidth;

          if(fmt.compByteWidth == 1)
          {
            byte *src = srcData + c * fmt.compByteWidth;

            if(fmt.compType == CompType::UInt)
              ret.inputs[i].value.uv[c] = *src;
            else if(fmt.compType == CompType::SInt)
              ret.inputs[i].value.iv[c] = *((int8_t *)src);
            else if(fmt.compType == CompType::UNorm)
              ret.inputs[i].value.fv[c] = float(*src) / 255.0f;
            else if(fmt.compType == CompType::SNorm)
            {
              signed char *schar = (signed char *)src;

              // -128 is mapped to -1, then -127 to -127 are mapped to -1 to 1
              if(*schar == -128)
                ret.inputs[i].value.fv[c] = -1.0f;
              else
                ret.inputs[i].value.fv[c] = float(*schar) / 127.0f;
            }
            else
              RDCERR("Unexpected component type");
          }
          else if(fmt.compByteWidth == 2)
          {
            uint16_t *src = (uint16_t *)(srcData + c * fmt.compByteWidth);

            if(fmt.compType == CompType::Float)
              ret.inputs[i].value.fv[c] = ConvertFromHalf(*src);
            else if(fmt.compType == CompType::UInt)
              ret.inputs[i].value.uv[c] = *src;
            else if(fmt.compType == CompType::SInt)
              ret.inputs[i].value.iv[c] = *((int16_t *)src);
            else if(fmt.compType == CompType::UNorm)
              ret.inputs[i].value.fv[c] = float(*src) / float(UINT16_MAX);
            else if(fmt.compType == CompType::SNorm)
            {
              int16_t *sint = (int16_t *)src;

              // -32768 is mapped to -1, then -32767 to -32767 are mapped to -1 to 1
              if(*sint == -32768)
                ret.inputs[i].value.fv[c] = -1.0f;
              else
                ret.inputs[i].value.fv[c] = float(*sint) / 32767.0f;
            }
            else
              RDCERR("Unexpected component type");
          }
          else if(fmt.compByteWidth == 4)
          {
            uint32_t *src = (uint32_t *)(srcData + c * fmt.compByteWidth);

            if(fmt.compType == CompType::Float || fmt.compType == CompType::UInt ||
               fmt.compType == CompType::SInt)
              memcpy(&ret.inputs[i].value.uv[c], src, 4);
            else
              RDCERR("Unexpected component type");
          }
        }

        if(fmt.bgraOrder)
        {
          RDCASSERT(fmt.compCount == 4);
          std::swap(ret.inputs[i].value.fv[2], ret.inputs[i].value.fv[0]);
        }
      }
    }
    else if(dxbc->m_InputSig[i].systemValue == ShaderBuiltin::VertexIndex)
    {
      uint32_t sv_vertid = vertid;

      if(draw->flags & DrawFlags::Indexed)
        sv_vertid = idx;

      if(dxbc->m_InputSig[i].compType == CompType::Float)
        ret.inputs[i].value.f.x = ret.inputs[i].value.f.y = ret.inputs[i].value.f.z =
            ret.inputs[i].value.f.w = (float)sv_vertid;
      else
        ret.inputs[i].value.u.x = ret.inputs[i].value.u.y = ret.inputs[i].value.u.z =
            ret.inputs[i].value.u.w = sv_vertid;
    }
    else if(dxbc->m_InputSig[i].systemValue == ShaderBuiltin::InstanceIndex)
    {
      if(dxbc->m_InputSig[i].compType == CompType::Float)
        ret.inputs[i].value.f.x = ret.inputs[i].value.f.y = ret.inputs[i].value.f.z =
            ret.inputs[i].value.f.w = (float)instid;
      else
        ret.inputs[i].value.u.x = ret.inputs[i].value.u.y = ret.inputs[i].value.u.z =
            ret.inputs[i].value.u.w = instid;
    }
    else
    {
      RDCERR("Unhandled system value semantic on VS input");
    }
  }

  delete[] instData;

  State last;

  vector<ShaderDebugState> states;

  dxbc->m_DebugInfo->GetStack(0, dxbc->GetInstruction(0).offset, initialState.callstack);

  states.push_back((State)initialState);

  D3D11MarkerRegion simloop("Simulation Loop");

  for(int cycleCounter = 0;; cycleCounter++)
  {
    if(initialState.Finished())
      break;

    initialState = initialState.GetNext(global, NULL);

    {
      const ASMOperation &op = dxbc->GetInstruction((size_t)initialState.nextInstruction);
      dxbc->m_DebugInfo->GetStack(initialState.nextInstruction, op.offset, initialState.callstack);
    }

    states.push_back((State)initialState);

    if(cycleCounter == SHADER_DEBUG_WARN_THRESHOLD)
    {
      if(PromptDebugTimeout(DXBC::TYPE_VERTEX, cycleCounter))
        break;
    }
  }

  ret.states = states;

  return ret;
}

ShaderDebugTrace D3D11Replay::DebugPixel(uint32_t eventId, uint32_t x, uint32_t y, uint32_t sample,
                                         uint32_t primitive)
{
  using namespace DXBC;
  using namespace ShaderDebug;

  D3D11MarkerRegion debugpixRegion(
      StringFormat::Fmt("DebugPixel @ %u of (%u,%u) %u / %u", eventId, x, y, sample, primitive));

  ShaderDebugTrace empty;

  D3D11RenderStateTracker tracker(m_pImmediateContext);

  ID3D11PixelShader *statePS = NULL;
  m_pImmediateContext->PSGetShader(&statePS, NULL, NULL);

  WrappedID3D11Shader<ID3D11PixelShader> *ps = (WrappedID3D11Shader<ID3D11PixelShader> *)statePS;

  SAFE_RELEASE(statePS);

  ID3D11GeometryShader *stateGS = NULL;
  m_pImmediateContext->GSGetShader(&stateGS, NULL, NULL);

  WrappedID3D11Shader<ID3D11GeometryShader> *gs =
      (WrappedID3D11Shader<ID3D11GeometryShader> *)stateGS;

  SAFE_RELEASE(stateGS);

  ID3D11DomainShader *stateDS = NULL;
  m_pImmediateContext->DSGetShader(&stateDS, NULL, NULL);

  WrappedID3D11Shader<ID3D11DomainShader> *ds = (WrappedID3D11Shader<ID3D11DomainShader> *)stateDS;

  SAFE_RELEASE(stateDS);

  ID3D11VertexShader *stateVS = NULL;
  m_pImmediateContext->VSGetShader(&stateVS, NULL, NULL);

  WrappedID3D11Shader<ID3D11VertexShader> *vs = (WrappedID3D11Shader<ID3D11VertexShader> *)stateVS;

  SAFE_RELEASE(stateVS);

  if(!ps)
    return empty;

  D3D11RenderState *rs = m_pImmediateContext->GetCurrentPipelineState();

  DXBCFile *dxbc = ps->GetDXBC();

  if(!dxbc)
    return empty;

  dxbc->GetDisassembly();

  DXBCFile *prevdxbc = NULL;

  if(prevdxbc == NULL && gs != NULL)
    prevdxbc = gs->GetDXBC();
  if(prevdxbc == NULL && ds != NULL)
    prevdxbc = ds->GetDXBC();
  if(prevdxbc == NULL && vs != NULL)
    prevdxbc = vs->GetDXBC();

  vector<DataOutput> initialValues;

  string extractHlsl = "struct PSInput\n{\n";

  int structureStride = 0;

  if(dxbc->m_InputSig.empty())
  {
    extractHlsl += "float4 input_dummy : SV_Position;\n";

    initialValues.push_back(DataOutput(-1, 0, 4, ShaderBuiltin::Undefined, true));

    structureStride += 4;
  }

  vector<string> floatInputs;
  vector<pair<string, pair<uint32_t, uint32_t> > >
      arrays;    // name, pair<start semantic index, end semantic index>

  uint32_t nextreg = 0;

  for(size_t i = 0; i < dxbc->m_InputSig.size(); i++)
  {
    extractHlsl += "  ";

    bool included = true;

    // handled specially to account for SV_ ordering
    if(dxbc->m_InputSig[i].systemValue == ShaderBuiltin::PrimitiveIndex ||
       dxbc->m_InputSig[i].systemValue == ShaderBuiltin::MSAACoverage ||
       dxbc->m_InputSig[i].systemValue == ShaderBuiltin::IsFrontFace ||
       dxbc->m_InputSig[i].systemValue == ShaderBuiltin::MSAASampleIndex)
    {
      extractHlsl += "//";
      included = false;
    }

    bool arrayElem = false;

    for(size_t a = 0; a < arrays.size(); a++)
    {
      if(dxbc->m_InputSig[i].semanticName == arrays[a].first &&
         arrays[a].second.first <= dxbc->m_InputSig[i].semanticIndex &&
         arrays[a].second.second >= dxbc->m_InputSig[i].semanticIndex)
      {
        extractHlsl += "//";
        included = false;
        arrayElem = true;
      }
    }

    int missingreg = int(dxbc->m_InputSig[i].regIndex) - int(nextreg);

    // fill in holes from output sig of previous shader if possible, to try and
    // ensure the same register order
    for(int dummy = 0; dummy < missingreg; dummy++)
    {
      bool filled = false;

      if(prevdxbc)
      {
        for(size_t os = 0; os < prevdxbc->m_OutputSig.size(); os++)
        {
          if(prevdxbc->m_OutputSig[os].regIndex == nextreg + dummy)
          {
            filled = true;

            if(prevdxbc->m_OutputSig[os].compType == CompType::Float)
              extractHlsl += "float";
            else if(prevdxbc->m_OutputSig[os].compType == CompType::SInt)
              extractHlsl += "int";
            else if(prevdxbc->m_OutputSig[os].compType == CompType::UInt)
              extractHlsl += "uint";
            else
              RDCERR("Unexpected input signature type: %d", prevdxbc->m_OutputSig[os].compType);

            int numCols = (prevdxbc->m_OutputSig[os].regChannelMask & 0x1 ? 1 : 0) +
                          (prevdxbc->m_OutputSig[os].regChannelMask & 0x2 ? 1 : 0) +
                          (prevdxbc->m_OutputSig[os].regChannelMask & 0x4 ? 1 : 0) +
                          (prevdxbc->m_OutputSig[os].regChannelMask & 0x8 ? 1 : 0);

            structureStride += 4 * numCols;

            initialValues.push_back(DataOutput(-1, 0, numCols, ShaderBuiltin::Undefined, true));

            std::string name = prevdxbc->m_OutputSig[os].semanticIdxName;

            extractHlsl += ToStr((uint32_t)numCols) + " input_" + name + " : " + name + ";\n";
          }
        }
      }

      if(!filled)
      {
        string dummy_reg = "dummy_register";
        dummy_reg += ToStr((uint32_t)nextreg + dummy);
        extractHlsl += "float4 var_" + dummy_reg + " : semantic_" + dummy_reg + ";\n";

        initialValues.push_back(DataOutput(-1, 0, 4, ShaderBuiltin::Undefined, true));

        structureStride += 4 * sizeof(float);
      }
    }

    nextreg = dxbc->m_InputSig[i].regIndex + 1;

    if(dxbc->m_InputSig[i].compType == CompType::Float)
    {
      // if we're packed with ints on either side, we must be nointerpolation
      bool nointerp = false;
      for(size_t j = 0; j < dxbc->m_InputSig.size(); j++)
      {
        if(dxbc->m_InputSig[i].regIndex == dxbc->m_InputSig[j].regIndex &&
           dxbc->m_InputSig[j].compType != CompType::Float)
        {
          nointerp = true;
          break;
        }
      }

      if(nointerp)
        extractHlsl += "nointerpolation ";

      extractHlsl += "float";
    }
    else if(dxbc->m_InputSig[i].compType == CompType::SInt)
      extractHlsl += "nointerpolation int";
    else if(dxbc->m_InputSig[i].compType == CompType::UInt)
      extractHlsl += "nointerpolation uint";
    else
      RDCERR("Unexpected input signature type: %d", dxbc->m_InputSig[i].compType);

    int numCols = (dxbc->m_InputSig[i].regChannelMask & 0x1 ? 1 : 0) +
                  (dxbc->m_InputSig[i].regChannelMask & 0x2 ? 1 : 0) +
                  (dxbc->m_InputSig[i].regChannelMask & 0x4 ? 1 : 0) +
                  (dxbc->m_InputSig[i].regChannelMask & 0x8 ? 1 : 0);

    if(included)
      structureStride += 4 * numCols;

    std::string name = dxbc->m_InputSig[i].semanticIdxName;

    // arrays of interpolators are handled really weirdly. They use cbuffer
    // packing rules where each new value is in a new register (rather than
    // e.g. 2 x float2 in a single register), but that's pointless because
    // you can't dynamically index into input registers.
    // If we declare those elements as a non-array, the float2s or floats
    // will be packed into registers and won't match up to the previous
    // shader.
    // HOWEVER to add an extra bit of fun, fxc will happily pack other
    // parameters not in the array into spare parts of the registers.
    //
    // So I think the upshot is that we can detect arrays reliably by
    // whenever we encounter a float or float2 at the start of a register,
    // search forward to see if the next register has an element that is the
    // same semantic name and one higher semantic index. If so, there's an
    // array, so keep searching to enumerate its length.
    // I think this should be safe if the packing just happens to place those
    // registers together.

    int arrayLength = 0;

    if(included && numCols <= 2 && dxbc->m_InputSig[i].regChannelMask <= 0x3)
    {
      uint32_t nextIdx = dxbc->m_InputSig[i].semanticIndex + 1;

      for(size_t j = i + 1; j < dxbc->m_InputSig.size(); j++)
      {
        // if we've found the 'next' semantic
        if(dxbc->m_InputSig[i].semanticName == dxbc->m_InputSig[j].semanticName &&
           nextIdx == dxbc->m_InputSig[j].semanticIndex)
        {
          int jNumCols = (dxbc->m_InputSig[i].regChannelMask & 0x1 ? 1 : 0) +
                         (dxbc->m_InputSig[i].regChannelMask & 0x2 ? 1 : 0) +
                         (dxbc->m_InputSig[i].regChannelMask & 0x4 ? 1 : 0) +
                         (dxbc->m_InputSig[i].regChannelMask & 0x8 ? 1 : 0);

          // if it's the same size, and it's at the start of the next register
          if(jNumCols == numCols && dxbc->m_InputSig[j].regChannelMask <= 0x3)
          {
            if(arrayLength == 0)
              arrayLength = 2;
            else
              arrayLength++;

            // continue searching now
            nextIdx++;
            j = i + 1;
            continue;
          }
        }
      }

      if(arrayLength > 0)
        arrays.push_back(
            std::make_pair(dxbc->m_InputSig[i].semanticName,
                           std::make_pair(dxbc->m_InputSig[i].semanticIndex, nextIdx - 1)));
    }

    // as another side effect of the above, an element declared as a 1-length array won't be
    // detected but it WILL be put in its own register (not packed together), so detect this
    // case too.
    // Note we have to search *backwards* because we need to know if this register should have
    // been packed into the previous register, but wasn't. float/float2 can be packed after an
    // array just fine.
    if(included && i > 0 && arrayLength == 0 && numCols <= 2 &&
       dxbc->m_InputSig[i].regChannelMask <= 0x3)
    {
      const SigParameter &prev = dxbc->m_InputSig[i - 1];

      if(prev.regIndex != dxbc->m_InputSig[i].regIndex && prev.compCount <= 2 &&
         prev.regChannelMask <= 0x3)
        arrayLength = 1;
    }

    // The compiler is also really annoying and will go to great lengths to rearrange elements
    // and screw up our declaration, to pack things together. E.g.:
    // float2 a : TEXCOORD1;
    // float4 b : TEXCOORD2;
    // float4 c : TEXCOORD3;
    // float2 d : TEXCOORD4;
    // the compiler will move d up and pack it into the last two components of a.
    // To prevent this, we look forward and backward to check that we aren't expecting to pack
    // with anything, and if not then we just make it a 1-length array to ensure no packing.
    // Note the regChannelMask & 0x1 means it is using .x, so it's not the tail-end of a pack
    if(included && arrayLength == 0 && numCols <= 2 && (dxbc->m_InputSig[i].regChannelMask & 0x1))
    {
      if(i == dxbc->m_InputSig.size() - 1)
      {
        // the last element is never packed
        arrayLength = 1;
      }
      else
      {
        // if the next reg is using .x, it wasn't packed with us
        if(dxbc->m_InputSig[i + 1].regChannelMask & 0x1)
          arrayLength = 1;
      }
    }

    extractHlsl += ToStr((uint32_t)numCols) + " input_" + name;
    if(arrayLength > 0)
      extractHlsl += "[" + ToStr(arrayLength) + "]";
    extractHlsl += " : " + name;

    if(included && dxbc->m_InputSig[i].compType == CompType::Float)
    {
      if(arrayLength == 0)
      {
        floatInputs.push_back("input_" + name);
      }
      else
      {
        for(int a = 0; a < arrayLength; a++)
          floatInputs.push_back("input_" + name + "[" + ToStr(a) + "]");
      }
    }

    extractHlsl += ";\n";

    int firstElem = dxbc->m_InputSig[i].regChannelMask & 0x1
                        ? 0
                        : dxbc->m_InputSig[i].regChannelMask & 0x2
                              ? 1
                              : dxbc->m_InputSig[i].regChannelMask & 0x4
                                    ? 2
                                    : dxbc->m_InputSig[i].regChannelMask & 0x8 ? 3 : -1;

    // arrays get added all at once (because in the struct data, they are contiguous even if
    // in the input signature they're not).
    if(!arrayElem)
    {
      if(arrayLength == 0)
      {
        initialValues.push_back(DataOutput(dxbc->m_InputSig[i].regIndex, firstElem, numCols,
                                           dxbc->m_InputSig[i].systemValue, included));
      }
      else
      {
        for(int a = 0; a < arrayLength; a++)
        {
          initialValues.push_back(DataOutput(dxbc->m_InputSig[i].regIndex + a, firstElem, numCols,
                                             dxbc->m_InputSig[i].systemValue, included));
        }
      }
    }
  }

  extractHlsl += "};\n\n";

  uint32_t overdrawLevels = 100;    // maximum number of overdraw levels

  uint32_t uavslot = 0;

  ID3D11DepthStencilView *depthView = NULL;
  ID3D11RenderTargetView *rtView = NULL;
  // preserve at least one render target and/or the depth view, so that
  // we have the right multisample level on output either way
  m_pImmediateContext->OMGetRenderTargets(1, &rtView, &depthView);
  if(rtView != NULL)
    uavslot = 1;

  extractHlsl +=
      "struct PSInitialData { uint hit; float3 pos; uint prim; uint fface; uint sample; uint "
      "covge; float derivValid; PSInput IN; PSInput INddx; PSInput INddy; PSInput INddxfine; "
      "PSInput INddyfine; };\n\n";
  extractHlsl +=
      "RWStructuredBuffer<PSInitialData> PSInitialBuffer : register(u" + ToStr(uavslot) + ");\n\n";
  extractHlsl +=
      "void ExtractInputsPS(PSInput IN, float4 debug_pixelPos : SV_Position, uint prim : "
      "SV_PrimitiveID, uint sample : SV_SampleIndex, uint covge : SV_Coverage, bool fface : "
      "SV_IsFrontFace)\n{\n";
  extractHlsl += "  uint idx = " + ToStr(overdrawLevels) + ";\n";
  extractHlsl += "  if(abs(debug_pixelPos.x - " + ToStr(x) +
                 ".5) < 0.5f && abs(debug_pixelPos.y - " + ToStr(y) + ".5) < 0.5f)\n";
  extractHlsl += "    InterlockedAdd(PSInitialBuffer[0].hit, 1, idx);\n\n";
  extractHlsl += "  idx = min(idx, " + ToStr(overdrawLevels) + ");\n\n";
  extractHlsl += "  PSInitialBuffer[idx].pos = debug_pixelPos.xyz;\n";
  extractHlsl += "  PSInitialBuffer[idx].prim = prim;\n";
  extractHlsl += "  PSInitialBuffer[idx].fface = fface;\n";
  extractHlsl += "  PSInitialBuffer[idx].covge = covge;\n";
  extractHlsl += "  PSInitialBuffer[idx].sample = sample;\n";
  extractHlsl += "  PSInitialBuffer[idx].IN = IN;\n";
  extractHlsl += "  PSInitialBuffer[idx].derivValid = ddx(debug_pixelPos.x);\n";
  extractHlsl += "  PSInitialBuffer[idx].INddx = (PSInput)0;\n";
  extractHlsl += "  PSInitialBuffer[idx].INddy = (PSInput)0;\n";
  extractHlsl += "  PSInitialBuffer[idx].INddxfine = (PSInput)0;\n";
  extractHlsl += "  PSInitialBuffer[idx].INddyfine = (PSInput)0;\n";

  for(size_t i = 0; i < floatInputs.size(); i++)
  {
    const string &name = floatInputs[i];
    extractHlsl += "  PSInitialBuffer[idx].INddx." + name + " = ddx(IN." + name + ");\n";
    extractHlsl += "  PSInitialBuffer[idx].INddy." + name + " = ddy(IN." + name + ");\n";
    extractHlsl += "  PSInitialBuffer[idx].INddxfine." + name + " = ddx_fine(IN." + name + ");\n";
    extractHlsl += "  PSInitialBuffer[idx].INddyfine." + name + " = ddy_fine(IN." + name + ");\n";
  }
  extractHlsl += "\n}";

  ID3D11PixelShader *extract =
      m_pDevice->GetShaderCache()->MakePShader(extractHlsl.c_str(), "ExtractInputsPS", "ps_5_0");

  uint32_t structStride = sizeof(uint32_t)       // uint hit;
                          + sizeof(float) * 3    // float3 pos;
                          + sizeof(uint32_t)     // uint prim;
                          + sizeof(uint32_t)     // uint fface;
                          + sizeof(uint32_t)     // uint sample;
                          + sizeof(uint32_t)     // uint covge;
                          + sizeof(float)        // float derivValid;
                          +
                          structureStride * 5;    // PSInput IN, INddx, INddy, INddxfine, INddyfine;

  HRESULT hr = S_OK;

  D3D11_BUFFER_DESC bdesc;
  bdesc.BindFlags = D3D11_BIND_UNORDERED_ACCESS;
  bdesc.CPUAccessFlags = 0;
  bdesc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
  bdesc.Usage = D3D11_USAGE_DEFAULT;
  bdesc.StructureByteStride = structStride;
  bdesc.ByteWidth = bdesc.StructureByteStride * (overdrawLevels + 1);

  ID3D11Buffer *initialBuf = NULL;
  hr = m_pDevice->CreateBuffer(&bdesc, NULL, &initialBuf);

  if(FAILED(hr))
  {
    RDCERR("Failed to create buffer HRESULT: %s", ToStr(hr).c_str());
    return empty;
  }

  bdesc.BindFlags = 0;
  bdesc.MiscFlags = 0;
  bdesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
  bdesc.Usage = D3D11_USAGE_STAGING;
  bdesc.StructureByteStride = 0;

  ID3D11Buffer *stageBuf = NULL;
  hr = m_pDevice->CreateBuffer(&bdesc, NULL, &stageBuf);

  if(FAILED(hr))
  {
    RDCERR("Failed to create buffer HRESULT: %s", ToStr(hr).c_str());
    return empty;
  }

  D3D11_UNORDERED_ACCESS_VIEW_DESC uavdesc;
  uavdesc.Format = DXGI_FORMAT_UNKNOWN;
  uavdesc.Buffer.FirstElement = 0;
  uavdesc.Buffer.Flags = 0;
  uavdesc.Buffer.NumElements = overdrawLevels + 1;
  uavdesc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;

  ID3D11UnorderedAccessView *initialUAV = NULL;
  hr = m_pDevice->CreateUnorderedAccessView(initialBuf, &uavdesc, &initialUAV);

  if(FAILED(hr))
  {
    RDCERR("Failed to create buffer HRESULT: %s", ToStr(hr).c_str());
    return empty;
  }

  UINT zero = 0;
  m_pImmediateContext->ClearUnorderedAccessViewUint(initialUAV, &zero);

  UINT count = (UINT)-1;
  m_pImmediateContext->OMSetRenderTargetsAndUnorderedAccessViews(uavslot, &rtView, depthView,
                                                                 uavslot, 1, &initialUAV, &count);
  m_pImmediateContext->PSSetShader(extract, NULL, 0);

  SAFE_RELEASE(rtView);
  SAFE_RELEASE(depthView);

  {
    D3D11MarkerRegion initState("Replaying event for initial states");

    m_pDevice->ReplayLog(0, eventId, eReplay_OnlyDraw);

    m_pImmediateContext->CopyResource(stageBuf, initialBuf);
  }

  D3D11_MAPPED_SUBRESOURCE mapped;
  hr = m_pImmediateContext->Map(stageBuf, 0, D3D11_MAP_READ, 0, &mapped);

  if(FAILED(hr))
  {
    RDCERR("Failed to map stage buff HRESULT: %s", ToStr(hr).c_str());
    return empty;
  }

  byte *initialData = new byte[bdesc.ByteWidth];
  memcpy(initialData, mapped.pData, bdesc.ByteWidth);

  m_pImmediateContext->Unmap(stageBuf, 0);

  SAFE_RELEASE(initialUAV);
  SAFE_RELEASE(initialBuf);
  SAFE_RELEASE(stageBuf);

  SAFE_RELEASE(extract);

  DebugHit *buf = (DebugHit *)initialData;

  D3D11MarkerRegion::Set(StringFormat::Fmt("Got %u hits", buf[0].numHits));

  if(buf[0].numHits == 0)
  {
    RDCLOG("No hit for this event");
    SAFE_DELETE_ARRAY(initialData);
    return empty;
  }

  // if we encounter multiple hits at our destination pixel co-ord (or any other) we
  // check to see if a specific primitive was requested (via primitive parameter not
  // being set to ~0U). If it was, debug that pixel, otherwise do a best-estimate
  // of which fragment was the last to successfully depth test and debug that, just by
  // checking if the depth test is ordered and picking the final fragment in the series

  // our debugging quad. Order is TL, TR, BL, BR
  State quad[4];

  // figure out the TL pixel's coords. Assume even top left (towards 0,0)
  // this isn't spec'd but is a reasonable assumption.
  int xTL = x & (~1);
  int yTL = y & (~1);

  // get the index of our desired pixel
  int destIdx = (x - xTL) + 2 * (y - yTL);

  bytebuf cbufData[D3D11_COMMONSHADER_CONSTANT_BUFFER_API_SLOT_COUNT];

  for(int i = 0; i < D3D11_COMMONSHADER_CONSTANT_BUFFER_API_SLOT_COUNT; i++)
    if(rs->PS.ConstantBuffers[i])
      GetDebugManager()->GetBufferData(rs->PS.ConstantBuffers[i],
                                       rs->PS.CBOffsets[i] * sizeof(Vec4f), 0, cbufData[i]);

  D3D11_COMPARISON_FUNC depthFunc = D3D11_COMPARISON_LESS;

  if(rs->OM.DepthStencilState)
  {
    D3D11_DEPTH_STENCIL_DESC desc;
    rs->OM.DepthStencilState->GetDesc(&desc);
    depthFunc = desc.DepthFunc;
  }

  DebugHit *winner = NULL;

  if(sample == ~0U)
    sample = 0;

  if(primitive != ~0U)
  {
    for(size_t i = 0; i < buf[0].numHits && i < overdrawLevels; i++)
    {
      DebugHit *hit = (DebugHit *)(initialData + i * structStride);

      if(hit->primitive == primitive && hit->sample == sample)
        winner = hit;
    }
  }

  if(winner == NULL)
  {
    for(size_t i = 0; i < buf[0].numHits && i < overdrawLevels; i++)
    {
      DebugHit *hit = (DebugHit *)(initialData + i * structStride);

      if(winner == NULL || (winner->sample != sample && hit->sample == sample) ||
         depthFunc == D3D11_COMPARISON_ALWAYS || depthFunc == D3D11_COMPARISON_NEVER ||
         depthFunc == D3D11_COMPARISON_NOT_EQUAL || depthFunc == D3D11_COMPARISON_EQUAL)
      {
        winner = hit;
        continue;
      }

      if((depthFunc == D3D11_COMPARISON_LESS && hit->depth < winner->depth) ||
         (depthFunc == D3D11_COMPARISON_LESS_EQUAL && hit->depth <= winner->depth) ||
         (depthFunc == D3D11_COMPARISON_GREATER && hit->depth > winner->depth) ||
         (depthFunc == D3D11_COMPARISON_GREATER_EQUAL && hit->depth >= winner->depth))
      {
        if(hit->sample == sample)
          winner = hit;
      }
    }
  }

  if(winner == NULL)
  {
    RDCLOG("Couldn't find any pixels that passed depth test at target co-ordinates");
    SAFE_DELETE_ARRAY(initialData);
    return empty;
  }

  ShaderDebugTrace traces[4];

  GlobalState global;
  GetDebugManager()->CreateShaderGlobalState(global, dxbc, rs->OM.UAVStartSlot, rs->OM.UAVs,
                                             rs->PS.SRVs);

  {
    DebugHit *hit = winner;

    State initialState =
        GetDebugManager()->CreateShaderDebugState(traces[destIdx], destIdx, dxbc, cbufData);

    rdcarray<ShaderVariable> &ins = traces[destIdx].inputs;
    if(!ins.empty() && ins.back().name == "vCoverage")
      ins.back().value.u.x = hit->coverage;

    initialState.semantics.coverage = hit->coverage;
    initialState.semantics.primID = hit->primitive;
    initialState.semantics.isFrontFace = hit->isFrontFace;

    uint32_t *data = &hit->rawdata;

    float *ddx = (float *)data;

    // ddx(SV_Position.x) MUST be 1.0
    if(*ddx != 1.0f)
    {
      RDCERR("Derivatives invalid");
      SAFE_DELETE_ARRAY(initialData);
      return empty;
    }

    data++;

    for(size_t i = 0; i < initialValues.size(); i++)
    {
      int32_t *rawout = NULL;

      if(initialValues[i].reg >= 0)
      {
        ShaderVariable &invar = traces[destIdx].inputs[initialValues[i].reg];

        if(initialValues[i].sysattribute == ShaderBuiltin::PrimitiveIndex)
        {
          invar.value.u.x = hit->primitive;
        }
        else if(initialValues[i].sysattribute == ShaderBuiltin::MSAASampleIndex)
        {
          invar.value.u.x = hit->sample;
        }
        else if(initialValues[i].sysattribute == ShaderBuiltin::MSAACoverage)
        {
          invar.value.u.x = hit->coverage;
        }
        else if(initialValues[i].sysattribute == ShaderBuiltin::IsFrontFace)
        {
          invar.value.u.x = hit->isFrontFace ? ~0U : 0;
        }
        else
        {
          rawout = &invar.value.iv[initialValues[i].elem];

          memcpy(rawout, data, initialValues[i].numwords * 4);
        }
      }

      if(initialValues[i].included)
        data += initialValues[i].numwords;
    }

    for(int i = 0; i < 4; i++)
    {
      if(i != destIdx)
        traces[i] = traces[destIdx];
      quad[i] = initialState;
      quad[i].SetTrace(i, &traces[i]);
      if(i != destIdx)
        quad[i].SetHelper();
    }

    // We make the assumption that the coarse derivatives are
    // generated from (0,0) in the quad, and fine derivatives
    // are generated from the destination index and its
    // neighbours in X and Y.
    // This isn't spec'd but we must assume something and this
    // will hopefully get us closest to reproducing actual
    // results.
    //
    // For debugging, we need members of the quad to be able
    // to generate coarse and fine derivatives.
    //
    // For (0,0) we only need the coarse derivatives to get
    // our neighbours (1,0) and (0,1) which will give us
    // coarse and fine derivatives being identical.
    //
    // For the others we will need to use a combination of
    // coarse and fine derivatives to get the diagonal element
    // in the quad. E.g. for (1,1):
    //
    // (1,0) = (1,1) - ddx_fine
    // (0,1) = (1,1) - ddy_fine
    // (0,0) = (1,1) - ddy_fine - ddx_coarse
    //
    // This only works if coarse and fine are calculated as we
    // are assuming.

    ddx = (float *)data;

    for(size_t i = 0; i < initialValues.size(); i++)
    {
      if(!initialValues[i].included)
        continue;

      if(initialValues[i].reg >= 0)
      {
        if(destIdx == 0)
        {
          for(int w = 0; w < initialValues[i].numwords; w++)
          {
            traces[1].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] += ddx[w];
            traces[3].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] += ddx[w];
          }
        }
        else if(destIdx == 1)
        {
          for(int w = 0; w < initialValues[i].numwords; w++)
          {
            traces[0].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] -= ddx[w];
            traces[2].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] -= ddx[w];
          }
        }
        else if(destIdx == 2)
        {
          for(int w = 0; w < initialValues[i].numwords; w++)
          {
            traces[1].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] += ddx[w];
          }
        }
        else if(destIdx == 3)
        {
          for(int w = 0; w < initialValues[i].numwords; w++)
          {
            traces[0].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] -= ddx[w];
          }
        }
      }

      ddx += initialValues[i].numwords;
    }

    float *ddy = ddx;

    for(size_t i = 0; i < initialValues.size(); i++)
    {
      if(!initialValues[i].included)
        continue;

      if(initialValues[i].reg >= 0)
      {
        if(destIdx == 0)
        {
          for(int w = 0; w < initialValues[i].numwords; w++)
          {
            traces[2].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] += ddy[w];
            traces[3].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] += ddy[w];
          }
        }
        else if(destIdx == 1)
        {
          for(int w = 0; w < initialValues[i].numwords; w++)
          {
            traces[2].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] += ddy[w];
          }
        }
        else if(destIdx == 2)
        {
          for(int w = 0; w < initialValues[i].numwords; w++)
          {
            traces[0].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] -= ddy[w];
            traces[1].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] -= ddy[w];
          }
        }
      }

      ddy += initialValues[i].numwords;
    }

    float *ddxfine = ddy;

    for(size_t i = 0; i < initialValues.size(); i++)
    {
      if(!initialValues[i].included)
        continue;

      if(initialValues[i].reg >= 0)
      {
        if(destIdx == 2)
        {
          for(int w = 0; w < initialValues[i].numwords; w++)
          {
            traces[3].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] += ddxfine[w];
          }
        }
        else if(destIdx == 3)
        {
          for(int w = 0; w < initialValues[i].numwords; w++)
          {
            traces[2].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] -= ddxfine[w];
          }
        }
      }

      ddxfine += initialValues[i].numwords;
    }

    float *ddyfine = ddxfine;

    for(size_t i = 0; i < initialValues.size(); i++)
    {
      if(!initialValues[i].included)
        continue;

      if(initialValues[i].reg >= 0)
      {
        if(destIdx == 1)
        {
          for(int w = 0; w < initialValues[i].numwords; w++)
          {
            traces[3].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] += ddyfine[w];
          }
        }
        else if(destIdx == 3)
        {
          for(int w = 0; w < initialValues[i].numwords; w++)
          {
            traces[1].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] -= ddyfine[w];
            traces[0].inputs[initialValues[i].reg].value.fv[initialValues[i].elem + w] -= ddyfine[w];
          }
        }
      }

      ddyfine += initialValues[i].numwords;
    }
  }

  SAFE_DELETE_ARRAY(initialData);

  vector<ShaderDebugState> states;

  dxbc->m_DebugInfo->GetStack(0, dxbc->GetInstruction(0).offset, quad[destIdx].callstack);

  states.push_back((State)quad[destIdx]);

  // ping pong between so that we can have 'current' quad to update into new one
  State quad2[4];

  State *curquad = quad;
  State *newquad = quad2;

  // marks any threads stalled waiting for others to catch up
  bool activeMask[4] = {true, true, true, true};

  int cycleCounter = 0;

  D3D11MarkerRegion simloop("Simulation Loop");

  // simulate lockstep until all threads are finished
  bool finished = true;
  do
  {
    for(size_t i = 0; i < 4; i++)
    {
      if(activeMask[i])
        newquad[i] = curquad[i].GetNext(global, curquad);
      else
        newquad[i] = curquad[i];
    }

    State *a = curquad;
    curquad = newquad;
    newquad = a;

    // if our destination quad is paused don't record multiple identical states.
    if(activeMask[destIdx])
    {
      State &s = curquad[destIdx];

      {
        const ASMOperation &op = dxbc->GetInstruction((size_t)s.nextInstruction);
        dxbc->m_DebugInfo->GetStack(s.nextInstruction, op.offset, s.callstack);
      }

      states.push_back(s);
    }

    // we need to make sure that control flow which converges stays in lockstep so that
    // derivatives are still valid. While diverged, we don't have to keep threads in lockstep
    // since using derivatives is invalid.

    // Threads diverge either in ifs, loops, or switches. Due to the nature of the bytecode,
    // all threads *must* pass through the same exit instruction for each, there's no jumping
    // around with gotos. Note also for the same reason, the only time threads are on earlier
    // instructions is if they are still catching up to a thread that has exited the control
    // flow.

    // So the scheme is as follows:
    // * If all threads have the same nextInstruction, just continue we are still in lockstep.
    // * If threads are out of lockstep, find any thread which has nextInstruction pointing
    //   immediately *after* an ENDIF, ENDLOOP or ENDSWITCH. Pointing directly at one is not
    //   an indication the thread is done, as the next step for an ENDLOOP will jump back to
    //   the matching LOOP and continue iterating.
    // * Pause any thread matching the above until all threads are pointing to the same
    //   instruction. By the assumption above, all threads will eventually pass through this
    //   terminating instruction so we just pause any other threads and don't do anything
    //   until the control flow has converged and we can continue stepping in lockstep.

    // mark all threads as active again.
    // if we've converged, or we were never diverged, this keeps everything ticking
    activeMask[0] = activeMask[1] = activeMask[2] = activeMask[3] = true;

    if(curquad[0].nextInstruction != curquad[1].nextInstruction ||
       curquad[0].nextInstruction != curquad[2].nextInstruction ||
       curquad[0].nextInstruction != curquad[3].nextInstruction)
    {
      // this isn't *perfect* but it will still eventually continue. We look for the most
      // advanced thread, and check to see if it's just finished a control flow. If it has
      // then we assume it's at the convergence point and wait for every other thread to
      // catch up, pausing any threads that reach the convergence point before others.

      // Note this might mean we don't have any threads paused even within divergent flow.
      // This is fine and all we care about is pausing to make sure threads don't run ahead
      // into code that should be lockstep. We don't care at all about what they do within
      // the code that is divergent.

      // The reason this isn't perfect is that the most advanced thread could be on an
      // inner loop or inner if, not the convergence point, and we could be pausing it
      // fruitlessly. Worse still - it could be on a branch none of the other threads will
      // take so they will never reach that exact instruction.
      // But we know that all threads will eventually go through the convergence point, so
      // even in that worst case if we didn't pick the right waiting point, another thread
      // will overtake and become the new most advanced thread and the previous waiting
      // thread will resume. So in this case we caused a thread to wait more than it should
      // have but that's not a big deal as it's within divergent flow so they don't have to
      // stay in lockstep. Also if all threads will eventually pass that point we picked,
      // we just waited to converge even in technically divergent code which is also
      // harmless.

      // Phew!

      uint32_t convergencePoint = 0;

      // find which thread is most advanced
      for(size_t i = 0; i < 4; i++)
        if(curquad[i].nextInstruction > convergencePoint)
          convergencePoint = curquad[i].nextInstruction;

      if(convergencePoint > 0)
      {
        OpcodeType op = dxbc->GetInstruction(convergencePoint - 1).operation;

        // if the most advnaced thread hasn't just finished control flow, then all
        // threads are still running, so don't converge
        if(op != OPCODE_ENDIF && op != OPCODE_ENDLOOP && op != OPCODE_ENDSWITCH)
          convergencePoint = 0;
      }

      // pause any threads at that instruction (could be none)
      for(size_t i = 0; i < 4; i++)
        if(curquad[i].nextInstruction == convergencePoint)
          activeMask[i] = false;
    }

    finished = curquad[destIdx].Finished();

    cycleCounter++;

    if(cycleCounter == SHADER_DEBUG_WARN_THRESHOLD)
    {
      if(PromptDebugTimeout(DXBC::TYPE_VERTEX, cycleCounter))
        break;
    }
  } while(!finished);

  traces[destIdx].states = states;

  return traces[destIdx];
}

ShaderDebugTrace D3D11Replay::DebugThread(uint32_t eventId, const uint32_t groupid[3],
                                          const uint32_t threadid[3])
{
  using namespace DXBC;
  using namespace ShaderDebug;

  D3D11MarkerRegion simloop(StringFormat::Fmt("DebugThread @ %u: [%u, %u, %u] (%u, %u, %u)",
                                              eventId, groupid[0], groupid[1], groupid[2],
                                              threadid[0], threadid[1], threadid[2]));

  ShaderDebugTrace empty;

  D3D11RenderStateTracker tracker(m_pImmediateContext);

  ID3D11ComputeShader *stateCS = NULL;
  m_pImmediateContext->CSGetShader(&stateCS, NULL, NULL);

  WrappedID3D11Shader<ID3D11ComputeShader> *cs = (WrappedID3D11Shader<ID3D11ComputeShader> *)stateCS;

  SAFE_RELEASE(stateCS);

  if(!cs)
    return empty;

  DXBCFile *dxbc = cs->GetDXBC();

  if(!dxbc)
    return empty;

  dxbc->GetDisassembly();

  D3D11RenderState *rs = m_pImmediateContext->GetCurrentPipelineState();

  bytebuf cbufData[D3D11_COMMONSHADER_CONSTANT_BUFFER_API_SLOT_COUNT];

  for(int i = 0; i < D3D11_COMMONSHADER_CONSTANT_BUFFER_API_SLOT_COUNT; i++)
    if(rs->CS.ConstantBuffers[i])
      GetDebugManager()->GetBufferData(rs->CS.ConstantBuffers[i],
                                       rs->CS.CBOffsets[i] * sizeof(Vec4f), 0, cbufData[i]);

  ShaderDebugTrace ret;

  GlobalState global;
  GetDebugManager()->CreateShaderGlobalState(global, dxbc, 0, rs->CSUAVs, rs->CS.SRVs);
  State initialState = GetDebugManager()->CreateShaderDebugState(ret, -1, dxbc, cbufData);

  for(int i = 0; i < 3; i++)
  {
    initialState.semantics.GroupID[i] = groupid[i];
    initialState.semantics.ThreadID[i] = threadid[i];
  }

  vector<ShaderDebugState> states;

  dxbc->m_DebugInfo->GetStack(0, dxbc->GetInstruction(0).offset, initialState.callstack);

  states.push_back((State)initialState);

  for(int cycleCounter = 0;; cycleCounter++)
  {
    if(initialState.Finished())
      break;

    initialState = initialState.GetNext(global, NULL);

    {
      const ASMOperation &op = dxbc->GetInstruction((size_t)initialState.nextInstruction);
      dxbc->m_DebugInfo->GetStack(initialState.nextInstruction, op.offset, initialState.callstack);
    }

    states.push_back((State)initialState);

    if(cycleCounter == SHADER_DEBUG_WARN_THRESHOLD)
    {
      if(PromptDebugTimeout(DXBC::TYPE_VERTEX, cycleCounter))
        break;
    }
  }

  ret.states = states;

  return ret;
}