renderdoc/renderdoc/driver/vulkan/vk_postvs.cpp

/******************************************************************************
 * The MIT License (MIT)
 *
 * Copyright (c) 2018 Baldur Karlsson
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 ******************************************************************************/

#include <float.h>
#include "3rdparty/glslang/SPIRV/spirv.hpp"
#include "driver/shaders/spirv/spirv_common.h"
#include "driver/shaders/spirv/spirv_editor.h"
#include "vk_core.h"
#include "vk_debug.h"
#include "vk_shader_cache.h"

static const char *PatchedMeshOutputEntryPoint = "rdc";
static const uint32_t MeshOutputDispatchWidth = 128;
static const uint32_t MeshOutputTBufferArraySize = 16;

static void ConvertToMeshOutputCompute(const ShaderReflection &refl, const SPIRVPatchData &patchData,
                                       const char *entryName, std::vector<bool> isInstanced,
                                       uint32_t &descSet, const DrawcallDescription *draw,
                                       int32_t indexOffset, uint64_t numFetchVerts, uint32_t numVerts,
                                       std::vector<uint32_t> &modSpirv, uint32_t &bufStride)
{
  SPIRVEditor editor(modSpirv);

  uint32_t numInputs = (uint32_t)refl.inputSignature.size();

  uint32_t numOutputs = (uint32_t)refl.outputSignature.size();
  RDCASSERT(numOutputs > 0);

  descSet = 0;

  for(SPIRVIterator it = editor.BeginDecorations(), end = editor.EndDecorations(); it != end; ++it)
  {
    // we will use the descriptor set immediately after the last set statically used by the shader.
    // This means we don't have to worry about if the descriptor set layout declares more sets which
    // might be invalid and un-bindable, we just trample over the next set that's unused.
    // This is much easier than trying to add a new bind to an existing descriptor set (which would
    // cascade into a new descriptor set layout, new pipeline layout, etc etc!). However, this might
    // push us over the limit on number of descriptor sets.
    if(it.opcode() == spv::OpDecorate && it.word(2) == spv::DecorationDescriptorSet)
      descSet = RDCMAX(descSet, it.word(3) + 1);
  }

  // tbuffer types, the values are the descriptor bindings
  enum tbufferType
  {
    tbuffer_undefined,
    tbuffer_float = 2,
    tbuffer_uint = 3,
    tbuffer_sint = 4,
    tbuffer_count,
  };

  struct inputOutputIDs
  {
    // if this is a builtin value, what builtin value is expected
    ShaderBuiltin builtin = ShaderBuiltin::Undefined;
    // ID of the variable
    SPIRVId variableID;
    // constant ID for the index of this attribute
    SPIRVId constID;
    // the type ID for this attribute. Must be present already by definition!
    SPIRVId basetypeID;
    // tbuffer type for this input
    tbufferType tbuffer;
    // gvec4 type for this input, used as result type when fetching from tbuffer
    uint32_t vec4ID;
    // Uniform Pointer ID for this output. Used only for output data, to write to output SSBO
    SPIRVId uniformPtrID;
    // Output Pointer ID for this attribute.
    // For inputs, used to 'write' to the global at the start.
    // For outputs, used to 'read' from the global at the end.
    SPIRVId privatePtrID;
  };
  std::vector<inputOutputIDs> ins;
  ins.resize(numInputs);
  std::vector<inputOutputIDs> outs;
  outs.resize(numOutputs);

  std::set<SPIRVId> inputs;
  std::set<SPIRVId> outputs;

  std::map<SPIRVId, SPIRVId> typeReplacements;

  // rewrite any inputs and outputs to be private storage class
  for(SPIRVIterator it = editor.BeginTypes(), end = editor.EndTypes(); it != end; ++it)
  {
    // rewrite any input/output variables to private, and build up inputs/outputs list
    if(it.opcode() == spv::OpTypePointer)
    {
      SPIRVId id;

      if(it.word(2) == spv::StorageClassInput)
      {
        id = it.word(1);
        inputs.insert(id);
      }
      else if(it.word(2) == spv::StorageClassOutput)
      {
        id = it.word(1);
        outputs.insert(id);

        SPIRVId baseId = it.word(3);

        SPIRVIterator baseIt = editor.GetID(baseId);
        if(baseIt && baseIt.opcode() == spv::OpTypeStruct)
          outputs.insert(baseId);
      }

      if(id)
      {
        SPIRVPointer privPtr(it.word(3), spv::StorageClassPrivate);

        SPIRVId origId = editor.GetType(privPtr);

        if(origId)
        {
          // if we already had a private pointer for this type, we have to use that type - we can't
          // create a new type by aliasing. Thus we need to replace any uses of 'id' with 'origId'.
          typeReplacements[id] = origId;

          // and remove this type declaration
          editor.Remove(it);
        }
        else
        {
          editor.PreModify(it);

          it.word(2) = spv::StorageClassPrivate;

          // if we didn't already have this pointer, process the modified type declaration
          editor.PostModify(it);
        }
      }
    }
    else if(it.opcode() == spv::OpVariable)
    {
      bool mod = false;

      if(it.word(3) == spv::StorageClassInput)
      {
        mod = true;
        editor.PreModify(it);
        it.word(3) = spv::StorageClassPrivate;

        inputs.insert(it.word(2));
      }
      else if(it.word(3) == spv::StorageClassOutput)
      {
        mod = true;
        editor.PreModify(it);
        it.word(3) = spv::StorageClassPrivate;

        outputs.insert(it.word(2));
      }

      auto replIt = typeReplacements.find(it.word(1));
      if(replIt != typeReplacements.end())
      {
        mod = true;
        if(!mod)
          editor.PreModify(it);
        it.word(1) = typeReplacements[it.word(1)];
      }

      if(mod)
        editor.PostModify(it);
    }
    else if(it.opcode() == spv::OpTypeFunction)
    {
      bool mod = false;

      auto replIt = typeReplacements.find(it.word(1));
      if(replIt != typeReplacements.end())
      {
        editor.PreModify(it);
        mod = true;
        it.word(1) = typeReplacements[it.word(1)];
      }

      for(size_t i = 4; i < it.size(); it++)
      {
        replIt = typeReplacements.find(it.word(i));
        if(replIt != typeReplacements.end())
        {
          if(!mod)
            editor.PreModify(it);
          mod = true;
          it.word(i) = typeReplacements[it.word(i)];
        }
      }

      if(mod)
        editor.PostModify(it);
    }
    else if(it.opcode() == spv::OpConstantNull)
    {
      auto replIt = typeReplacements.find(it.word(1));
      if(replIt != typeReplacements.end())
      {
        editor.PreModify(it);
        it.word(1) = typeReplacements[it.word(1)];
        editor.PostModify(it);
      }
    }
  }

  for(SPIRVIterator it = editor.BeginFunctions(); it; ++it)
  {
    // identify functions with result types we might want to replace
    if(it.opcode() == spv::OpFunction || it.opcode() == spv::OpFunctionParameter ||
       it.opcode() == spv::OpVariable || it.opcode() == spv::OpAccessChain ||
       it.opcode() == spv::OpInBoundsAccessChain || it.opcode() == spv::OpBitcast ||
       it.opcode() == spv::OpUndef || it.opcode() == spv::OpExtInst ||
       it.opcode() == spv::OpFunctionCall || it.opcode() == spv::OpPhi)
    {
      editor.PreModify(it);

      uint32_t &id = it.word(1);
      auto replIt = typeReplacements.find(id);
      if(replIt != typeReplacements.end())
        id = typeReplacements[id];

      editor.PostModify(it);
    }
  }

  // detect builtin inputs or outputs, and remove builtin decorations
  for(SPIRVIterator it = editor.BeginDecorations(), end = editor.EndDecorations(); it != end; ++it)
  {
    // remove any builtin decorations
    if(it.opcode() == spv::OpDecorate && it.word(2) == spv::DecorationBuiltIn)
    {
      SPIRVId id = it.word(1);

      if(outputs.find(id) != outputs.end())
      {
        // outputs we don't have to do anything, discard the builtin information
      }
      else if(inputs.find(id) != inputs.end())
      {
        // for inputs, record the variable ID for this builtin
        for(size_t i = 0; i < refl.inputSignature.size(); i++)
        {
          const SigParameter &sig = refl.inputSignature[i];

          if(sig.systemValue ==
             BuiltInToSystemAttribute(ShaderStage::Vertex, (spv::BuiltIn)it.word(3)))
          {
            ins[i].variableID = id;
            break;
          }
        }
      }

      editor.Remove(it);
    }

    if(it.opcode() == spv::OpMemberDecorate && it.word(3) == spv::DecorationBuiltIn)
      editor.Remove(it);

    // remove block decoration from input or output structs
    if(it.opcode() == spv::OpDecorate && it.word(2) == spv::DecorationBlock)
    {
      SPIRVId id = it.word(1);

      if(outputs.find(id) != outputs.end() || inputs.find(id) != inputs.end())
        editor.Remove(it);
    }

    // remove all invariant decoreations
    if(it.opcode() == spv::OpDecorate && it.word(2) == spv::DecorationInvariant)
    {
      editor.Remove(it);
    }

    if(it.opcode() == spv::OpDecorate && it.word(2) == spv::DecorationLocation)
    {
      SPIRVId id = it.word(1);

      if(outputs.find(id) != outputs.end())
      {
        // outputs we don't have to do anything, discard the location information
      }
      else if(inputs.find(id) != inputs.end())
      {
        // for inputs, record the variable ID for this location
        for(size_t i = 0; i < refl.inputSignature.size(); i++)
        {
          const SigParameter &sig = refl.inputSignature[i];

          if(sig.systemValue == ShaderBuiltin::Undefined && sig.regIndex == it.word(3))
          {
            ins[i].variableID = id;
            break;
          }
        }
      }

      editor.Remove(it);
    }
  }

  SPIRVId entryID = 0;

  std::set<SPIRVId> entries;

  for(const SPIRVEntry &entry : editor.GetEntries())
  {
    if(entry.name == entryName)
      entryID = entry.id;

    entries.insert(entry.id);
  }

  RDCASSERT(entryID);

  for(SPIRVIterator it = editor.BeginDebug(), end2 = editor.EndDebug(); it != end2; ++it)
  {
    if(it.opcode() == spv::OpName &&
       (inputs.find(it.word(1)) != inputs.end() || outputs.find(it.word(1)) != outputs.end()))
    {
      SPIRVId id = it.word(1);
      std::string oldName = (const char *)&it.word(2);
      editor.Remove(it);
      editor.SetName(id, ("emulated_" + oldName).c_str());
    }

    // remove any OpName for the old entry points
    if(it.opcode() == spv::OpName && entries.find(it.word(1)) != entries.end())
      editor.Remove(it);
  }

  // declare necessary variables per-output, types and constants. We do this last so that we don't
  // add a private pointer that we later try and deduplicate when collapsing output/input pointers
  // to private
  for(uint32_t i = 0; i < numOutputs; i++)
  {
    inputOutputIDs &io = outs[i];

    io.builtin = refl.outputSignature[i].systemValue;

    // constant for this index
    io.constID = editor.AddConstantImmediate(i);

    io.variableID = patchData.outputs[i].ID;

    // base type - either a scalar or a vector, since matrix outputs are decayed to vectors
    {
      SPIRVScalar scalarType = scalar<uint32_t>();

      if(refl.outputSignature[i].compType == CompType::UInt)
        scalarType = scalar<uint32_t>();
      else if(refl.outputSignature[i].compType == CompType::SInt)
        scalarType = scalar<int32_t>();
      else if(refl.outputSignature[i].compType == CompType::Float)
        scalarType = scalar<float>();
      else if(refl.outputSignature[i].compType == CompType::Double)
        scalarType = scalar<double>();

      io.vec4ID = editor.DeclareType(SPIRVVector(scalarType, 4));

      if(refl.outputSignature[i].compCount > 1)
        io.basetypeID =
            editor.DeclareType(SPIRVVector(scalarType, refl.outputSignature[i].compCount));
      else
        io.basetypeID = editor.DeclareType(scalarType);
    }

    io.uniformPtrID = editor.DeclareType(SPIRVPointer(io.basetypeID, spv::StorageClassUniform));
    io.privatePtrID = editor.DeclareType(SPIRVPointer(io.basetypeID, spv::StorageClassPrivate));

    RDCASSERT(io.basetypeID && io.vec4ID && io.constID && io.privatePtrID && io.uniformPtrID,
              io.basetypeID, io.vec4ID, io.constID, io.privatePtrID, io.uniformPtrID);
  }

  // repeat for inputs
  for(uint32_t i = 0; i < numInputs; i++)
  {
    inputOutputIDs &io = ins[i];

    io.builtin = refl.inputSignature[i].systemValue;

    // constant for this index
    io.constID = editor.AddConstantImmediate(i);

    SPIRVScalar scalarType = scalar<uint32_t>();

    // base type - either a scalar or a vector, since matrix outputs are decayed to vectors
    if(refl.inputSignature[i].compType == CompType::UInt)
    {
      scalarType = scalar<uint32_t>();
      io.tbuffer = tbuffer_uint;
    }
    else if(refl.inputSignature[i].compType == CompType::SInt)
    {
      scalarType = scalar<int32_t>();
      io.tbuffer = tbuffer_sint;
    }
    else if(refl.inputSignature[i].compType == CompType::Float)
    {
      scalarType = scalar<float>();
      io.tbuffer = tbuffer_float;
    }
    else if(refl.inputSignature[i].compType == CompType::Double)
    {
      RDCERR("Double inputs are not supported, will be undefined");
      scalarType = scalar<double>();
    }

    io.vec4ID = editor.DeclareType(SPIRVVector(scalarType, 4));

    if(refl.inputSignature[i].compCount > 1)
      io.basetypeID = editor.DeclareType(SPIRVVector(scalarType, refl.inputSignature[i].compCount));
    else
      io.basetypeID = editor.DeclareType(scalarType);

    io.privatePtrID = editor.DeclareType(SPIRVPointer(io.basetypeID, spv::StorageClassPrivate));

    RDCASSERT(io.basetypeID && io.vec4ID && io.constID && io.privatePtrID, io.basetypeID, io.vec4ID,
              io.constID, io.privatePtrID);
  }

  struct tbufferIDs
  {
    uint32_t imageTypeID;
    uint32_t imageSampledTypeID;
    uint32_t pointerTypeID;
    uint32_t variableID;
  } tbuffers[tbuffer_count];

  uint32_t arraySize = editor.AddConstantImmediate<uint32_t>(MeshOutputTBufferArraySize);

  for(tbufferType tb : {tbuffer_float, tbuffer_sint, tbuffer_uint})
  {
    SPIRVScalar scalarType = scalar<float>();
    const char *name = "float_vbuffers";

    if(tb == tbuffer_sint)
    {
      scalarType = scalar<int32_t>();
      name = "int_vbuffers";
    }
    else if(tb == tbuffer_uint)
    {
      scalarType = scalar<uint32_t>();
      name = "uint_vbuffers";
    }

    tbuffers[tb].imageTypeID = editor.DeclareType(
        SPIRVImage(scalarType, spv::DimBuffer, 0, 0, 0, 1, spv::ImageFormatUnknown));
    tbuffers[tb].imageSampledTypeID = editor.DeclareType(SPIRVSampledImage(tbuffers[tb].imageTypeID));

    uint32_t arrayType = editor.MakeId();
    editor.AddType(
        SPIRVOperation(spv::OpTypeArray, {arrayType, tbuffers[tb].imageSampledTypeID, arraySize}));

    uint32_t arrayPtrType =
        editor.DeclareType(SPIRVPointer(arrayType, spv::StorageClassUniformConstant));

    tbuffers[tb].pointerTypeID = editor.DeclareType(
        SPIRVPointer(tbuffers[tb].imageSampledTypeID, spv::StorageClassUniformConstant));

    tbuffers[tb].variableID = editor.MakeId();
    editor.AddVariable(SPIRVOperation(
        spv::OpVariable, {arrayPtrType, tbuffers[tb].variableID, spv::StorageClassUniformConstant}));

    editor.SetName(tbuffers[tb].variableID, name);

    editor.AddDecoration(SPIRVOperation(
        spv::OpDecorate, {tbuffers[tb].variableID, (uint32_t)spv::DecorationDescriptorSet, descSet}));
    editor.AddDecoration(SPIRVOperation(
        spv::OpDecorate, {tbuffers[tb].variableID, (uint32_t)spv::DecorationBinding, (uint32_t)tb}));
  }

  SPIRVId uint32Vec4ID = 0;
  SPIRVId idxImageTypeID = 0;
  SPIRVId idxImagePtr = 0;
  SPIRVId idxSampledTypeID = 0;

  if(draw->flags & DrawFlags::UseIBuffer)
  {
    uint32Vec4ID = editor.DeclareType(SPIRVVector(scalar<uint32_t>(), 4));

    idxImageTypeID = editor.DeclareType(
        SPIRVImage(scalar<uint32_t>(), spv::DimBuffer, 0, 0, 0, 1, spv::ImageFormatUnknown));
    idxSampledTypeID = editor.DeclareType(SPIRVSampledImage(idxImageTypeID));

    uint32_t idxImagePtrType =
        editor.DeclareType(SPIRVPointer(idxSampledTypeID, spv::StorageClassUniformConstant));

    idxImagePtr = editor.MakeId();
    editor.AddVariable(SPIRVOperation(
        spv::OpVariable, {idxImagePtrType, idxImagePtr, spv::StorageClassUniformConstant}));

    editor.SetName(idxImagePtr, "ibuffer");

    editor.AddDecoration(SPIRVOperation(
        spv::OpDecorate, {idxImagePtr, (uint32_t)spv::DecorationDescriptorSet, descSet}));
    editor.AddDecoration(
        SPIRVOperation(spv::OpDecorate, {idxImagePtr, (uint32_t)spv::DecorationBinding, 1}));
  }

  if(numInputs > 0)
  {
    editor.AddCapability(spv::CapabilitySampledBuffer);
  }

  SPIRVId outBufferVarID = 0;
  SPIRVId numFetchVertsConstID = editor.AddConstantImmediate((int32_t)numFetchVerts);
  SPIRVId numVertsConstID = editor.AddConstantImmediate((int32_t)numVerts);
  SPIRVId numInstConstID = editor.AddConstantImmediate((int32_t)draw->numInstances);

  editor.SetName(numFetchVertsConstID, "numFetchVerts");
  editor.SetName(numVertsConstID, "numVerts");
  editor.SetName(numInstConstID, "numInsts");

  // declare the output buffer and its type
  {
    std::vector<uint32_t> words;
    for(uint32_t o = 0; o < numOutputs; o++)
      words.push_back(outs[o].basetypeID);

    // struct vertex { ... outputs };
    SPIRVId vertStructID = editor.DeclareStructType(words);
    editor.SetName(vertStructID, "vertex_struct");

    // vertex vertArray[];
    SPIRVId runtimeArrayID =
        editor.AddType(SPIRVOperation(spv::OpTypeRuntimeArray, {editor.MakeId(), vertStructID}));
    editor.SetName(runtimeArrayID, "vertex_array");

    // struct meshOutput { vertex vertArray[]; };
    SPIRVId outputStructID = editor.DeclareStructType({runtimeArrayID});
    editor.SetName(outputStructID, "meshOutput");

    // meshOutput *
    SPIRVId outputStructPtrID =
        editor.DeclareType(SPIRVPointer(outputStructID, spv::StorageClassUniform));
    editor.SetName(outputStructPtrID, "meshOutput_ptr");

    // meshOutput *outputData;
    outBufferVarID = editor.AddVariable(SPIRVOperation(
        spv::OpVariable, {outputStructPtrID, editor.MakeId(), spv::StorageClassUniform}));
    editor.SetName(outBufferVarID, "outputData");

    uint32_t memberOffset = 0;
    for(uint32_t o = 0; o < numOutputs; o++)
    {
      uint32_t elemSize = 0;
      if(refl.outputSignature[o].compType == CompType::Double)
        elemSize = 8;
      else if(refl.outputSignature[o].compType == CompType::SInt ||
              refl.outputSignature[o].compType == CompType::UInt ||
              refl.outputSignature[o].compType == CompType::Float)
        elemSize = 4;
      else
        RDCERR("Unexpected component type for output signature element");

      uint32_t numComps = refl.outputSignature[o].compCount;

      // ensure member is std430 packed (vec4 alignment for vec3/vec4)
      if(numComps == 2)
        memberOffset = AlignUp(memberOffset, 2U * elemSize);
      else if(numComps > 2)
        memberOffset = AlignUp(memberOffset, 4U * elemSize);

      // apply decoration to each member in the struct with its offset in the struct
      editor.AddDecoration(SPIRVOperation(spv::OpMemberDecorate,
                                          {vertStructID, o, spv::DecorationOffset, memberOffset}));

      memberOffset += elemSize * refl.outputSignature[o].compCount;
    }

    // align to 16 bytes (vec4) since we will almost certainly have
    // a vec4 in the struct somewhere, and even in std430 alignment,
    // the base struct alignment is still the largest base alignment
    // of any member
    bufStride = AlignUp16(memberOffset);

    // the array is the only element in the output struct, so
    // it's at offset 0
    editor.AddDecoration(
        SPIRVOperation(spv::OpMemberDecorate, {outputStructID, 0, spv::DecorationOffset, 0}));

    // set array stride
    editor.AddDecoration(
        SPIRVOperation(spv::OpDecorate, {runtimeArrayID, spv::DecorationArrayStride, bufStride}));

    // set object type
    editor.AddDecoration(
        SPIRVOperation(spv::OpDecorate, {outputStructID, spv::DecorationBufferBlock}));

    // set binding
    editor.AddDecoration(
        SPIRVOperation(spv::OpDecorate, {outBufferVarID, spv::DecorationDescriptorSet, descSet}));
    editor.AddDecoration(SPIRVOperation(spv::OpDecorate, {outBufferVarID, spv::DecorationBinding, 0}));
  }

  SPIRVId uint32Vec3ID = editor.DeclareType(SPIRVVector(scalar<uint32_t>(), 3));
  SPIRVId invocationPtr = editor.DeclareType(SPIRVPointer(uint32Vec3ID, spv::StorageClassInput));
  SPIRVId invocationId = editor.AddVariable(
      SPIRVOperation(spv::OpVariable, {invocationPtr, editor.MakeId(), spv::StorageClassInput}));
  editor.AddDecoration(SPIRVOperation(
      spv::OpDecorate, {invocationId, spv::DecorationBuiltIn, spv::BuiltInGlobalInvocationId}));

  editor.SetName(invocationId, "rdoc_invocation");

  // make a new entry point that will call the old function, then when it returns extract & write
  // the outputs.
  SPIRVId wrapperEntry = editor.MakeId();
  // don't set a debug name, as some drivers get confused when this doesn't match the entry point
  // name :(.
  // editor.SetName(wrapperEntry, "RenderDoc_MeshFetch_Wrapper_Entrypoint");

  // we remove all entry points and just create one of our own.
  SPIRVIterator it = editor.BeginEntries();

  {
    // there should already have been at least one entry point
    RDCASSERT(it.opcode() == spv::OpEntryPoint);
    // and it should have been at least 5 words (if not more) since a vertex shader cannot function
    // without at least one interface ID. We only need one, so there should be plenty space.
    RDCASSERT(it.size() >= 5);

    editor.PreModify(it);

    SPIRVOperation op(it);

    op.nopRemove(5);

    op[1] = spv::ExecutionModelGLCompute;
    op[2] = wrapperEntry;
    op[3] = MAKE_FOURCC('r', 'd', 'c', 0);
    op[4] = invocationId;

    editor.PostModify(it);

    ++it;
  }

  for(SPIRVIterator end = editor.EndEntries(); it != end; ++it)
    editor.Remove(it);

  editor.AddOperation(
      it, SPIRVOperation(spv::OpExecutionMode, {wrapperEntry, spv::ExecutionModeLocalSize,
                                                MeshOutputDispatchWidth, 1, 1}));

  SPIRVId uint32ID = editor.DeclareType(scalar<uint32_t>());
  SPIRVId sint32ID = editor.DeclareType(scalar<int32_t>());

  // add the wrapper function
  {
    std::vector<SPIRVOperation> ops;

    SPIRVId voidType = editor.DeclareType(scalar<void>());
    SPIRVId funcType = editor.DeclareType(SPIRVFunction(voidType, {}));

    ops.push_back(SPIRVOperation(spv::OpFunction,
                                 {voidType, wrapperEntry, spv::FunctionControlMaskNone, funcType}));

    ops.push_back(SPIRVOperation(spv::OpLabel, {editor.MakeId()}));
    {
      // uint3 invocationVec = gl_GlobalInvocationID;
      uint32_t invocationVector = editor.MakeId();
      ops.push_back(SPIRVOperation(spv::OpLoad, {uint32Vec3ID, invocationVector, invocationId}));

      // uint invocation = invocationVec.x
      uint32_t invocationID = editor.MakeId();
      ops.push_back(
          SPIRVOperation(spv::OpCompositeExtract, {uint32ID, invocationID, invocationVector, 0U}));

      // int intInvocationID = int(invocation);
      uint32_t intInvocationID = editor.MakeId();
      ops.push_back(SPIRVOperation(spv::OpBitcast, {sint32ID, intInvocationID, invocationID}));

      editor.SetName(intInvocationID, "invocation");

      // int inst = intInvocationID / numFetchVerts
      uint32_t instID = editor.MakeId();
      ops.push_back(
          SPIRVOperation(spv::OpSDiv, {sint32ID, instID, intInvocationID, numFetchVertsConstID}));

      editor.SetName(instID, "instanceID");

      // bool inBounds = inst < numInstances;
      uint32_t inBounds = editor.MakeId();
      ops.push_back(SPIRVOperation(
          spv::OpULessThan, {editor.DeclareType(scalar<bool>()), inBounds, instID, numInstConstID}));

      // if(inBounds) goto continueLabel; else goto killLabel;
      uint32_t killLabel = editor.MakeId();
      uint32_t continueLabel = editor.MakeId();
      ops.push_back(SPIRVOperation(spv::OpSelectionMerge, {killLabel, spv::SelectionControlMaskNone}));
      ops.push_back(SPIRVOperation(spv::OpBranchConditional, {inBounds, continueLabel, killLabel}));

      // continueLabel:
      ops.push_back(SPIRVOperation(spv::OpLabel, {continueLabel}));

      // int vtx = intInvocationID % numVerts
      uint32_t vtx = editor.MakeId();
      ops.push_back(SPIRVOperation(spv::OpSMod, {sint32ID, vtx, intInvocationID, numVertsConstID}));

      editor.SetName(vtx, "vertexID");

      uint32_t vertexIndex = vtx;

      // if we're indexing, look up the index buffer. We don't have to apply vertexOffset - it was
      // already applied when we read back and uniq-ified the index buffer.
      if(draw->flags & DrawFlags::UseIBuffer)
      {
        // sampledimage idximg = *idximgPtr;
        uint32_t loaded = editor.MakeId();
        ops.push_back(SPIRVOperation(spv::OpLoad, {idxSampledTypeID, loaded, idxImagePtr}));

        // image rawimg = imageFromSampled(idximg);
        uint32_t rawimg = editor.MakeId();
        ops.push_back(SPIRVOperation(spv::OpImage, {idxImageTypeID, rawimg, loaded}));

        // uvec4 result = texelFetch(rawimg, vtxID);
        uint32_t result = editor.MakeId();
        ops.push_back(SPIRVOperation(spv::OpImageFetch, {uint32Vec4ID, result, rawimg, vertexIndex}));

        // uint vtxID = result.x;
        uint32_t uintIndex = editor.MakeId();
        ops.push_back(SPIRVOperation(spv::OpCompositeExtract, {uint32ID, uintIndex, result, 0}));

        vertexIndex = editor.MakeId();
        ops.push_back(SPIRVOperation(spv::OpBitcast, {sint32ID, vertexIndex, uintIndex}));
      }

      // int arraySlotID = inst * numVerts;
      uint32_t arraySlotTempID = editor.MakeId();
      ops.push_back(SPIRVOperation(spv::OpIMul, {sint32ID, arraySlotTempID, instID, numVertsConstID}));

      // arraySlotID = arraySlotID + vertexIndex;
      uint32_t arraySlotTemp2ID = editor.MakeId();
      ops.push_back(
          SPIRVOperation(spv::OpIAdd, {sint32ID, arraySlotTemp2ID, arraySlotTempID, vertexIndex}));

      // arraySlotID = arraySlotID + indexOffset;
      uint32_t arraySlotID = editor.MakeId();
      ops.push_back(SPIRVOperation(spv::OpIAdd, {sint32ID, arraySlotID, arraySlotTemp2ID,
                                                 editor.AddConstantImmediate(indexOffset)}));

      editor.SetName(arraySlotID, "arraySlot");

      // we use the current value of vertexIndex and use instID, to lookup per-vertex and
      // per-instance attributes. This is because when we fetched the vertex data, we advanced by
      // (in non-indexed draws) vertexOffset, and by instanceOffset. Rather than fetching data
      // that's only used as padding skipped over by these offsets.
      uint32_t vertexLookup = vertexIndex;
      uint32_t instanceLookup = instID;

      if(!(draw->flags & DrawFlags::UseIBuffer))
      {
        // for non-indexed draws, we manually apply the vertex offset, but here after we used the
        // 0-based one to calculate the array slot
        vertexIndex = editor.MakeId();
        ops.push_back(SPIRVOperation(
            spv::OpIAdd, {sint32ID, vertexIndex, vtx,
                          editor.AddConstantImmediate(int32_t(draw->vertexOffset & 0x7fffffff))}));
      }
      editor.SetName(vertexIndex, "vertexIndex");

      // instIndex = inst + instOffset
      uint32_t instIndex = editor.MakeId();
      ops.push_back(SPIRVOperation(
          spv::OpIAdd, {sint32ID, instIndex, instID,
                        editor.AddConstantImmediate(int32_t(draw->instanceOffset & 0x7fffffff))}));
      editor.SetName(instIndex, "instanceIndex");

      uint32_t idxs[64] = {};

      for(size_t i = 0; i < refl.inputSignature.size(); i++)
      {
        ShaderBuiltin builtin = refl.inputSignature[i].systemValue;

        if(builtin == ShaderBuiltin::VertexIndex)
        {
          ops.push_back(SPIRVOperation(spv::OpStore, {ins[i].variableID, vertexIndex}));
        }
        else if(builtin == ShaderBuiltin::InstanceIndex)
        {
          ops.push_back(SPIRVOperation(spv::OpStore, {ins[i].variableID, instIndex}));
        }
        else if(builtin != ShaderBuiltin::Undefined)
        {
          RDCERR("Unsupported/unsupported built-in input %s", ToStr(builtin).c_str());
        }
        else
        {
          if(idxs[i] == 0)
            idxs[i] = editor.AddConstantImmediate<uint32_t>((uint32_t)i);

          if(idxs[refl.inputSignature[i].regIndex] == 0)
            idxs[refl.inputSignature[i].regIndex] =
                editor.AddConstantImmediate<uint32_t>((uint32_t)refl.inputSignature[i].regIndex);

          tbufferIDs tb = tbuffers[ins[i].tbuffer];

          uint32_t location = refl.inputSignature[i].regIndex;

          uint32_t ptrId = editor.MakeId();
          // sampledimage *imgPtr = xxx_tbuffers[i];
          ops.push_back(SPIRVOperation(spv::OpAccessChain, {tb.pointerTypeID, ptrId, tb.variableID,
                                                            idxs[refl.inputSignature[i].regIndex]}));

          // sampledimage img = *imgPtr;
          uint32_t loaded = editor.MakeId();
          ops.push_back(SPIRVOperation(spv::OpLoad, {tb.imageSampledTypeID, loaded, ptrId}));

          // image rawimg = imageFromSampled(img);
          uint32_t rawimg = editor.MakeId();
          ops.push_back(SPIRVOperation(spv::OpImage, {tb.imageTypeID, rawimg, loaded}));

          // vec4 result = texelFetch(rawimg, vtxID or instID);
          uint32_t idx = location < isInstanced.size() && isInstanced[location] ? instanceLookup
                                                                                : vertexLookup;
          uint32_t result = editor.MakeId();
          ops.push_back(SPIRVOperation(spv::OpImageFetch, {ins[i].vec4ID, result, rawimg, idx}));

          // for one component, extract x, for less than 4, extract the sub-vector, otherwise
          // leave
          // alone (4 components)
          if(refl.inputSignature[i].compCount == 1)
          {
            uint32_t swizzleIn = result;
            result = editor.MakeId();

            // baseType value = result.x;
            ops.push_back(
                SPIRVOperation(spv::OpCompositeExtract, {ins[i].basetypeID, result, swizzleIn, 0}));
          }
          else if(refl.inputSignature[i].compCount != 4)
          {
            uint32_t swizzleIn = result;
            result = editor.MakeId();

            std::vector<uint32_t> words = {ins[i].basetypeID, result, swizzleIn, swizzleIn};

            for(uint32_t c = 0; c < refl.inputSignature[i].compCount; c++)
              words.push_back(c);

            // baseTypeN value = result.xyz;
            ops.push_back(SPIRVOperation(spv::OpVectorShuffle, words));
          }

          // not a composite type, we can store directly
          if(patchData.inputs[i].accessChain.empty())
          {
            // *global = value
            ops.push_back(SPIRVOperation(spv::OpStore, {ins[i].variableID, result}));
          }
          else
          {
            // for composite types we need to access chain first
            uint32_t subElement = editor.MakeId();
            std::vector<uint32_t> words = {ins[i].privatePtrID, subElement, patchData.inputs[i].ID};

            for(uint32_t accessIdx : patchData.inputs[i].accessChain)
            {
              if(idxs[accessIdx] == 0)
                idxs[accessIdx] = editor.AddConstantImmediate<uint32_t>((uint32_t)accessIdx);

              words.push_back(idxs[accessIdx]);
            }

            ops.push_back(SPIRVOperation(spv::OpAccessChain, words));

            ops.push_back(SPIRVOperation(spv::OpStore, {subElement, result}));
          }
        }
      }

      // real_main();
      ops.push_back(SPIRVOperation(spv::OpFunctionCall, {voidType, editor.MakeId(), entryID}));

      SPIRVId zero = editor.AddConstantImmediate<uint32_t>(0);

      for(uint32_t o = 0; o < numOutputs; o++)
      {
        uint32_t loaded = 0;

        // not a structure member or array child, can load directly
        if(patchData.outputs[o].accessChain.empty())
        {
          loaded = editor.MakeId();
          // type loaded = *globalvar;
          ops.push_back(
              SPIRVOperation(spv::OpLoad, {outs[o].basetypeID, loaded, patchData.outputs[o].ID}));
        }
        else
        {
          uint32_t readPtr = editor.MakeId();
          loaded = editor.MakeId();

          // structure member, need to access chain first
          std::vector<uint32_t> words = {outs[o].privatePtrID, readPtr, patchData.outputs[o].ID};

          for(uint32_t idx : patchData.outputs[o].accessChain)
            words.push_back(outs[idx].constID);

          // type *readPtr = globalvar.globalsub...;
          ops.push_back(SPIRVOperation(spv::OpAccessChain, words));
          // type loaded = *readPtr;
          ops.push_back(SPIRVOperation(spv::OpLoad, {outs[o].basetypeID, loaded, readPtr}));
        }

        // access chain the destination
        // type *writePtr = outBuffer.verts[arraySlot].outputN
        uint32_t writePtr = editor.MakeId();
        ops.push_back(SPIRVOperation(
            spv::OpAccessChain,
            {outs[o].uniformPtrID, writePtr, outBufferVarID, zero, arraySlotID, outs[o].constID}));

        // *writePtr = loaded;
        ops.push_back(SPIRVOperation(spv::OpStore, {writePtr, loaded}));
      }

      // goto killLabel;
      ops.push_back(SPIRVOperation(spv::OpBranch, {killLabel}));

      // killLabel:
      ops.push_back(SPIRVOperation(spv::OpLabel, {killLabel}));
    }
    ops.push_back(SPIRVOperation(spv::OpReturn, {}));

    ops.push_back(SPIRVOperation(spv::OpFunctionEnd, {}));

    editor.AddFunction(ops.data(), ops.size());
  }

  editor.StripNops();
}

void VulkanReplay::ClearPostVSCache()
{
  VkDevice dev = m_Device;

  for(auto it = m_PostVSData.begin(); it != m_PostVSData.end(); ++it)
  {
    m_pDriver->vkDestroyBuffer(dev, it->second.vsout.buf, NULL);
    m_pDriver->vkFreeMemory(dev, it->second.vsout.bufmem, NULL);
  }

  m_PostVSData.clear();
}

void VulkanReplay::InitPostVSBuffers(uint32_t eventId)
{
  // go through any aliasing
  if(m_PostVSAlias.find(eventId) != m_PostVSAlias.end())
    eventId = m_PostVSAlias[eventId];

  if(m_PostVSData.find(eventId) != m_PostVSData.end())
    return;

  const VulkanRenderState &state = m_pDriver->m_RenderState;
  VulkanCreationInfo &creationInfo = m_pDriver->m_CreationInfo;

  if(state.graphics.pipeline == ResourceId() || state.renderPass == ResourceId())
    return;

  const VulkanCreationInfo::Pipeline &pipeInfo = creationInfo.m_Pipeline[state.graphics.pipeline];

  if(pipeInfo.shaders[0].module == ResourceId())
    return;

  const VulkanCreationInfo::ShaderModule &moduleInfo =
      creationInfo.m_ShaderModule[pipeInfo.shaders[0].module];

  ShaderReflection *refl = pipeInfo.shaders[0].refl;

  // no outputs from this shader? unexpected but theoretically possible (dummy VS before
  // tessellation maybe). Just fill out an empty data set
  if(refl->outputSignature.empty())
  {
    // empty vertex output signature
    m_PostVSData[eventId].vsin.topo = pipeInfo.topology;
    m_PostVSData[eventId].vsout.buf = VK_NULL_HANDLE;
    m_PostVSData[eventId].vsout.instStride = 0;
    m_PostVSData[eventId].vsout.vertStride = 0;
    m_PostVSData[eventId].vsout.nearPlane = 0.0f;
    m_PostVSData[eventId].vsout.farPlane = 0.0f;
    m_PostVSData[eventId].vsout.useIndices = false;
    m_PostVSData[eventId].vsout.hasPosOut = false;
    m_PostVSData[eventId].vsout.idxBuf = ResourceId();

    m_PostVSData[eventId].vsout.topo = pipeInfo.topology;

    return;
  }

  const DrawcallDescription *drawcall = m_pDriver->GetDrawcall(eventId);

  if(drawcall == NULL || drawcall->numIndices == 0 || drawcall->numInstances == 0)
    return;

  // the SPIR-V patching will determine the next descriptor set to use, after all sets statically
  // used by the shader. This gets around the problem where the shader only uses 0 and 1, but the
  // layout declares 0-4, and 2,3,4 are invalid at bind time and we are unable to bind our new set
  // 5. Instead we'll notice that only 0 and 1 are used and just use 2 ourselves (although it was
  // in
  // the original set layout, we know it's statically unused by the shader so we can safely steal
  // it).
  uint32_t descSet = 0;

  // we go through the driver for all these creations since they need to be properly
  // registered in order to be put in the partial replay state
  VkResult vkr = VK_SUCCESS;
  VkDevice dev = m_Device;

  VkPipelineLayout pipeLayout;

  VkGraphicsPipelineCreateInfo pipeCreateInfo;

  // get pipeline create info
  m_pDriver->GetShaderCache()->MakeGraphicsPipelineInfo(pipeCreateInfo, state.graphics.pipeline);

  VkBuffer meshBuffer = VK_NULL_HANDLE, readbackBuffer = VK_NULL_HANDLE;
  VkDeviceMemory meshMem = VK_NULL_HANDLE, readbackMem = VK_NULL_HANDLE;

  VkBuffer uniqIdxBuf = VK_NULL_HANDLE;
  VkDeviceMemory uniqIdxBufMem = VK_NULL_HANDLE;
  VkBufferView uniqIdxBufView = VK_NULL_HANDLE;

  uint32_t numVerts = drawcall->numIndices;
  uint64_t numFetchVerts = drawcall->numIndices;
  VkDeviceSize bufSize = 0;

  uint32_t idxsize = state.ibuffer.bytewidth;

  int32_t baseVertex = 0;

  uint32_t minIndex = 0, maxIndex = RDCMAX(drawcall->baseVertex, 0) + numVerts - 1;

  uint32_t maxInstance = drawcall->instanceOffset + drawcall->numInstances - 1;

  if(drawcall->flags & DrawFlags::UseIBuffer)
  {
    bool index16 = (idxsize == 2);
    bytebuf idxdata;
    std::vector<uint32_t> indices;
    uint16_t *idx16 = NULL;
    uint32_t *idx32 = NULL;

    // fetch ibuffer
    GetBufferData(state.ibuffer.buf, state.ibuffer.offs + drawcall->indexOffset * idxsize,
                  uint64_t(drawcall->numIndices) * idxsize, idxdata);

    // figure out what the maximum index could be, so we can clamp our index buffer to something
    // sane
    uint32_t maxIdx = 0;

    // if there are no active bindings assume the vertex shader is generating its own data
    // and don't clamp the indices
    if(pipeCreateInfo.pVertexInputState->vertexBindingDescriptionCount == 0)
      maxIdx = ~0U;

    for(uint32_t b = 0; b < pipeCreateInfo.pVertexInputState->vertexBindingDescriptionCount; b++)
    {
      const VkVertexInputBindingDescription &input =
          pipeCreateInfo.pVertexInputState->pVertexBindingDescriptions[b];
      // only vertex inputs (not instance inputs) count
      if(input.inputRate == VK_VERTEX_INPUT_RATE_VERTEX)
      {
        if(b >= state.vbuffers.size())
          continue;

        ResourceId buf = state.vbuffers[b].buf;
        VkDeviceSize offs = state.vbuffers[b].offs;

        VkDeviceSize bufsize = creationInfo.m_Buffer[buf].size;

        // the maximum valid index on this particular input is the one that reaches
        // the end of the buffer. The maximum valid index at all is the one that reads
        // off the end of ALL buffers (so we max it with any other maxindex value
        // calculated).
        if(input.stride > 0)
          maxIdx = RDCMAX(maxIdx, uint32_t((bufsize - offs) / input.stride));
      }
    }

    // in case the vertex buffers were set but had invalid stride (0), max with the number
    // of vertices too. This is fine since the max here is just a conservative limit
    maxIdx = RDCMAX(maxIdx, drawcall->numIndices);

    // do ibuffer rebasing/remapping

    idx16 = (uint16_t *)&idxdata[0];
    idx32 = (uint32_t *)&idxdata[0];

    // only read as many indices as were available in the buffer
    uint32_t numIndices =
        RDCMIN(uint32_t(index16 ? idxdata.size() / 2 : idxdata.size() / 4), drawcall->numIndices);

    uint32_t idxclamp = 0;
    if(drawcall->baseVertex < 0)
      idxclamp = uint32_t(-drawcall->baseVertex);

    // grab all unique vertex indices referenced
    for(uint32_t i = 0; i < numIndices; i++)
    {
      uint32_t i32 = index16 ? uint32_t(idx16[i]) : idx32[i];

      // apply baseVertex but clamp to 0 (don't allow index to become negative)
      if(i32 < idxclamp)
        i32 = 0;
      else if(drawcall->baseVertex < 0)
        i32 -= idxclamp;
      else if(drawcall->baseVertex > 0)
        i32 += drawcall->baseVertex;

      // we clamp to maxIdx here, to avoid any invalid indices like 0xffffffff
      // from filtering through. Worst case we index to the end of the vertex
      // buffers which is generally much more reasonable
      i32 = RDCMIN(maxIdx, i32);

      auto it = std::lower_bound(indices.begin(), indices.end(), i32);

      if(it != indices.end() && *it == i32)
        continue;

      indices.insert(it, i32);
    }

    // if we read out of bounds, we'll also have a 0 index being referenced
    // (as 0 is read). Don't insert 0 if we already have 0 though
    if(numIndices < drawcall->numIndices && (indices.empty() || indices[0] != 0))
      indices.insert(indices.begin(), 0);

    minIndex = indices[0];
    maxIndex = indices[indices.size() - 1];

    // set numVerts
    numVerts = maxIndex - minIndex + 1;
    numFetchVerts = (uint64_t)indices.size();

    // An index buffer could be something like: 500, 520, 518, 553, 554, 556
    // but in our vertex buffer that will be: 0, 20, 18, 53, 54, 56
    // so we add -minIndex as the baseVertex when rendering. The existing baseVertex was 'applied'
    // when we fetched the mesh output so it can be discarded.
    baseVertex = -(int32_t)minIndex;

    // create buffer with unique 0-based indices
    VkBufferCreateInfo bufInfo = {
        VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
        NULL,
        0,
        indices.size() * sizeof(uint32_t),
        VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
    };

    vkr = m_pDriver->vkCreateBuffer(dev, &bufInfo, NULL, &uniqIdxBuf);
    RDCASSERTEQUAL(vkr, VK_SUCCESS);

    VkMemoryRequirements mrq = {0};
    m_pDriver->vkGetBufferMemoryRequirements(dev, uniqIdxBuf, &mrq);

    VkMemoryAllocateInfo allocInfo = {
        VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, NULL, mrq.size,
        m_pDriver->GetUploadMemoryIndex(mrq.memoryTypeBits),
    };

    vkr = m_pDriver->vkAllocateMemory(dev, &allocInfo, NULL, &uniqIdxBufMem);
    RDCASSERTEQUAL(vkr, VK_SUCCESS);

    vkr = m_pDriver->vkBindBufferMemory(dev, uniqIdxBuf, uniqIdxBufMem, 0);
    RDCASSERTEQUAL(vkr, VK_SUCCESS);

    VkBufferViewCreateInfo viewInfo = {
        VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
        NULL,
        0,
        uniqIdxBuf,
        VK_FORMAT_R32_UINT,
        0,
        VK_WHOLE_SIZE,
    };

    vkr = m_pDriver->vkCreateBufferView(dev, &viewInfo, NULL, &uniqIdxBufView);
    RDCASSERTEQUAL(vkr, VK_SUCCESS);

    byte *idxData = NULL;
    vkr = m_pDriver->vkMapMemory(m_Device, uniqIdxBufMem, 0, VK_WHOLE_SIZE, 0, (void **)&idxData);
    RDCASSERTEQUAL(vkr, VK_SUCCESS);

    memcpy(idxData, &indices[0], indices.size() * sizeof(uint32_t));

    m_pDriver->vkUnmapMemory(m_Device, uniqIdxBufMem);
  }

  uint32_t bufStride = 0;
  vector<uint32_t> modSpirv = moduleInfo.spirv.spirv;

  struct CompactedAttrBuffer
  {
    VkDeviceMemory mem;
    VkBuffer buf;
    VkBufferView view;
  };

  std::vector<bool> attrIsInstanced;
  CompactedAttrBuffer vbuffers[64];
  RDCEraseEl(vbuffers);

  {
    VkWriteDescriptorSet descWrites[64];
    uint32_t numWrites = 0;

    RDCEraseEl(descWrites);

    const VkPipelineVertexInputStateCreateInfo *vi = pipeCreateInfo.pVertexInputState;

    RDCASSERT(vi->vertexAttributeDescriptionCount <= MeshOutputTBufferArraySize);

    // we fetch the vertex buffer data up front here since there's a very high chance of either
    // overlap due to interleaved attributes, or no overlap and no wastage due to separate compact
    // attributes.
    bytebuf origVBs[16];

    for(uint32_t vb = 0; vb < vi->vertexBindingDescriptionCount; vb++)
    {
      VkDeviceSize offs = state.vbuffers[vb].offs;
      uint64_t len = 0;

      if(vi->pVertexBindingDescriptions[vb].inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
      {
        len = (maxInstance + 1) * vi->pVertexBindingDescriptions[vb].stride;

        offs += drawcall->instanceOffset * vi->pVertexBindingDescriptions[vb].stride;
      }
      else
      {
        len = (maxIndex + 1) * vi->pVertexBindingDescriptions[vb].stride;

        offs += drawcall->vertexOffset * vi->pVertexBindingDescriptions[vb].stride;
      }

      GetBufferData(state.vbuffers[vb].buf, offs, len, origVBs[vb]);
    }

    for(uint32_t i = 0; i < vi->vertexAttributeDescriptionCount; i++)
    {
      const VkVertexInputAttributeDescription &attrDesc = vi->pVertexAttributeDescriptions[i];
      uint32_t attr = attrDesc.location;

      RDCASSERT(attr < 64);
      if(attr >= ARRAY_COUNT(vbuffers))
      {
        RDCERR("Attribute index too high! Resize array.");
        continue;
      }

      bool isInstanced = false;
      size_t stride = 1;

      const byte *origVBBegin = NULL;
      const byte *origVBEnd = NULL;

      for(uint32_t vb = 0; vb < vi->vertexBindingDescriptionCount; vb++)
      {
        const VkVertexInputBindingDescription &vbDesc = vi->pVertexBindingDescriptions[vb];
        if(vbDesc.binding == attrDesc.binding)
        {
          origVBBegin = origVBs[vb].data() + attrDesc.offset;
          origVBEnd = origVBs[vb].data() + origVBs[vb].size();
          stride = vbDesc.stride;
          isInstanced = (vbDesc.inputRate == VK_VERTEX_INPUT_RATE_INSTANCE);
          break;
        }
      }

      RDCASSERT(origVBEnd);

      // in some limited cases, provided we added the UNIFORM_TEXEL_BUFFER usage bit, we could use
      // the original buffers here as-is and read out of them. However it is likely that the offset
      // is not a multiple of the minimum texel buffer offset for at least some of the buffers if
      // not all of them, so we simplify the code here by *always* reading back the vertex buffer
      // data and uploading a compacted version.
      uint32_t elemSize = GetByteSize(1, 1, 1, attrDesc.format, 0);

      {
        VkBufferCreateInfo bufInfo = {
            VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
            NULL,
            0,
            elemSize * (maxIndex + 1),
            VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
        };

        if(isInstanced)
          bufInfo.size = elemSize * (maxInstance + 1);

        vkr = m_pDriver->vkCreateBuffer(dev, &bufInfo, NULL, &vbuffers[attr].buf);
        RDCASSERTEQUAL(vkr, VK_SUCCESS);

        VkMemoryRequirements mrq = {0};
        m_pDriver->vkGetBufferMemoryRequirements(dev, vbuffers[attr].buf, &mrq);

        VkMemoryAllocateInfo allocInfo = {
            VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, NULL, mrq.size,
            m_pDriver->GetUploadMemoryIndex(mrq.memoryTypeBits),
        };

        vkr = m_pDriver->vkAllocateMemory(dev, &allocInfo, NULL, &vbuffers[attr].mem);
        RDCASSERTEQUAL(vkr, VK_SUCCESS);

        vkr = m_pDriver->vkBindBufferMemory(dev, vbuffers[attr].buf, vbuffers[attr].mem, 0);
        RDCASSERTEQUAL(vkr, VK_SUCCESS);

        byte *compactedData = NULL;
        vkr = m_pDriver->vkMapMemory(m_Device, vbuffers[attr].mem, 0, VK_WHOLE_SIZE, 0,
                                     (void **)&compactedData);
        RDCASSERTEQUAL(vkr, VK_SUCCESS);

        if(compactedData && origVBEnd)
        {
          const byte *src = origVBBegin;
          byte *dst = compactedData;
          const byte *dstEnd = dst + bufInfo.size;
          while(src < origVBEnd && dst < dstEnd)
          {
            memcpy(dst, src, elemSize);
            dst += elemSize;
            src += stride;
          }
        }

        m_pDriver->vkUnmapMemory(m_Device, vbuffers[attr].mem);
      }

      VkBufferViewCreateInfo info = {
          VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
          NULL,
          0,
          vbuffers[attr].buf,
          attrDesc.format,
          0,
          VK_WHOLE_SIZE,
      };

      m_pDriver->vkCreateBufferView(dev, &info, NULL, &vbuffers[attr].view);

      attrIsInstanced.push_back(isInstanced);

      descWrites[numWrites].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
      descWrites[numWrites].dstSet = m_MeshFetchDescSet;
      if(IsSIntFormat(attrDesc.format))
        descWrites[numWrites].dstBinding = 4;
      else if(IsUIntFormat(attrDesc.format))
        descWrites[numWrites].dstBinding = 3;
      else
        descWrites[numWrites].dstBinding = 2;
      descWrites[numWrites].dstArrayElement = i;
      descWrites[numWrites].descriptorCount = 1;
      descWrites[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
      descWrites[numWrites].pTexelBufferView = &vbuffers[attr].view;
      numWrites++;
    }

    // add a write of the index buffer
    if(uniqIdxBufView != VK_NULL_HANDLE)
    {
      descWrites[numWrites].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
      descWrites[numWrites].dstSet = m_MeshFetchDescSet;
      descWrites[numWrites].dstBinding = 1;
      descWrites[numWrites].dstArrayElement = 0;
      descWrites[numWrites].descriptorCount = 1;
      descWrites[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
      descWrites[numWrites].pTexelBufferView = &uniqIdxBufView;
      numWrites++;
    }

    m_pDriver->vkUpdateDescriptorSets(dev, numWrites, descWrites, 0, NULL);
  }

  ConvertToMeshOutputCompute(*refl, *pipeInfo.shaders[0].patchData,
                             pipeInfo.shaders[0].entryPoint.c_str(), attrIsInstanced, descSet,
                             drawcall, baseVertex, numFetchVerts, numVerts, modSpirv, bufStride);

  VkComputePipelineCreateInfo compPipeInfo = {VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO};

  {
    VkDescriptorSetLayout *descSetLayouts;

    // descSet will be the index of our new descriptor set
    descSetLayouts = new VkDescriptorSetLayout[descSet + 1];

    for(uint32_t i = 0; i < descSet; i++)
      descSetLayouts[i] = m_pDriver->GetResourceManager()->GetCurrentHandle<VkDescriptorSetLayout>(
          creationInfo.m_PipelineLayout[pipeInfo.layout].descSetLayouts[i]);

    // this layout just says it has one storage buffer
    descSetLayouts[descSet] = m_MeshFetchDescSetLayout;

    std::vector<VkPushConstantRange> push = creationInfo.m_PipelineLayout[pipeInfo.layout].pushRanges;

    // ensure the push range is visible to the compute shader
    for(VkPushConstantRange &range : push)
      range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;

    VkPipelineLayoutCreateInfo pipeLayoutInfo = {
        VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
        NULL,
        0,
        descSet + 1,
        descSetLayouts,
        (uint32_t)push.size(),
        push.empty() ? NULL : &push[0],
    };

    // create pipeline layout with same descriptor set layouts, plus our mesh output set
    vkr = m_pDriver->vkCreatePipelineLayout(dev, &pipeLayoutInfo, NULL, &pipeLayout);
    RDCASSERTEQUAL(vkr, VK_SUCCESS);

    SAFE_DELETE_ARRAY(descSetLayouts);

    // repoint pipeline layout
    compPipeInfo.layout = pipeLayout;
  }

  // create vertex shader with modified code
  VkShaderModuleCreateInfo moduleCreateInfo = {
      VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, NULL,         0,
      modSpirv.size() * sizeof(uint32_t),          &modSpirv[0],
  };

  VkShaderModule module;
  vkr = m_pDriver->vkCreateShaderModule(dev, &moduleCreateInfo, NULL, &module);
  RDCASSERTEQUAL(vkr, VK_SUCCESS);

  compPipeInfo.stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
  compPipeInfo.stage.module = module;
  compPipeInfo.stage.pName = PatchedMeshOutputEntryPoint;
  compPipeInfo.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;

  // create new pipeline
  VkPipeline pipe;
  vkr = m_pDriver->vkCreateComputePipelines(m_Device, VK_NULL_HANDLE, 1, &compPipeInfo, NULL, &pipe);
  RDCASSERTEQUAL(vkr, VK_SUCCESS);

  // make copy of state to draw from
  VulkanRenderState modifiedstate = state;

  // bind created pipeline to partial replay state
  modifiedstate.compute.pipeline = GetResID(pipe);

  // move graphics descriptor sets onto the compute pipe.
  modifiedstate.compute.descSets = modifiedstate.graphics.descSets;

  // push back extra descriptor set to partial replay state
  // note that we examined the used pipeline layout above and inserted our descriptor set
  // after any the application used. So there might be more bound, but we want to ensure to
  // bind to the slot we're using
  modifiedstate.compute.descSets.resize(descSet + 1);
  modifiedstate.compute.descSets[descSet].descSet = GetResID(m_MeshFetchDescSet);

  {
    // create buffer of sufficient size
    // this can't just be bufStride * num unique indices per instance, as we don't
    // have a compact 0-based index to index into the buffer. We must use
    // index-minIndex which is 0-based but potentially sparse, so this buffer may
    // be more or less wasteful
    VkBufferCreateInfo bufInfo = {
        VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,          NULL, 0,
        numVerts * drawcall->numInstances * bufStride, 0,
    };

    bufInfo.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
    bufInfo.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
    bufInfo.usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
    bufInfo.usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;

    vkr = m_pDriver->vkCreateBuffer(dev, &bufInfo, NULL, &meshBuffer);
    RDCASSERTEQUAL(vkr, VK_SUCCESS);

    bufInfo.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;

    vkr = m_pDriver->vkCreateBuffer(dev, &bufInfo, NULL, &readbackBuffer);
    RDCASSERTEQUAL(vkr, VK_SUCCESS);

    VkMemoryRequirements mrq = {0};
    m_pDriver->vkGetBufferMemoryRequirements(dev, meshBuffer, &mrq);

    VkMemoryAllocateInfo allocInfo = {
        VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, NULL, mrq.size,
        m_pDriver->GetGPULocalMemoryIndex(mrq.memoryTypeBits),
    };

    vkr = m_pDriver->vkAllocateMemory(dev, &allocInfo, NULL, &meshMem);
    RDCASSERTEQUAL(vkr, VK_SUCCESS);

    vkr = m_pDriver->vkBindBufferMemory(dev, meshBuffer, meshMem, 0);
    RDCASSERTEQUAL(vkr, VK_SUCCESS);

    m_pDriver->vkGetBufferMemoryRequirements(dev, readbackBuffer, &mrq);

    allocInfo.memoryTypeIndex = m_pDriver->GetReadbackMemoryIndex(mrq.memoryTypeBits);

    vkr = m_pDriver->vkAllocateMemory(dev, &allocInfo, NULL, &readbackMem);
    RDCASSERTEQUAL(vkr, VK_SUCCESS);

    vkr = m_pDriver->vkBindBufferMemory(dev, readbackBuffer, readbackMem, 0);
    RDCASSERTEQUAL(vkr, VK_SUCCESS);

    VkCommandBuffer cmd = m_pDriver->GetNextCmd();

    VkCommandBufferBeginInfo beginInfo = {VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, NULL,
                                          VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT};

    vkr = ObjDisp(dev)->BeginCommandBuffer(Unwrap(cmd), &beginInfo);
    RDCASSERTEQUAL(vkr, VK_SUCCESS);

    // fill destination buffer with 0s to ensure unwritten vertices have sane data
    ObjDisp(dev)->CmdFillBuffer(Unwrap(cmd), Unwrap(meshBuffer), 0, bufInfo.size, 0xbaadf00d);

    VkBufferMemoryBarrier meshbufbarrier = {
        VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
        NULL,
        VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_HOST_WRITE_BIT,
        VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
        VK_QUEUE_FAMILY_IGNORED,
        VK_QUEUE_FAMILY_IGNORED,
    };

    meshbufbarrier.size = VK_WHOLE_SIZE;

    VkMemoryBarrier globalbarrier = {
        VK_STRUCTURE_TYPE_MEMORY_BARRIER, NULL,
        VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_HOST_WRITE_BIT,
        VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
    };

    // wait for uploads of index buffer (if used), compacted vertex buffers, and the above fill to
    // finish.
    DoPipelineBarrier(cmd, 1, &globalbarrier);

    // set bufSize
    bufSize = numVerts * drawcall->numInstances * bufStride;

    // vkUpdateDescriptorSet desc set to point to buffer
    VkDescriptorBufferInfo fetchdesc = {0};
    fetchdesc.buffer = meshBuffer;
    fetchdesc.offset = 0;
    fetchdesc.range = bufInfo.size;

    VkWriteDescriptorSet write = {
        VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, NULL, m_MeshFetchDescSet, 0,   0, 1,
        VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,      NULL, &fetchdesc,         NULL};
    m_pDriver->vkUpdateDescriptorSets(dev, 1, &write, 0, NULL);

    // do single draw
    modifiedstate.BindPipeline(cmd, VulkanRenderState::BindCompute, true);
    uint64_t totalVerts = numFetchVerts * uint64_t(drawcall->numInstances);

    // the validation layers will probably complain about this dispatch saying some arrays aren't
    // fully updated. That's because they don't statically analyse that only fixed indices are
    // referred to. It's safe to leave unused array indices as invalid descriptors.
    ObjDisp(cmd)->CmdDispatch(Unwrap(cmd), uint32_t(totalVerts / MeshOutputDispatchWidth) + 1, 1, 1);

    // wait for mesh output writing to finish
    meshbufbarrier.buffer = Unwrap(meshBuffer);
    meshbufbarrier.size = bufSize;
    meshbufbarrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
    meshbufbarrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;

    DoPipelineBarrier(cmd, 1, &meshbufbarrier);

    VkBufferCopy bufcopy = {
        0, 0, bufInfo.size,
    };

    // copy to readback buffer
    ObjDisp(dev)->CmdCopyBuffer(Unwrap(cmd), Unwrap(meshBuffer), Unwrap(readbackBuffer), 1, &bufcopy);

    meshbufbarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
    meshbufbarrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT;
    meshbufbarrier.buffer = Unwrap(readbackBuffer);

    // wait for copy to finish
    DoPipelineBarrier(cmd, 1, &meshbufbarrier);

    vkr = ObjDisp(dev)->EndCommandBuffer(Unwrap(cmd));
    RDCASSERTEQUAL(vkr, VK_SUCCESS);

    // submit & flush so that we don't have to keep pipeline around for a while
    m_pDriver->SubmitCmds();
    m_pDriver->FlushQ();
  }

  for(CompactedAttrBuffer attrBuf : vbuffers)
  {
    m_pDriver->vkDestroyBufferView(dev, attrBuf.view, NULL);
    m_pDriver->vkDestroyBuffer(dev, attrBuf.buf, NULL);
    m_pDriver->vkFreeMemory(dev, attrBuf.mem, NULL);
  }

  // readback mesh data
  byte *byteData = NULL;
  vkr = m_pDriver->vkMapMemory(m_Device, readbackMem, 0, VK_WHOLE_SIZE, 0, (void **)&byteData);

  // do near/far calculations

  float nearp = 0.1f;
  float farp = 100.0f;

  Vec4f *pos0 = (Vec4f *)byteData;

  bool found = false;

  // expect position at the start of the buffer, as system values are sorted first
  // and position is the first value

  for(uint32_t i = 1;
      refl->outputSignature[0].systemValue == ShaderBuiltin::Position && i < numVerts; i++)
  {
    //////////////////////////////////////////////////////////////////////////////////
    // derive near/far, assuming a standard perspective matrix
    //
    // the transformation from from pre-projection {Z,W} to post-projection {Z,W}
    // is linear. So we can say Zpost = Zpre*m + c . Here we assume Wpre = 1
    // and we know Wpost = Zpre from the perspective matrix.
    // we can then see from the perspective matrix that
    // m = F/(F-N)
    // c = -(F*N)/(F-N)
    //
    // with re-arranging and substitution, we then get:
    // N = -c/m
    // F = c/(1-m)
    //
    // so if we can derive m and c then we can determine N and F. We can do this with
    // two points, and we pick them reasonably distinct on z to reduce floating-point
    // error

    Vec4f *pos = (Vec4f *)(byteData + i * bufStride);

    // skip invalid vertices (w=0)
    if(pos->w != 0.0f && fabs(pos->w - pos0->w) > 0.01f && fabs(pos->z - pos0->z) > 0.01f)
    {
      Vec2f A(pos0->w, pos0->z);
      Vec2f B(pos->w, pos->z);

      float m = (B.y - A.y) / (B.x - A.x);
      float c = B.y - B.x * m;

      if(m == 1.0f)
        continue;

      if(-c / m <= 0.000001f)
        continue;

      nearp = -c / m;
      farp = c / (1 - m);

      found = true;

      break;
    }
  }

  // if we didn't find anything, all z's and w's were identical.
  // If the z is positive and w greater for the first element then
  // we detect this projection as reversed z with infinite far plane
  if(!found && pos0->z > 0.0f && pos0->w > pos0->z)
  {
    nearp = pos0->z;
    farp = FLT_MAX;
  }

  m_pDriver->vkUnmapMemory(m_Device, readbackMem);

  // clean up temporary memories
  m_pDriver->vkDestroyBuffer(m_Device, readbackBuffer, NULL);
  m_pDriver->vkFreeMemory(m_Device, readbackMem, NULL);

  if(uniqIdxBuf != VK_NULL_HANDLE)
  {
    m_pDriver->vkDestroyBuffer(m_Device, uniqIdxBuf, NULL);
    m_pDriver->vkFreeMemory(m_Device, uniqIdxBufMem, NULL);
    m_pDriver->vkDestroyBufferView(m_Device, uniqIdxBufView, NULL);
  }

  // fill out m_PostVSData
  m_PostVSData[eventId].vsin.topo = pipeCreateInfo.pInputAssemblyState->topology;
  m_PostVSData[eventId].vsout.topo = pipeCreateInfo.pInputAssemblyState->topology;
  m_PostVSData[eventId].vsout.buf = meshBuffer;
  m_PostVSData[eventId].vsout.bufmem = meshMem;

  m_PostVSData[eventId].vsout.baseVertex = baseVertex + drawcall->baseVertex;

  m_PostVSData[eventId].vsout.vertStride = bufStride;
  m_PostVSData[eventId].vsout.nearPlane = nearp;
  m_PostVSData[eventId].vsout.farPlane = farp;

  m_PostVSData[eventId].vsout.useIndices = bool(drawcall->flags & DrawFlags::UseIBuffer);
  m_PostVSData[eventId].vsout.numVerts = drawcall->numIndices;

  m_PostVSData[eventId].vsout.instStride = 0;
  if(drawcall->flags & DrawFlags::Instanced)
    m_PostVSData[eventId].vsout.instStride = uint32_t(bufSize / drawcall->numInstances);

  m_PostVSData[eventId].vsout.idxBuf = ResourceId();
  if(m_PostVSData[eventId].vsout.useIndices && state.ibuffer.buf != ResourceId())
  {
    m_PostVSData[eventId].vsout.idxBuf = GetResourceManager()->GetOriginalID(state.ibuffer.buf);
    m_PostVSData[eventId].vsout.idxOffset = state.ibuffer.offs + drawcall->indexOffset * idxsize;
    m_PostVSData[eventId].vsout.idxFmt = idxsize == 2 ? VK_INDEX_TYPE_UINT16 : VK_INDEX_TYPE_UINT32;
  }

  m_PostVSData[eventId].vsout.hasPosOut =
      refl->outputSignature[0].systemValue == ShaderBuiltin::Position;

  // delete pipeline layout
  m_pDriver->vkDestroyPipelineLayout(dev, pipeLayout, NULL);

  // delete pipeline
  m_pDriver->vkDestroyPipeline(dev, pipe, NULL);

  // delete shader/shader module
  m_pDriver->vkDestroyShaderModule(dev, module, NULL);
}
struct VulkanInitPostVSCallback : public VulkanDrawcallCallback
{
  VulkanInitPostVSCallback(WrappedVulkan *vk, const vector<uint32_t> &events)
      : m_pDriver(vk), m_Events(events)
  {
    m_pDriver->SetDrawcallCB(this);
  }
  ~VulkanInitPostVSCallback() { m_pDriver->SetDrawcallCB(NULL); }
  void PreDraw(uint32_t eid, VkCommandBuffer cmd)
  {
    if(std::find(m_Events.begin(), m_Events.end(), eid) != m_Events.end())
      m_pDriver->GetReplay()->InitPostVSBuffers(eid);
  }

  bool PostDraw(uint32_t eid, VkCommandBuffer cmd) { return false; }
  void PostRedraw(uint32_t eid, VkCommandBuffer cmd) {}
  // Dispatches don't rasterize, so do nothing
  void PreDispatch(uint32_t eid, VkCommandBuffer cmd) {}
  bool PostDispatch(uint32_t eid, VkCommandBuffer cmd) { return false; }
  void PostRedispatch(uint32_t eid, VkCommandBuffer cmd) {}
  // Ditto copy/etc
  void PreMisc(uint32_t eid, DrawFlags flags, VkCommandBuffer cmd) {}
  bool PostMisc(uint32_t eid, DrawFlags flags, VkCommandBuffer cmd) { return false; }
  void PostRemisc(uint32_t eid, DrawFlags flags, VkCommandBuffer cmd) {}
  void AliasEvent(uint32_t primary, uint32_t alias)
  {
    if(std::find(m_Events.begin(), m_Events.end(), primary) != m_Events.end())
      m_pDriver->GetReplay()->AliasPostVSBuffers(primary, alias);
  }

  WrappedVulkan *m_pDriver;
  const std::vector<uint32_t> &m_Events;
};

void VulkanReplay::InitPostVSBuffers(const vector<uint32_t> &events)
{
  // first we must replay up to the first event without replaying it. This ensures any
  // non-command buffer calls like memory unmaps etc all happen correctly before this
  // command buffer
  m_pDriver->ReplayLog(0, events.front(), eReplay_WithoutDraw);

  VulkanInitPostVSCallback cb(m_pDriver, events);

  // now we replay the events, which are guaranteed (because we generated them in
  // GetPassEvents above) to come from the same command buffer, so the event IDs are
  // still locally continuous, even if we jump into replaying.
  m_pDriver->ReplayLog(events.front(), events.back(), eReplay_Full);
}

MeshFormat VulkanReplay::GetPostVSBuffers(uint32_t eventId, uint32_t instID, MeshDataStage stage)
{
  // go through any aliasing
  if(m_PostVSAlias.find(eventId) != m_PostVSAlias.end())
    eventId = m_PostVSAlias[eventId];

  VulkanPostVSData postvs;
  RDCEraseEl(postvs);

  if(m_PostVSData.find(eventId) != m_PostVSData.end())
    postvs = m_PostVSData[eventId];

  VulkanPostVSData::StageData s = postvs.GetStage(stage);

  MeshFormat ret;

  if(s.useIndices && s.idxBuf != ResourceId())
  {
    ret.indexResourceId = s.idxBuf;
    ret.indexByteStride = s.idxFmt == VK_INDEX_TYPE_UINT16 ? 2 : 4;
  }
  else
  {
    ret.indexResourceId = ResourceId();
    ret.indexByteStride = 0;
  }
  ret.indexByteOffset = s.idxOffset;
  ret.baseVertex = s.baseVertex;

  if(s.buf != VK_NULL_HANDLE)
    ret.vertexResourceId = GetResID(s.buf);
  else
    ret.vertexResourceId = ResourceId();

  ret.vertexByteOffset = s.instStride * instID;
  ret.vertexByteStride = s.vertStride;

  ret.format.compCount = 4;
  ret.format.compByteWidth = 4;
  ret.format.compType = CompType::Float;
  ret.format.type = ResourceFormatType::Regular;
  ret.format.bgraOrder = false;

  ret.showAlpha = false;

  ret.topology = MakePrimitiveTopology(s.topo, 1);
  ret.numIndices = s.numVerts;

  ret.unproject = s.hasPosOut;
  ret.nearPlane = s.nearPlane;
  ret.farPlane = s.farPlane;

  return ret;
}