renderdoc/renderdoc/driver/shaders/spirv/spirv_debug_setup.cpp

/******************************************************************************
 * The MIT License (MIT)
 *
 * Copyright (c) 2020-2026 Baldur Karlsson
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 ******************************************************************************/

#include "spirv_debug.h"
#include "common/formatting.h"
#include "common/threading.h"
#include "core/settings.h"
#include "replay/common/var_dispatch_helpers.h"
#include "spirv_op_helpers.h"
#include "spirv_reflect.h"

RDOC_CONFIG(bool, Shader_Debug_SPIRVUseDebugColumnInformation, false,
            "Control whether column information should be read from vulkan debug info.");

RDOC_DEBUG_CONFIG(bool, Shader_Debug_UseJobSystemJobs, false,
                  "Use individual job system jobs to run shader debugging simulation.");

using namespace rdcshaders;

// this could be cleaner if ShaderVariable wasn't a very public struct, but it's not worth it so
// we just reserve value slots that we know won't be used in opaque variables.
// there's significant wasted space to keep things simple with one property = one slot
static const uint32_t OpaquePointerTypeID = 0x0dd0beef;

enum class PointerFlags
{
  RowMajorMatrix = 0x1,
  SSBO = 0x2,
  GlobalArrayBinding = 0x4,
  DereferencedPhysical = 0x8,
};

BITMASK_OPERATORS(PointerFlags);

// slot 0 for the actual pointer. Shares the same slot as the actual pointer value for GPU pointers
ShaderVariable *getPointer(ShaderVariable &var)
{
  return (ShaderVariable *)(uintptr_t)var.value.u64v[0];
}

const ShaderVariable *getPointer(const ShaderVariable &var)
{
  return (const ShaderVariable *)(uintptr_t)var.value.u64v[0];
}

void setPointer(ShaderVariable &var, const ShaderVariable *ptr)
{
  var.value.u64v[0] = (uint64_t)(uintptr_t)ptr;
}

// slot 1 is the type ID, for opaque pointers this is OpaquePointerTypeID and for real GPU pointers
// this is the type ID of the pointer. We only display this properly for base pointers -
// dereferenced pointers just show the value behind them (otherwise we'd need a pointer type for
// every child element of any pointer type, which is feasible but probably unnecessary)

// slot 2 contains the scalar indices that we carry around from dereferences
void setScalars(ShaderVariable &var, uint8_t scalar0, uint8_t scalar1)
{
  var.value.u64v[2] = (scalar0 << 8) | scalar1;
}

rdcpair<uint8_t, uint8_t> getScalars(const ShaderVariable &var)
{
  return {uint8_t((var.value.u64v[2] >> 8) & 0xff), uint8_t(var.value.u64v[2] & 0xff)};
}

// slot 3 contains the base ID of the structure, for registering pointer changes
void setBaseId(ShaderVariable &var, rdcspv::Id id)
{
  var.value.u64v[3] = id.value();
}

rdcspv::Id getBaseId(const ShaderVariable &var)
{
  return rdcspv::Id::fromWord((uint32_t)var.value.u64v[3]);
}

bool isUndefPointer(const ShaderVariable &var)
{
  return var.value.u64v[4] == 0xccccccccccccccccULL;
}

// slot 4 has the different flags we keep track of
void setPointerFlags(ShaderVariable &var, PointerFlags flags)
{
  var.value.u64v[4] = uint32_t(flags);
}

PointerFlags getPointerFlags(const ShaderVariable &var)
{
  return (PointerFlags)var.value.u64v[4];
}

void enablePointerFlags(ShaderVariable &var, PointerFlags flags)
{
  var.value.u64v[4] = uint32_t((PointerFlags)var.value.u64v[4] | flags);
}

void disablePointerFlags(ShaderVariable &var, PointerFlags flags)
{
  var.value.u64v[4] = uint32_t(PointerFlags((PointerFlags)var.value.u64v[4] & ~flags));
}

bool checkPointerFlags(const ShaderVariable &var, PointerFlags flags)
{
  return ((PointerFlags)var.value.u64v[4] & flags) == flags;
}

// slot 5 has the matrix stride
void setMatrixStride(ShaderVariable &var, uint32_t stride)
{
  var.value.u64v[5] = stride;
}

uint32_t getMatrixStride(const ShaderVariable &var)
{
  return (uint32_t)var.value.u64v[5];
}

// slot 6 has the relative byte offset. For plain bindings the global is created with an offset 0
// and then it's added to for access chains
void setByteOffset(ShaderVariable &var, uint64_t offset)
{
  var.value.u64v[6] = offset;
}

uint64_t getByteOffset(const ShaderVariable &var)
{
  return var.value.u64v[6];
}

// we also use slot 6 for the texture type (because textures and buffers requiring a byte offset are
// disjoint)
void setTextureType(ShaderVariable &var, rdcspv::DebugAPIWrapper::TextureType type)
{
  var.value.u64v[6] = type;
}

rdcspv::DebugAPIWrapper::TextureType getTextureType(const ShaderVariable &var)
{
  return (rdcspv::DebugAPIWrapper::TextureType)var.value.u64v[6];
}

// slot 7 contains the binding array index if we indexed into a global binding array
void setBindArrayIndex(ShaderVariable &var, uint32_t arrayIndex)
{
  var.value.u64v[7] = arrayIndex;
}

uint32_t getBindArrayIndex(const ShaderVariable &var)
{
  return (uint32_t)var.value.u64v[7];
}

// slot 8 contains the ID of the pointer's type, for further buffer type chasing
void setBufferTypeId(ShaderVariable &var, rdcspv::Id id)
{
  var.value.u64v[8] = id.value();
}

rdcspv::Id getBufferTypeId(const ShaderVariable &var)
{
  return rdcspv::Id::fromWord((uint32_t)var.value.u64v[8]);
}

// slot 9 is the array stride. Can't be shared with matrix stride (slot 5) in the case of matrix arrays.
void setArrayStride(ShaderVariable &var, uint32_t stride)
{
  var.value.u64v[9] = stride;
}

uint32_t getArrayStride(const ShaderVariable &var)
{
  return (uint32_t)var.value.u64v[9];
}

static ShaderVariable *pointerIfMutable(const ShaderVariable &var)
{
  return NULL;
}
static ShaderVariable *pointerIfMutable(ShaderVariable &var)
{
  return &var;
}

static uint32_t VarByteSize(const ShaderVariable &var)
{
  return VarTypeByteSize(var.type) * RDCMAX(1U, (uint32_t)var.rows) *
         RDCMAX(1U, (uint32_t)var.columns);
}

static void *VarElemPointer(ShaderVariable &var, uint32_t comp)
{
  RDCASSERTNOTEQUAL(var.type, VarType::Unknown);
  byte *ret = (byte *)var.value.u8v.data();
  return ret + comp * VarTypeByteSize(var.type);
}

static const void *VarElemPointer(const ShaderVariable &var, uint32_t comp)
{
  RDCASSERTNOTEQUAL(var.type, VarType::Unknown);
  const byte *ret = (const byte *)var.value.u8v.data();
  return ret + comp * VarTypeByteSize(var.type);
}

namespace rdcspv
{
ShaderVariable ThreadDebugBreak(ThreadState &state, uint32_t, const rdcarray<Id> &)
{
  state.DebugBreak();
  return ShaderVariable("void", 0U, 0U, 0U, 0U);
}

rdcstr GetRawName(Id id)
{
  // 32-bit value means at most 10 decimal digits, plus a preceeding _, plus trailing NULL.
  char name[12] = {};
  char *ptr = name + 10;
  uint32_t val = id.value();
  do
  {
    *ptr = char('0' + (val % 10));
    ptr--;
    val /= 10;
  } while(val);

  *ptr = '_';

  return ptr;
}

Id ParseRawName(const rdcstr &name)
{
  if(name[0] != '_')
    return Id();

  uint32_t val = 0;
  for(int i = 1; i < name.count(); i++)
  {
    if(name[i] < '0' || name[i] > '9')
      return Id();

    val *= 10;
    val += uint32_t(name[i] - '0');
  }

  return Id::fromWord(val);
}

void AssignValue(ShaderVariable &dst, const ShaderVariable &src)
{
  dst.value = src.value;

  RDCASSERTEQUAL(dst.members.size(), src.members.size());

  for(size_t i = 0; i < src.members.size(); i++)
    AssignValue(dst.members[i], src.members[i]);
}

#if ENABLED(RDOC_RELEASE)
#define CHECK_DEBUGGER_THREAD() \
  do                            \
  {                             \
  } while((void)0, 0)
#else
#define CHECK_DEBUGGER_THREAD() \
  RDCASSERTMSG("Debugger function called from non-device thread!", IsDeviceThread());
#endif    // #if ENABLED(RDOC_RELEASE)

Debugger::Debugger() : deviceThreadID(Threading::GetCurrentID())
{
}

Debugger::~Debugger()
{
  AtomicStore(&atomic_simulationFinished, 1);
  Threading::JobSystem::SyncAllJobs();
  SAFE_DELETE(apiWrapper);
}

void Debugger::ClampScalars(const ShaderVariable &var, uint8_t &scalar0) const
{
  if(scalar0 > var.columns && scalar0 != 0xff)
  {
    AddDebugMessage(MessageCategory::Execution, MessageSeverity::High, MessageSource::RuntimeWarning,
                    StringFormat::Fmt("Invalid scalar index %u at %u-vector %s. Clamping to %u",
                                      scalar0, var.columns, var.name.c_str(), var.columns - 1));
    scalar0 = RDCMIN((uint8_t)1, var.columns) - 1;
  }
}

void Debugger::ClampScalars(const ShaderVariable &var, uint8_t &scalar0, uint8_t &scalar1) const
{
  if(scalar0 > var.columns && scalar0 != 0xff)
  {
    AddDebugMessage(
        MessageCategory::Execution, MessageSeverity::High, MessageSource::RuntimeWarning,
        StringFormat::Fmt("Invalid scalar index %u at matrix %s with %u columns. Clamping to %u",
                          scalar0, var.name.c_str(), var.columns, var.columns - 1));
    scalar0 = RDCMIN((uint8_t)1, var.columns) - 1;
  }
  if(scalar1 > var.rows && scalar1 != 0xff)
  {
    AddDebugMessage(
        MessageCategory::Execution, MessageSeverity::High, MessageSource::RuntimeWarning,
        StringFormat::Fmt("Invalid scalar index %u at matrix %s with %u rows. Clamping to %u",
                          scalar1, var.name.c_str(), var.rows, var.rows - 1));
    scalar1 = RDCMIN((uint8_t)1, var.rows) - 1;
  }
}

void Debugger::Parse(const rdcarray<uint32_t> &spirvWords)
{
  Processor::Parse(spirvWords);
}

ConstIter Debugger::GetIterForInstruction(uint32_t inst) const
{
  return ConstIter(m_SPIRV, instructionOffsets[inst]);
}

uint32_t Debugger::GetInstructionForIter(ConstIter it) const
{
  return instructionOffsets.indexOf(it.offs());
}

uint32_t Debugger::GetInstructionForFunction(Id id) const
{
  return instructionOffsets.indexOf(functions[id].begin);
}

uint32_t Debugger::GetInstructionForLabel(Id id) const
{
  uint32_t ret = labelInstruction[id];
  RDCASSERT(ret);
  return ret;
}

const rdcspv::DataType &Debugger::GetType(Id typeId) const
{
  return dataTypes[typeId];
}

const rdcspv::DataType &Debugger::GetTypeForId(Id ssaId) const
{
  return dataTypes[idTypes[ssaId]];
}

const Decorations &Debugger::GetDecorations(Id typeId) const
{
  return decorations[typeId];
}

void Debugger::MakeSignatureNames(const rdcarray<SPIRVInterfaceAccess> &sigList,
                                  rdcarray<rdcstr> &sigNames)
{
  for(const SPIRVInterfaceAccess &sig : sigList)
  {
    rdcstr name = GetRawName(sig.ID);

    const DataType *type = &dataTypes[idTypes[sig.ID]];

    RDCASSERT(type->type == DataType::PointerType);
    type = &dataTypes[type->InnerType()];

    for(uint32_t chain : sig.accessChain)
    {
      if(type->type == DataType::ArrayType)
      {
        name += StringFormat::Fmt("[%u]", chain);
        type = &dataTypes[type->InnerType()];
      }
      else if(type->type == DataType::StructType)
      {
        if(!type->children[chain].name.empty())
          name += "." + type->children[chain].name;
        else
          name += StringFormat::Fmt("._child%u", chain);
        type = &dataTypes[type->children[chain].type];
      }
      else if(type->type == DataType::MatrixType)
      {
        name += StringFormat::Fmt(".col%u", chain);
        type = &dataTypes[type->InnerType()];
      }
      else
      {
        RDCERR("Got access chain with non-aggregate type in interface.");
        break;
      }
    }

    sigNames.push_back(name);
  }
}

// this function is implemented here to keep it next to the code we might need to update, even
// though it's checked at reflection time.
void Reflector::CheckDebuggable(bool &debuggable, rdcstr &debugStatus) const
{
  debuggable = true;
  debugStatus.clear();

  if(m_MajorVersion > 1 || m_MinorVersion > 6)
  {
    debugStatus +=
        StringFormat::Fmt("Unsupported SPIR-V version %u.%u\n", m_MajorVersion, m_MinorVersion);
    debuggable = false;
  }

  for(const Variable &g : globals)
  {
    if(g.storage == StorageClass::TaskPayloadWorkgroupEXT)
    {
      debugStatus += "Unsupported Task payload\n";
      debuggable = false;
    }
  }

  const rdcstr whitelist[] = {
      // KHR extensions
      "SPV_KHR_16bit_storage",
      "SPV_KHR_8bit_storage",
      "SPV_KHR_bit_instructions",
      "SPV_KHR_device_group",
      "SPV_KHR_expect_assume",
      "SPV_KHR_float_controls",
      "SPV_KHR_maximal_reconvergence",
      "SPV_KHR_multiview",
      "SPV_KHR_no_integer_wrap_decoration",
      "SPV_KHR_non_semantic_info",
      "SPV_KHR_physical_storage_buffer",
      "SPV_KHR_post_depth_coverage",
      "SPV_KHR_quad_control",
      "SPV_KHR_relaxed_extended_instruction",
      "SPV_KHR_shader_atomic_counter_ops",
      "SPV_KHR_shader_ballot",
      "SPV_KHR_shader_clock",
      "SPV_KHR_shader_draw_parameters",
      "SPV_KHR_storage_buffer_storage_class",
      "SPV_KHR_subgroup_rotate",
      "SPV_KHR_subgroup_uniform_control_flow",
      "SPV_KHR_subgroup_vote",
      "SPV_KHR_terminate_invocation",
      "SPV_KHR_vulkan_memory_model",
      "SPV_KHR_compute_shader_derivatives",

      // EXT extensions
      "SPV_EXT_demote_to_helper_invocation",
      "SPV_EXT_descriptor_indexing",
      "SPV_EXT_fragment_fully_covered",
      "SPV_EXT_fragment_invocation_density",
      "SPV_EXT_mesh_shader",
      "SPV_EXT_physical_storage_buffer",
      "SPV_EXT_shader_atomic_float_add",
      "SPV_EXT_shader_atomic_float_min_max",
      "SPV_EXT_shader_atomic_float16_add",
      "SPV_EXT_shader_image_int64",
      "SPV_EXT_shader_stencil_export",
      "SPV_EXT_shader_viewport_index_layer",

      // vendor extensions
      "SPV_GOOGLE_decorate_string",
      "SPV_GOOGLE_hlsl_functionality1",
      "SPV_GOOGLE_user_type",
      "SPV_NV_compute_shader_derivatives",
  };

  // whitelist supported extensions
  for(const rdcstr &ext : extensions)
  {
    bool supported = false;
    for(const rdcstr &check : whitelist)
    {
      if(ext == check)
      {
        supported = true;
        break;
      }
    }

    if(supported)
      continue;

    debuggable = false;
    debugStatus += StringFormat::Fmt("Unsupported SPIR-V extension %s\n", ext.c_str());
  }

  for(Capability c : capabilities)
  {
    bool supported = false;
    switch(c)
    {
      case Capability::Matrix:
      case Capability::Shader:
      // we "support" geometry/tessellation in case the module contains other entry points, but
      // these can't be debugged right now.
      case Capability::Geometry:
      case Capability::Tessellation:
      case Capability::AtomicStorage:
      case Capability::TessellationPointSize:
      case Capability::GeometryPointSize:
      case Capability::ImageGatherExtended:
      case Capability::StorageImageMultisample:
      case Capability::UniformBufferArrayDynamicIndexing:
      case Capability::SampledImageArrayDynamicIndexing:
      case Capability::StorageBufferArrayDynamicIndexing:
      case Capability::StorageImageArrayDynamicIndexing:
      case Capability::ClipDistance:
      case Capability::CullDistance:
      case Capability::ImageCubeArray:
      case Capability::SampleRateShading:
      case Capability::ImageRect:
      case Capability::SampledRect:
      case Capability::InputAttachment:
      case Capability::MinLod:
      case Capability::Sampled1D:
      case Capability::Image1D:
      case Capability::SampledCubeArray:
      case Capability::SampledBuffer:
      case Capability::ImageBuffer:
      case Capability::ImageMSArray:
      case Capability::StorageImageExtendedFormats:
      case Capability::ImageQuery:
      case Capability::DerivativeControl:
      case Capability::TransformFeedback:
      case Capability::GeometryStreams:
      case Capability::StorageImageReadWithoutFormat:
      case Capability::StorageImageWriteWithoutFormat:
      case Capability::MultiViewport:
      case Capability::ShaderLayer:
      case Capability::ShaderViewportIndex:
      case Capability::DrawParameters:
      case Capability::DeviceGroup:
      case Capability::MultiView:
      case Capability::AtomicStorageOps:
      case Capability::SampleMaskPostDepthCoverage:
      case Capability::StencilExportEXT:
      case Capability::ShaderClockKHR:
      case Capability::ShaderViewportIndexLayerEXT:
      case Capability::FragmentFullyCoveredEXT:
      case Capability::FragmentDensityEXT:
      case Capability::ShaderNonUniform:
      case Capability::RuntimeDescriptorArray:
      case Capability::InputAttachmentArrayDynamicIndexing:
      case Capability::UniformTexelBufferArrayDynamicIndexing:
      case Capability::StorageTexelBufferArrayDynamicIndexing:
      case Capability::UniformBufferArrayNonUniformIndexing:
      case Capability::SampledImageArrayNonUniformIndexing:
      case Capability::StorageBufferArrayNonUniformIndexing:
      case Capability::StorageImageArrayNonUniformIndexing:
      case Capability::InputAttachmentArrayNonUniformIndexing:
      case Capability::UniformTexelBufferArrayNonUniformIndexing:
      case Capability::StorageTexelBufferArrayNonUniformIndexing:
      case Capability::VulkanMemoryModel:
      case Capability::VulkanMemoryModelDeviceScope:
      case Capability::DemoteToHelperInvocation:
      case Capability::AtomicFloat32AddEXT:
      case Capability::AtomicFloat32MinMaxEXT:
      case Capability::AtomicFloat16AddEXT:
      case Capability::AtomicFloat16MinMaxEXT:
      case Capability::AtomicFloat64AddEXT:
      case Capability::AtomicFloat64MinMaxEXT:
      case Capability::Float16Buffer:
      case Capability::Float16:
      case Capability::Int64:
      case Capability::Int16:
      case Capability::Int8:
      case Capability::StorageBuffer16BitAccess:
      case Capability::UniformAndStorageBuffer16BitAccess:
      case Capability::StoragePushConstant16:
      case Capability::StorageInputOutput16:
      case Capability::StorageBuffer8BitAccess:
      case Capability::UniformAndStorageBuffer8BitAccess:
      case Capability::StoragePushConstant8:
      case Capability::Float64:
      case Capability::Int64Atomics:
      case Capability::Int64ImageEXT:
      case Capability::ExpectAssumeKHR:
      case Capability::BitInstructions:
      case Capability::UniformDecoration:
      case Capability::SignedZeroInfNanPreserve:
      case Capability::PhysicalStorageBufferAddresses:
      case Capability::MeshShadingEXT:
      case Capability::QuadControlKHR:
      case Capability::GroupNonUniform:
      case Capability::GroupNonUniformArithmetic:
      case Capability::GroupNonUniformBallot:
      case Capability::GroupNonUniformClustered:
      case Capability::GroupNonUniformQuad:
      case Capability::GroupNonUniformRotateKHR:
      case Capability::GroupNonUniformShuffle:
      case Capability::GroupNonUniformShuffleRelative:
      case Capability::GroupNonUniformVote:
      case Capability::SubgroupBallotKHR:
      case Capability::SubgroupVoteKHR:
      case Capability::ComputeDerivativeGroupQuadsKHR:
      case Capability::ComputeDerivativeGroupLinearKHR:
      {
        supported = true;
        break;
      }

      // we plan to support these but needs additional testing/proving

      // SPIR-V 1.0 MSAA custom interpolation
      case Capability::InterpolationFunction:
      {
        supported = false;
        break;
      }

      // SPIR-V 1.0 Sparse Operations
      case Capability::SparseResidency:
      {
        supported = false;
        break;
      }

      // SPIR-V 1.4 / SPV_KHR_float_controls
      case Capability::DenormPreserve:
      case Capability::DenormFlushToZero:
      case Capability::RoundingModeRTE:
      case Capability::RoundingModeRTZ:
      {
        supported = false;
        break;
      }

      // SPIR-V 1.6 / SPV_KHR_integer_dot_product
      case Capability::DotProduct:
      case Capability::DotProductInput4x8Bit:
      case Capability::DotProductInput4x8BitPacked:
      case Capability::DotProductInputAll:
      {
        supported = false;
        break;
      }

      // SPV_KHR_bfloat16
      case Capability::BFloat16TypeKHR:
      case Capability::BFloat16DotProductKHR:
      {
        supported = false;
        break;
      }

      // SPV_KHR_float_controls2
      case Capability::FloatControls2:
      {
        supported = false;
        break;
      }

      // SPV_KHR_fma
      case Capability::FMAKHR:
      {
        supported = false;
        break;
      }

      // SPV_KHR_fragment_shader_barycentric
      case Capability::FragmentBarycentricKHR:
      {
        supported = false;
        break;
      }

      // SPV_KHR_fragment_shading_rate
      case Capability::FragmentShadingRateKHR:
      {
        supported = false;
        break;
      }

      // SPV_KHR_untyped_pointers
      case Capability::UntypedPointersKHR:
      {
        supported = false;
        break;
      }

      // SPV_KHR_variable_pointers
      case Capability::VariablePointersStorageBuffer:
      case Capability::VariablePointers:
      {
        supported = false;
        break;
      }

      // SPV_KHR_workgroup_memory_explicit_layout
      case Capability::WorkgroupMemoryExplicitLayout16BitAccessKHR:
      case Capability::WorkgroupMemoryExplicitLayout8BitAccessKHR:
      case Capability::WorkgroupMemoryExplicitLayoutKHR:
      {
        supported = false;
        break;
      }

      // Ray tracing
      case Capability::RayCullMaskKHR:
      case Capability::RayQueryKHR:
      case Capability::RayQueryPositionFetchKHR:
      case Capability::RayTracingKHR:
      case Capability::RayTracingPositionFetchKHR:
      case Capability::RayTraversalPrimitiveCullingKHR:
      case Capability::RayTracingOpacityMicromapEXT:
      {
        supported = false;
        break;
      }

      // SPV_EXT_float8
      case Capability::Float8EXT:
      {
        supported = false;
        break;
      }

      // SPV_EXT_fragment_shader_interlock
      case Capability::FragmentShaderSampleInterlockEXT:
      case Capability::FragmentShaderShadingRateInterlockEXT:
      case Capability::FragmentShaderPixelInterlockEXT:
      {
        supported = false;
        break;
      }

      case Capability::ReplicatedCompositesEXT:
      {
        supported = false;
        break;
      }

      // SPV_EXT_shader_64bit_indexing
      case Capability::Shader64BitIndexingEXT:
      {
        supported = false;
        break;
      }

      // SPV_EXT_shader_tile_image
      case Capability::TileImageColorReadAccessEXT:
      case Capability::TileImageDepthReadAccessEXT:
      case Capability::TileImageStencilReadAccessEXT:
      {
        supported = false;
        break;
      }

      // no plans to support these - mostly Kernel/OpenCL related or vendor extensions
      case Capability::Addresses:
      case Capability::Linkage:
      case Capability::Kernel:
      case Capability::Groups:
      case Capability::Vector16:
      case Capability::ImageBasic:
      case Capability::ImageReadWrite:
      case Capability::ImageMipmap:
      case Capability::Pipes:
      case Capability::DeviceEnqueue:
      case Capability::LiteralSampler:
      case Capability::GenericPointer:
      case Capability::SubgroupDispatch:
      case Capability::NamedBarrier:
      case Capability::PipeStorage:
      case Capability::Float16ImageAMD:
      case Capability::ImageGatherBiasLodAMD:
      case Capability::FragmentMaskAMD:
      case Capability::ImageReadWriteLodAMD:
      case Capability::SampleMaskOverrideCoverageNV:
      case Capability::GeometryShaderPassthroughNV:
      case Capability::ShaderViewportMaskNV:
      case Capability::ShaderStereoViewNV:
      case Capability::PerViewAttributesNV:
      case Capability::MeshShadingNV:
      case Capability::ImageFootprintNV:
      case Capability::GroupNonUniformPartitionedNV:
      case Capability::CooperativeMatrixNV:
      case Capability::ShaderSMBuiltinsNV:
      case Capability::SubgroupShuffleINTEL:
      case Capability::SubgroupBufferBlockIOINTEL:
      case Capability::SubgroupImageBlockIOINTEL:
      case Capability::SubgroupImageMediaBlockIOINTEL:
      case Capability::IntegerFunctions2INTEL:
      case Capability::SubgroupAvcMotionEstimationINTEL:
      case Capability::SubgroupAvcMotionEstimationIntraINTEL:
      case Capability::SubgroupAvcMotionEstimationChromaINTEL:
      case Capability::FunctionPointersINTEL:
      case Capability::IndirectReferencesINTEL:
      case Capability::FPGAKernelAttributesINTEL:
      case Capability::FPGALoopControlsALTERA:
      case Capability::FPGAMemoryAttributesALTERA:
      case Capability::FPGARegALTERA:
      case Capability::UnstructuredLoopControlsINTEL:
      case Capability::KernelAttributesINTEL:
      case Capability::BlockingPipesALTERA:
      case Capability::RayTracingMotionBlurNV:
      case Capability::RoundToInfinityINTEL:
      case Capability::FloatingPointModeINTEL:
      case Capability::AsmINTEL:
      case Capability::VectorAnyINTEL:
      case Capability::VectorComputeINTEL:
      case Capability::VariableLengthArrayINTEL:
      case Capability::FunctionFloatControlINTEL:
      case Capability::FPFastMathModeINTEL:
      case Capability::ArbitraryPrecisionFixedPointALTERA:
      case Capability::ArbitraryPrecisionFloatingPointALTERA:
      case Capability::ArbitraryPrecisionIntegersALTERA:
      case Capability::FPGAMemoryAccessesALTERA:
      case Capability::FPGAClusterAttributesALTERA:
      case Capability::LoopFuseALTERA:
      case Capability::FPGABufferLocationALTERA:
      case Capability::USMStorageClassesALTERA:
      case Capability::IOPipesALTERA:
      case Capability::LongCompositesINTEL:
      case Capability::DebugInfoModuleINTEL:
      case Capability::BindlessTextureNV:
      case Capability::MemoryAccessAliasingINTEL:
      case Capability::SplitBarrierINTEL:
      case Capability::GroupUniformArithmeticKHR:
      case Capability::CoreBuiltinsARM:
      case Capability::FPGADSPControlALTERA:
      case Capability::FPGAInvocationPipeliningAttributesALTERA:
      case Capability::RuntimeAlignedAttributeALTERA:
      case Capability::TextureSampleWeightedQCOM:
      case Capability::TextureBoxFilterQCOM:
      case Capability::TextureBlockMatchQCOM:
      case Capability::BFloat16ConversionINTEL:
      case Capability::FPGAKernelAttributesv2INTEL:
      case Capability::FPGALatencyControlALTERA:
      case Capability::FPGAArgumentInterfacesALTERA:
      case Capability::TextureBlockMatch2QCOM:
      case Capability::ShaderEnqueueAMDX:
      case Capability::DisplacementMicromapNV:
      case Capability::AtomicFloat16VectorNV:
      case Capability::RayTracingDisplacementMicromapNV:
      case Capability::CooperativeMatrixKHR:
      case Capability::CooperativeVectorNV:
      case Capability::CooperativeVectorTrainingNV:
      case Capability::CooperativeMatrixReductionsNV:
      case Capability::CooperativeMatrixConversionsNV:
      case Capability::CooperativeMatrixPerElementOperationsNV:
      case Capability::CooperativeMatrixTensorAddressingNV:
      case Capability::CooperativeMatrixBlockLoadsNV:
      case Capability::FPGAClusterAttributesV2ALTERA:
      case Capability::FPMaxErrorINTEL:
      case Capability::GlobalVariableFPGADecorationsALTERA:
      case Capability::MaskedGatherScatterINTEL:
      case Capability::CacheControlsINTEL:
      case Capability::RegisterLimitsINTEL:
      case Capability::GlobalVariableHostAccessINTEL:
      case Capability::SubgroupBufferPrefetchINTEL:
      case Capability::Subgroup2DBlockIOINTEL:
      case Capability::Subgroup2DBlockTransformINTEL:
      case Capability::Subgroup2DBlockTransposeINTEL:
      case Capability::SubgroupMatrixMultiplyAccumulateINTEL:
      case Capability::CooperativeMatrixLayoutsARM:
      case Capability::RawAccessChainsNV:
      case Capability::RayTracingSpheresGeometryNV:
      case Capability::RayTracingLinearSweptSpheresGeometryNV:
      case Capability::RayTracingClusterAccelerationStructureNV:
      case Capability::TensorAddressingNV:
      case Capability::OptNoneEXT:
      case Capability::ArithmeticFenceEXT:
      case Capability::TensorsARM:
      case Capability::StorageTensorArrayDynamicIndexingARM:
      case Capability::StorageTensorArrayNonUniformIndexingARM:
      case Capability::TileShadingQCOM:
      case Capability::Int4TypeINTEL:
      case Capability::Int4CooperativeMatrixINTEL:
      case Capability::TaskSequenceALTERA:
      case Capability::TernaryBitwiseFunctionINTEL:
      case Capability::TensorFloat32RoundingINTEL:
      case Capability::GraphARM:
      case Capability::BFloat16CooperativeMatrixKHR:
      case Capability::Float8CooperativeMatrixEXT:
      case Capability::CooperativeMatrixConversionQCOM:
      case Capability::UntypedVariableLengthArrayINTEL:
      case Capability::SpecConditionalINTEL:
      case Capability::FunctionVariantsINTEL:
      case Capability::BindlessImagesINTEL:
      case Capability::RayTracingNV:
      case Capability::ShaderInvocationReorderNV:
      case Capability::Max:
      case Capability::Invalid:
      {
        supported = false;
        break;
      }

      // deprecated provisional raytracing
      case Capability::RayQueryProvisionalKHR:
      case Capability::RayTracingProvisionalKHR:
      {
        supported = false;
        break;
      }
    }

    if(!supported)
    {
      debuggable = false;
      debugStatus += StringFormat::Fmt("Unsupported capability '%s'\n", ToStr(c).c_str());
    }
  }

  for(auto it = extSets.begin(); it != extSets.end(); it++)
  {
    Id id = it->first;
    const rdcstr &setname = it->second;

    if(setname == "GLSL.std.450" || setname.beginsWith("NonSemantic."))
      continue;

    debuggable = false;
    debugStatus += StringFormat::Fmt("Unsupported extended instruction set: '%s'\n", setname.c_str());
  }

  debugStatus.trim();
}

void Debugger::SetStructArrayNames(ShaderVariable &c, const DataType *typeWalk,
                                   const rdcarray<SpecConstant> &specInfo)
{
  if(typeWalk->type == DataType::StructType)
  {
    RDCASSERTEQUAL(c.members.size(), typeWalk->children.size());
    for(size_t i = 0; i < c.members.size(); ++i)
    {
      const DataType::Child &child = typeWalk->children[i];
      const DataType *childType = &dataTypes[child.type];
      if(!child.name.empty())
        c.members[i].name = child.name;
      else
        c.members[i].name = StringFormat::Fmt("_child%d", i);

      SetStructArrayNames(c.members[i], childType, specInfo);
    }
  }
  else if(typeWalk->type == DataType::ArrayType)
  {
    uint32_t arraySize = EvaluateConstant(typeWalk->length, specInfo).value.u32v[0];
    const DataType *childType = &dataTypes[typeWalk->InnerType()];
    for(size_t i = 0; i < arraySize; ++i)
    {
      c.members[i].name = StringFormat::Fmt("[%u]", i);
      SetStructArrayNames(c.members[i], childType, specInfo);
    }
  }
}

ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage shaderStage,
                                       const rdcstr &entryPoint,
                                       const rdcarray<SpecConstant> &specInfo,
                                       const std::map<size_t, uint32_t> &instructionLines,
                                       const SPIRVPatchData &patchData, uint32_t activeIndex,
                                       uint32_t threadsInWorkgroup, uint32_t threadsInSubgroup)
{
  Id entryId = entryLookup[ShaderEntryPoint(entryPoint, shaderStage)];

  if(entryId == Id())
  {
    RDCERR("Invalid entry point '%s'", entryPoint.c_str());
    return new ShaderDebugTrace;
  }

  rdcarray<Id> entryInterface;

  for(const EntryPoint &e : entries)
  {
    if(e.id == entryId)
    {
      entryInterface = e.usedIds;
      break;
    }
  }

  global.clock = uint64_t(time(NULL)) << 32;

  for(auto it = extSets.begin(); it != extSets.end(); it++)
  {
    Id id = it->first;
    const rdcstr &setname = it->second;

    if(setname == "GLSL.std.450")
    {
      ExtInstDispatcher extinst;

      extinst.name = setname;

      ConfigureGLSLStd450(extinst);

      global.extInsts[id] = extinst;
    }
    else if(setname == "NonSemantic.DebugBreak")
    {
      ExtInstDispatcher extinst;

      extinst.name = setname;

      // idx 0 is unused, fill with a dummy function
      extinst.names.push_back("__");
      extinst.functions.push_back(&ThreadDebugBreak);
      extinst.names.push_back("DebugBreak");
      extinst.functions.push_back(&ThreadDebugBreak);

      global.extInsts[id] = extinst;

      RDCLOG("extinst set %u is debug break", id.value());
    }
    else if(setname.beginsWith("NonSemantic."))
    {
      ExtInstDispatcher extinst;

      extinst.name = setname;

      extinst.skippedNonsemantic = true;

      global.extInsts[id] = extinst;
    }
  }

  ShaderDebugTrace *ret = new ShaderDebugTrace;
  ret->debugger = this;
  ret->stage = shaderStage;
  activeLaneIndex = activeIndex;
  subgroupSize = threadsInSubgroup;
  stage = shaderStage;
  apiWrapper = api;
  ShaderFeatures shaderFeatures = ShaderFeatures::None;
  if((stage == ShaderStage::Fragment) ||
     ((stage == ShaderStage::Compute) && patchData.derivativeMode != ComputeDerivativeMode::None))
    shaderFeatures |= ShaderFeatures::Derivatives;

  queuedDeviceThreadSteps.resize(threadsInWorkgroup);
  queuedGpuMathOps.resize(threadsInWorkgroup);
  queuedGpuSampleGatherOps.resize(threadsInWorkgroup);
  pendingLanes.resize(threadsInWorkgroup);
  queuedJobs.resize(threadsInWorkgroup);
  for(uint32_t i = 0; i < threadsInWorkgroup; i++)
  {
    workgroup.push_back(ThreadState(*this, global, stage, shaderFeatures));
    queuedDeviceThreadSteps[i] = false;
    queuedGpuMathOps[i] = false;
    queuedGpuSampleGatherOps[i] = false;
    pendingLanes[i] = false;
    queuedJobs[i] = 0;
  }

  ThreadState &active = GetActiveLane();

  active.nextInstruction = instructionOffsets.indexOf(functions[entryId].begin);

  active.ids.resize(idOffsets.size());

  // array names and struct member names are not set when constants are created
  for(auto it = constants.begin(); it != constants.end(); ++it)
  {
    Constant &c = it->second;

    const DataType *typeWalk = &dataTypes[c.type];
    SetStructArrayNames(c.value, typeWalk, specInfo);
  }

  // evaluate all constants
  for(auto it = constants.begin(); it != constants.end(); it++)
  {
    active.ids[it->first] = EvaluateConstant(it->first, specInfo);
    active.ids[it->first].name = GetRawName(it->first);
  }

  rdcarray<rdcstr> inputSigNames, outputSigNames;

  MakeSignatureNames(patchData.inputs, inputSigNames);
  MakeSignatureNames(patchData.outputs, outputSigNames);

  struct PointerId
  {
    PointerId(Id i, rdcarray<ShaderVariable> GlobalState::*th, rdcarray<ShaderVariable> &storage)
        : id(i), globalStorage(th), globalIndex(storage.size() - 1)
    {
    }
    PointerId(Id i, rdcarray<ShaderVariable> ThreadState::*th, rdcarray<ShaderVariable> &storage)
        : id(i), threadStorage(th), threadIndex(storage.size() - 1)
    {
    }
    PointerId(Id i, rdcarray<ShaderVariable> GlobalState::*global,
              rdcarray<ShaderVariable> &globalVars, rdcarray<ShaderVariable> ThreadState::*thread,
              rdcarray<ShaderVariable> &threadVars)
        : id(i),
          globalStorage(global),
          globalIndex(globalVars.size() - 1),
          threadStorage(thread),
          threadIndex(threadVars.size() - 1)
    {
    }

    void Set(Debugger &d, const GlobalState &global, ThreadState &lane, bool forceLocalGSM) const
    {
      const bool isGlobal = (globalIndex != UINT_MAX);
      const bool isGSM = isGlobal && (threadIndex != UINT_MAX);
      const bool useLocal = (forceLocalGSM && isGSM) || !isGlobal;

      if(!useLocal)
        lane.ids[id] = d.MakePointerVariable(id, &(global.*globalStorage)[globalIndex]);
      else
        lane.ids[id] = d.MakePointerVariable(id, &(lane.*threadStorage)[threadIndex]);
    }

    Id id;
    rdcarray<ShaderVariable> GlobalState::*globalStorage = NULL;
    rdcarray<ShaderVariable> ThreadState::*threadStorage = NULL;
    size_t globalIndex = UINT_MAX;
    size_t threadIndex = UINT_MAX;
  };

#define GLOBAL_POINTER(id, list) PointerId(id, &GlobalState::list, global.list)
#define THREAD_POINTER(id, list) PointerId(id, &ThreadState::list, active.list)
#define GSM_POINTER(id, globalList, threadList)                                        \
  PointerId(id, &GlobalState::globalList, global.globalList, &ThreadState::threadList, \
            active.threadList)

  rdcarray<PointerId> pointerIDs;

  // tracking for any GL bare uniforms
  uint32_t uniformsCBuffer = ~0U;
  rdcarray<rdcpair<rdcspv::Id, size_t>> bareUniformPointers;

  // allocate storage for globals with opaque storage classes, and prepare to set up pointers to
  // them for the global variables themselves
  for(const Variable &v : globals)
  {
    if(v.storage == StorageClass::Input || v.storage == StorageClass::Output)
    {
      if(!entryInterface.contains(v.id))
        continue;

      const bool isInput = (v.storage == StorageClass::Input);

      ShaderVariable var;
      var.name = GetRawName(v.id);

      rdcstr rawName = var.name;
      rdcstr sourceName = GetHumanName(v.id);

      const DataType &type = dataTypes[v.type];

      // global variables should all be pointers into opaque storage
      RDCASSERT(type.type == DataType::PointerType);

      // if we don't have a good human name, generate a better one using the interface information
      // we have
      if(sourceName == var.name)
      {
        if(decorations[v.id].flags & Decorations::HasBuiltIn)
        {
          sourceName = StringFormat::Fmt("_%s", ToStr(decorations[v.id].builtIn).c_str());
        }
        else if(decorations[v.id].flags & Decorations::HasLocation)
        {
          sourceName =
              StringFormat::Fmt("_%s%u", isInput ? "input" : "output", decorations[v.id].location);
        }
        else
        {
          sourceName = StringFormat::Fmt("_sig%u", v.id.value());

          // on GL, detect and name gl_PerVertex as the builtin struct
          if(api->GetGraphicsAPI() == GraphicsAPI::OpenGL)
          {
            if(!dataTypes[type.InnerType()].children.empty() &&
               dataTypes[type.InnerType()].children[0].decorations.flags & Decorations::HasBuiltIn)
              sourceName = "gl_PerVertex";
          }
        }

        for(const DecorationAndParamData &d : decorations[v.id].others)
        {
          if(d.value == Decoration::Component)
            sourceName += StringFormat::Fmt("_%u", d.component);
        }
      }

      const rdcarray<rdcstr> &sigNames = isInput ? inputSigNames : outputSigNames;

      bool addSource = m_DebugInfo.valid ? m_DebugInfo.globals.contains(v.id) : true;

      // modified in a loop below as needed
      uint32_t laneIndex = 0;

      // fill the interface variable
      auto fillInputCallback = [this, isInput, addSource, ret, &sigNames, &rawName, &sourceName,
                                &laneIndex](ShaderVariable &var, const Decorations &curDecorations,
                                            const DataType &type, uint64_t location,
                                            const rdcstr &accessSuffix) {
        if(!var.members.empty())
          return;

        if(isInput)
        {
          uint32_t component = 0;
          for(const DecorationAndParamData &dec : curDecorations.others)
          {
            if(dec.value == Decoration::Component)
            {
              component = dec.component;
              break;
            }
          }

          ShaderBuiltin builtin = ShaderBuiltin::Undefined;
          if(curDecorations.flags & Decorations::HasBuiltIn)
            builtin = MakeShaderBuiltin(stage, curDecorations.builtIn);

          this->apiWrapper->FillInputValue(var, builtin, laneIndex, (uint32_t)location, component);
        }
        else
        {
          // make it obvious when uninitialised outputs are written
          memset(&var.value, 0xcc, sizeof(var.value));
        }

        bool addSourceVar = false;

        if(!isInput && addSource)
          addSourceVar = true;

        if(isInput && laneIndex == activeLaneIndex)
          addSourceVar = true;

        if(sourceName != rawName && addSourceVar)
        {
          rdcstr debugVarName = rawName + accessSuffix;

          SourceVariableMapping sourceVar;
          sourceVar.name = sourceName + accessSuffix;
          sourceVar.offset = (uint32_t)location;
          sourceVar.type = var.type;
          sourceVar.rows = var.rows;
          sourceVar.columns = var.columns;
          sourceVar.signatureIndex = sigNames.indexOf(debugVarName);

          StripCommonGLPrefixes(sourceVar.name);

          for(uint32_t x = 0; x < uint32_t(var.rows) * var.columns; x++)
            sourceVar.variables.push_back(DebugVariableReference(
                isInput ? DebugVariableType::Input : DebugVariableType::Variable, debugVarName, x));

          if(addSourceVar)
            ret->sourceVars.push_back(sourceVar);
        }
      };

      if(isInput)
      {
        for(laneIndex = 0; laneIndex < threadsInWorkgroup; laneIndex++)
        {
          // create the opaque storage
          workgroup[laneIndex].inputs.push_back(var);

          WalkVariable<ShaderVariable, true>(decorations[v.id], dataTypes[type.InnerType()], ~0U,
                                             false, workgroup[laneIndex].inputs.back(), rdcstr(),
                                             fillInputCallback);
        }

        // then make sure we know which ID to set up for the pointer
        pointerIDs.push_back(THREAD_POINTER(v.id, inputs));
      }
      else
      {
        WalkVariable<ShaderVariable, true>(decorations[v.id], dataTypes[type.InnerType()], ~0U,
                                           false, var, rdcstr(), fillInputCallback);

        active.outputs.push_back(var);
        liveGlobals.push_back(v.id);
        pointerIDs.push_back(THREAD_POINTER(v.id, outputs));
      }
    }

    // pick up uniform globals, which could be cbuffers, and push constants
    else if(v.storage == StorageClass::Uniform || v.storage == StorageClass::StorageBuffer ||
            v.storage == StorageClass::PushConstant)
    {
      if(!patchData.usedIds.contains(v.id))
        continue;

      ShaderVariable var;
      var.name = GetRawName(v.id);

      rdcstr sourceName = GetHumanName(v.id);

      const DataType &type = dataTypes[v.type];

      // global variables should all be pointers into opaque storage
      RDCASSERT(type.type == DataType::PointerType);

      const DataType *innertype = &dataTypes[type.InnerType()];

      if(sourceName == var.name)
        sourceName = GetHumanName(innertype->id);

      bool isArray = false;
      uint32_t arraySize = 1;
      if(innertype->type == DataType::ArrayType)
      {
        isArray = true;
        if(innertype->length == Id())
          arraySize = ~0U;
        else
          arraySize = EvaluateConstant(innertype->length, specInfo).value.u32v[0];
        innertype = &dataTypes[innertype->InnerType()];
      }

      const bool ssbo = (v.storage == StorageClass::StorageBuffer) ||
                        (decorations[innertype->id].flags & Decorations::BufferBlock);

      if(innertype->type == DataType::StructType)
      {
        // if we don't have a good human name, generate a better one using the interface information
        // we have
        if(sourceName == var.name)
        {
          if(v.storage == StorageClass::PushConstant)
            sourceName = "_pushconsts";
          else if(ssbo)
            sourceName = StringFormat::Fmt("_buffer_set%u_bind%u", decorations[v.id].set,
                                           decorations[v.id].binding);
          else
            sourceName = StringFormat::Fmt("_cbuffer_set%u_bind%u", decorations[v.id].set,
                                           decorations[v.id].binding);
        }

        SourceVariableMapping sourceVar;
        sourceVar.name = sourceName;
        sourceVar.offset = 0;

        if(ssbo)
        {
          var.rows = 1;
          var.columns = 1;
          var.type = VarType::ReadWriteResource;

          int32_t idx = patchData.rwInterface.indexOf(v.id);

          // on GL we may have buffers which are dead-code eliminated but remain part of the simulated
          // code. Because we base our interfaces off the GLSL reflected data it may not be present
          if(idx >= 0)
            var.SetBindIndex(ShaderBindIndex(DescriptorCategory::ReadWriteResource, idx, 0U));
          else
            var.SetBindIndex(ShaderBindIndex());

          if(api->GetGraphicsAPI() == GraphicsAPI::Vulkan)
            RDCASSERT(idx >= 0);

          enablePointerFlags(var, PointerFlags::SSBO);

          if(isArray)
            enablePointerFlags(var, PointerFlags::GlobalArrayBinding);

          sourceVar.type = VarType::ReadWriteResource;
          sourceVar.rows = 1;
          sourceVar.columns = 1;
          sourceVar.variables.push_back(
              DebugVariableReference(DebugVariableType::ReadWriteResource, var.name));

          global.readWriteResources.push_back(var);
          pointerIDs.push_back(GLOBAL_POINTER(v.id, readWriteResources));
        }
        else
        {
          ShaderBindIndex binding;

          binding.category = DescriptorCategory::ConstantBlock;
          binding.index = patchData.cblockInterface.indexOf(v.id);

          // on GL we may have buffers which are dead-code eliminated but remain part of the simulated
          // code. Because we base our interfaces off the GLSL reflected data it may not be present
          if(binding.index == ~0U)
            binding = ShaderBindIndex();

          if(api->GetGraphicsAPI() == GraphicsAPI::Vulkan)
            RDCASSERT(binding.index != ~0U);

          auto cbufferCallback = [this, &binding](
                                     ShaderVariable &var, const Decorations &curDecorations,
                                     const DataType &type, uint64_t offset, const rdcstr &) {
            if(!var.members.empty())
              return;

            // non-matrix case is simple, just read the size of the variable
            if(var.rows == 1)
            {
              this->apiWrapper->ReadBufferValue(binding, offset, VarByteSize(var),
                                                var.value.u8v.data());

              if(type.type == DataType::PointerType)
              {
                var.SetTypedPointer(var.value.u64v[0], this->apiWrapper->GetShaderID(),
                                    idToPointerType[type.InnerType()]);

                const Decorations &dec = decorations[type.id];
                if(dec.flags & Decorations::HasArrayStride)
                  setArrayStride(var, dec.arrayStride);
              }
            }
            else
            {
              // matrix case is more complicated. Either read column by column or row by row
              // depending on majorness
              uint32_t matrixStride = curDecorations.matrixStride;

              if(!(curDecorations.flags & Decorations::HasMatrixStride))
              {
                RDCWARN("Matrix without matrix stride - assuming legacy vec4 packed");
                matrixStride = 16;
              }

              if(curDecorations.flags & Decorations::ColMajor)
              {
                ShaderVariable tmp;
                tmp.type = var.type;

                uint32_t colSize = VarTypeByteSize(var.type) * var.rows;
                for(uint32_t c = 0; c < var.columns; c++)
                {
                  // read the column
                  this->apiWrapper->ReadBufferValue(binding, offset + c * matrixStride, colSize,
                                                    VarElemPointer(tmp, 0));

                  // now write it into the appropiate elements in the destination ShaderValue
                  for(uint32_t r = 0; r < var.rows; r++)
                    copyComp(var, r * var.columns + c, tmp, r);
                }
              }
              else
              {
                // row major is easier, read row-by-row directly into the output variable
                uint32_t rowSize = VarTypeByteSize(var.type) * var.columns;
                for(uint32_t r = 0; r < var.rows; r++)
                {
                  // read the column into the destination ShaderValue, which is tightly packed with
                  // rows
                  this->apiWrapper->ReadBufferValue(binding, offset + r * matrixStride, rowSize,
                                                    VarElemPointer(var, r * var.columns));
                }
              }
            }
          };

          if(isArray)
          {
            if(arraySize == ~0U)
            {
              RDCERR("Unsupported runtime array of UBOs");
              arraySize = 1;
            }

            var.members.reserve(arraySize);

            for(uint32_t a = 0; a < arraySize; a++)
            {
              binding.arrayElement = a;
              var.members.push_back(ShaderVariable());
              var.members.back().name = StringFormat::Fmt("[%u]", a);
              WalkVariable<ShaderVariable, true>(decorations[v.id], *innertype, 0U, false,
                                                 var.members.back(), rdcstr(), cbufferCallback);
            }
          }
          else
          {
            WalkVariable<ShaderVariable, true>(decorations[v.id], *innertype, 0U, false, var,
                                               rdcstr(), cbufferCallback);
          }

          sourceVar.type = VarType::ConstantBlock;
          sourceVar.rows = 1;
          sourceVar.columns = 1;
          sourceVar.variables.push_back(DebugVariableReference(DebugVariableType::Constant, var.name));

          global.constantBlocks.push_back(var);
          pointerIDs.push_back(GLOBAL_POINTER(v.id, constantBlocks));
        }

        ret->sourceVars.push_back(sourceVar);
      }
      else
      {
        RDCERR("Unhandled type of uniform: %u", innertype->type);
      }
    }
    else if(v.storage == StorageClass::UniformConstant)
    {
      if(!patchData.usedIds.contains(v.id))
        continue;

      // only images/samplers are allowed to be in UniformConstant in Vulkan SPIR-V. In GL SPIR-V
      // these can also be values, but we default to this and override below as needed
      ShaderVariable var;
      var.rows = 1;
      var.columns = 1;
      var.name = GetRawName(v.id);

      rdcstr sourceName = GetHumanName(v.id);

      const DataType &type = dataTypes[v.type];

      // global variables should all be pointers into opaque storage
      RDCASSERT(type.type == DataType::PointerType);

      const DataType *innertype = &dataTypes[type.InnerType()];

      // if we don't have a good human name, generate a better one using the interface information
      // we have
      if(sourceName == var.name)
      {
        rdcstr innerName;
        if(innertype->type == DataType::SamplerType)
          innerName = "sampler";
        else if(innertype->type == DataType::SampledImageType)
          innerName = "sampledImage";
        else if(innertype->type == DataType::ImageType)
          innerName = "image";
        else if(innertype->type == DataType::AccelerationStructureType)
          innerName = "accelerationStructure";
        sourceName = StringFormat::Fmt("_%s_set%u_bind%u", innerName.c_str(), decorations[v.id].set,
                                       decorations[v.id].binding);
      }

      DebugVariableType debugType = DebugVariableType::ReadOnlyResource;

      uint32_t set = 0, bind = 0, location = ~0U;
      if(decorations[v.id].flags & Decorations::HasDescriptorSet)
        set = decorations[v.id].set;
      if(decorations[v.id].flags & Decorations::HasBinding)
        bind = decorations[v.id].binding;
      if(decorations[v.id].flags & Decorations::HasLocation)
        location = decorations[v.id].location;

      // don't step into arrays when they're bare uniforms with locations
      if(innertype->type == DataType::ArrayType && location == ~0U)
      {
        enablePointerFlags(var, PointerFlags::GlobalArrayBinding);
        innertype = &dataTypes[innertype->InnerType()];
      }

      bool bareUniform = false;
      // GL Resource Arrays
      if((api->GetGraphicsAPI() == GraphicsAPI::OpenGL) &&
         (innertype->type == DataType::ArrayType && location != ~0U))
      {
        DataType *elementtype = &dataTypes[innertype->InnerType()];
        DataType::Type baseType = elementtype->type;
        if((baseType == DataType::SampledImageType) || (baseType == DataType::ImageType) ||
           (baseType == DataType::SamplerType))
        {
          var.type = VarType::Struct;
          debugType = DebugVariableType::ReadOnlyResource;

          VarType memberType = VarType::ReadOnlyResource;
          DescriptorCategory descCat = DescriptorCategory::ReadOnlyResource;
          uint32_t texType = DebugAPIWrapper::Float_Texture;

          if(baseType == DataType::SamplerType)
          {
            debugType = DebugVariableType::Sampler;
            memberType = VarType::Sampler;
            descCat = DescriptorCategory::Sampler;
          }
          else if(baseType == DataType::SampledImageType || baseType == DataType::ImageType)
          {
            // store the texture type here, since the image may be copied around and combined with
            // a sampler, so accessing the original type might be non-trivial at point of access
            Id imgid = elementtype->id;

            if(baseType == DataType::SampledImageType)
              imgid = sampledImageTypes[imgid].baseId;

            RDCASSERT(imageTypes[imgid].dim != Dim::Max);

            if(imageTypes[imgid].dim == Dim::Buffer)
              texType |= DebugAPIWrapper::Buffer_Texture;

            if(imageTypes[imgid].dim == Dim::SubpassData)
              texType |= DebugAPIWrapper::Subpass_Texture;

            if(imageTypes[imgid].retType.type == Op::TypeInt)
            {
              if(imageTypes[imgid].retType.signedness)
                texType |= DebugAPIWrapper::SInt_Texture;
              else
                texType |= DebugAPIWrapper::UInt_Texture;
            }

            if(imageTypes[imgid].sampled == 2 && imageTypes[imgid].dim != Dim::SubpassData)
            {
              debugType = DebugVariableType::ReadWriteResource;
              memberType = VarType::ReadWriteResource;
              descCat = DescriptorCategory::ReadWriteResource;
            }
          }
          int32_t idx = -1;
          if(memberType == VarType::ReadOnlyResource)
            idx = patchData.roInterface.indexOf(v.id);
          else if(memberType == VarType::Sampler)
            idx = patchData.samplerInterface.indexOf(v.id);
          else if(memberType == VarType::ReadWriteResource)
            idx = patchData.rwInterface.indexOf(v.id);

          uint32_t len = uintComp(GetActiveLane().ids[innertype->length], 0);
          for(uint32_t i = 0; i < len; ++i)
          {
            ShaderVariable member;

            member.rows = 1;
            member.columns = 1;
            member.name = StringFormat::Fmt("[%u]", i);
            member.type = memberType;

            if((memberType == VarType::ReadOnlyResource) ||
               (memberType == VarType::ReadWriteResource))
              setTextureType(member, (DebugAPIWrapper::TextureType)texType);

            // on GL we may have textures which are dead-code eliminated but remain part of the simulated
            // code. Because we base our interfaces off the GLSL reflected data it may not be present.

            // Bind to index "idx+i" because GL resource arrays are expanded in element order
            if(idx >= 0)
              member.SetBindIndex(ShaderBindIndex(descCat, idx + i, 0));
            else
              member.SetBindIndex(ShaderBindIndex());

            var.members.push_back(member);

            // Source mapping per array element because GL resource arrays are expanded
            SourceVariableMapping sourceVar;
            sourceVar.name = StringFormat::Fmt("%s[%u]", sourceName.c_str(), i);
            sourceVar.type = var.members[i].type;
            sourceVar.rows = 1;
            sourceVar.columns = 1;
            sourceVar.offset = 0;
            sourceVar.variables.push_back(DebugVariableReference(
                debugType, StringFormat::Fmt("%s[%u]", var.name.c_str(), i)));
            ret->sourceVars.push_back(sourceVar);
          }

          if(debugType == DebugVariableType::ReadOnlyResource)
          {
            global.readOnlyResources.push_back(var);
            pointerIDs.push_back(GLOBAL_POINTER(v.id, readOnlyResources));
          }
          else if(debugType == DebugVariableType::Sampler)
          {
            global.samplers.push_back(var);
            pointerIDs.push_back(GLOBAL_POINTER(v.id, samplers));
          }
          else if(debugType == DebugVariableType::ReadWriteResource)
          {
            global.readWriteResources.push_back(var);
            pointerIDs.push_back(GLOBAL_POINTER(v.id, readWriteResources));
          }
          continue;
        }
      }

      if(innertype->type == DataType::SamplerType)
      {
        var.type = VarType::Sampler;
        debugType = DebugVariableType::Sampler;

        int32_t idx = patchData.samplerInterface.indexOf(v.id);

        // on GL we may have samplers which are dead-code eliminated but remain part of the simulated
        // code. Because we base our interfaces off the GLSL reflected data it may not be present
        if(idx >= 0)
          var.SetBindIndex(ShaderBindIndex(DescriptorCategory::Sampler, idx, 0U));
        else
          var.SetBindIndex(ShaderBindIndex());

        if(api->GetGraphicsAPI() == GraphicsAPI::Vulkan)
          RDCASSERT(idx >= 0);

        global.samplers.push_back(var);
        pointerIDs.push_back(GLOBAL_POINTER(v.id, samplers));
      }
      else if(innertype->type == DataType::SampledImageType || innertype->type == DataType::ImageType)
      {
        var.type = VarType::ReadOnlyResource;
        debugType = DebugVariableType::ReadOnlyResource;

        // store the texture type here, since the image may be copied around and combined with a
        // sampler, so accessing the original type might be non-trivial at point of access
        uint32_t texType = DebugAPIWrapper::Float_Texture;

        Id imgid = innertype->id;

        if(innertype->type == DataType::SampledImageType)
          imgid = sampledImageTypes[imgid].baseId;

        RDCASSERT(imageTypes[imgid].dim != Dim::Max);

        if(imageTypes[imgid].dim == Dim::Buffer)
          texType |= DebugAPIWrapper::Buffer_Texture;

        if(imageTypes[imgid].dim == Dim::SubpassData)
          texType |= DebugAPIWrapper::Subpass_Texture;

        if(imageTypes[imgid].retType.type == Op::TypeInt)
        {
          if(imageTypes[imgid].retType.signedness)
            texType |= DebugAPIWrapper::SInt_Texture;
          else
            texType |= DebugAPIWrapper::UInt_Texture;
        }

        setTextureType(var, (DebugAPIWrapper::TextureType)texType);

        if(imageTypes[imgid].sampled == 2 && imageTypes[imgid].dim != Dim::SubpassData)
        {
          var.type = VarType::ReadWriteResource;
          debugType = DebugVariableType::ReadWriteResource;

          int32_t idx = patchData.rwInterface.indexOf(v.id);

          // on GL we may have textures which are dead-code eliminated but remain part of the simulated
          // code. Because we base our interfaces off the GLSL reflected data it may not be present
          if(idx >= 0)
            var.SetBindIndex(ShaderBindIndex(DescriptorCategory::ReadWriteResource, idx, 0U));
          else
            var.SetBindIndex(ShaderBindIndex());

          if(api->GetGraphicsAPI() == GraphicsAPI::Vulkan)
            RDCASSERT(idx >= 0);

          global.readWriteResources.push_back(var);
          pointerIDs.push_back(GLOBAL_POINTER(v.id, readWriteResources));
        }
        else
        {
          int32_t idx = patchData.roInterface.indexOf(v.id);

          // on GL we may have textures which are dead-code eliminated but remain part of the simulated
          // code. Because we base our interfaces off the GLSL reflected data it may not be present
          if(idx >= 0)
            var.SetBindIndex(ShaderBindIndex(DescriptorCategory::ReadOnlyResource, idx, 0U));
          else
            var.SetBindIndex(ShaderBindIndex());

          if(api->GetGraphicsAPI() == GraphicsAPI::Vulkan)
            RDCASSERT(idx >= 0);

          global.readOnlyResources.push_back(var);
          pointerIDs.push_back(GLOBAL_POINTER(v.id, readOnlyResources));
        }
      }
      else if(innertype->type == DataType::AccelerationStructureType)
      {
        var.type = VarType::ReadOnlyResource;
        debugType = DebugVariableType::ReadOnlyResource;

        global.readOnlyResources.push_back(var);
        pointerIDs.push_back(GLOBAL_POINTER(v.id, readOnlyResources));
      }
      else if(innertype->type == DataType::StructType || innertype->type == DataType::ArrayType ||
              innertype->type == DataType::MatrixType || innertype->type == DataType::VectorType ||
              innertype->type == DataType::ScalarType)
      {
        // plain variable
        bareUniform = true;

        // if we haven't already added a virtual uniforms cbuffer, do so now
        if(uniformsCBuffer == ~0U)
        {
          ShaderVariable uniformsVar;
          uniformsVar.rows = 1;
          uniformsVar.columns = 1;
          uniformsVar.type = VarType::ConstantBlock;

          SourceVariableMapping sourceVar;
          sourceVar.name = uniformsVar.name = "uniforms";
          sourceVar.type = VarType::ConstantBlock;
          sourceVar.rows = 1;
          sourceVar.columns = 1;
          sourceVar.offset = 0;
          sourceVar.variables.push_back(
              DebugVariableReference(DebugVariableType::Constant, uniformsVar.name));

          uniformsCBuffer = global.constantBlocks.size();

          global.constantBlocks.push_back(uniformsVar);
          pointerIDs.push_back(GLOBAL_POINTER(v.id, constantBlocks));

          ret->sourceVars.push_back(sourceVar);
        }

        rdcarray<ShaderVariable> &uniforms = global.constantBlocks[uniformsCBuffer].members;

        // record that this variable id needs to be pointed to the n'th member of the virtual
        // cbuffer, which we're about to add
        bareUniformPointers.push_back({v.id, uniforms.size()});

        var = ShaderVariable();
        var.name = GetHumanName(v.id);

        auto uniformCallback = [this](ShaderVariable &var, const Decorations &curDecorations,
                                      const DataType &type, uint64_t location, const rdcstr &) {
          if(var.members.empty())
            this->apiWrapper->ReadLocationValue((uint32_t)location, var);
        };

        WalkVariable<ShaderVariable, true>(decorations[v.id], *innertype, ~0U, false, var, rdcstr(),
                                           uniformCallback);

        uniforms.push_back(var);
      }
      else
      {
        RDCERR("Unhandled type of uniform: %u", innertype->type);
      }

      if(!bareUniform)
      {
        SourceVariableMapping sourceVar;
        sourceVar.name = sourceName;
        sourceVar.type = var.type;
        sourceVar.rows = 1;
        sourceVar.columns = 1;
        sourceVar.offset = 0;
        sourceVar.variables.push_back(DebugVariableReference(debugType, var.name));

        ret->sourceVars.push_back(sourceVar);
      }
    }
    else if(v.storage == StorageClass::Private || v.storage == StorageClass::Workgroup)
    {
      // private variables are allocated as globals. Similar to outputs
      ShaderVariable var;
      var.name = GetRawName(v.id);

      rdcstr sourceName = GetHumanName(v.id);

      const DataType &type = dataTypes[v.type];

      // global variables should all be pointers into opaque storage
      RDCASSERT(type.type == DataType::PointerType);

      auto uninitialisedCallback = [](ShaderVariable &var, const Decorations &, const DataType &,
                                      uint64_t, const rdcstr &) {
        if(!var.members.empty())
          return;

        memset(&var.value, 0xcc, sizeof(var.value));
      };

      WalkVariable<ShaderVariable, true>(decorations[v.id], dataTypes[type.InnerType()], ~0U, false,
                                         var, rdcstr(), uninitialisedCallback);

      if(v.initializer != Id())
        AssignValue(var, active.ids[v.initializer]);

      if(v.storage == StorageClass::Private)
      {
        active.privates.push_back(var);
        pointerIDs.push_back(THREAD_POINTER(v.id, privates));
      }
      else if(v.storage == StorageClass::Workgroup)
      {
        active.gsmIndexes.push_back({global.workgroups.count(), active.privates.count()});
        active.privates.push_back(var);
        global.workgroups.push_back(var);
        pointerIDs.push_back(GSM_POINTER(v.id, workgroups, privates));
      }

      liveGlobals.push_back(v.id);

      if(sourceName != var.name && (!m_DebugInfo.valid || m_DebugInfo.globals.contains(v.id)))
      {
        SourceVariableMapping sourceVar;
        sourceVar.name = sourceName;
        sourceVar.type = var.type;
        sourceVar.rows = RDCMAX(1U, (uint32_t)var.rows);
        sourceVar.columns = RDCMAX(1U, (uint32_t)var.columns);
        sourceVar.offset = 0;
        for(uint32_t x = 0; x < sourceVar.rows * sourceVar.columns; x++)
          sourceVar.variables.push_back(
              DebugVariableReference(DebugVariableType::Variable, var.name, x));

        ret->sourceVars.push_back(sourceVar);
      }
    }
    else
    {
      RDCERR("Unhandled type of global variable: %s", ToStr(v.storage).c_str());
    }
  }

  std::sort(liveGlobals.begin(), liveGlobals.end());

  rdcarray<ThreadIndex> threadIds;
  for(uint32_t i = 0; i < threadsInWorkgroup; i++)
  {
    bool isActiveLane = (i == activeLaneIndex);
    ThreadState &lane = workgroup[i];
    lane.workgroupIndex = i;
    lane.activeMask.resize(threadsInWorkgroup);
    if(!isActiveLane)
    {
      lane.nextInstruction = active.nextInstruction;
      lane.outputs = active.outputs;
      lane.privates = active.privates;
      lane.ids = active.ids;
    }

    if(stage == ShaderStage::Pixel)
    {
      lane.helperInvocation = apiWrapper->GetThreadProperty(i, ThreadProperty::Helper) != 0;
      lane.quadLaneIndex = apiWrapper->GetThreadProperty(i, ThreadProperty::QuadLane);
      lane.quadId = apiWrapper->GetThreadProperty(i, ThreadProperty::QuadId);
    }
    if(stage == ShaderStage::Compute)
    {
      lane.quadLaneIndex = apiWrapper->GetThreadProperty(i, ThreadProperty::QuadLane);
      lane.quadId = apiWrapper->GetThreadProperty(i, ThreadProperty::QuadId);
    }

    lane.subgroupId = apiWrapper->GetThreadProperty(i, ThreadProperty::SubgroupId);
    lane.dead = apiWrapper->GetThreadProperty(i, ThreadProperty::Active) == 0;

    if(patchData.threadScope & ThreadScope::Subgroup)
      lane.elected = apiWrapper->GetThreadProperty(i, ThreadProperty::Elected) != 0;

    // now that the globals are allocated and their storage won't move, we can take pointers to them
    for(const PointerId &p : pointerIDs)
      p.Set(*this, global, lane, isActiveLane);

    for(const rdcpair<rdcspv::Id, size_t> &u : bareUniformPointers)
    {
      lane.ids[u.first] =
          MakePointerVariable(u.first, &global.constantBlocks[uniformsCBuffer].members[u.second]);
    }

    if(isActiveLane)
    {
      for(const PointerId &p : pointerIDs)
      {
        // GSM pointers have a global and local index
        // Create a GSM global pointer, used for writing back
        if((p.globalIndex != UINT_MAX) && (p.threadIndex != UINT_MAX))
        {
          RDCASSERTEQUAL(lane.gsmPointers.count(p.id), 0);
          lane.gsmPointers[p.id] = MakePointerVariable(p.id, &global.workgroups[p.globalIndex]);
        }
      }
    }

    // Only add active lanes to control flow
    if(!lane.dead)
      threadIds.push_back(i);
  }

  controlFlow.Construct(threadIds);

  // find quad neighbours
  {
    rdcarray<uint32_t> processedQuads;
    for(uint32_t i = 0; i < threadsInWorkgroup; i++)
    {
      uint32_t desiredQuad = workgroup[i].quadId;

      // ignore threads not in any quad
      if(desiredQuad == 0)
        continue;

      // quads are almost certainly sorted together, so shortcut by checking the last one
      if((!processedQuads.empty() && processedQuads.back() == desiredQuad) ||
         processedQuads.contains(desiredQuad))
        continue;

      processedQuads.push_back(desiredQuad);

      // find the threads
      uint32_t threads[4] = {
          i,
          ~0U,
          ~0U,
          ~0U,
      };
      for(uint32_t j = i + 1, t = 1; j < threadsInWorkgroup && t < 4; j++)
      {
        if(workgroup[j].quadId == desiredQuad)
          threads[t++] = j;
      }

      // now swizzle the threads to know each other
      for(uint32_t src = 0; src < 4; src++)
      {
        const uint32_t thread = threads[src];

        if(thread >= workgroup.size())
        {
          RDCERR("Unexpected incomplete quad missing a thread");
          continue;
        }

        const uint32_t lane = workgroup[thread].quadLaneIndex;

        if(lane >= 4)
          continue;

        for(uint32_t dst = 0; dst < 4; dst++)
        {
          if(threads[dst] == ~0U)
            continue;

          workgroup[threads[dst]].quadNeighbours[lane] = threads[src];
        }
      }
    }
  }

  // this contains all the accumulated line number information. Add in our disassembly mapping
  ret->instInfo = m_InstInfo;
  for(size_t i = 0; i < m_InstInfo.size(); i++)
  {
    auto it = instructionLines.find(instructionOffsets[m_InstInfo[i].instruction]);
    if(it != instructionLines.end())
      ret->instInfo[i].lineInfo.disassemblyLine = it->second;
    else
      ret->instInfo[i].lineInfo.disassemblyLine = 0;
  }

  if(m_DebugInfo.valid)
    FillDebugSourceVars(ret->instInfo);
  else
    FillDefaultSourceVars(ret->instInfo);

  ret->constantBlocks = global.constantBlocks;
  ret->readOnlyResources = global.readOnlyResources;
  ret->readWriteResources = global.readWriteResources;
  ret->samplers = global.samplers;
  ret->inputs = active.inputs;

  mtSimulation = apiWrapper->SimulateThreaded();
  if(threadsInWorkgroup < 4)
    mtSimulation = false;

  AtomicStore(&atomic_simulationFinished, 0);
  if(mtSimulation)
  {
    if(!Shader_Debug_UseJobSystemJobs())
    {
      uint32_t countJobs = RDCMIN(threadsInWorkgroup, Threading::JobSystem::GetCountWorkers() / 2U);
      for(uint32_t i = 0; i < countJobs; ++i)
        Threading::JobSystem::AddJob([this]() { SimulationJobHelper(); });
    }
  }
  return ret;
}

void Debugger::FillCallstack(ThreadState &thread, ShaderDebugState &state) const
{
  rdcarray<Id> funcs;
  thread.FillCallstack(funcs);

  for(Id f : funcs)
  {
    if(m_DebugInfo.valid)
    {
      auto it = m_DebugInfo.funcToDebugFunc.find(f);
      if(it != m_DebugInfo.funcToDebugFunc.end())
      {
        state.callstack.push_back(m_DebugInfo.scopes[it->second].name);
        continue;
      }
    }

    state.callstack.push_back(GetHumanName(f));
  }
}

void Debugger::FillDebugSourceVars(rdcarray<InstructionSourceInfo> &instInfo) const
{
  for(InstructionSourceInfo &i : instInfo)
  {
    size_t offs = instructionOffsets[i.instruction];

    const ScopeData *scope = GetScope(offs);

    if(!scope)
      continue;

    // track which mappings we've processed, so if the same variable has mappings in multiple scopes
    // we only pick the innermost.
    rdcarray<LocalMapping> processed;
    rdcarray<Id> sourceVars;

    // capture the scopes upwards (from child to parent)
    rdcarray<const ScopeData *> scopes;
    while(scope)
    {
      scopes.push_back(scope);
      // if we reach a function scope, don't go up any further.
      if(scope->type == DebugScope::Function)
        break;

      scope = scope->parent;
    }

    // Iterate over the scopes downwards (parent->child)
    for(size_t s = 0; s < scopes.size(); ++s)
    {
      scope = scopes[scopes.size() - 1 - s];
      for(size_t m = 0; m < scope->localMappings.size(); m++)
      {
        const LocalMapping &mapping = scope->localMappings[m];

        // if this mapping is past the current instruction, stop here.
        if(mapping.instIndex > i.instruction)
          break;

        // see if this mapping is superceded by a later mapping in this scope for this instruction.
        // This is a bit inefficient but simple. The alternative would be to do record
        // start and end points for each mapping and update the end points, but this is simple and
        // should be limited since it's only per-scope
        bool supercede = false;
        for(size_t n = m + 1; n < scope->localMappings.size(); n++)
        {
          const LocalMapping &laterMapping = scope->localMappings[n];

          // if this mapping is past the current instruction, stop here.
          if(laterMapping.instIndex > i.instruction)
            break;

          // if this mapping will supercede and starts later
          if(laterMapping.isSourceSupersetOf(mapping) && laterMapping.instIndex > mapping.instIndex)
          {
            supercede = true;
            break;
          }
        }

        // don't add the current mapping if it's going to be superceded by something later
        if(supercede)
          continue;

        processed.push_back(mapping);
        Id sourceVar = mapping.sourceVar;
        if(!sourceVars.contains(mapping.sourceVar))
          sourceVars.push_back(mapping.sourceVar);
      }
    }

    // Converting debug variable mappings to SourceVariableMapping is a two phase algorithm.

    // Phase One
    // For each source variable, repeatedly apply the debug variable mappings.
    // This debug variable usage is tracked in a tree-like structure built using DebugVarNode
    // elements.
    // As each mapping is applied, the new mapping can fully or partially override the
    // existing mapping. When an existing mapping is:
    //  - fully overrideen: any sub-elements of that mapping are cleared
    //    i.e. assigning a vector, array, structure
    //  - partially overriden: the existing mapping is expanded into its sub-elements which are
    //    mapped to the current mapping and then the new mapping is set to its corresponding
    //    elements i.e. y-component in a vector, member in a structure, a single array element
    // The DebugVarNode member "emitSourceVar" determines if the DebugVar mapping should be
    // converted to a source variable mapping.

    // Phase Two
    // The DebugVarNode tree is walked to find the nodes which have "emitSourceVar" set to true and
    // then those nodes are converted to SourceVariableMapping

    struct DebugVarNode
    {
      rdcarray<DebugVarNode> children;
      Id debugVar;
      rdcstr name;
      rdcstr debugVarSuffix;
      VarType type = VarType::Unknown;
      uint32_t rows = 0;
      uint32_t columns = 0;
      uint32_t debugVarComponent = 0;
      uint32_t offset = 0;
      bool emitSourceVar = false;
    };

    ::std::map<Id, DebugVarNode> roots;

    // Phase One: generate the DebugVarNode tree by repeatedly apply debug variables updating
    // existing mappings with later mappings
    for(size_t sv = 0; sv < sourceVars.size(); ++sv)
    {
      Id sourceVarId = sourceVars[sv];
      const LocalData &l = m_DebugInfo.locals[sourceVarId];

      // Convert processed mappings into a usage map
      for(size_t m = 0; m < processed.size(); ++m)
      {
        const LocalMapping &mapping = processed[m];
        if(mapping.sourceVar != sourceVarId)
          continue;

        const TypeData *typeWalk = l.type;
        DebugVarNode *usage = &roots[sourceVarId];
        if(usage->name.isEmpty())
        {
          usage->name = l.name;
          usage->rows = 1U;
          usage->columns = 1U;
        }

        // if it doesn't have indexes this is simple, set up a 1:1 map
        if(mapping.indexes.isEmpty())
        {
          uint32_t rows = 1;
          uint32_t columns = 1;
          // skip past any pointer types to get the 'real' type that we'll see
          while(typeWalk && typeWalk->baseType != Id() && typeWalk->type == VarType::GPUPointer)
            typeWalk = &m_DebugInfo.types[typeWalk->baseType];

          const uint32_t arrayDimension = typeWalk->arrayDimensions.size();
          if(arrayDimension > 0)
          {
            // walk down until we get to a scalar type, if we get there. This means arrays of
            // basic types will get the right type
            while(typeWalk && typeWalk->baseType != Id() && typeWalk->type == VarType::Unknown)
              typeWalk = &m_DebugInfo.types[typeWalk->baseType];

            usage->type = typeWalk->type;
          }
          else if(!typeWalk->structMembers.empty())
          {
            usage->type = typeWalk->type;
          }
          if(typeWalk->matSize != 0)
          {
            const TypeData &vec = m_DebugInfo.types[typeWalk->baseType];
            const TypeData &scalar = m_DebugInfo.types[vec.baseType];

            usage->type = scalar.type;

            if(typeWalk->colMajorMat)
            {
              rows = RDCMAX(1U, vec.vecSize);
              columns = RDCMAX(1U, typeWalk->matSize);
            }
            else
            {
              columns = RDCMAX(1U, vec.vecSize);
              rows = RDCMAX(1U, typeWalk->matSize);
            }
          }
          else if(typeWalk->vecSize != 0)
          {
            const TypeData &scalar = m_DebugInfo.types[typeWalk->baseType];

            usage->type = scalar.type;
            columns = RDCMAX(1U, typeWalk->vecSize);
          }
          else
          {
            usage->type = typeWalk->type;
          }

          usage->debugVar = mapping.debugVar;
          // Remove any child mappings : this mapping covers everything
          usage->children.clear();
          usage->emitSourceVar = true;
          usage->rows = rows;
          usage->columns = columns;
        }
        else
        {
          rdcarray<uint32_t> indexes = mapping.indexes;

          // walk any aggregate types
          while(!indexes.empty())
          {
            uint32_t idx = ~0U;
            const TypeData *childType = NULL;
            const uint32_t arrayDimension = typeWalk->arrayDimensions.size();
            if(arrayDimension > 0)
            {
              const rdcarray<uint32_t> &dims = typeWalk->arrayDimensions;
              uint32_t numIdxs = (uint32_t)indexes.size();
              childType = &m_DebugInfo.types[typeWalk->baseType];
              uint32_t childRows = 1U;
              uint32_t childColumns = 1U;
              VarType elementType = childType->type;
              uint32_t elementSize = 1;
              if(childType->matSize != 0)
              {
                const TypeData &vec = m_DebugInfo.types[childType->baseType];
                const TypeData &scalar = m_DebugInfo.types[vec.baseType];

                elementType = scalar.type;
                if(childType->colMajorMat)
                {
                  childRows = RDCMAX(1U, vec.vecSize);
                  childColumns = RDCMAX(1U, childType->matSize);
                }
                else
                {
                  childColumns = RDCMAX(1U, vec.vecSize);
                  childRows = RDCMAX(1U, childType->matSize);
                }
              }
              else if(childType->vecSize != 0)
              {
                const TypeData &scalar = m_DebugInfo.types[childType->baseType];
                uint32_t vecColumns = RDCMAX(1U, childType->vecSize);

                elementType = scalar.type;

                childRows = 1U;
                childColumns = vecColumns;
              }
              else if(!childType->structMembers.empty())
              {
                elementSize += childType->memberOffsets[childType->memberOffsets.count() - 1];
              }
              elementSize *= childRows * childColumns;
              const uint32_t countDims = RDCMIN(arrayDimension, numIdxs);
              // handle N dimensional arrays
              for(uint32_t d = 0; d < countDims; ++d)
              {
                idx = indexes[0];
                indexes.erase(0);
                uint32_t rows = dims[d];
                usage->rows = rows;
                usage->columns = 1U;
                // Expand the node if required
                if(usage->children.isEmpty())
                {
                  usage->children.resize(rows);
                  for(uint32_t x = 0; x < rows; x++)
                  {
                    usage->children[x].debugVar = usage->debugVar;
                    rdcstr suffix = StringFormat::Fmt("[%u]", x);
                    usage->children[x].debugVarSuffix = usage->debugVarSuffix + suffix;
                    usage->children[x].name = usage->name + suffix;
                    usage->children[x].type = elementType;
                    usage->children[x].rows = childRows;
                    usage->children[x].columns = childColumns;
                    usage->children[x].offset = usage->offset + x * elementSize;
                  }
                }
                RDCASSERTEQUAL(usage->children.size(), rows);
                // if the whole node was displayed : display the sub-elements
                if(usage->emitSourceVar)
                {
                  for(uint32_t x = 0; x < rows; x++)
                    usage->children[x].emitSourceVar = true;
                  usage->emitSourceVar = false;
                }
                usage = &usage->children[idx];
                usage->type = childType->type;
                typeWalk = childType;
              }
            }
            else if(!typeWalk->structMembers.empty())
            {
              idx = indexes[0];
              indexes.erase(0);
              childType = &m_DebugInfo.types[typeWalk->structMembers[idx].second];
              uint32_t rows = typeWalk->structMembers.size();
              usage->rows = rows;
              usage->columns = 1U;
              // Expand the node if required
              if(usage->children.isEmpty())
              {
                usage->children.resize(rows);
                for(uint32_t x = 0; x < rows; x++)
                {
                  rdcstr suffix = StringFormat::Fmt(".%s", typeWalk->structMembers[x].first.c_str());
                  usage->children[x].debugVar = usage->debugVar;
                  usage->children[x].debugVarSuffix = usage->debugVarSuffix + suffix;
                  usage->children[x].name = usage->name + suffix;
                  usage->children[x].offset = usage->offset + typeWalk->memberOffsets[x];
                  uint32_t memberRows = 1U;
                  uint32_t memberColumns = 1U;
                  const TypeData *memberType = &m_DebugInfo.types[typeWalk->structMembers[x].second];
                  VarType elementType = memberType->type;
                  if(memberType->matSize != 0)
                  {
                    const TypeData &vec = m_DebugInfo.types[memberType->baseType];
                    const TypeData &scalar = m_DebugInfo.types[vec.baseType];

                    elementType = scalar.type;
                    if(memberType->colMajorMat)
                    {
                      memberRows = RDCMAX(1U, vec.vecSize);
                      memberColumns = RDCMAX(1U, memberType->matSize);
                    }
                    else
                    {
                      memberColumns = RDCMAX(1U, vec.vecSize);
                      memberRows = RDCMAX(1U, memberType->matSize);
                    }
                  }
                  else if(memberType->vecSize != 0)
                  {
                    const TypeData &scalar = m_DebugInfo.types[memberType->baseType];
                    uint32_t vecColumns = RDCMAX(1U, memberType->vecSize);

                    elementType = scalar.type;

                    memberRows = 1U;
                    memberColumns = vecColumns;
                  }
                  usage->children[x].type = elementType;
                  usage->children[x].rows = memberRows;
                  usage->children[x].columns = memberColumns;
                }
              }
              RDCASSERTEQUAL(usage->children.size(), rows);
              // if the whole node was displayed : display the sub-elements
              if(usage->emitSourceVar)
              {
                for(uint32_t x = 0; x < rows; x++)
                  usage->children[x].emitSourceVar = true;
                usage->emitSourceVar = false;
              }

              usage = &usage->children[idx];
              usage->type = childType->type;
              typeWalk = childType;
            }
            else
            {
              break;
            }
          }

          const char swizzle[] = "xyzw";
          uint32_t rows = 1U;
          uint32_t columns = 1U;
          size_t countRemainingIndexes = indexes.size();
          if(typeWalk->matSize != 0)
          {
            const TypeData &vec = m_DebugInfo.types[typeWalk->baseType];
            const TypeData &scalar = m_DebugInfo.types[vec.baseType];

            usage->type = scalar.type;

            if(typeWalk->colMajorMat)
            {
              rows = RDCMAX(1U, vec.vecSize);
              columns = RDCMAX(1U, typeWalk->matSize);
            }
            else
            {
              columns = RDCMAX(1U, vec.vecSize);
              rows = RDCMAX(1U, typeWalk->matSize);
            }
            usage->rows = rows;
            usage->columns = columns;

            if((countRemainingIndexes == 2) || (countRemainingIndexes == 1))
            {
              if(usage->children.isEmpty())
              {
                // Matrices are stored as [row][col]
                usage->children.resize(rows);
                for(uint32_t r = 0; r < rows; ++r)
                {
                  usage->children[r].emitSourceVar = false;
                  usage->children[r].name = usage->name + StringFormat::Fmt(".row%u", r);
                  usage->children[r].type = scalar.type;
                  usage->children[r].debugVar = usage->debugVar;
                  usage->children[r].debugVarComponent = 0;
                  usage->children[r].rows = 1U;
                  usage->children[r].columns = columns;
                  usage->children[r].offset = usage->offset + r * rows;
                  usage->children[r].children.resize(columns);
                  for(uint32_t c = 0; c < columns; ++c)
                  {
                    usage->children[r].children[c].emitSourceVar = false;
                    usage->children[r].children[c].name =
                        usage->name + StringFormat::Fmt(".row%u.%c", r, swizzle[RDCMIN(c, 3U)]);
                    usage->children[r].children[c].type = scalar.type;
                    usage->children[r].children[c].debugVar = usage->debugVar;
                    usage->children[r].children[c].debugVarComponent = r;
                    usage->children[r].children[c].rows = 1U;
                    usage->children[r].children[c].columns = 1U;
                    usage->children[r].children[c].offset = usage->children[r].offset + c;
                  }
                }
              }
              RDCASSERTEQUAL(usage->children.size(), rows);

              // two remaining indices selects a scalar within the matrix
              if(countRemainingIndexes == 2)
              {
                uint32_t row, col;

                if(typeWalk->colMajorMat)
                {
                  col = indexes[0];
                  row = indexes[1];
                }
                else
                {
                  row = indexes[0];
                  col = indexes[1];
                }
                RDCASSERT(row < rows, row, rows);
                RDCASSERT(col < columns, col, columns);

                RDCASSERTEQUAL(usage->children[row].children.size(), columns);
                usage->children[row].children[col].emitSourceVar =
                    !usage->children[row].emitSourceVar;
                usage->children[row].children[col].debugVar = mapping.debugVar;
                usage->children[row].children[col].debugVarComponent = 0;

                // try to recombine matrix rows to a single source var display
                if(!usage->children[row].emitSourceVar)
                {
                  bool collapseVector = true;
                  for(uint32_t c = 0; c < columns; ++c)
                  {
                    collapseVector = usage->children[row].children[c].emitSourceVar;
                    if(!collapseVector)
                      break;
                  }
                  if(collapseVector)
                  {
                    usage->children[row].emitSourceVar = true;
                    for(uint32_t c = 0; c < columns; ++c)
                      usage->children[row].children[c].emitSourceVar = false;
                  }
                }
              }
              else
              {
                if(typeWalk->colMajorMat)
                {
                  uint32_t col = indexes[0];
                  RDCASSERT(col < columns, col, columns);
                  // one remaining index selects a column within the matrix.
                  // source vars are displayed as row-major, need <rows> mappings
                  for(uint32_t r = 0; r < rows; ++r)
                  {
                    RDCASSERTEQUAL(usage->children[r].children.size(), columns);
                    usage->children[r].children[col].emitSourceVar =
                        !usage->children[r].emitSourceVar;
                    usage->children[r].children[col].debugVar = mapping.debugVar;
                    usage->children[r].children[col].debugVarComponent = r;
                  }
                }
                else
                {
                  uint32_t row = indexes[0];
                  RDCASSERT(row < rows, row, rows);
                  RDCASSERTEQUAL(usage->children.size(), rows);
                  RDCASSERTEQUAL(usage->children[row].children.size(), columns);
                  // one remaining index selects a row within the matrix.
                  // source vars are displayed as row-major, need <rows> mappings
                  for(uint32_t c = 0; c < columns; ++c)
                  {
                    usage->children[row].children[c].emitSourceVar =
                        !usage->children[row].emitSourceVar;
                    usage->children[row].children[c].debugVar = mapping.debugVar;
                    usage->children[row].children[c].debugVarComponent = c;
                  }
                }
              }
              // try to recombine matrix rows to a single source var display
              for(uint32_t r = 0; r < rows; ++r)
              {
                if(!usage->children[r].emitSourceVar)
                {
                  bool collapseVector = true;
                  RDCASSERTEQUAL(usage->children[r].children.size(), columns);
                  for(uint32_t c = 0; c < columns; ++c)
                  {
                    collapseVector = usage->children[r].children[c].emitSourceVar;
                    if(!collapseVector)
                      break;
                  }
                  if(collapseVector)
                  {
                    usage->children[r].emitSourceVar = true;
                    for(uint32_t c = 0; c < columns; ++c)
                      usage->children[r].children[c].emitSourceVar = false;
                  }
                }
              }
              usage->emitSourceVar = false;
            }
            else
            {
              RDCASSERTEQUAL(countRemainingIndexes, 0);
              // Remove mappings : this mapping covers everything
              usage->debugVar = mapping.debugVar;
              usage->children.clear();
              usage->emitSourceVar = true;
              usage->debugVarSuffix.clear();
            }
          }
          else if(typeWalk->vecSize != 0)
          {
            const TypeData &scalar = m_DebugInfo.types[typeWalk->baseType];
            columns = RDCMAX(1U, typeWalk->vecSize);

            usage->type = scalar.type;

            usage->rows = 1U;
            usage->columns = columns;

            // remaining index selects a scalar within the vector
            if(countRemainingIndexes == 1)
            {
              if(usage->children.isEmpty())
              {
                usage->children.resize(columns);
                for(uint32_t x = 0; x < columns; ++x)
                {
                  usage->children[x].emitSourceVar = usage->emitSourceVar;
                  usage->children[x].name =
                      usage->name + StringFormat::Fmt(".%c", swizzle[RDCMIN(x, 3U)]);
                  usage->children[x].type = scalar.type;
                  usage->children[x].debugVar = usage->debugVar;
                  usage->children[x].debugVarComponent = x;
                  usage->children[x].rows = 1U;
                  usage->children[x].columns = 1U;
                  usage->children[x].offset = usage->offset + x;
                }
                usage->emitSourceVar = false;
              }
              uint32_t col = indexes[0];
              RDCASSERT(col < columns, col, columns);
              RDCASSERTEQUAL(usage->children.size(), columns);
              usage->children[col].debugVar = mapping.debugVar;
              usage->children[col].debugVarComponent = 0;
              usage->children[col].emitSourceVar = true;

              // try to recombine vector to a single source var display
              bool collapseVector = true;
              for(uint32_t x = 0; x < columns; ++x)
              {
                collapseVector = usage->children[x].emitSourceVar;
                if(!collapseVector)
                  break;
              }
              if(collapseVector)
              {
                usage->emitSourceVar = true;
                for(uint32_t x = 0; x < columns; ++x)
                  usage->children[x].emitSourceVar = false;
              }
            }
            else
            {
              RDCASSERTEQUAL(countRemainingIndexes, 0);
              // Remove mappings : this mapping covers everything
              usage->debugVar = mapping.debugVar;
              usage->children.clear();
              usage->emitSourceVar = true;
              usage->debugVarSuffix.clear();
            }
          }
          else
          {
            // walk down until we get to a scalar type, if we get there. This means arrays of
            // basic types will get the right type
            while(typeWalk && typeWalk->baseType != Id() && typeWalk->type == VarType::Unknown)
              typeWalk = &m_DebugInfo.types[typeWalk->baseType];

            usage->type = typeWalk->type;
            usage->debugVar = mapping.debugVar;
            usage->debugVarComponent = 0;
            usage->rows = 1U;
            usage->columns = 1U;
            usage->emitSourceVar = true;
            usage->children.clear();
            usage->debugVarSuffix.clear();
          }
        }
      }
    }

    // Phase Two: walk the DebugVarNode tree and convert "emitSourceVar = true" nodes to a SourceVariableMapping
    for(size_t sv = 0; sv < sourceVars.size(); ++sv)
    {
      Id sourceVarId = sourceVars[sv];
      DebugVarNode *usage = &roots[sourceVarId];
      rdcarray<const DebugVarNode *> nodesToProcess;
      rdcarray<const DebugVarNode *> sourceVarNodes;
      nodesToProcess.push_back(usage);
      while(!nodesToProcess.isEmpty())
      {
        const DebugVarNode *n = nodesToProcess.back();
        nodesToProcess.pop_back();
        if(n->emitSourceVar)
        {
          sourceVarNodes.push_back(n);
        }
        else
        {
          for(size_t x = 0; x < n->children.size(); ++x)
          {
            const DebugVarNode *child = &n->children[x];
            nodesToProcess.push_back(child);
          }
        }
      }
      for(size_t x = 0; x < sourceVarNodes.size(); ++x)
      {
        const DebugVarNode *n = sourceVarNodes[x];
        SourceVariableMapping sourceVar;
        sourceVar.name = n->name;
        sourceVar.type = n->type;
        sourceVar.rows = n->rows;
        sourceVar.columns = n->columns;
        sourceVar.signatureIndex = -1;
        sourceVar.offset = n->offset;
        sourceVar.variables.clear();
        // unknown is treated as a struct
        if(sourceVar.type == VarType::Unknown)
          sourceVar.type = VarType::Struct;

        if(n->children.empty())
        {
          ConstIter it = GetID(n->debugVar);

          if(it.opcode() == Op::Undef)
          {
            sourceVar.rows = sourceVar.columns = 1;
            sourceVar.undefinedValue = true;

            sourceVar.variables.push_back(DebugVariableReference(
                DebugVariableType::Variable, GetRawName(n->debugVar) + n->debugVarSuffix, 0));
          }
          else
          {
            RDCASSERTNOTEQUAL(n->rows * n->columns, 0);
            for(uint32_t c = 0; c < n->rows * n->columns; ++c)
            {
              sourceVar.variables.push_back(DebugVariableReference(
                  DebugVariableType::Variable, GetRawName(n->debugVar) + n->debugVarSuffix, c));
            }
          }
        }
        else
        {
          RDCASSERTEQUAL(n->rows * n->columns, (uint32_t)n->children.count());
          for(int32_t c = 0; c < n->children.count(); ++c)
            sourceVar.variables.push_back(DebugVariableReference(
                DebugVariableType::Variable,
                GetRawName(n->children[c].debugVar) + n->children[c].debugVarSuffix,
                n->children[c].debugVarComponent));
        }
        i.sourceVars.push_back(sourceVar);
      }
    }
  }
}

void Debugger::FillDefaultSourceVars(rdcarray<InstructionSourceInfo> &instInfo) const
{
  rdcarray<SourceVariableMapping> sourceVars;
  rdcarray<Id> debugVars;

  for(InstructionSourceInfo &i : instInfo)
  {
    // the source vars for this instruction are whatever we have currently, because when we're
    // looking up the source vars for instruction X we are effectively talking abotu the state just
    // before X executes, not just after.
    i.sourceVars = sourceVars;

    // now update the sourcevars for after this instruction executed

    size_t offs = instructionOffsets[i.instruction];

    ConstIter it(m_SPIRV, offs);

    OpDecoder opdata(it);

    Id id = opdata.result;

    // stores can bring their pointer into being, if it's the first write.
    if(opdata.op == Op::Store)
      id = OpStore(it).pointer;

    // if this is the offset where the id's live range begins, try to add the source name for it if
    // one exists.
    if(id != Id() && idLiveRange[id].first == offs)
    {
      rdcstr name;

      auto dyn = dynamicNames.find(id);
      if(dyn != dynamicNames.end())
        name = dyn->second;
      else
        name = strings[id];

      if(!name.empty())
      {
        SourceVariableMapping sourceVar;

        const DataType *type = &GetTypeForId(id);

        while(type->type == DataType::PointerType || type->type == DataType::ArrayType)
          type = &GetType(type->InnerType());

        sourceVar.name = name;
        sourceVar.offset = 0;
        if(type->type == DataType::MatrixType || type->type == DataType::VectorType ||
           type->type == DataType::ScalarType)
          sourceVar.type = type->scalar().Type();
        else if(type->type == DataType::StructType)
          sourceVar.type = VarType::Struct;
        else if(type->type == DataType::ImageType || type->type == DataType::SampledImageType ||
                type->type == DataType::SamplerType)
          sourceVar.type = VarType::ReadOnlyResource;
        sourceVar.rows = RDCMAX(1U, (uint32_t)type->matrix().count);
        sourceVar.columns = RDCMAX(1U, (uint32_t)type->vector().count);
        rdcstr rawName = GetRawName(id);
        for(uint32_t x = 0; x < sourceVar.rows * sourceVar.columns; x++)
          sourceVar.variables.push_back(
              DebugVariableReference(DebugVariableType::Variable, rawName, x));

        sourceVars.push_back(sourceVar);
        debugVars.push_back(id);
      }
    }

    // see which vars have expired
    for(size_t d = 0; d < debugVars.size();)
    {
      if(offs > idLiveRange[debugVars[d]].second)
      {
        sourceVars.erase(d);
        debugVars.erase(d);
        continue;
      }

      d++;
    }

    // all variables/IDs are function-local
    if(opdata.op == Op::FunctionEnd)
    {
      sourceVars.clear();
      debugVars.clear();
    }
  }
}

rdcarray<ShaderDebugState> Debugger::ContinueDebug()
{
  ThreadState &active = GetActiveLane();

  rdcarray<ShaderDebugState> ret;
  shaderChangesReturn = NULL;

  // initialise the first ShaderDebugState if we haven't stepped yet
  if(steps == 0)
  {
    ShaderDebugState initial;
    uint32_t startPoint = INVALID_EXECUTION_POINT;

    // we should be sitting at the entry point function prologue, step forward into the first block
    // and past any function-local variable declarations
    for(size_t lane = 0; lane < workgroup.size(); lane++)
    {
      ThreadState &thread = workgroup[lane];

      if(lane == activeLaneIndex)
      {
        thread.EnterEntryPoint(true);
        FillCallstack(thread, initial);
        initial.nextInstruction = thread.nextInstruction;
        const ShaderDebugState &pendingDebugState = thread.GetPendingDebugState();
        initial.flags = pendingDebugState.flags;
        initial.changes.append(pendingDebugState.changes);
        startPoint = initial.nextInstruction;
      }
      else
      {
        thread.EnterEntryPoint(false);
      }
    }

    // globals won't be filled out by entering the entry point, ensure their change is registered.
    ShaderVariable val;
    DeviceOpResult opResult;
    for(const Id &v : liveGlobals)
    {
      opResult = GetPointerValue(active.ids[v], val);
      RDCASSERTEQUAL(opResult, DeviceOpResult::Succeeded);
      initial.changes.push_back({ShaderVariable(), val});
    }

    if(m_DebugInfo.valid)
    {
      // debug info can refer to constants for source variable values. Add an initial change for any
      // that are so referenced
      for(const Id &v : m_DebugInfo.constants)
      {
        opResult = GetPointerValue(active.ids[v], val);
        RDCASSERTEQUAL(opResult, DeviceOpResult::Succeeded);
        initial.changes.push_back({ShaderVariable(), val});
      }
    }

    ret.push_back(std::move(initial));

    // Set the initial execution point for the threads in the root tangle
    ThreadExecutionStates threadExecutionStates;
    TangleGroup &tangles = controlFlow.GetTangles();
    RDCASSERTEQUAL(tangles.size(), 1);
    RDCASSERTNOTEQUAL(startPoint, INVALID_EXECUTION_POINT);
    for(Tangle &tangle : tangles)
    {
      RDCASSERT(tangle.IsAliveActive());
      for(uint32_t threadIdx = 0; threadIdx < workgroup.size(); ++threadIdx)
      {
        if(!workgroup[threadIdx].Finished())
          threadExecutionStates[threadIdx].push_back(startPoint);
      }
    }
    controlFlow.UpdateState(threadExecutionStates);
    steps++;
  }

  // if we've finished, return an empty set to signify that
  if(active.Finished())
  {
    AtomicStore(&atomic_simulationFinished, 1);
    Threading::JobSystem::SyncAllJobs();
    return ret;
  }

  bool allStepsCompleted = true;
  shaderChangesReturn = &ret;

  // continue stepping until we have 1000000 target steps completed in a chunk. This may involve
  // doing more steps if our target thread is inactive
  for(int stepEnd = steps + 1000000; steps < stepEnd;)
  {
    global.clock++;
    allStepsCompleted = true;
    if(active.Finished() && !active.IsSimulationStepActive())
      break;

    // Execute the threads in each active tangle
    ThreadExecutionStates threadExecutionStates;
    TangleGroup &tangles = controlFlow.GetTangles();

    bool anyActiveThreads = false;
    for(const Tangle &tangle : tangles)
    {
      if(!tangle.IsAliveActive())
        continue;

      rdcarray<bool> activeMask;
      // one bool per workgroup thread
      activeMask.resize(workgroup.size());

      // calculate the current active thread mask from the threads in the tangle
      for(size_t i = 0; i < workgroup.size(); i++)
        activeMask[i] = false;

      const rdcarray<ThreadReference> &threadRefs = tangle.GetThreadRefs();
      for(const ThreadReference &ref : threadRefs)
      {
        uint32_t lane = ref.id;
        RDCASSERT(lane < workgroup.size(), lane, workgroup.size());
        ThreadState &thread = workgroup[lane];
        RDCASSERT(!thread.Finished());
        activeMask[lane] = true;
        anyActiveThreads = true;
      }

      // step all threads in the tangle
      for(const ThreadReference &ref : threadRefs)
      {
        const uint32_t threadId = ref.id;
        const uint32_t lane = threadId;

        ThreadState &thread = workgroup[lane];
        if(thread.nextInstruction >= instructionOffsets.size())
        {
          if(lane == activeLaneIndex)
            ret.emplace_back();
          continue;
        }
        RDCASSERTEQUAL(thread.activeMask.size(), activeMask.size());
        memcpy(thread.activeMask.data(), activeMask.data(), activeMask.size() * sizeof(bool));
        QueueJob(lane);
      }
    }

    do
    {
      ProcessQueuedDebugMessages();
      ProcessQueuedDeviceThreadSteps();
      // Convert the simulation threads queued operations into pending operations i.e. GPU commands
      ProcessQueuedOps();
      // Sync any pending GPU operations and set the results to the pending threads
      SyncPendingLanes();

      allStepsCompleted = true;
      for(const Tangle &tangle : tangles)
      {
        if(!tangle.IsAliveActive())
          continue;

        bool tangleStepsCompleted = true;
        const rdcarray<ThreadReference> &threadRefs = tangle.GetThreadRefs();
        for(const ThreadReference &ref : threadRefs)
        {
          const uint32_t threadId = ref.id;
          const uint32_t lane = threadId;
          ThreadState &thread = workgroup[lane];
          if(thread.IsSimulationStepActive())
          {
            tangleStepsCompleted = false;
            break;
          }
        }
        if(!tangleStepsCompleted)
        {
          allStepsCompleted = false;
          break;
        }
      }
    } while(!allStepsCompleted);

    for(Tangle &tangle : tangles)
    {
      if(!tangle.IsAliveActive())
        continue;

      const rdcarray<ThreadReference> &threadRefs = tangle.GetThreadRefs();
#if ENABLED(RDOC_DEVEL)
      for(const ThreadReference &ref : threadRefs)
      {
        const uint32_t threadId = ref.id;
        const uint32_t lane = threadId;
        ThreadState &thread = workgroup[lane];
        RDCASSERT(!thread.IsSimulationStepActive());
      }
#endif    // #if ENABLED(RDOC_DEVEL)

      ExecutionPoint newConvergeInstruction = INVALID_EXECUTION_POINT;
      ExecutionPoint newFunctionReturnPoint = INVALID_EXECUTION_POINT;
      uint32_t countActiveThreads = 0;
      uint32_t countDivergedThreads = 0;
      uint32_t countIdenticalConvergePointThreads = 0;
      uint32_t countFunctionReturnThreads = 0;

      // Update the control flow state
      for(const ThreadReference &ref : threadRefs)
      {
        const uint32_t threadId = ref.id;
        const uint32_t lane = threadId;
        ThreadState &thread = workgroup[lane];

        if(thread.nextInstruction >= instructionOffsets.size())
        {
          tangle.SetThreadDead(threadId);
          continue;
        }
        bool wasActive = !thread.Finished();

        threadExecutionStates[threadId] = thread.GetEnteredPoints();

        uint32_t threadConvergeInstruction = thread.GetConvergenceInstruction();
        tangle.SetThreadMergePoint(threadId, threadConvergeInstruction);
        // the thread activated a new convergence point
        if(threadConvergeInstruction != INVALID_EXECUTION_POINT)
        {
          wasActive = true;
          if(newConvergeInstruction == INVALID_EXECUTION_POINT)
          {
            newConvergeInstruction = threadConvergeInstruction;
            RDCASSERTNOTEQUAL(newConvergeInstruction, INVALID_EXECUTION_POINT);
          }
          if(newConvergeInstruction == threadConvergeInstruction)
            ++countIdenticalConvergePointThreads;
        }

        uint32_t threadFunctionReturnPoint = thread.GetFunctionReturnPoint();
        // the thread activated a new function return point
        if(threadFunctionReturnPoint != INVALID_EXECUTION_POINT)
        {
          wasActive = true;
          if(newFunctionReturnPoint == INVALID_EXECUTION_POINT)
          {
            newFunctionReturnPoint = threadFunctionReturnPoint;
            RDCASSERTNOTEQUAL(newFunctionReturnPoint, INVALID_EXECUTION_POINT);
          }
          else
          {
            // All the threads in the tangle should set the same function return point
            RDCASSERTEQUAL(threadFunctionReturnPoint, newFunctionReturnPoint);
          }
          ++countFunctionReturnThreads;
        }

        if(thread.IsDiverged())
        {
          wasActive = true;
          ++countDivergedThreads;
        }

        if(thread.Finished())
          tangle.SetThreadDead(threadId);

        countActiveThreads += wasActive ? 1 : 0;
      }

      for(const ThreadReference &ref : threadRefs)
      {
        const uint32_t threadId = ref.id;
        const uint32_t lane = threadId;
        workgroup[lane].currentInstruction = workgroup[lane].nextInstruction;
      }

      // If the tangle has a common merge point set it here (this will clear the thread merge point)
      // otherwise the convergence point will come from the threads during control flow divergence processing
      if(countIdenticalConvergePointThreads == countActiveThreads)
        tangle.AddMergePoint(newConvergeInstruction);

      if(countFunctionReturnThreads)
      {
        // all the active threads should have a function return point if any have one
        RDCASSERTEQUAL(countFunctionReturnThreads, countActiveThreads);
        tangle.AddFunctionReturnPoint(newFunctionReturnPoint);
      }
      if(countDivergedThreads)
      {
        // all the active threads should have diverged if any diverges
        RDCASSERTEQUAL(countDivergedThreads, countActiveThreads);
        tangle.SetDiverged(true);
      }
    }
    if(!anyActiveThreads)
    {
      active.dead = true;
      controlFlow.UpdateState(threadExecutionStates);
      RDCERR("No active threads in any tangle, killing active thread to terminate the debugger");
    }
    controlFlow.UpdateState(threadExecutionStates);
  }

  RDCASSERT(allStepsCompleted);
  shaderChangesReturn = NULL;
  return ret;
}

ShaderVariable Debugger::MakeTypedPointer(uint64_t value, const DataType &type) const
{
  rdcspv::Id typeId = type.InnerType();
  ShaderVariable var;
  var.rows = var.columns = 1;
  var.type = VarType::GPUPointer;
  var.SetTypedPointer(value, apiWrapper ? apiWrapper->GetShaderID() : ResourceId(),
                      idToPointerType[typeId]);

  const Decorations &dec = decorations[type.id];
  if(dec.flags & Decorations::HasArrayStride)
  {
    uint32_t arrayStride = dec.arrayStride;
    setArrayStride(var, arrayStride);
  }
  return var;
}

ShaderVariable Debugger::MakePointerVariable(Id id, const ShaderVariable *v, uint8_t scalar0,
                                             uint8_t scalar1) const
{
  ShaderVariable var;
  var.rows = var.columns = 1;
  var.type = VarType::GPUPointer;
  var.name = GetRawName(id);
  var.SetTypedPointer(0, ResourceId(), OpaquePointerTypeID);
  setPointer(var, v);
  setScalars(var, scalar0, scalar1);
  setBaseId(var, id);
  return var;
}

ShaderVariable Debugger::MakeCompositePointer(const ShaderVariable &base, Id id,
                                              rdcarray<uint32_t> &indices) const
{
  const ShaderVariable *leaf = &base;

  bool physicalPointer = IsPhysicalPointer(base);
  bool isArray = false;

  if(!physicalPointer)
  {
    // if the base is a plain value, we just start walking down the chain. If the base is a pointer
    // though, we want to step down the chain in the underlying storage, so dereference first.
    if(base.type == VarType::GPUPointer)
      leaf = getPointer(base);
  }

  // if this is an arrayed opaque binding, the first index is a 'virtual' array index into the
  // binding.
  // We only take this if this is the FIRST dereference from the global pointer.
  // If the SPIR-V does something like structType *_1234 =
  if((leaf->type == VarType::ReadWriteResource || leaf->type == VarType::ReadOnlyResource ||
      leaf->type == VarType::Sampler) &&
     checkPointerFlags(*leaf, PointerFlags::GlobalArrayBinding) &&
     getBufferTypeId(base) == rdcspv::Id())
  {
    isArray = true;
  }

  if((leaf->type == VarType::ReadWriteResource && checkPointerFlags(*leaf, PointerFlags::SSBO)) ||
     physicalPointer)
  {
    ShaderVariable ret;
    uint64_t byteOffset = 0;
    const DataType *type = NULL;

    if(physicalPointer)
    {
      // work purely with the pointer itself. All we're going to do effectively is move the address
      // and set the sub-type pointed to so that we know how to dereference it later
      ret = *leaf;
      // if this hasn't been dereferenced yet we should have a valid pointer type ID for a physical
      // pointer, which we can then use to get the base ID (and there will be no buffer type ID
      // below). If not, we rely on the buffer type ID.
      if(!checkPointerFlags(ret, PointerFlags::DereferencedPhysical))
      {
        rdcspv::Id typeId = pointerTypeToId[ret.GetPointer().pointerTypeID];
        RDCASSERT(typeId != rdcspv::Id());
        type = &dataTypes[typeId];
      }
    }
    else
    {
      ret = MakePointerVariable(id, leaf);

      byteOffset = getByteOffset(base);
      type = &dataTypes[idTypes[id]];

      RDCASSERT(type->type == DataType::PointerType);
      type = &dataTypes[type->InnerType()];
    }

    setMatrixStride(ret, getMatrixStride(base));
    setPointerFlags(ret, getPointerFlags(base));

    rdcspv::Id typeId = getBufferTypeId(base);

    if(typeId != rdcspv::Id())
      type = &dataTypes[typeId];

    // first walk any aggregate types
    size_t i = 0;

    // if it's an array, consume the array index first
    if(isArray)
    {
      setBindArrayIndex(ret, indices[i++]);
      type = &dataTypes[type->InnerType()];
    }
    else
    {
      setBindArrayIndex(ret, getBindArrayIndex(base));
    }

    Decorations curDecorations = decorations[type->id];

    uint32_t arrayStride = 0;

    while(i < indices.size() &&
          (type->type == DataType::ArrayType || type->type == DataType::StructType))
    {
      if(type->type == DataType::ArrayType)
      {
        // look up the array stride
        const Decorations &dec = decorations[type->id];
        RDCASSERT(dec.flags & Decorations::HasArrayStride);

        // offset increases by index * arrayStride
        arrayStride = dec.arrayStride;
        byteOffset += indices[i] * arrayStride;

        // new type is the inner type
        type = &dataTypes[type->InnerType()];
      }
      else
      {
        // otherwise it's a struct member
        const DataType::Child &child = type->children[indices[i]];

        // offset increases by member offset
        RDCASSERT(child.decorations.flags & Decorations::HasOffset);
        byteOffset += child.decorations.offset;

        // new type is the child type
        type = &dataTypes[child.type];
        curDecorations = child.decorations;
      }
      i++;
    }

    size_t remaining = indices.size() - i;
    if(remaining == 2)
    {
      // pointer to a scalar in a matrix. indices[i] is column, indices[i + 1] is row
      RDCASSERT(curDecorations.flags & Decorations::HasMatrixStride);

      // type is the resulting scalar (first inner does matrix->colun type, second does column
      // type->scalar type)
      type = &dataTypes[dataTypes[type->InnerType()].InnerType()];

      if(curDecorations.flags & Decorations::RowMajor)
      {
        byteOffset +=
            curDecorations.matrixStride * indices[i + 1] + indices[i] * (type->scalar().width / 8);
      }
      else
      {
        byteOffset +=
            curDecorations.matrixStride * indices[i] + indices[i + 1] * (type->scalar().width / 8);
      }
    }
    else if(remaining == 1)
    {
      if(type->type == DataType::VectorType)
      {
        // pointer to a scalar in a vector.

        // type is the resulting scalar (first inner does matrix->colun type, second does column
        // type->scalar type)
        type = &dataTypes[type->InnerType()];

        byteOffset += indices[i] * (type->scalar().width / 8);
      }
      else
      {
        // pointer to a column in a matrix
        RDCASSERT(curDecorations.flags & Decorations::HasMatrixStride);

        // type is the resulting vector
        type = &dataTypes[type->InnerType()];

        if(curDecorations.flags & Decorations::RowMajor)
        {
          byteOffset += indices[i] * (type->scalar().width / 8);
        }
        else
        {
          byteOffset += curDecorations.matrixStride * indices[i];
        }
      }
    }

    if(curDecorations.flags & Decorations::HasMatrixStride)
      setMatrixStride(ret, curDecorations.matrixStride);

    if(curDecorations.flags & Decorations::RowMajor)
      enablePointerFlags(ret, PointerFlags::RowMajorMatrix);
    else if(curDecorations.flags & Decorations::ColMajor)
      disablePointerFlags(ret, PointerFlags::RowMajorMatrix);

    setBufferTypeId(ret, type->id);
    setArrayStride(ret, arrayStride);
    if(physicalPointer)
    {
      PointerVal ptrval = ret.GetPointer();
      // we use the opaque type ID to ensure we don't accidentally leak the wrong type ID.
      // we check where the pointer is dereferenced to use the physical address instead of the inner
      // binding
      ret.SetTypedPointer(ptrval.pointer + byteOffset, ptrval.shader, OpaquePointerTypeID);
    }
    else
    {
      setByteOffset(ret, byteOffset);
    }
    // this flag is only used for physical pointers, to indicate that it's been dereferenced and
    // the pointer type should be fetched from our ID above and it returned as a plain value, rather
    // than showing the pointer 'natively'. This is because we may not have a pointer type to
    // reference if e.g. the only pointer type registered is struct foo { } and we've dereferenced
    // into inner struct bar { }
    //
    // effectively physical pointers currently decay into opaque pointers after any access chain
    // (but opaque that still uses an address, not that uses a global pointer inner value as in
    // other opaque pointers)
    if(physicalPointer)
      enablePointerFlags(ret, PointerFlags::DereferencedPhysical);
    return ret;
  }

  // first walk any struct member/array indices
  size_t i = 0;
  if(isArray)
    i++;
  while(i < indices.size() && !leaf->members.empty())
  {
    uint32_t idx = indices[i++];
    if(idx >= leaf->members.size())
    {
      AddDebugMessage(MessageCategory::Execution, MessageSeverity::High,
                      MessageSource::RuntimeWarning,
                      StringFormat::Fmt("Index %u invalid at leaf %s. Clamping to %zu", idx,
                                        leaf->name.c_str(), leaf->members.size() - 1));
      idx = uint32_t(leaf->members.size() - 1);
    }
    leaf = &leaf->members[idx];
  }

  // apply any remaining scalar selectors
  uint8_t scalar0 = 0xff, scalar1 = 0xff;

  size_t remaining = indices.size() - i;

  if(remaining > 2)
  {
    AddDebugMessage(
        MessageCategory::Execution, MessageSeverity::High, MessageSource::RuntimeWarning,
        StringFormat::Fmt("Too many indices left (%zu) at leaf %s. Ignoring all but last two",
                          remaining, leaf->name.c_str()));
    i = indices.size() - 2;
  }

  if(remaining == 2)
  {
    scalar0 = indices[i] & 0xff;
    scalar1 = indices[i + 1] & 0xff;
  }
  else if(remaining == 1)
  {
    scalar0 = indices[i] & 0xff;
  }

  ShaderVariable ret = MakePointerVariable(id, leaf, scalar0, scalar1);

  if(isArray)
    setBindArrayIndex(ret, indices[0]);

  return ret;
}

uint64_t Debugger::GetPointerByteOffset(const ShaderVariable &ptr) const
{
  return getByteOffset(ptr);
}

DebugAPIWrapper::TextureType Debugger::GetTextureType(const ShaderVariable &img) const
{
  return getTextureType(img);
}

DeviceOpResult Debugger::GetPointerValue(const ShaderVariable &ptr, ShaderVariable &ret) const
{
  // opaque pointers display as their inner value
  if(IsOpaquePointer(ptr))
  {
    const ShaderVariable *inner = getPointer(ptr);
    ret = *inner;
    ret.name = ptr.name;
    // inherit any array index from the pointer
    ShaderBindIndex bind = ret.GetBindIndex();
    bind.arrayElement = getBindArrayIndex(ptr);
    ret.SetBindIndex(bind);
    return DeviceOpResult::Succeeded;
  }
  // physical pointers which haven't been dereferenced are returned as-is, they're ready for display
  else if(IsPhysicalPointer(ptr) && !checkPointerFlags(ptr, PointerFlags::DereferencedPhysical))
  {
    ret = ptr;
    return DeviceOpResult::Succeeded;
  }

  // every other kind of pointer displays as its contents
  return ReadFromPointer(ptr, ret);
}

DeviceOpResult Debugger::ReadFromPointer(const ShaderVariable &ptr, ShaderVariable &ret) const
{
  if(ptr.type != VarType::GPUPointer)
  {
    ret = ptr;
    return DeviceOpResult::Succeeded;
  }

  if(isUndefPointer(ptr))
  {
    ret = ShaderVariable(ptr.name, 0, 0, 0, 0);
    memset(&ret.value, 0xcc, sizeof(ret.value));
    return DeviceOpResult::Succeeded;
  }

  // values for setting up pointer reads, either from a physical pointer or from an opaque pointer
  rdcspv::Id typeId;
  Decorations parentDecorations;
  uint64_t baseAddress;
  ShaderBindIndex bind;
  uint64_t byteOffset = 0;
  std::function<void(uint64_t offset, uint64_t size, void *dst)> pointerReadCallback;
  if(IsPhysicalPointer(ptr))
  {
    baseAddress = ptr.GetPointer().pointer;
    if(!IsDeviceThread() && !apiWrapper->IsBufferCached(baseAddress))
      return DeviceOpResult::NeedsDevice;

    if(checkPointerFlags(ptr, PointerFlags::DereferencedPhysical))
      typeId = getBufferTypeId(ptr);
    else
      typeId = pointerTypeToId[ptr.GetPointer().pointerTypeID];

    RDCASSERT(typeId != rdcspv::Id());

    parentDecorations = decorations[typeId];
    uint32_t varMatrixStride = getMatrixStride(ptr);

    if(varMatrixStride != 0)
    {
      if(checkPointerFlags(ptr, PointerFlags::RowMajorMatrix))
        parentDecorations.flags = Decorations::RowMajor;
      else
        parentDecorations.flags = Decorations::ColMajor;
      parentDecorations.flags =
          Decorations::Flags(parentDecorations.flags | Decorations::HasMatrixStride);
      parentDecorations.matrixStride = varMatrixStride;
    }
    pointerReadCallback = [this, baseAddress](uint64_t offset, uint64_t size, void *dst) {
      apiWrapper->ReadAddress(baseAddress + offset, size, dst);
    };
  }
  else
  {
    const ShaderVariable *inner = getPointer(ptr);
    if(inner == NULL)
    {
      ret = ShaderVariable(ptr.name, 0, 0, 0, 0);
      return DeviceOpResult::Succeeded;
    }
    if(inner->type == VarType::ReadWriteResource && checkPointerFlags(*inner, PointerFlags::SSBO))
    {
      typeId = getBufferTypeId(ptr);
      byteOffset = getByteOffset(ptr);
      bind = inner->GetBindIndex();
      bind.arrayElement = getBindArrayIndex(ptr);
      if(!IsDeviceThread() && !apiWrapper->IsBufferCached(bind))
        return DeviceOpResult::NeedsDevice;

      uint32_t varMatrixStride = getMatrixStride(ptr);
      if(varMatrixStride != 0)
      {
        if(checkPointerFlags(ptr, PointerFlags::RowMajorMatrix))
          parentDecorations.flags = Decorations::RowMajor;
        else
          parentDecorations.flags = Decorations::ColMajor;
        parentDecorations.flags =
            Decorations::Flags(parentDecorations.flags | Decorations::HasMatrixStride);
        parentDecorations.matrixStride = varMatrixStride;
      }
      pointerReadCallback = [this, bind](uint64_t offset, uint64_t size, void *dst) {
        apiWrapper->ReadBufferValue(bind, offset, size, dst);
      };
    }
  }

  if(pointerReadCallback)
  {
    auto readCallback = [this, pointerReadCallback](ShaderVariable &var, const Decorations &dec,
                                                    const DataType &type, uint64_t offset,
                                                    const rdcstr &) {
      // ignore any callbacks we get on the way up for structs/arrays, we don't need it we only read
      // or write at primitive level
      if(!var.members.empty())
        return;

      bool rowMajor = (dec.flags & Decorations::RowMajor) != 0;
      uint32_t matrixStride = dec.matrixStride;

      if(type.type == DataType::MatrixType)
      {
        RDCASSERT(matrixStride != 0);

        if(rowMajor)
        {
          for(uint8_t r = 0; r < var.rows; r++)
          {
            pointerReadCallback(offset + r * matrixStride, VarTypeByteSize(var.type) * var.columns,
                                VarElemPointer(var, r * var.columns));
          }
        }
        else
        {
          ShaderVariable tmp;
          tmp.type = var.type;

          // read column-wise
          for(uint8_t c = 0; c < var.columns; c++)
          {
            pointerReadCallback(offset + c * matrixStride, VarTypeByteSize(var.type) * var.rows,
                                VarElemPointer(tmp, c * var.rows));
          }

          // transpose into our row major storage
          for(uint8_t r = 0; r < var.rows; r++)
            for(uint8_t c = 0; c < var.columns; c++)
              copyComp(var, r * var.columns + c, tmp, c * var.rows + r);
        }
      }
      else if(type.type == DataType::VectorType)
      {
        if(!rowMajor)
        {
          // we can read a vector at a time if the matrix is column major
          pointerReadCallback(offset, VarTypeByteSize(var.type) * var.columns,
                              VarElemPointer(var, 0));
        }
        else
        {
          for(uint8_t c = 0; c < var.columns; c++)
          {
            pointerReadCallback(offset + c * matrixStride, VarTypeByteSize(var.type),
                                VarElemPointer(var, c));
          }
        }
      }
      else if(type.type == DataType::ScalarType || type.type == DataType::PointerType)
      {
        pointerReadCallback(offset, VarTypeByteSize(var.type), VarElemPointer(var, 0));
        if(type.type == DataType::PointerType)
        {
          auto it = idToPointerType.find(type.InnerType());
          if(it != idToPointerType.end())
          {
            var.SetTypedPointer(var.value.u64v[0], this->apiWrapper->GetShaderID(), it->second);
          }
          else
          {
            var.SetTypedPointer(var.value.u64v[0], ResourceId(), OpaquePointerTypeID);
            enablePointerFlags(var, PointerFlags::DereferencedPhysical);
            setMatrixStride(var, matrixStride);
            setBufferTypeId(var, type.InnerType());
          }
        }
      }
    };

    WalkVariable<ShaderVariable, true>(parentDecorations, dataTypes[typeId], byteOffset, false, ret,
                                       rdcstr(), readCallback);

    ret.name = ptr.name;
    return DeviceOpResult::Succeeded;
  }

  // this is the case of 'reading' from a pointer where the data is entirely contained within the
  // inner pointed variable. Either opaque sampler/image etc which is just the binding, or a
  // cbuffer pointer which was already evaluated
  const ShaderVariable *inner = getPointer(ptr);

  ret = *inner;
  ret.name = ptr.name;

  if(inner->type == VarType::ReadOnlyResource || inner->type == VarType::ReadWriteResource ||
     inner->type == VarType::Sampler)
  {
    bind = ret.GetBindIndex();
    bind.arrayElement = getBindArrayIndex(ptr);
    ret.SetBindIndex(bind);
  }

  // we don't support pointers to scalars since our 'unit' of pointer is a ShaderVariable, so check
  // if we have scalar indices to apply:
  uint8_t scalar0 = 0, scalar1 = 0;
  rdctie(scalar0, scalar1) = getScalars(ptr);

  ShaderVariable tmp = ret;

  if(ret.rows > 1)
  {
    // matrix case
    ClampScalars(ret, scalar0, scalar1);

    if(scalar0 != 0xff && scalar1 != 0xff)
    {
      // two indices - selecting a scalar. scalar0 is the first index in the chain so it chooses
      // column
      copyComp(ret, 0, tmp, scalar1 * ret.columns + scalar0);

      // it's a scalar now, even if it was a matrix before
      ret.rows = ret.columns = 1;
    }
    else if(scalar0 != 0xff)
    {
      // one index, selecting a column
      for(uint32_t row = 0; row < ret.rows; row++)
        copyComp(ret, row, tmp, row * ret.columns + scalar0);

      // it's a vector now, even if it was a matrix before.
      // since we have the convention of row vectors in RenderDoc, adjust the size too
      ret.columns = ret.rows;
      ret.rows = 1;
    }
  }
  else
  {
    ClampScalars(ret, scalar0);

    // vector case, selecting a scalar (if anything)
    if(scalar0 != 0xff)
    {
      copyComp(ret, 0, tmp, scalar0);

      // it's a scalar now, even if it was a matrix before
      ret.columns = 1;
    }
  }

  return DeviceOpResult::Succeeded;
}

Id Debugger::GetPointerBaseId(const ShaderVariable &ptr) const
{
  RDCASSERT(ptr.type == VarType::GPUPointer);

  // we stored the base ID so that it's always available regardless of access chains
  return getBaseId(ptr);
}

uint32_t Debugger::GetPointerArrayStride(const ShaderVariable &ptr) const
{
  RDCASSERT(ptr.type == VarType::GPUPointer);
  return getArrayStride(ptr);
}

bool Debugger::IsOpaquePointer(const ShaderVariable &ptr) const
{
  if(ptr.type != VarType::GPUPointer)
    return false;

  if(IsPhysicalPointer(ptr))
    return false;

  const ShaderVariable *inner = getPointer(ptr);
  return inner->type == VarType::ReadOnlyResource || inner->type == VarType::Sampler ||
         inner->type == VarType::ReadWriteResource;
}

bool Debugger::IsPhysicalPointer(const ShaderVariable &ptr) const
{
  if(ptr.type == VarType::GPUPointer)
  {
    // non-dereferenced physical pointer
    if(ptr.GetPointer().pointerTypeID != OpaquePointerTypeID)
      return true;
    // dereferenced physical pointer
    if(checkPointerFlags(ptr, PointerFlags::DereferencedPhysical))
      return true;
  }
  return false;
}

bool Debugger::ArePointersAndEqual(const ShaderVariable &a, const ShaderVariable &b) const
{
  // we can do a pointer comparison by checking the values, since we store all pointer-related
  // data in there
  if(a.type == VarType::GPUPointer && b.type == VarType::GPUPointer)
    return memcmp(&a.value, &b.value, sizeof(ShaderValue)) == 0;

  return false;
}

DeviceOpResult Debugger::WriteThroughPointer(ShaderVariable &ptr, const ShaderVariable &val) const
{
  // values for setting up pointer reads, either from a physical pointer or from an opaque pointer
  rdcspv::Id typeId;
  Decorations parentDecorations;
  uint64_t baseAddress;
  ShaderBindIndex bind;
  uint64_t byteOffset = 0;
  std::function<void(uint64_t offset, uint64_t size, const void *src)> pointerWriteCallback;

  if(IsPhysicalPointer(ptr))
  {
    baseAddress = ptr.GetPointer().pointer;
    if(!IsDeviceThread() && !apiWrapper->IsBufferCached(baseAddress))
      return DeviceOpResult::NeedsDevice;

    if(checkPointerFlags(ptr, PointerFlags::DereferencedPhysical))
      typeId = getBufferTypeId(ptr);
    else
      typeId = pointerTypeToId[ptr.GetPointer().pointerTypeID];

    RDCASSERT(typeId != rdcspv::Id());
    parentDecorations = decorations[typeId];
    uint32_t varMatrixStride = getMatrixStride(ptr);
    if(varMatrixStride != 0)
    {
      if(checkPointerFlags(ptr, PointerFlags::RowMajorMatrix))
        parentDecorations.flags = Decorations::RowMajor;
      else
        parentDecorations.flags = Decorations::ColMajor;
      parentDecorations.flags =
          Decorations::Flags(parentDecorations.flags | Decorations::HasMatrixStride);
      parentDecorations.matrixStride = varMatrixStride;
    }
    pointerWriteCallback = [this, baseAddress](uint64_t offset, uint64_t size, const void *src) {
      apiWrapper->WriteAddress(baseAddress + offset, size, src);
    };
  }
  else
  {
    const ShaderVariable *inner = getPointer(ptr);
    if(inner->type == VarType::ReadWriteResource && checkPointerFlags(*inner, PointerFlags::SSBO))
    {
      typeId = getBufferTypeId(ptr);
      byteOffset = getByteOffset(ptr);
      bind = inner->GetBindIndex();
      bind.arrayElement = getBindArrayIndex(ptr);
      if(!IsDeviceThread() && !apiWrapper->IsBufferCached(bind))
        return DeviceOpResult::NeedsDevice;

      uint32_t varMatrixStride = getMatrixStride(ptr);
      if(varMatrixStride != 0)
      {
        if(checkPointerFlags(ptr, PointerFlags::RowMajorMatrix))
          parentDecorations.flags = Decorations::RowMajor;
        else
          parentDecorations.flags = Decorations::ColMajor;
        parentDecorations.flags =
            Decorations::Flags(parentDecorations.flags | Decorations::HasMatrixStride);
        parentDecorations.matrixStride = varMatrixStride;
      }
      pointerWriteCallback = [this, bind](uint64_t offset, uint64_t size, const void *src) {
        apiWrapper->WriteBufferValue(bind, offset, size, src);
      };
    }
  }

  if(pointerWriteCallback)
  {
    if(!IsDeviceThread())
      return DeviceOpResult::NeedsDevice;

    auto writeCallback = [pointerWriteCallback](const ShaderVariable &var, const Decorations &dec,
                                                const DataType &type, uint64_t offset,
                                                const rdcstr &) {
      // ignore any callbacks we get on the way up for structs/arrays, we don't need it we only
      // read or write at primitive level
      if(!var.members.empty())
        return;

      bool rowMajor = (dec.flags & Decorations::RowMajor) != 0;
      uint32_t matrixStride = dec.matrixStride;

      if(type.type == DataType::MatrixType)
      {
        RDCASSERT(matrixStride != 0);

        if(rowMajor)
        {
          for(uint8_t r = 0; r < var.rows; r++)
          {
            pointerWriteCallback(offset + r * matrixStride, VarTypeByteSize(var.type) * var.columns,
                                 VarElemPointer(var, r * var.columns));
          }
        }
        else
        {
          ShaderVariable tmp;
          tmp.type = var.type;

          // transpose from our row major storage
          for(uint8_t r = 0; r < var.rows; r++)
            for(uint8_t c = 0; c < var.columns; c++)
              copyComp(tmp, c * var.rows + r, var, r * var.columns + c);

          // write column-wise
          for(uint8_t c = 0; c < var.columns; c++)
          {
            pointerWriteCallback(offset + c * matrixStride, VarTypeByteSize(var.type) * var.rows,
                                 VarElemPointer(tmp, c * var.rows));
          }
        }
      }
      else if(type.type == DataType::VectorType)
      {
        if(!rowMajor)
        {
          // we can write a vector at a time if the matrix is column major
          pointerWriteCallback(offset, VarTypeByteSize(var.type) * var.columns,
                               VarElemPointer(var, 0));
        }
        else
        {
          for(uint8_t c = 0; c < var.columns; c++)
            pointerWriteCallback(offset + c * matrixStride, VarTypeByteSize(var.type),
                                 VarElemPointer(var, c));
        }
      }
      else if(type.type == DataType::ScalarType || type.type == DataType::PointerType)
      {
        pointerWriteCallback(offset, VarTypeByteSize(var.type), VarElemPointer(var, 0));
      }
    };

    WalkVariable<const ShaderVariable, false>(parentDecorations, dataTypes[typeId], byteOffset,
                                              false, val, rdcstr(), writeCallback);

    return DeviceOpResult::Succeeded;
  }

  ShaderVariable *storage = getPointer(ptr);

  // we don't support pointers to scalars since our 'unit' of pointer is a ShaderVariable, so check
  // if we have scalar indices to apply:
  uint8_t scalar0 = 0, scalar1 = 0;
  rdctie(scalar0, scalar1) = getScalars(ptr);

  // in the common case we don't have scalar selectors. In this case just assign the value
  if(scalar0 == 0xff && scalar1 == 0xff)
  {
    AssignValue(*storage, val);
  }
  else
  {
    // otherwise we need to store only the selected part of this pointer. We assume by SPIR-V
    // validity rules that the incoming value matches the pointed value
    if(storage->rows > 1)
    {
      // matrix case
      ClampScalars(*storage, scalar0, scalar1);

      if(scalar0 != 0xff && scalar1 != 0xff)
      {
        // two indices - selecting a scalar. scalar0 is the first index in the chain so it chooses
        // column
        copyComp(*storage, scalar1 * storage->columns + scalar0, val, 0);
      }
      else if(scalar0 != 0xff)
      {
        // one index, selecting a column
        for(uint32_t row = 0; row < storage->rows; row++)
          copyComp(*storage, row * storage->columns + scalar0, val, row);
      }
    }
    else
    {
      ClampScalars(*storage, scalar0);

      // vector case, selecting a scalar
      copyComp(*storage, scalar0, val, 0);
    }
  }
  return DeviceOpResult::Succeeded;
}

rdcstr Debugger::GetHumanName(Id id) const
{
  {
    SCOPED_READLOCK(dynamicNamesLock);
    // see if we have a dynamic name assigned (to disambiguate), if so use that
    auto it = dynamicNames.find(id);
    if(it != dynamicNames.end())
      return it->second;
  }

  // otherwise try the string first
  rdcstr name = strings[id];

  // if we don't have a string name, we can be sure the id is unambiguous
  if(name.empty())
    return GetRawName(id);

  rdcstr basename = name;

  // otherwise check to see if it's been used before. If so give it a new name
  {
    SCOPED_READLOCK(dynamicNamesLock);
    int alias = 2;
    while(usedNames.find(name) != usedNames.end())
    {
      name = basename + "@" + ToStr(alias);
      alias++;
    }
  }

  {
    SCOPED_WRITELOCK(dynamicNamesLock);
    usedNames.insert(name);
    dynamicNames[id] = name;
  }

  return name;
}

void Debugger::AllocateVariable(Id id, Id typeId, ShaderVariable &outVar) const
{
  // allocs should always be pointers
  RDCASSERT(dataTypes[typeId].type == DataType::PointerType);

  auto initCallback = [](ShaderVariable &var, const Decorations &, const DataType &, uint64_t,
                         const rdcstr &) {
    // ignore any callbacks we get on the way up for structs/arrays, we don't need it we only read
    // or write at primitive level
    if(!var.members.empty())
      return;

    // make it obvious when uninitialised values are used
    memset(&var.value, 0xcc, sizeof(var.value));
  };

  WalkVariable<ShaderVariable, true>(Decorations(), dataTypes[dataTypes[typeId].InnerType()], ~0U,
                                     false, outVar, rdcstr(), initCallback);
}

template <typename ShaderVarType, bool allocate>
uint32_t Debugger::WalkVariable(
    const Decorations &curDecorations, const DataType &type, uint64_t offsetOrLocation,
    bool locationUniform, ShaderVarType &var, const rdcstr &accessSuffix,
    std::function<void(ShaderVarType &, const Decorations &, const DataType &, uint64_t, const rdcstr &)>
        callback) const
{
  // if we're walking a const variable we just want to walk it without modification. So outVar
  // is NULL. Otherwise outVar points to the variable itself so we modify it before iterating
  ShaderVariable *outVar = allocate ? pointerIfMutable(var) : NULL;

  // the Location decoration should either be on the variable itself (in which case we hit this
  // first thing), or on the first member of a struct. i.e. once we have a location already and
  // we're auto-assigning from there we shouldn't encounter another location decoration somewhere
  // further down the struct chain. This also prevents us from using the same location for every
  // element in an array, since we have the same set of decorations on the array as on the members
  const bool hasLocation = (curDecorations.flags & Decorations::HasLocation) != 0 || locationUniform;
  if(hasLocation && offsetOrLocation == ~0U)
    offsetOrLocation = curDecorations.location;

  uint32_t numLocations = 0;

  switch(type.type)
  {
    case DataType::ScalarType:
    {
      if(outVar)
      {
        outVar->type = type.scalar().Type();
        outVar->rows = 1;
        outVar->columns = 1;
      }
      numLocations = 1;
      break;
    }
    case DataType::VectorType:
    {
      if(outVar)
      {
        outVar->type = type.scalar().Type();
        outVar->rows = 1U;
        outVar->columns = RDCMAX(1U, type.vector().count) & 0xff;
      }
      numLocations = 1U;
      break;
    }
    case DataType::MatrixType:
    {
      if(outVar)
      {
        outVar->type = type.scalar().Type();
        outVar->columns = RDCMAX(1U, type.matrix().count) & 0xff;
        outVar->rows = RDCMAX(1U, type.vector().count) & 0xff;
      }
      numLocations = var.rows;
      break;
    }
    case DataType::StructType:
    {
      for(int32_t i = 0; i < type.children.count(); i++)
      {
        if(outVar)
        {
          outVar->members.push_back(ShaderVariable());
          if(!type.children[i].name.empty())
            outVar->members.back().name = type.children[i].name;
          else
            outVar->members.back().name = StringFormat::Fmt("_child%d", i);
        }

        rdcstr childAccess = accessSuffix + "." + var.members.back().name;

        const Decorations &childDecorations = type.children[i].decorations;

        uint64_t childOffsetOrLocation = offsetOrLocation;

        // if the struct is concrete, it must have an offset. Otherwise it's opaque and we're using
        // locations
        if(hasLocation)
          childOffsetOrLocation += numLocations;
        else if(childDecorations.flags & Decorations::HasOffset)
          childOffsetOrLocation += childDecorations.offset;

        uint32_t childLocations = WalkVariable<ShaderVarType, allocate>(
            childDecorations, dataTypes[type.children[i].type], childOffsetOrLocation, hasLocation,
            var.members[i], childAccess, callback);

        numLocations += childLocations;
      }
      break;
    }
    case DataType::ArrayType:
    {
      // array stride is decorated on the type, not the member itself
      const Decorations &typeDecorations = decorations[type.id];

      uint32_t childOffset = 0;

      uint32_t len = uintComp(GetActiveLane().ids[type.length], 0);
      for(uint32_t i = 0; i < len; i++)
      {
        if(outVar)
        {
          outVar->members.push_back(ShaderVariable());
          outVar->members.back().name = StringFormat::Fmt("[%u]", i);
        }

        rdcstr childAccess = accessSuffix + var.members.back().name;

        uint32_t childLocations = WalkVariable<ShaderVarType, allocate>(
            curDecorations, dataTypes[type.InnerType()], offsetOrLocation + childOffset,
            hasLocation, var.members[i], childAccess, callback);

        numLocations += childLocations;

        // as above - either the type is concrete and has an array stride, or else we're using
        // locations
        if(hasLocation)
          childOffset = numLocations;
        else if(typeDecorations.flags & Decorations::HasArrayStride)
          childOffset += decorations[type.id].arrayStride;
      }
      break;
    }
    case DataType::PointerType:
    {
      RDCASSERT(dataTypes[type.id].pointerType.storage == StorageClass::PhysicalStorageBuffer);
      if(outVar)
      {
        outVar->type = VarType::GPUPointer;
        outVar->rows = 1;
        outVar->columns = 1;
      }
      numLocations = 1;
      break;
    }
    case DataType::ImageType:
    case DataType::SamplerType:
    case DataType::SampledImageType:
    case DataType::RayQueryType:
    case DataType::AccelerationStructureType:
    case DataType::UnknownType:
    {
      RDCERR("Unexpected variable type %d", type.type);
      return numLocations;
    }
  }

  if(callback)
    callback(var, curDecorations, type, offsetOrLocation, accessSuffix);

  // for auto-assigning locations, we return the number of locations
  return numLocations;
}

bool Debugger::IsDebugExtInstSet(Id id) const
{
  return knownExtSet[ExtSet_ShaderDbg] == id;
}

bool Debugger::InDebugScope(uint32_t inst) const
{
  return m_DebugInfo.lineScope.find(instructionOffsets[inst]) != m_DebugInfo.lineScope.end();
}

const ScopeData *Debugger::GetScope(size_t offset) const
{
  auto it = m_DebugInfo.lineScope.find(offset);
  if(it == m_DebugInfo.lineScope.end())
    return NULL;
  return it->second;
}

void Debugger::PreParse(uint32_t maxId)
{
  Processor::PreParse(maxId);

  strings.resize(idTypes.size());
  idLiveRange.resize(idTypes.size());

  m_InstInfo.reserve(idTypes.size());
}

void Debugger::PostParse()
{
  Processor::PostParse();

  for(std::function<void()> &f : m_DebugInfo.deferredMembers)
    f();
  m_DebugInfo.deferredMembers.clear();

  // declare pointerTypes for all declared physical pointer types. This will match the reflection
  for(auto it = dataTypes.begin(); it != dataTypes.end(); ++it)
  {
    if(it->second.type == DataType::PointerType &&
       it->second.pointerType.storage == rdcspv::StorageClass::PhysicalStorageBuffer)
    {
      idToPointerType.insert(std::make_pair(it->second.InnerType(), (uint16_t)idToPointerType.size()));
    }
  }
  pointerTypeToId.resize(idToPointerType.size());
  for(auto it = idToPointerType.begin(); it != idToPointerType.end(); ++it)
    pointerTypeToId[it->second] = it->first;

  for(const MemberName &mem : memberNames)
    dataTypes[mem.id].children[mem.member].name = mem.name;

  // global IDs never hit a death point
  for(const Variable &v : globals)
    idLiveRange[v.id].second = ~0U;

  if(m_DebugInfo.valid)
  {
    for(auto it = m_DebugInfo.scopes.begin(); it != m_DebugInfo.scopes.end(); ++it)
    {
      ScopeData *scope = &it->second;

      // keep every ID referenced by a local alive until the scope ends. We do this even if a source
      // variable maps to multiple debug variables and technically the earlier ones could be left to
      // die when superceeded by the later ones. This is simple and only means a little bloating of
      // debug variables in the UI (which generally won't be viewed directly anyway)
      for(LocalMapping &m : scope->localMappings)
      {
        Id id = m.debugVar;

        if(id == Id())
          continue;

        idLiveRange[id].second = RDCMAX(scope->end + 1, idLiveRange[id].second);
      }

      // every scope's parent lasts at least as long as it
      while(scope->parent)
      {
        scope->parent->end = RDCMAX(scope->parent->end, scope->end);
        scope = scope->parent;
      }
    }
  }

  memberNames.clear();
}

void Debugger::SetDebugTypeMember(const OpShaderDbg &member, TypeData &resultType, size_t memberIndex)
{
  rdcstr memberName;
  Id memberType;
  uint32_t memberOffset = 0;

  switch(member.inst)
  {
    case ShaderDbg::TypeMember:
      memberName = strings[member.arg<Id>(0)];
      memberType = member.arg<Id>(1);
      memberOffset = EvaluateConstant(member.arg<Id>(5), {}).value.u32v[0];
      break;
    case ShaderDbg::Function:
      memberName = strings[member.arg<Id>(0)];
      memberType = member.arg<Id>(1);
      break;
    case ShaderDbg::TypeInheritance: memberName = "Inheritence"; break;
    default: RDCERR("Unhandled DebugTypeComposite entry %u", member.inst);
  }

  resultType.structMembers[memberIndex] = {memberName, memberType};
  resultType.memberOffsets[memberIndex] = memberOffset;
}

void Debugger::RegisterOp(Iter it)
{
  Processor::RegisterOp(it);

  OpDecoder opdata(it);

  // we add +1 so that we don't remove the ID on its last use, but the next subsequent instruction
  // since blocks always end with a terminator that doesn't consume IDs we're interested in
  // (variables) we'll always have one extra instruction to step to
  OpDecoder::ForEachID(it, [this, &it](Id id, bool result) {
    if(result)
      idLiveRange[id].first = it.offs();
    idLiveRange[id].second = RDCMAX(it.offs() + 1, idLiveRange[id].second);
  });

  bool leaveScope = false;
  bool executable = curFunction != NULL;

  const uint32_t curInstIndex = (uint32_t)instructionOffsets.size();

  if(opdata.op == Op::ExtInst || opdata.op == Op::ExtInstWithForwardRefsKHR)
  {
    OpExtInst extinst(it);

    if(knownExtSet[ExtSet_GLSL450] == extinst.set)
    {
      // all parameters to GLSL.std.450 are Ids, extend idDeathOffset appropriately
      for(const uint32_t param : extinst.params)
      {
        Id id = Id::fromWord(param);
        idLiveRange[id].second = RDCMAX(it.offs() + 1, idLiveRange[id].second);
      }
    }
    else if(knownExtSet[ExtSet_Printf] == extinst.set)
    {
      // all parameters to NonSemantic.DebugPrintf are Ids, extend idDeathOffset appropriately
      for(const uint32_t param : extinst.params)
      {
        Id id = Id::fromWord(param);
        idLiveRange[id].second = RDCMAX(it.offs() + 1, idLiveRange[id].second);
      }
    }
    else if(knownExtSet[ExtSet_ShaderDbg] == extinst.set)
    {
      // the types are identical just with different accessors
      OpShaderDbg &dbg = (OpShaderDbg &)extinst;

      if(dbg.inst != ShaderDbg::Value)
        executable = false;

      switch(dbg.inst)
      {
        case ShaderDbg::Source:
        {
          int32_t fileIndex = (int32_t)m_DebugInfo.sources.size();

          m_DebugInfo.sources[dbg.result] = fileIndex;
          m_DebugInfo.filenames[dbg.result] = strings[dbg.arg<Id>(0)];
          break;
        }
        case ShaderDbg::CompilationUnit:
        {
          m_DebugInfo.scopes[dbg.result] = {
              DebugScope::CompilationUnit,
              NULL,
              1,
              1,
              m_DebugInfo.sources[dbg.arg<Id>(2)],
              0,
              m_DebugInfo.filenames[dbg.arg<Id>(2)],
          };
          break;
        }
        case ShaderDbg::FunctionDefinition:
        {
          m_DebugInfo.funcToDebugFunc[dbg.arg<Id>(1)] = dbg.arg<Id>(0);
          break;
        }
        case ShaderDbg::Function:
        {
          rdcstr name = strings[dbg.arg<Id>(0)];
          // ignore arg 1 type
          // don't use arg 2 source - assume the parent is in the same file so it's redundant
          uint32_t line = EvaluateConstant(dbg.arg<Id>(3), {}).value.u32v[0];
          uint32_t column = EvaluateConstant(dbg.arg<Id>(4), {}).value.u32v[0];
          ScopeData *parent = &m_DebugInfo.scopes[dbg.arg<Id>(5)];
          // ignore arg 6 linkage name
          // ignore arg 7 flags
          // ignore arg 8 scope line
          // ignore arg 9 (optional) declaration

          m_DebugInfo.scopes[dbg.result] = {
              DebugScope::Function, parent, line, column, parent->fileIndex, 0, name,
          };
          break;
        }
        case ShaderDbg::TypeBasic:
        {
          uint32_t byteSize = EvaluateConstant(dbg.arg<Id>(1), {}).value.u32v[0];
          uint32_t encoding = EvaluateConstant(dbg.arg<Id>(2), {}).value.u32v[0];
          switch(encoding)
          {
            case 2: m_DebugInfo.types[dbg.result].type = VarType::Bool; break;
            case 3:
              if(byteSize == 64)
                m_DebugInfo.types[dbg.result].type = VarType::Double;
              else if(byteSize == 32)
                m_DebugInfo.types[dbg.result].type = VarType::Float;
              else if(byteSize == 16)
                m_DebugInfo.types[dbg.result].type = VarType::Half;
              break;
            case 4:
              if(byteSize == 64)
                m_DebugInfo.types[dbg.result].type = VarType::SLong;
              else if(byteSize == 32)
                m_DebugInfo.types[dbg.result].type = VarType::SInt;
              else if(byteSize == 16)
                m_DebugInfo.types[dbg.result].type = VarType::SShort;
              else if(byteSize == 8)
                m_DebugInfo.types[dbg.result].type = VarType::SByte;
              break;
            case 5: m_DebugInfo.types[dbg.result].type = VarType::SByte; break;
            case 6:
              if(byteSize == 64)
                m_DebugInfo.types[dbg.result].type = VarType::ULong;
              else if(byteSize == 32)
                m_DebugInfo.types[dbg.result].type = VarType::UInt;
              else if(byteSize == 16)
                m_DebugInfo.types[dbg.result].type = VarType::UShort;
              else if(byteSize == 8)
                m_DebugInfo.types[dbg.result].type = VarType::UByte;
              break;
            case 7: m_DebugInfo.types[dbg.result].type = VarType::UByte; break;
          }
          break;
        }
        case ShaderDbg::TypePointer:
        {
          m_DebugInfo.types[dbg.result].baseType = dbg.arg<Id>(0);
          m_DebugInfo.types[dbg.result].type = VarType::GPUPointer;
          break;
        }
        case ShaderDbg::TypeVector:
        {
          m_DebugInfo.types[dbg.result].baseType = dbg.arg<Id>(0);
          m_DebugInfo.types[dbg.result].vecSize = EvaluateConstant(dbg.arg<Id>(1), {}).value.u32v[0];
          break;
        }
        case ShaderDbg::TypeMatrix:
        {
          m_DebugInfo.types[dbg.result].baseType = dbg.arg<Id>(0);
          m_DebugInfo.types[dbg.result].matSize = EvaluateConstant(dbg.arg<Id>(1), {}).value.u32v[0];
          m_DebugInfo.types[dbg.result].colMajorMat =
              EvaluateConstant(dbg.arg<Id>(2), {}).value.u32v[0] != 0;
          break;
        }
        case ShaderDbg::TypeArray:
        {
          m_DebugInfo.types[dbg.result].baseType = dbg.arg<Id>(0);
          size_t countDims = dbg.params.size();
          m_DebugInfo.types[dbg.result].arrayDimensions.resize(countDims - 1);
          for(uint32_t i = 1; i < countDims; ++i)
          {
            size_t idx = i - 1;
            m_DebugInfo.types[dbg.result].arrayDimensions[idx] =
                EvaluateConstant(dbg.arg<Id>(i), {}).value.u32v[0];
          }

          break;
        }
        case ShaderDbg::TypeFunction:
        {
          m_DebugInfo.types[dbg.result].type = VarType::Unknown;
          m_DebugInfo.types[dbg.result].baseType = Id();
          m_DebugInfo.types[dbg.result].matSize = 0;
          m_DebugInfo.types[dbg.result].vecSize = 0;
          break;
        }
        case ShaderDbg::TypeTemplate:
        {
          m_DebugInfo.types[dbg.result] = m_DebugInfo.types[dbg.arg<Id>(0)];
          break;
        }
        case ShaderDbg::TypeMember:
        {
          rdcstr name = strings[dbg.arg<Id>(0)];
          Id type = dbg.arg<Id>(1);
          uint32_t offset = EvaluateConstant(dbg.arg<Id>(4), {}).value.u32v[0];
          (void)offset;
          break;
        }
        case ShaderDbg::TypeComposite:
        {
          rdcstr name = strings[dbg.arg<Id>(0)];
          uint32_t tag = EvaluateConstant(dbg.arg<Id>(1), {}).value.u32v[0];
          const rdcstr tagString[3] = {
              "class ",
              "struct ",
              "union ",
          };

          // don't use arg 2 source - assume the parent is in the same file so it's redundant
          uint32_t line = EvaluateConstant(dbg.arg<Id>(3), {}).value.u32v[0];
          uint32_t column = EvaluateConstant(dbg.arg<Id>(4), {}).value.u32v[0];
          ScopeData *parent = &m_DebugInfo.scopes[dbg.arg<Id>(5)];
          // ignore arg 6 linkage name
          // ignore arg 7 size
          // ignore arg 8 flags

          TypeData &resultType = m_DebugInfo.types[dbg.result];
          for(uint32_t i = 9; i < dbg.params.size(); i++)
          {
            resultType.structMembers.push_back({});
            resultType.memberOffsets.push_back(0);
            size_t memberIndex = resultType.structMembers.size() - 1;

            Id memberId = dbg.arg<Id>(i);
            ConstIter memberIt = GetID(memberId);

            if(!memberIt)
            {
              m_DebugInfo.deferredMembers.push_back(
                  [this, resultId = dbg.result, memberIndex, memberId]() {
                    SetDebugTypeMember(OpShaderDbg(GetID(memberId)), m_DebugInfo.types[resultId],
                                       memberIndex);
                  });

              continue;
            }

            SetDebugTypeMember(OpShaderDbg(memberIt), resultType, memberIndex);
          }

          name = tagString[tag % 3] + name;

          m_DebugInfo.scopes[dbg.result] = {
              DebugScope::Composite, parent, line, column, parent->fileIndex, 0, name,
          };
          break;
        }
        case ShaderDbg::LexicalBlock:
        {
          // don't use arg 0 source - assume the parent is in the same file so it's redundant
          uint32_t line = EvaluateConstant(dbg.arg<Id>(1), {}).value.u32v[0];
          uint32_t column = EvaluateConstant(dbg.arg<Id>(2), {}).value.u32v[0];
          ScopeData *parent = &m_DebugInfo.scopes[dbg.arg<Id>(3)];

          rdcstr name;
          if(dbg.params.count() >= 5)
          {
            name = strings[dbg.arg<Id>(4)];
            if(name.isEmpty())
              name = "anonymous_scope";
          }
          else
          {
            name = parent->name + ":" + ToStr(line);
          }

          m_DebugInfo.scopes[dbg.result] = {
              DebugScope::Block, parent, line, column, parent->fileIndex, 0, name,
          };
          break;
        }
        case ShaderDbg::Scope:
        {
          if(m_DebugInfo.curScope)
            m_DebugInfo.curScope->end = it.offs();

          m_DebugInfo.curScope = &m_DebugInfo.scopes[dbg.arg<Id>(0)];

          // pick up any pending mappings for this function if we just entered into a new function.
          // See the comment below in Value for this workaround
          for(size_t i = 0; i < m_DebugInfo.pendingMappings.size(); i++)
          {
            rdcpair<const ScopeData *, LocalMapping> &cur = m_DebugInfo.pendingMappings[i];
            if(m_DebugInfo.curScope->HasAncestor(cur.first))
            {
              m_DebugInfo.curScope->localMappings.push_back(std::move(cur.second));

              // the array isn't sorted so we can just swap the last one into this spot to avoid
              // moving everything
              std::swap(cur, m_DebugInfo.pendingMappings.back());
              m_DebugInfo.pendingMappings.pop_back();
            }
          }

          if(dbg.params.size() >= 2)
            m_DebugInfo.curInline = &m_DebugInfo.inlined[dbg.arg<Id>(1)];
          else
            m_DebugInfo.curInline = NULL;
          break;
        }
        case ShaderDbg::NoScope:
        {
          // don't want to set curScope to NULL until after this instruction. That way flood-fill of
          // scopes in PostParse() can find this instruction in a scope.
          leaveScope = true;
          break;
        }
        case ShaderDbg::GlobalVariable:
        {
          // copy the name string to the variable string only if it's empty. If it has a name
          // already,
          // we prefer that. If the variable is DebugInfoNone then we don't care about it's name.
          if(strings[dbg.arg<Id>(7)].empty())
            strings[dbg.arg<Id>(7)] = strings[dbg.arg<Id>(0)];

          OpVariable var(GetID(dbg.arg<Id>(7)));

          if(var.storageClass == StorageClass::Private ||
             var.storageClass == StorageClass::Workgroup || var.storageClass == StorageClass::Output)
          {
            m_DebugInfo.globals.push_back(var.result);
          }
          break;
        }
        case ShaderDbg::LocalVariable:
        {
          m_DebugInfo.locals[dbg.result] = {
              strings[dbg.arg<Id>(0)],
              &m_DebugInfo.scopes[dbg.arg<Id>(5)],
              &m_DebugInfo.types[dbg.arg<Id>(1)],
          };

          m_DebugInfo.scopes[dbg.arg<Id>(5)].locals.push_back(dbg.result);
          break;
        }
        case ShaderDbg::Declare:
        case ShaderDbg::Value:
        {
          Id sourceVarId = dbg.arg<Id>(0);
          Id debugVarId = dbg.arg<Id>(1);

          // check the function this variable is scoped inside of at declaration time.
          const ScopeData *varDeclScope = m_DebugInfo.locals[sourceVarId].scope;

          // bit of a hack - only process declares/values for variables inside a scope that is
          // within that function. If we see a declare/value in another function we defer it hoping
          // that we will encounter a scope later that's valid for it.
          const bool insideValidScope = m_DebugInfo.curScope->HasAncestor(varDeclScope);

          LocalMapping mapping = {curInstIndex, sourceVarId, debugVarId,
                                  dbg.inst == ShaderDbg::Declare};

          if(constants.find(debugVarId) != constants.end() &&
             !m_DebugInfo.constants.contains(debugVarId))
            m_DebugInfo.constants.push_back(debugVarId);

          mapping.indexes.resize(dbg.params.size() - 3);
          for(uint32_t i = 0; i < mapping.indexes.size(); i++)
            mapping.indexes[i] = EvaluateConstant(dbg.arg<Id>(i + 3), {}).value.u32v[0];

          {
            // don't support expressions, only allow for a single 'deref' which is used for
            // variables to 'deref' into the pointed value
            OpShaderDbg expr(GetID(dbg.arg<Id>(2)));

            for(uint32_t i = 0; i < expr.params.size(); i++)
            {
              OpShaderDbg op(GetID(expr.arg<Id>(i)));

              if(op.params.size() > 1 || EvaluateConstant(op.arg<Id>(0), {}).value.u32v[0] != 0)
              {
                RDCERR("Only deref expressions supported");
              }
            }
          }

          if(insideValidScope)
          {
            m_DebugInfo.curScope->localMappings.push_back(mapping);
          }
          else
          {
            // remove any pending mapping that already exists for this variable, without any way to
            // meaningfully know which one to use when we pick the latter.
            m_DebugInfo.pendingMappings.removeIf(
                [&mapping](const rdcpair<const ScopeData *, LocalMapping> &m) {
                  return mapping.isSourceSupersetOf(m.second);
                });

            m_DebugInfo.pendingMappings.push_back({varDeclScope, mapping});
          }

          break;
        }
        case ShaderDbg::InlinedAt:
        {
          // ignore arg 0 the line number
          ScopeData *scope = &m_DebugInfo.scopes[dbg.arg<Id>(1)];

          if(dbg.params.count() >= 3)
            m_DebugInfo.inlined[dbg.result] = {scope, &m_DebugInfo.inlined[dbg.arg<Id>(2)]};
          else
            m_DebugInfo.inlined[dbg.result] = {scope, NULL};
          break;
        }
        case ShaderDbg::InlinedVariable:
        {
          // TODO handle inlined variables
          break;
        }
        case ShaderDbg::Line:
        {
          m_CurLineCol.lineStart = EvaluateConstant(dbg.arg<Id>(1), {}).value.u32v[0];
          m_CurLineCol.lineEnd = EvaluateConstant(dbg.arg<Id>(2), {}).value.u32v[0];
          if(Shader_Debug_SPIRVUseDebugColumnInformation())
          {
            m_CurLineCol.colStart = EvaluateConstant(dbg.arg<Id>(3), {}).value.u32v[0];
            m_CurLineCol.colEnd = EvaluateConstant(dbg.arg<Id>(4), {}).value.u32v[0];
          }

          // find file index by filename matching, this would be nice to improve as it's brittle
          m_CurLineCol.fileIndex = m_DebugInfo.sources[dbg.arg<Id>(0)];
          break;
        }
        case ShaderDbg::NoLine:
        {
          m_CurLineCol = LineColumnInfo();
          break;
        }
        default: break;
      }
    }
  }
  else if(opdata.op == Op::ExtInstImport)
  {
    OpExtInstImport extimport(it);

    if(extimport.result == knownExtSet[ExtSet_ShaderDbg])
    {
      m_DebugInfo.valid = true;
    }
  }
  else if((opdata.op == Op::AccessChain) || (opdata.op == Op::InBoundsAccessChain))
  {
    OpAccessChain chain(it);
    // Base pointers never retire
    idLiveRange[chain.base].second = ~0U;
  }
  else if((opdata.op == Op::PtrAccessChain) || (opdata.op == Op::InBoundsPtrAccessChain))
  {
    OpPtrAccessChain chain(it);
    // Base pointers never retire
    idLiveRange[chain.base].second = ~0U;
  }

  if(opdata.op == Op::Source)
  {
    OpSource source(it);

    if(!source.source.empty())
    {
      m_Files[source.file] = m_Files.size();
    }
  }
  else if(opdata.op == Op::Line)
  {
    OpLine line(it);

    if(m_DebugInfo.valid)
    {
      // ignore any OpLine when we have proper debug info
    }
    else
    {
      m_CurLineCol.lineStart = line.line;
      m_CurLineCol.lineEnd = line.line;
      m_CurLineCol.colStart = line.column;
      m_CurLineCol.fileIndex = (int32_t)m_Files[line.file];
    }
  }
  else if(opdata.op == Op::NoLine)
  {
    if(!m_DebugInfo.valid)
      m_CurLineCol = LineColumnInfo();
  }
  else if(executable)
  {
    // for debug info, only apply line info if we're in a scope. Otherwise the line info may not
    // apply to this instruction. This means OpPhi's will never be line mapped
    if(m_DebugInfo.valid)
    {
      if(m_DebugInfo.curScope)
        m_InstInfo.push_back({curInstIndex, m_CurLineCol});
      else
        m_InstInfo.push_back({curInstIndex, LineColumnInfo()});
    }
    else
    {
      m_InstInfo.push_back({curInstIndex, m_CurLineCol});
    }
  }

  if(m_DebugInfo.valid)
  {
    m_DebugInfo.lineScope[it.offs()] = m_DebugInfo.curScope;
    m_DebugInfo.lineInline[it.offs()] = m_DebugInfo.curInline;
  }

  // if we're explicitly leaving the scope because of a DebugNoScope, or if we're leaving due to the
  // end of a block then set scope to NULL now.
  if(leaveScope || it.opcode() == Op::Kill || it.opcode() == Op::Unreachable ||
     it.opcode() == Op::Branch || it.opcode() == Op::BranchConditional ||
     it.opcode() == Op::Switch || it.opcode() == Op::Return || it.opcode() == Op::ReturnValue)
  {
    if(m_DebugInfo.curScope)
      m_DebugInfo.curScope->end = it.offs();

    m_DebugInfo.curScope = NULL;
    m_DebugInfo.curInline = NULL;
  }

  if(opdata.op == Op::String)
  {
    OpString string(it);

    strings[string.result] = string.string;
  }
  else if(opdata.op == Op::Name)
  {
    OpName name(it);

    // technically you could name a string - in that case we ignore the name
    if(strings[name.target].empty())
      strings[name.target] = name.name;
  }
  else if(opdata.op == Op::MemberName)
  {
    OpMemberName memberName(it);

    memberNames.push_back({memberName.type, memberName.member, memberName.name});
  }
  else if(opdata.op == Op::EntryPoint)
  {
    OpEntryPoint entryPoint(it);

    entryLookup[ShaderEntryPoint(entryPoint.name, MakeShaderStage(entryPoint.executionModel))] =
        entryPoint.entryPoint;
  }
  else if(opdata.op == Op::Function)
  {
    OpFunction func(it);

    curFunction = &functions[func.result];

    curFunction->begin = it.offs();
  }
  else if(opdata.op == Op::FunctionParameter)
  {
    OpFunctionParameter param(it);

    curFunction->parameters.push_back(param.result);
  }
  else if(opdata.op == Op::Variable)
  {
    OpVariable var(it);

    if(var.storageClass == StorageClass::Function && curFunction)
      curFunction->variables.push_back(var.result);

    // variables are always pointers
    Id varType = dataTypes[var.resultType].InnerType();

    // if we don't have a name for this variable but it's a pointer to a struct that is named then
    // give the variable a name based on the type. This is a common pattern in GLSL for global
    // blocks, and since the variable is how we access commonly we should give it a recognisable
    // name.
    //
    // Don't do this if we have debug info, rely on it purely to give us the right data
    if(strings[var.result].empty() && dataTypes[varType].type == DataType::StructType &&
       !strings[varType].empty() && !m_DebugInfo.valid)
    {
      strings[var.result] = strings[varType] + "_var";
    }
  }
  else if(opdata.op == Op::Label)
  {
    OpLabel lab(it);

    labelInstruction[lab.result] = instructionOffsets.count();
  }

  // everything else inside a function becomes an instruction, including the OpFunction and
  // OpFunctionEnd. We won't actually execute these instructions

  instructionOffsets.push_back(it.offs());

  if(opdata.op == Op::FunctionEnd)
  {
    // allow function parameters and variables to live indefinitely
    for(const Id &id : curFunction->parameters)
      idLiveRange[id].second = ~0U;
    for(const Id &id : curFunction->variables)
      idLiveRange[id].second = ~0U;
    curFunction = NULL;
  }
}

// Can be called from any thread
void Debugger::QueueGpuMathOp(uint32_t lane)
{
  ThreadState &thread = workgroup[lane];
  SPIRV_DEBUG_RDCASSERT(thread.IsSimulationStepActive());
  SPIRV_DEBUG_RDCASSERT(!queuedGpuMathOps[lane]);
  queuedGpuMathOps[lane] = true;
}

// Can be called from any thread
void Debugger::QueueGpuSampleGatherOp(uint32_t lane)
{
  ThreadState &thread = workgroup[lane];
  SPIRV_DEBUG_RDCASSERT(thread.IsSimulationStepActive());
  SPIRV_DEBUG_RDCASSERT(!queuedGpuSampleGatherOps[lane]);
  queuedGpuSampleGatherOps[lane] = true;
}

// Must be called from the replay manager thread (the debugger thread)
void Debugger::ProcessQueuedOps()
{
  CHECK_DEBUGGER_THREAD();
  ProcessQueuedGpuMathOps();
  ProcessQueuedGpuSampleGatherOps();
  SyncPendingGpuOps();
}

// Must be called from the replay manager thread (the debugger thread)
void Debugger::SyncPendingLanes()
{
  CHECK_DEBUGGER_THREAD();
  for(uint32_t lane = 0; lane < pendingLanes.size(); ++lane)
  {
    if(pendingLanes[lane])
    {
      pendingLanes[lane] = false;
      ThreadState &thread = workgroup[lane];
      thread.SetPendingResultReady();
      QueueJob(lane);
    }
  }
}

// Must be called from the replay manager thread (the debugger thread)
void Debugger::ProcessQueuedGpuMathOps()
{
  CHECK_DEBUGGER_THREAD();
  for(uint32_t lane = 0; lane < queuedGpuMathOps.size(); ++lane)
  {
    if(queuedGpuMathOps[lane])
    {
      if(!apiWrapper->QueuedOpsHasSpace())
        SyncPendingGpuOps();

      queuedGpuMathOps[lane] = false;
      const GpuMathOperation &mathOp = workgroup[lane].GetQueuedGpuMathOp();

      uint32_t workgroupIndex = mathOp.workgroupIndex;
      if(apiWrapper->QueueCalculateMathOp(mathOp.op, mathOp.paramVars))
      {
        pendingGpuMathsOpsResults.push_back(mathOp.result);
      }
      else
      {
        ShaderVariable &result = *mathOp.result;
        memset(&result.value, 0, sizeof(result.value));
      }

      SPIRV_DEBUG_RDCASSERT(!pendingLanes[workgroupIndex]);
      pendingLanes[workgroupIndex] = true;
    }
  }
}

// Must be called from the replay manager thread (the debugger thread)
void Debugger::ProcessQueuedGpuSampleGatherOps()
{
  CHECK_DEBUGGER_THREAD();
  for(uint32_t lane = 0; lane < queuedGpuSampleGatherOps.size(); ++lane)
  {
    if(queuedGpuSampleGatherOps[lane])
    {
      if(!apiWrapper->QueuedOpsHasSpace())
        SyncPendingGpuOps();

      queuedGpuSampleGatherOps[lane] = false;
      const GpuSampleGatherOperation &sampleGatherOp = workgroup[lane].GetQueuedGpuSampleGatherOp();

      uint32_t workgroupIndex = sampleGatherOp.workgroupIndex;
      ThreadState &thread = workgroup[workgroupIndex];
      ShaderVariable &result = *sampleGatherOp.result;
      bool hasResult = false;
      if(!(apiWrapper->QueueSampleGather(
             thread, sampleGatherOp.opcode, sampleGatherOp.texType, sampleGatherOp.imageBind,
             sampleGatherOp.samplerBind, sampleGatherOp.uv, sampleGatherOp.ddxCalc,
             sampleGatherOp.ddyCalc, sampleGatherOp.compare, sampleGatherOp.gatherChannel,
             sampleGatherOp.operands, result, hasResult)))
      {
        // sample failed. Pretend we got 0 columns back
        set0001(result);
        hasResult = true;
      }
      if(!hasResult)
        pendingGpuSampleGatherOpsResults.push_back(sampleGatherOp.result);

      SPIRV_DEBUG_RDCASSERT(!pendingLanes[workgroupIndex]);
      pendingLanes[workgroupIndex] = true;
    }
  }
}

// Must be called from the replay manager thread (the debugger thread)
void Debugger::SyncPendingGpuOps()
{
  CHECK_DEBUGGER_THREAD();
  if(pendingGpuMathsOpsResults.empty() && pendingGpuSampleGatherOpsResults.empty())
    return;

  if(!(apiWrapper->GetQueuedResults(pendingGpuMathsOpsResults, pendingGpuSampleGatherOpsResults)))
  {
    RDCERR("GetQueuedResults failed");
    return;
  }
  pendingGpuMathsOpsResults.clear();
  pendingGpuSampleGatherOpsResults.clear();
}

// Must be called from the replay manager thread (the debugger thread)
DebugAPIWrapper *Debugger::GetAPIWrapper() const
{
  CHECK_DEBUGGER_THREAD();
  return apiWrapper;
}

void Debugger::SimulationJobHelper()
{
  while(AtomicLoad(&atomic_simulationFinished) == 0)
  {
    for(uint32_t lane = 0; lane < workgroup.size(); ++lane)
    {
      if(Atomic::CmpExch32(&queuedJobs[lane], 1, 0) == 1)
      {
        StepThread(lane, StepThreadMode::RUN_MULTIPLE_STEPS);
      }
    }
  };
}

// Called from any thread
void Debugger::StepThread(uint32_t lane, StepThreadMode stepMode)
{
  ThreadState &thread = workgroup[lane];
  bool isActiveThread = lane == activeLaneIndex;
  bool simulateStep = true;
  SPIRV_DEBUG_RDCASSERT(thread.IsSimulationStepActive());
  int curActiveSteps = isActiveThread ? steps : 0;

  while(simulateStep)
  {
    simulateStep = false;
    {
      thread.ClearPendingDebugState();
      if(isActiveThread)
        activeDebugState.stepIndex = curActiveSteps;
      InternalStepThread(lane);
      thread.ClearPendingDebugState();
    }
    if(thread.StepNeedsGpuSampleGatherOp())
      break;
    else if(thread.StepNeedsGpuMathOp())
      break;
    else if(thread.StepNeedsDeviceThread())
      break;

    if(isActiveThread)
      curActiveSteps++;

    if(stepMode == StepThreadMode::RUN_SINGLE_STEP)
      break;

    simulateStep = thread.CanRunAnotherStep();
    if(simulateStep)
    {
      SPIRV_DEBUG_RDCASSERT(thread.IsSimulationStepActive());
    }
    if(simulateStep)
      thread.SetStepQueued();

    if(stepMode == StepThreadMode::QUEUE_MULTIPLE_STEPS)
      break;
  };
  // Update the number of simulation steps
  if(isActiveThread)
    steps = curActiveSteps;

  SPIRV_DEBUG_RDCASSERT(thread.IsSimulationStepActive());

  // The queueing has to be when the thread is not being simulated
  if(thread.StepNeedsGpuSampleGatherOp())
  {
    SPIRV_DEBUG_RDCASSERT(!simulateStep);
    QueueGpuSampleGatherOp(lane);
    return;
  }
  if(thread.StepNeedsGpuMathOp())
  {
    SPIRV_DEBUG_RDCASSERT(!simulateStep);
    QueueGpuMathOp(lane);
    return;
  }
  if(thread.StepNeedsDeviceThread())
  {
    SPIRV_DEBUG_RDCASSERT(!simulateStep);
    QueueDeviceThreadStep(lane);
    return;
  }

  if(simulateStep)
  {
    SPIRV_DEBUG_RDCASSERTEQUAL(stepMode, StepThreadMode::QUEUE_MULTIPLE_STEPS);
    QueueJob(lane);
    return;
  }
  SPIRV_DEBUG_RDCASSERT(!thread.IsPendingResultPending());
  thread.SetSimulationStepCompleted();
}

// Called from any thread
void Debugger::InternalStepThread(uint32_t lane)
{
  ThreadState &thread = workgroup[lane];
  if(lane == activeLaneIndex)
  {
    size_t instOffs = instructionOffsets[thread.nextInstruction];

    // see if we're retiring any IDs at this state
    if(retireIDs)
    {
      {
        SPIRV_DEBUG_RDCASSERT(activeDebugState.callstack.empty());
        SPIRV_DEBUG_RDCASSERT(activeDebugState.changes.empty());
        SPIRV_DEBUG_RDCASSERT(activeDebugState.flags == ShaderEvents::NoEvent);
        SPIRV_DEBUG_RDCASSERT(activeDebugState.nextInstruction == 0);
      }
      for(size_t l = 0; l < thread.live.size();)
      {
        Id id = thread.live[l];
        if(idLiveRange[id].second < instOffs)
        {
          thread.live.erase(l);
          ShaderVariableChange change;
          DeviceOpResult opResult = GetPointerValue(thread.ids[id], change.before);
          // The variable was live and written to, it should be cached
          SPIRV_DEBUG_RDCASSERTEQUAL(opResult, DeviceOpResult::Succeeded);
          activeDebugState.changes.push_back(change);
          continue;
        }

        l++;
      }
      retireIDs = false;
    }

    uint32_t funcRet = ~0U;
    size_t prevStackSize = thread.callstack.size();

    if(!thread.callstack.empty())
      funcRet = thread.callstack.back()->funcCallInstruction;

    thread.StepNext(true, activeDebugState.stepIndex, workgroup);
    if(thread.StepNeedsGpuSampleGatherOp())
      return;
    if(thread.StepNeedsGpuMathOp())
      return;
    if(thread.StepNeedsDeviceThread())
      return;

    if(!thread.IsPendingResultPending())
    {
      const ShaderDebugState &pendingDebugState = thread.GetPendingDebugState();
      activeDebugState.nextInstruction = pendingDebugState.nextInstruction;
      activeDebugState.flags = pendingDebugState.flags;
      activeDebugState.changes.append(pendingDebugState.changes);
      thread.ClearPendingDebugState();

      if(thread.callstack.size() > prevStackSize)
        instOffs = instructionOffsets[GetInstructionForFunction(thread.callstack.back()->function)];

      else if(thread.callstack.size() < prevStackSize && funcRet != ~0U)
        instOffs = instructionOffsets[funcRet];

      FillCallstack(thread, activeDebugState);

      if(m_DebugInfo.valid)
      {
        size_t endOffs = instructionOffsets[thread.nextInstruction - 1];

        // append any inlined functions to the top of the stack
        InlineData *inlined = m_DebugInfo.lineInline[endOffs];

        size_t insertPoint = activeDebugState.callstack.size();

        // start with the current scope, it refers to the *inlined* function
        if(inlined)
        {
          const ScopeData *scope = GetScope(endOffs);
          // find the function parent of the current scope
          while(scope && scope->parent && scope->type == DebugScope::Block)
            scope = scope->parent;

          activeDebugState.callstack.insert(insertPoint, scope->name);
        }

        // if this instruction has no scope, don't give it a callstack
        if(GetScope(endOffs) == NULL)
        {
          activeDebugState.callstack.clear();
        }

        // move to the next inline up on our inline stack. If we reach an actual function
        // call, this parent will be NULL as there was no more inlining - the final scope will
        // refer to the real function which is already on our stack
        while(inlined && inlined->parent)
        {
          const ScopeData *scope = inlined->scope;
          // find the function parent of the current scope
          while(scope && scope->parent && scope->type == DebugScope::Block)
            scope = scope->parent;

          activeDebugState.callstack.insert(insertPoint, scope->name);

          inlined = inlined->parent;
        }
      }

      shaderChangesReturn->push_back(activeDebugState);
      {
        activeDebugState.callstack.clear();
        activeDebugState.changes.clear();
        activeDebugState.flags = ShaderEvents::NoEvent;
        activeDebugState.stepIndex = 0;
        activeDebugState.nextInstruction = 0;
        retireIDs = true;
      }
    }
  }
  else
  {
    thread.StepNext(false, ~0U, workgroup);
    if(thread.StepNeedsGpuSampleGatherOp())
      return;
    if(thread.StepNeedsGpuMathOp())
      return;
    if(thread.StepNeedsDeviceThread())
      return;
  }
}

// Must be called from the replay manager thread (the debugger thread)
void Debugger::QueueJob(uint32_t lane)
{
  CHECK_DEBUGGER_THREAD();
  ThreadState &thread = workgroup[lane];
  thread.SetStepQueued();
  if(mtSimulation)
  {
    if(Shader_Debug_UseJobSystemJobs())
    {
      Threading::JobSystem::AddJob(
          [this, lane]() { StepThread(lane, StepThreadMode::RUN_MULTIPLE_STEPS); });
    }
    else
    {
      RDCASSERT(Atomic::CmpExch32(&queuedJobs[lane], 0, 1) == 0);
    }
  }
  else
  {
    StepThread(lane, StepThreadMode::RUN_SINGLE_STEP);
  }
}

// Must be called from the replay manager thread (the debugger thread)
void Debugger::ProcessQueuedDebugMessages()
{
  rdcarray<DebugMessage> msgs;
  {
    SCOPED_LOCK(queuedDebugMessagesLock);
    queuedDebugMessages.swap(msgs);
  }
  for(const DebugMessage &dbgMsg : msgs)
    apiWrapper->AddDebugMessage(dbgMsg.cat, dbgMsg.sev, dbgMsg.src, dbgMsg.desc);
}

// Called from any thread
void Debugger::AddDebugMessage(MessageCategory c, MessageSeverity sv, MessageSource src, rdcstr d) const
{
  SCOPED_LOCK(queuedDebugMessagesLock);
  queuedDebugMessages.push_back({c, sv, src, d});
}

// Can be called from any thread
void Debugger::QueueDeviceThreadStep(uint32_t lane)
{
  ThreadState &thread = workgroup[lane];
  SPIRV_DEBUG_RDCASSERT(thread.IsSimulationStepActive());
  thread.SetStepQueued();
  SPIRV_DEBUG_RDCASSERT(!queuedDeviceThreadSteps[lane]);
  queuedDeviceThreadSteps[lane] = true;
}

// Must be called from the replay manager thread (the debugger thread)
void Debugger::ProcessQueuedDeviceThreadSteps()
{
  CHECK_DEBUGGER_THREAD();
  for(uint32_t lane = 0; lane < queuedDeviceThreadSteps.size(); ++lane)
  {
    if(queuedDeviceThreadSteps[lane])
    {
      queuedDeviceThreadSteps[lane] = false;
      ThreadState &thread = workgroup[lane];
      thread.SetPendingResultUnknown();
      SPIRV_DEBUG_RDCASSERT(thread.IsSimulationStepActive());
      StepThread(lane, StepThreadMode::QUEUE_MULTIPLE_STEPS);
    }
  }
}

void Debugger::FillInputValue(ShaderVariable &var, ShaderBuiltin builtin, uint32_t threadIndex) const
{
  apiWrapper->FillInputValue(var, builtin, threadIndex, 0, 0);
}

DeviceOpResult Debugger::ReadTexel(const ShaderBindIndex &imageBind, const ShaderVariable &coord,
                                   uint32_t sample, ShaderVariable &output) const
{
  if(!IsDeviceThread())
  {
    if(!apiWrapper->IsImageCached(imageBind))
      return DeviceOpResult::NeedsDevice;
  }
  return apiWrapper->ReadTexel(imageBind, coord, sample, output);
}

DeviceOpResult Debugger::WriteTexel(const ShaderBindIndex &imageBind, const ShaderVariable &coord,
                                    uint32_t sample, const ShaderVariable &input) const
{
  if(!IsDeviceThread())
  {
    if(!apiWrapper->IsImageCached(imageBind))
      return DeviceOpResult::NeedsDevice;
  }
  return apiWrapper->WriteTexel(imageBind, coord, sample, input);
}

DeviceOpResult Debugger::GetBufferLength(const ShaderBindIndex &bind, uint64_t &bufferLen) const
{
  if(!IsDeviceThread())
  {
    if(!apiWrapper->IsImageCached(bind))
      return DeviceOpResult::NeedsDevice;
  }
  bufferLen = apiWrapper->GetBufferLength(bind);
  return DeviceOpResult::Succeeded;
}

};    // namespace rdcspv

#if ENABLED(ENABLE_UNIT_TESTS)

#include "catch/catch.hpp"

TEST_CASE("Check SPIRV Id naming", "[tostr]")
{
  SECTION("Test GetRawName")
  {
    CHECK(rdcspv::GetRawName(rdcspv::Id::fromWord(1234)) == "_1234");
    CHECK(rdcspv::GetRawName(rdcspv::Id::fromWord(12345)) == "_12345");
    CHECK(rdcspv::GetRawName(rdcspv::Id::fromWord(999)) == "_999");
    CHECK(rdcspv::GetRawName(rdcspv::Id::fromWord(0xffffffff)) == "_4294967295");
    CHECK(rdcspv::GetRawName(rdcspv::Id()) == "_0");
  };

  SECTION("Test ParseRawName")
  {
    CHECK(rdcspv::ParseRawName("_1234") == rdcspv::Id::fromWord(1234));
    CHECK(rdcspv::ParseRawName("_12345") == rdcspv::Id::fromWord(12345));
    CHECK(rdcspv::ParseRawName("_999") == rdcspv::Id::fromWord(999));
    CHECK(rdcspv::ParseRawName("_4294967295") == rdcspv::Id::fromWord(0xffffffff));
    CHECK(rdcspv::ParseRawName("_0") == rdcspv::Id());
    CHECK(rdcspv::ParseRawName("1234") == rdcspv::Id());
    CHECK(rdcspv::ParseRawName("999") == rdcspv::Id());
    CHECK(rdcspv::ParseRawName("1") == rdcspv::Id());
    CHECK(rdcspv::ParseRawName("-1234") == rdcspv::Id());
    CHECK(rdcspv::ParseRawName("asdf") == rdcspv::Id());
  };
}

#endif