diff --git a/renderdoc/driver/shaders/dxbc/dxbc_debug.h b/renderdoc/driver/shaders/dxbc/dxbc_debug.h
index 820aed28f..5dfded91e 100644
--- a/renderdoc/driver/shaders/dxbc/dxbc_debug.h
+++ b/renderdoc/driver/shaders/dxbc/dxbc_debug.h
@@ -291,7 +291,7 @@ public:
 
   bool Finished() const;
 
-  void StepNext(ShaderDebugState *prevState, DebugAPIWrapper *apiWrapper,
+  void StepNext(ShaderDebugState *state, DebugAPIWrapper *apiWrapper,
                 const rdcarray<ThreadState> &prevWorkgroup);
 
 private:
diff --git a/renderdoc/driver/shaders/spirv/CMakeLists.txt b/renderdoc/driver/shaders/spirv/CMakeLists.txt
index 58a2e237f..fdd6d67d9 100644
--- a/renderdoc/driver/shaders/spirv/CMakeLists.txt
+++ b/renderdoc/driver/shaders/spirv/CMakeLists.txt
@@ -99,6 +99,7 @@ set(sources
     spirv_compile.cpp
     spirv_compile.h
     spirv_debug_setup.cpp
+    spirv_debug.cpp
     spirv_debug.h
     spirv_reflect.cpp
     spirv_reflect.h
diff --git a/renderdoc/driver/shaders/spirv/renderdoc_spirv.vcxproj b/renderdoc/driver/shaders/spirv/renderdoc_spirv.vcxproj
index 3c53f1c0a..c2d6cb0e4 100644
--- a/renderdoc/driver/shaders/spirv/renderdoc_spirv.vcxproj
+++ b/renderdoc/driver/shaders/spirv/renderdoc_spirv.vcxproj
@@ -156,6 +156,7 @@
       <PrecompiledHeaderFile>precompiled.h</PrecompiledHeaderFile>
       <ForcedIncludeFiles>precompiled.h</ForcedIncludeFiles>
     </ClCompile>
+    <ClCompile Include="spirv_debug.cpp" />
     <ClCompile Include="spirv_debug_setup.cpp" />
     <ClCompile Include="spirv_disassemble.cpp">
       <WarningLevel>Level4</WarningLevel>
diff --git a/renderdoc/driver/shaders/spirv/renderdoc_spirv.vcxproj.filters b/renderdoc/driver/shaders/spirv/renderdoc_spirv.vcxproj.filters
index d4fa2546a..a399a686e 100644
--- a/renderdoc/driver/shaders/spirv/renderdoc_spirv.vcxproj.filters
+++ b/renderdoc/driver/shaders/spirv/renderdoc_spirv.vcxproj.filters
@@ -144,6 +144,7 @@
     <ClCompile Include="glslang_compile.cpp" />
     <ClCompile Include="spirv_processor.cpp" />
     <ClCompile Include="spirv_debug_setup.cpp" />
+    <ClCompile Include="spirv_debug.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\..\3rdparty\glslang\OGLCompilersDLL\InitializeDll.h">
diff --git a/renderdoc/driver/shaders/spirv/spirv_debug.cpp b/renderdoc/driver/shaders/spirv/spirv_debug.cpp
new file mode 100644
index 000000000..67e30705f
--- /dev/null
+++ b/renderdoc/driver/shaders/spirv/spirv_debug.cpp
@@ -0,0 +1,193 @@
+/******************************************************************************
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Baldur Karlsson
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ ******************************************************************************/
+
+#include "spirv_debug.h"
+#include "common/formatting.h"
+#include "spirv_op_helpers.h"
+
+namespace rdcspv
+{
+ThreadState::ThreadState(int workgroupIdx, Debugger &debug, const GlobalState &globalState)
+    : debugger(debug), global(globalState)
+{
+  workgroupIndex = workgroupIdx;
+  nextInstruction = 0;
+  done = false;
+}
+
+ThreadState::~ThreadState()
+{
+  for(StackFrame *stack : callstack)
+    delete stack;
+  callstack.clear();
+}
+
+bool ThreadState::Finished() const
+{
+  return done || callstack.empty();
+}
+
+void ThreadState::FillCallstack(ShaderDebugState &state)
+{
+  for(const StackFrame *frame : callstack)
+    state.callstack.push_back(debugger.GetHumanName(frame->function));
+}
+
+void ThreadState::EnterFunction(ShaderDebugState *state, const rdcarray<Id> &arguments)
+{
+  Iter it = debugger.GetIterForInstruction(nextInstruction);
+
+  RDCASSERT(OpDecoder(it).op == Op::Function);
+
+  OpFunction func(it);
+  StackFrame *frame = new StackFrame();
+  frame->function = func.result;
+  callstack.push_back(frame);
+
+  it++;
+
+  size_t arg = 0;
+  while(OpDecoder(it).op == Op::FunctionParameter)
+  {
+    OpFunctionParameter param(it);
+
+    if(arg <= arguments.size())
+    {
+      // TODO fill in function parameter
+    }
+    else
+    {
+      RDCERR("Not enough function parameters!");
+    }
+
+    arg++;
+    it++;
+  }
+
+  // next should be the start of the first function block
+  RDCASSERT(OpDecoder(it).op == Op::Label);
+  it++;
+
+  // handle any variable declarations
+  while(OpDecoder(it).op == Op::Variable)
+  {
+    OpVariable decl(it);
+
+    // TODO declare variable
+
+    it++;
+  }
+
+  // next instruction is the first actual instruction we'll execute
+  nextInstruction = debugger.GetInstructionForIter(it);
+}
+
+void ThreadState::StepNext(ShaderDebugState *state,
+                           const rdcarray<rdcarray<ShaderVariable>> &prevWorkgroup)
+{
+  Iter it = debugger.GetIterForInstruction(nextInstruction);
+  nextInstruction++;
+
+  OpDecoder opdata(it);
+
+  // skip OpLine/OpNoLine
+  while(opdata.op == Op::Line || opdata.op == Op::NoLine)
+  {
+    it++;
+    nextInstruction++;
+    opdata = OpDecoder(it);
+  }
+
+  switch(opdata.op)
+  {
+    case Op::FunctionCall:
+    {
+      OpFunctionCall call(it);
+
+      // we hit this twice. The first time we don't have a return value so we jump into the
+      // function. The second time we do have a return value so we process it and continue
+      if(returnValue.name.empty())
+      {
+        uint32_t returnInstruction = nextInstruction - 1;
+        nextInstruction = debugger.GetInstructionForFunction(call.function);
+
+        EnterFunction(state, call.arguments);
+
+        RDCASSERT(callstack.back()->function == call.function);
+        callstack.back()->funcCallInstruction = returnInstruction;
+      }
+      else
+      {
+        // process ret;
+        returnValue.name.clear();
+      }
+      break;
+    }
+
+    case Op::Return:
+    case Op::ReturnValue:
+    {
+      StackFrame *exitingFrame = callstack.back();
+      callstack.pop_back();
+
+      if(callstack.empty())
+      {
+        // if there's no callstack there's no return address, jump to the function end
+
+        it++;    // see what the next instruction is
+        // keep going until it's the end of the function
+
+        while(OpDecoder(it).op != Op::FunctionEnd)
+        {
+          nextInstruction++;
+          it++;
+        }
+      }
+      else
+      {
+        if(opdata.op == Op::ReturnValue)
+        {
+          OpReturnValue ret(it);
+
+          // TODO: returnValue = MakeValue(ret.value);
+          returnValue.name = "<return value>";
+        }
+
+        nextInstruction = exitingFrame->funcCallInstruction;
+      }
+
+      delete exitingFrame;
+
+      break;
+    }
+
+    default: RDCWARN("Unhandled SPIR-V operation %s", ToStr(opdata.op).c_str()); break;
+  }
+
+  // set the state's next instruction (if we have one) to ours, bounded by how many
+  // instructions there are
+  if(state)
+    state->nextInstruction = RDCMIN(nextInstruction, debugger.GetNumInstructions() - 1);
+}
+};    // namespace rdcspv
diff --git a/renderdoc/driver/shaders/spirv/spirv_debug.h b/renderdoc/driver/shaders/spirv/spirv_debug.h
index a8bc0169d..044a37039 100644
--- a/renderdoc/driver/shaders/spirv/spirv_debug.h
+++ b/renderdoc/driver/shaders/spirv/spirv_debug.h
@@ -51,15 +51,36 @@ public:
   rdcarray<ShaderVariable> constantBlocks;
 };
 
-class ThreadState
+struct StackFrame
 {
-public:
-  ThreadState(int workgroupIdx, GlobalState &globalState);
+  StackFrame() = default;
+  Id function;
+  uint32_t funcCallInstruction = ~0U;
+
+private:
+  // disallow copying to ensure the locals we allocate never move around
+  StackFrame(const StackFrame &o) = delete;
+  StackFrame &operator=(const StackFrame &o) = delete;
+};
+
+class Debugger;
+
+struct ThreadState
+{
+  ThreadState(int workgroupIdx, Debugger &debug, const GlobalState &globalState);
+  ~ThreadState();
+
+  void EnterFunction(ShaderDebugState *state, const rdcarray<Id> &arguments);
+  void StepNext(ShaderDebugState *state, const rdcarray<rdcarray<ShaderVariable>> &prevWorkgroup);
+
+  void FillCallstack(ShaderDebugState &state);
+
+  bool Finished() const;
 
-  bool Finished() const { return done; }
   uint32_t nextInstruction;
 
-  GlobalState &global;
+  const GlobalState &global;
+  Debugger &debugger;
 
   // thread-local inputs/outputs. This array does not change over the course of debugging
   rdcarray<ShaderVariable> inputs, outputs;
@@ -75,10 +96,12 @@ public:
   // changes (and vice-versa - a change via any of those pointers must update all other pointers).
   SparseIdMap<rdcarray<Id>> pointersForId;
 
+  ShaderVariable returnValue;
+  rdcarray<StackFrame *> callstack;
+
   // the list of IDs that are currently valid and live
   rdcarray<Id> live;
 
-private:
   // index in the pixel quad
   int workgroupIndex;
   bool done;
@@ -97,12 +120,14 @@ public:
 
   rdcarray<ShaderDebugState> ContinueDebug();
 
-  GlobalState GetGlobal() { return global; }
-  ThreadState &GetActiveLane() { return workgroup[activeLaneIndex]; }
-private:
-  virtual void PreParse(uint32_t maxId);
-  virtual void PostParse();
-  virtual void RegisterOp(Iter it);
+  Iter GetIterForInstruction(uint32_t inst);
+  uint32_t GetInstructionForIter(Iter it);
+  uint32_t GetInstructionForFunction(Id id);
+  const DataType &GetType(Id typeId);
+  rdcstr GetRawName(Id id) const;
+  rdcstr GetHumanName(Id id);
+  void AllocateVariable(Id id, Id typeId, DebugVariableType sourceVarType, const rdcstr &sourceName,
+                        ShaderVariable &outVar);
 
   ShaderVariable EvaluatePointerVariable(const ShaderVariable &v) const;
   ShaderVariable MakePointerVariable(Id id, const ShaderVariable *v, uint32_t scalar0 = ~0U,
@@ -111,6 +136,14 @@ private:
   void WriteThroughPointer(const ShaderVariable &ptr, const ShaderVariable &val);
   ShaderVariable MakeCompositePointer(const ShaderVariable &base, Id id, rdcarray<uint32_t> &indices);
 
+  uint32_t GetNumInstructions() { return (uint32_t)instructionOffsets.size(); }
+  GlobalState GetGlobal() { return global; }
+  ThreadState &GetActiveLane() { return workgroup[activeLaneIndex]; }
+private:
+  virtual void PreParse(uint32_t maxId);
+  virtual void PostParse();
+  virtual void RegisterOp(Iter it);
+
   void AllocateVariable(const Decorations &varDecorations, const Decorations &curDecorations,
                         DebugVariableType sourceVarType, const rdcstr &sourceName, uint32_t offset,
                         const DataType &inType, ShaderVariable &outVar);
@@ -154,11 +187,9 @@ private:
 
   rdcarray<size_t> instructionOffsets;
 
-  rdcstr GetRawName(Id id) const;
-  rdcstr GetHumanName(Id id);
-
   std::set<rdcstr> usedNames;
   std::map<Id, rdcstr> dynamicNames;
+  void CalcActiveMask(rdcarray<bool> &activeMask);
 };
 
 // this does a 'safe' value assignment, by doing parallel depth-first iteration of both variables
diff --git a/renderdoc/driver/shaders/spirv/spirv_debug_setup.cpp b/renderdoc/driver/shaders/spirv/spirv_debug_setup.cpp
index 8e7de6a9a..7f7c0f265 100644
--- a/renderdoc/driver/shaders/spirv/spirv_debug_setup.cpp
+++ b/renderdoc/driver/shaders/spirv/spirv_debug_setup.cpp
@@ -44,13 +44,6 @@ void AssignValue(ShaderVariable &dst, const ShaderVariable &src)
     AssignValue(dst.members[i], src.members[i]);
 }
 
-ThreadState::ThreadState(int workgroupIdx, GlobalState &globalState) : global(globalState)
-{
-  workgroupIndex = workgroupIdx;
-  nextInstruction = 0;
-  done = false;
-}
-
 Debugger::Debugger()
 {
 }
@@ -65,6 +58,26 @@ void Debugger::Parse(const rdcarray<uint32_t> &spirvWords)
   Processor::Parse(spirvWords);
 }
 
+Iter Debugger::GetIterForInstruction(uint32_t inst)
+{
+  return Iter(m_SPIRV, instructionOffsets[inst]);
+}
+
+uint32_t Debugger::GetInstructionForIter(Iter it)
+{
+  return instructionOffsets.indexOf(it.offs());
+}
+
+uint32_t Debugger::GetInstructionForFunction(Id id)
+{
+  return instructionOffsets.indexOf(functions[id].begin);
+}
+
+const rdcspv::DataType &Debugger::GetType(Id typeId)
+{
+  return dataTypes[typeId];
+}
+
 ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *apiWrapper, const ShaderStage stage,
                                        const rdcstr &entryPoint,
                                        const rdcarray<SpecConstant> &specInfo,
@@ -87,7 +100,7 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *apiWrapper, const Shader
 
   int workgroupSize = stage == ShaderStage::Pixel ? 4 : 1;
   for(int i = 0; i < workgroupSize; i++)
-    workgroup.push_back(ThreadState(i, global));
+    workgroup.push_back(ThreadState(i, *this, global));
 
   ThreadState &active = GetActiveLane();
 
@@ -235,37 +248,12 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
 
   rdcarray<ShaderDebugState> ret;
 
-  // if we've finished, return an empty set to signify that
-  if(active.Finished())
-    return ret;
-
-  // initialise a blank set of shader variable changes in the first ShaderDebugState
+  // initialise the first ShaderDebugState if we haven't stepped yet
   if(steps == 0)
   {
     // we should be sitting at the entry point function prologue, step forward into the first block
     // and past any function-local variable declarations
-    {
-      Iter it(m_SPIRV, instructionOffsets[active.nextInstruction]);
-
-      RDCASSERT(OpDecoder(it).op == Op::Function);
-      it++;
-      // vulkan doesn't allow entry points with parameters, next should be a label
-      RDCASSERT(OpDecoder(it).op == Op::Label);
-      it++;
-
-      // handle any variable declarations
-      while(OpDecoder(it).op == Op::Variable)
-      {
-        OpVariable decl(it);
-
-        // TODO declare variable
-
-        it++;
-      }
-
-      // next instruction is the first actual instruction we'll execute
-      active.nextInstruction = instructionOffsets.indexOf(it.offs());
-    }
+    active.EnterFunction(NULL, {});
 
     ShaderDebugState initial;
 
@@ -276,11 +264,72 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
 
     initial.sourceVars = sourceVars;
 
+    initial.stepIndex = steps;
+
+    active.FillCallstack(initial);
+
     ret.push_back(initial);
 
     steps++;
   }
 
+  // if we've finished, return an empty set to signify that
+  if(active.Finished())
+    return ret;
+
+  rdcarray<rdcarray<ShaderVariable>> oldworkgroup;
+
+  oldworkgroup.resize(workgroup.size());
+
+  rdcarray<bool> activeMask;
+
+  // do 100 in a chunk
+  for(int cycleCounter = 0; cycleCounter < 100; cycleCounter++)
+  {
+    if(active.Finished())
+      break;
+
+    // set up the old workgroup so that cross-workgroup/cross-quad operations (e.g. DDX/DDY) get
+    // consistent results even when we step the quad out of order. Otherwise if an operation reads
+    // and writes from the same register we'd trash data needed for other workgroup elements.
+    for(size_t i = 0; i < oldworkgroup.size(); i++)
+      oldworkgroup[i] = workgroup[i].ids;
+
+    // calculate the current mask of which threads are active
+    CalcActiveMask(activeMask);
+
+    // step all active members of the workgroup
+    for(size_t i = 0; i < workgroup.size(); i++)
+    {
+      if(activeMask[i])
+      {
+        if(workgroup[i].nextInstruction >= instructionOffsets.size())
+        {
+          if(i == activeLaneIndex)
+            ret.push_back(ShaderDebugState());
+
+          continue;
+        }
+
+        if(i == activeLaneIndex)
+        {
+          ShaderDebugState state;
+          workgroup[i].StepNext(&state, oldworkgroup);
+          state.stepIndex = steps;
+          state.sourceVars = sourceVars;
+          workgroup[i].FillCallstack(state);
+          ret.push_back(state);
+        }
+        else
+        {
+          workgroup[i].StepNext(NULL, oldworkgroup);
+        }
+      }
+    }
+
+    steps++;
+  }
+
   return ret;
 }
 
@@ -503,6 +552,32 @@ rdcstr Debugger::GetHumanName(Id id)
   return name;
 }
 
+void Debugger::CalcActiveMask(rdcarray<bool> &activeMask)
+{
+  // one bool per workgroup thread
+  activeMask.resize(workgroup.size());
+
+  // start as active, then if necessary turn off threads that are running diverged
+  for(bool &active : activeMask)
+    active = true;
+
+  // only pixel shaders automatically converge workgroups, compute shaders need explicit sync
+  if(stage != ShaderStage::Pixel)
+    return;
+
+  // TODO handle diverging control flow
+}
+
+void Debugger::AllocateVariable(Id id, Id typeId, DebugVariableType sourceVarType,
+                                const rdcstr &sourceName, ShaderVariable &outVar)
+{
+  // allocs should always be pointers
+  RDCASSERT(dataTypes[typeId].type == DataType::PointerType);
+
+  AllocateVariable(decorations[id], decorations[id], sourceVarType, sourceName, 0,
+                   dataTypes[dataTypes[typeId].InnerType()], outVar);
+}
+
 void Debugger::AllocateVariable(const Decorations &varDecorations, const Decorations &curDecorations,
                                 DebugVariableType sourceVarType, const rdcstr &sourceName,
                                 uint32_t offset, const DataType &inType, ShaderVariable &outVar)