From 8d3f40b0a9eb1454e00774c51924e802bee08501 Mon Sep 17 00:00:00 2001 From: baldurk Date: Fri, 31 Jan 2025 14:20:11 +0000 Subject: [PATCH] Prepare SPIR-V debugger for larger workgroup sizes * The workgroup size is passed in at creation time, and we handle the potenttial for multiple quads by identifying quads via quad ID --- .../driver/shaders/spirv/spirv_debug.cpp | 65 +++++++++------- renderdoc/driver/shaders/spirv/spirv_debug.h | 19 +++-- .../shaders/spirv/spirv_debug_setup.cpp | 65 ++++++++++++++-- renderdoc/driver/vulkan/vk_shaderdebug.cpp | 75 ++++++++++++------- 4 files changed, 159 insertions(+), 65 deletions(-) diff --git a/renderdoc/driver/shaders/spirv/spirv_debug.cpp b/renderdoc/driver/shaders/spirv/spirv_debug.cpp index 2a470e31d..3d4f71751 100644 --- a/renderdoc/driver/shaders/spirv/spirv_debug.cpp +++ b/renderdoc/driver/shaders/spirv/spirv_debug.cpp @@ -77,13 +77,9 @@ inline uint64_t CountOnes(uint64_t value) namespace rdcspv { -ThreadState::ThreadState(uint32_t workgroupIdx, Debugger &debug, const GlobalState &globalState) +ThreadState::ThreadState(Debugger &debug, const GlobalState &globalState) : debugger(debug), global(globalState) { - workgroupIndex = workgroupIdx; - nextInstruction = 0; - helperInvocation = false; - killed = false; } ThreadState::~ThreadState() @@ -95,7 +91,7 @@ ThreadState::~ThreadState() bool ThreadState::Finished() const { - return killed || callstack.empty(); + return dead || callstack.empty(); } void ThreadState::FillCallstack(rdcarray &funcs) @@ -415,18 +411,33 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De { const ThreadState *a = NULL, *b = NULL; + if(quadNeighbours[0] == ~0U || quadNeighbours[1] == ~0U || quadNeighbours[2] == ~0U || + quadNeighbours[3] == ~0U) + { + debugger.GetAPIWrapper()->AddDebugMessage( + MessageCategory::Execution, MessageSeverity::High, MessageSource::RuntimeWarning, + StringFormat::Fmt("Derivative calculation within non-quad on input %s", + debugger.GetHumanName(val).c_str())); + return ShaderVariable("", 0.0f, 0.0f, 0.0f, 0.0f); + } + + RDCASSERT(quadNeighbours[0] < workgroup.size(), quadNeighbours[0], workgroup.size()); + RDCASSERT(quadNeighbours[1] < workgroup.size(), quadNeighbours[1], workgroup.size()); + RDCASSERT(quadNeighbours[2] < workgroup.size(), quadNeighbours[2], workgroup.size()); + RDCASSERT(quadNeighbours[3] < workgroup.size(), quadNeighbours[3], workgroup.size()); + const bool xdirection = (dir == DDX); if(type == Coarse) { // coarse derivatives are identical across the quad, based on the top-left. - a = &workgroup[0]; - b = &workgroup[xdirection ? 1 : 2]; + a = &workgroup[quadNeighbours[0]]; + b = &workgroup[quadNeighbours[xdirection ? 1 : 2]]; } else { // we need to figure out the exact pair to use - int x = workgroupIndex & 1; - int y = workgroupIndex / 2; + int x = quadLaneIndex & 1; + int y = quadLaneIndex / 2; if(x == 0) { @@ -435,13 +446,13 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De // top-left if(xdirection) { - a = &workgroup[0]; - b = &workgroup[1]; + a = &workgroup[quadNeighbours[0]]; + b = &workgroup[quadNeighbours[1]]; } else { - a = &workgroup[0]; - b = &workgroup[2]; + a = &workgroup[quadNeighbours[0]]; + b = &workgroup[quadNeighbours[2]]; } } else @@ -449,13 +460,13 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De // bottom-left if(xdirection) { - a = &workgroup[2]; - b = &workgroup[3]; + a = &workgroup[quadNeighbours[2]]; + b = &workgroup[quadNeighbours[3]]; } else { - a = &workgroup[0]; - b = &workgroup[2]; + a = &workgroup[quadNeighbours[0]]; + b = &workgroup[quadNeighbours[2]]; } } } @@ -466,13 +477,13 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De // top-right if(xdirection) { - a = &workgroup[0]; - b = &workgroup[1]; + a = &workgroup[quadNeighbours[0]]; + b = &workgroup[quadNeighbours[1]]; } else { - a = &workgroup[1]; - b = &workgroup[3]; + a = &workgroup[quadNeighbours[1]]; + b = &workgroup[quadNeighbours[3]]; } } else @@ -480,13 +491,13 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De // bottom-right if(xdirection) { - a = &workgroup[2]; - b = &workgroup[3]; + a = &workgroup[quadNeighbours[2]]; + b = &workgroup[quadNeighbours[3]]; } else { - a = &workgroup[1]; - b = &workgroup[3]; + a = &workgroup[quadNeighbours[1]]; + b = &workgroup[quadNeighbours[3]]; } } } @@ -3068,7 +3079,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray case Op::TerminateInvocation: case Op::Kill: { - killed = true; + dead = true; // destroy all stack frames for(StackFrame *exitingFrame : callstack) diff --git a/renderdoc/driver/shaders/spirv/spirv_debug.h b/renderdoc/driver/shaders/spirv/spirv_debug.h index 7b00b463e..37884306f 100644 --- a/renderdoc/driver/shaders/spirv/spirv_debug.h +++ b/renderdoc/driver/shaders/spirv/spirv_debug.h @@ -175,7 +175,7 @@ class Debugger; struct ThreadState { - ThreadState(uint32_t workgroupIdx, Debugger &debug, const GlobalState &globalState); + ThreadState(Debugger &debug, const GlobalState &globalState); ~ThreadState(); void EnterEntryPoint(ShaderDebugState *state); @@ -231,10 +231,16 @@ struct ThreadState std::map lastWrite; - // index in the pixel quad - uint32_t workgroupIndex; - bool helperInvocation; - bool killed; + // quad ID (arbitrary, just used to find neighbours for derivatives) + uint32_t quadId = 0; + // index in the pixel quad (relative to the active lane) + uint32_t quadLaneIndex = ~0U; + // the lane indices of our quad neighbours + uint32_t quadNeighbours[4] = {~0U, ~0U, ~0U, ~0U}; + // index in the workgroup + uint32_t workgroupIndex = 0; + bool helperInvocation = false; + bool dead = true; const ShaderVariable &GetSrc(Id id) const; void WritePointerValue(Id pointer, const ShaderVariable &val); @@ -368,7 +374,8 @@ public: ShaderDebugTrace *BeginDebug(DebugAPIWrapper *apiWrapper, const ShaderStage stage, const rdcstr &entryPoint, const rdcarray &specInfo, const std::map &instructionLines, - const SPIRVPatchData &patchData, uint32_t activeIndex); + const SPIRVPatchData &patchData, uint32_t activeIndex, + uint32_t workgroupSize); rdcarray ContinueDebug(); diff --git a/renderdoc/driver/shaders/spirv/spirv_debug_setup.cpp b/renderdoc/driver/shaders/spirv/spirv_debug_setup.cpp index 7e61d63e1..c556e0ebd 100644 --- a/renderdoc/driver/shaders/spirv/spirv_debug_setup.cpp +++ b/renderdoc/driver/shaders/spirv/spirv_debug_setup.cpp @@ -838,7 +838,8 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s const rdcstr &entryPoint, const rdcarray &specInfo, const std::map &instructionLines, - const SPIRVPatchData &patchData, uint32_t activeIndex) + const SPIRVPatchData &patchData, uint32_t activeIndex, + uint32_t workgroupSize) { Id entryId = entryLookup[ShaderEntryPoint(entryPoint, shaderStage)]; @@ -895,9 +896,8 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s stage = shaderStage; apiWrapper = api; - uint32_t workgroupSize = shaderStage == ShaderStage::Pixel ? 4 : 1; for(uint32_t i = 0; i < workgroupSize; i++) - workgroup.push_back(ThreadState(i, *this, global)); + workgroup.push_back(ThreadState(*this, global)); ThreadState &active = GetActiveLane(); @@ -1489,6 +1489,7 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s for(uint32_t i = 0; i < workgroupSize; i++) { ThreadState &lane = workgroup[i]; + lane.workgroupIndex = i; if(i != activeLaneIndex) { lane.nextInstruction = active.nextInstruction; @@ -1499,16 +1500,68 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *api, const ShaderStage s if(stage == ShaderStage::Pixel) { - ShaderVariable var(rdcstr(), 0U, 0U, 0U, 0U); - apiWrapper->FillInputValue(var, ShaderBuiltin::IsHelper, i, 0, 0); - lane.helperInvocation = var.value.u32v[0] != 0; + lane.helperInvocation = apiWrapper->GetThreadProperty(i, ThreadProperty::Helper) != 0; + lane.quadLaneIndex = apiWrapper->GetThreadProperty(i, ThreadProperty::QuadLane); + lane.quadId = apiWrapper->GetThreadProperty(i, ThreadProperty::QuadId); } + lane.dead = apiWrapper->GetThreadProperty(i, ThreadProperty::Active) == 0; + // now that the globals are allocated and their storage won't move, we can take pointers to them for(const PointerId &p : pointerIDs) p.Set(*this, global, lane); } + // find quad neighbours + { + rdcarray processedQuads; + for(uint32_t i = 0; i < workgroupSize; i++) + { + uint32_t desiredQuad = workgroup[i].quadId; + + // ignore threads not in any quad + if(desiredQuad == 0) + continue; + + // quads are almost certainly sorted together, so shortcut by checking the last one + if((!processedQuads.empty() && processedQuads.back() == desiredQuad) || + processedQuads.contains(desiredQuad)) + continue; + + processedQuads.push_back(desiredQuad); + + // find the threads + uint32_t threads[4] = { + i, + ~0U, + ~0U, + ~0U, + }; + for(uint32_t j = i + 1, t = 1; j < workgroupSize && t < 4; j++) + { + if(workgroup[j].quadId == desiredQuad) + threads[t++] = j; + } + + // now swizzle the threads to know each other + for(uint32_t src = 0; src < 4; src++) + { + uint32_t lane = workgroup[threads[src]].quadLaneIndex; + + if(lane >= 4) + continue; + + for(uint32_t dst = 0; dst < 4; dst++) + { + if(threads[dst] == ~0U) + continue; + + workgroup[threads[dst]].quadNeighbours[lane] = threads[src]; + } + } + } + } + // this contains all the accumulated line number information. Add in our disassembly mapping ret->instInfo = m_InstInfo; for(size_t i = 0; i < m_InstInfo.size(); i++) diff --git a/renderdoc/driver/vulkan/vk_shaderdebug.cpp b/renderdoc/driver/vulkan/vk_shaderdebug.cpp index c6878871a..eb2b4f927 100644 --- a/renderdoc/driver/vulkan/vk_shaderdebug.cpp +++ b/renderdoc/driver/vulkan/vk_shaderdebug.cpp @@ -2968,8 +2968,8 @@ struct VertexLaneData struct PixelLaneData { - Vec4f fragCoord; // per-lane coord - uint32_t helper; // per-lane helper bit + Vec4f fragCoord; // per-lane coord + uint32_t isHelper; // per-lane helper bit uint32_t padding[3]; }; @@ -3302,7 +3302,7 @@ static void CreateInputFetcher(rdcarray &spv, helper.base = helper.loadOps.add(rdcspv::OpSelect(uint32Type, editor.MakeId(), helper.base, getUIntConst(1), getUIntConst(0))); fixedValues.push_back(helper); - structMembers.push_back({uint32Type, "__rd_helper", offsetof(PixelLaneData, helper)}); + structMembers.push_back({uint32Type, "__rd_helper", offsetof(PixelLaneData, isHelper)}); offset += sizeof(PixelLaneData); } @@ -3881,6 +3881,9 @@ ShaderDebugTrace *VulkanReplay::DebugVertex(uint32_t eventId, uint32_t vertid, u apiWrapper->location_inputs.resize(numThreads); apiWrapper->thread_builtins.resize(numThreads); + apiWrapper->thread_props.resize(numThreads); + + apiWrapper->thread_props[0][(size_t)rdcspv::ThreadProperty::Active] = 1; std::unordered_map &global_builtins = apiWrapper->global_builtins; global_builtins[ShaderBuiltin::BaseInstance] = @@ -4056,7 +4059,7 @@ ShaderDebugTrace *VulkanReplay::DebugVertex(uint32_t eventId, uint32_t vertid, u rdcspv::Debugger *debugger = new rdcspv::Debugger; debugger->Parse(shader.spirv.GetSPIRV()); ShaderDebugTrace *ret = debugger->BeginDebug(apiWrapper, ShaderStage::Vertex, entryPoint, spec, - shadRefl.instructionLines, shadRefl.patchData, 0); + shadRefl.instructionLines, shadRefl.patchData, 0, 1); apiWrapper->ResetReplay(); return ret; @@ -4589,27 +4592,40 @@ ShaderDebugTrace *VulkanReplay::DebugPixel(uint32_t eventId, uint32_t x, uint32_ { byte *value = LaneData + t * structStride; + apiWrapper->thread_props[t][(size_t)rdcspv::ThreadProperty::Active] = 1; + // read PixelLaneData - PixelLaneData *pixelData = (PixelLaneData *)value; + { + PixelLaneData *pixelData = (PixelLaneData *)value; + + { + ShaderVariable &var = apiWrapper->thread_builtins[t][ShaderBuiltin::Position]; + + var.rows = 1; + var.columns = 4; + var.type = VarType::Float; + + memcpy(var.value.u8v.data(), &pixelData->fragCoord, sizeof(Vec4f)); + } + + { + ShaderVariable &var = apiWrapper->thread_builtins[t][ShaderBuiltin::IsHelper]; + + var.rows = 1; + var.columns = 1; + var.type = VarType::Bool; + + memcpy(var.value.u8v.data(), &pixelData->isHelper, sizeof(uint32_t)); + } + + if(numThreads == 4) + apiWrapper->thread_props[t][(size_t)rdcspv::ThreadProperty::Active] = 1; + apiWrapper->thread_props[t][(size_t)rdcspv::ThreadProperty::Helper] = + t != winner->laneIndex ? 1 : 0; + apiWrapper->thread_props[t][(size_t)rdcspv::ThreadProperty::QuadId] = 1000; + apiWrapper->thread_props[t][(size_t)rdcspv::ThreadProperty::QuadLane] = t; + } value += sizeof(PixelLaneData); - { - ShaderVariable &var = apiWrapper->thread_builtins[t][ShaderBuiltin::Position]; - - var.rows = 1; - var.columns = 4; - var.type = VarType::Float; - - memcpy(var.value.u8v.data(), &pixelData->fragCoord, sizeof(Vec4f)); - } - { - ShaderVariable &var = apiWrapper->thread_builtins[t][ShaderBuiltin::IsHelper]; - - var.rows = 1; - var.columns = 1; - var.type = VarType::Bool; - - memcpy(var.value.u8v.data(), &pixelData->helper, sizeof(uint32_t)); - } for(size_t i = 0; i < shadRefl.refl->inputSignature.size(); i++) { @@ -4640,7 +4656,8 @@ ShaderDebugTrace *VulkanReplay::DebugPixel(uint32_t eventId, uint32_t x, uint32_ } ret = debugger->BeginDebug(apiWrapper, ShaderStage::Pixel, entryPoint, spec, - shadRefl.instructionLines, shadRefl.patchData, winner->laneIndex); + shadRefl.instructionLines, shadRefl.patchData, winner->laneIndex, + numThreads); apiWrapper->ResetReplay(); } else @@ -4712,6 +4729,9 @@ ShaderDebugTrace *VulkanReplay::DebugThread(uint32_t eventId, static const uint32_t numThreads = 1; apiWrapper->thread_builtins.resize(numThreads); + apiWrapper->thread_props.resize(numThreads); + + apiWrapper->thread_props[0][(size_t)rdcspv::ThreadProperty::Active] = 1; std::unordered_map &global_builtins = apiWrapper->global_builtins; global_builtins[ShaderBuiltin::DispatchSize] = @@ -4736,7 +4756,7 @@ ShaderDebugTrace *VulkanReplay::DebugThread(uint32_t eventId, rdcspv::Debugger *debugger = new rdcspv::Debugger; debugger->Parse(shader.spirv.GetSPIRV()); ShaderDebugTrace *ret = debugger->BeginDebug(apiWrapper, ShaderStage::Compute, entryPoint, spec, - shadRefl.instructionLines, shadRefl.patchData, 0); + shadRefl.instructionLines, shadRefl.patchData, 0, 1); apiWrapper->ResetReplay(); return ret; @@ -4800,6 +4820,9 @@ ShaderDebugTrace *VulkanReplay::DebugMeshThread(uint32_t eventId, static const uint32_t numThreads = 1; apiWrapper->thread_builtins.resize(numThreads); + apiWrapper->thread_props.resize(numThreads); + + apiWrapper->thread_props[0][(size_t)rdcspv::ThreadProperty::Active] = 1; std::unordered_map &global_builtins = apiWrapper->global_builtins; global_builtins[ShaderBuiltin::DispatchSize] = @@ -4824,7 +4847,7 @@ ShaderDebugTrace *VulkanReplay::DebugMeshThread(uint32_t eventId, rdcspv::Debugger *debugger = new rdcspv::Debugger; debugger->Parse(shader.spirv.GetSPIRV()); ShaderDebugTrace *ret = debugger->BeginDebug(apiWrapper, ShaderStage::Mesh, entryPoint, spec, - shadRefl.instructionLines, shadRefl.patchData, 0); + shadRefl.instructionLines, shadRefl.patchData, 0, 1); apiWrapper->ResetReplay(); return ret;