From a3f3f1a51058c2dea3401b8d671b696980045109 Mon Sep 17 00:00:00 2001 From: Jake Turner Date: Fri, 12 Dec 2025 11:50:41 +1300 Subject: [PATCH] Support for DXIL SM6.6 Derivatives in Compute Shaders linear layout (4x1x1) : for 1D workgroup (Nx1x1) otherwise quad layout (2x2x1) HLSL ddx() ddx_coarse() ddy() ddy_coarse() ddx_fine() ddy_fine() CalculateLevelOfDetail() CalculateLevelOfDetailUnclamped() Sample() SampleBias() SampleCmp() DXIL DXOp::DerivCoarseX DXOp::DerivCoarseY DXOp::DerivFineX DXOp::DerivFineY DXOp::CalculateLOD DXOp::Sample DXOp::SampleBias --- renderdoc/driver/shaders/dxil/dxil_debug.cpp | 34 +++++++++++++----- renderdoc/driver/shaders/dxil/dxil_debug.h | 12 ++++++- .../driver/shaders/dxil/dxil_disassemble.cpp | 36 +++++++++++++++++++ 3 files changed, 72 insertions(+), 10 deletions(-) diff --git a/renderdoc/driver/shaders/dxil/dxil_debug.cpp b/renderdoc/driver/shaders/dxil/dxil_debug.cpp index c477b7c89..9c58ac71f 100644 --- a/renderdoc/driver/shaders/dxil/dxil_debug.cpp +++ b/renderdoc/driver/shaders/dxil/dxil_debug.cpp @@ -1523,12 +1523,13 @@ void MemoryTracking::ConvertGlobalAllocToLocal(Id allocId) // Must be called from the replay manager thread (the debugger thread) ThreadState::ThreadState(Debugger &debugger, const GlobalState &globalState, uint32_t maxSSAId, - uint32_t laneIndex, uint32_t numThreads) + uint32_t laneIndex, uint32_t numThreads, ShaderFeatures shaderFeatures) : m_Debugger(debugger), m_GlobalState(globalState), m_Program(debugger.GetProgram()), m_MaxSSAId(maxSSAId), - m_WorkgroupIndex(laneIndex) + m_WorkgroupIndex(laneIndex), + m_Features(shaderFeatures) { THREADSTATE_CHECK_DEBUGGER_THREAD(); m_ShaderType = m_Program.GetShaderType(); @@ -3099,9 +3100,12 @@ bool ThreadState::ExecuteInstruction(const rdcarray &workgroup) case DXOp::DerivFineX: case DXOp::DerivFineY: { - if(m_ShaderType != DXBC::ShaderType::Pixel || workgroup.size() < 4) + if(!(m_Features & ShaderFeatures::Derivatives) || (workgroup.size() < 4) || + m_QuadNeighbours.contains(~0U)) { - RDCERR("Undefined results using derivative instruction outside of a pixel shader."); + RDCERR( + "Undefined results using derivative instruction in shader without support for " + "derivatives"); } else { @@ -6880,12 +6884,16 @@ bool ThreadState::PerformGPUResourceOp(const rdcarray &workgroup, O ShaderVariable ddx; ShaderVariable ddy; - // Sample, SampleBias, CalculateLOD need DDX, DDY - if((dxOpCode == DXOp::Sample) || (dxOpCode == DXOp::SampleBias) || (dxOpCode == DXOp::CalculateLOD)) + // Sample, SampleBias, SampleCmp, CalculateLOD need DDX, DDY + if((dxOpCode == DXOp::Sample) || (dxOpCode == DXOp::SampleBias) || + (dxOpCode == DXOp::SampleCmp) || (dxOpCode == DXOp::CalculateLOD)) { - if(m_ShaderType != DXBC::ShaderType::Pixel || m_QuadNeighbours.contains(~0U)) + if(!(m_Features & ShaderFeatures::Derivatives) || (workgroup.size() < 4) || + m_QuadNeighbours.contains(~0U)) { - RDCERR("Undefined results using derivative instruction outside of a pixel shader."); + RDCERR( + "Undefined results using derivative instruction in shader without support for " + "derivatives"); } else { @@ -9129,6 +9137,13 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *apiWrapper, uint32_t eve ShaderDebugTrace *ret = new ShaderDebugTrace; ret->stage = shaderStage; + ShaderFeatures shaderFeatures = ShaderFeatures::None; + bool isSM66Plus = (m_Program->GetMajorVersion() > 6) || + ((m_Program->GetMajorVersion() == 6) && (m_Program->GetMinorVersion() >= 6)); + + if((shaderStage == ShaderStage::Fragment) || ((shaderStage == ShaderStage::Compute) && isSM66Plus)) + shaderFeatures |= ShaderFeatures::Derivatives; + // Get the global state from the API wrapper m_GlobalState.builtins = apiWrapper->GetBuiltins(); m_GlobalState.subgroupSize = apiWrapper->GetSubgroupSize(); @@ -9137,7 +9152,8 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *apiWrapper, uint32_t eve for(uint32_t i = 0; i < threadsInWorkgroup; i++) { - m_Workgroup.push_back(ThreadState(*this, m_GlobalState, maxSSAId, i, threadsInWorkgroup)); + m_Workgroup.push_back( + ThreadState(*this, m_GlobalState, maxSSAId, i, threadsInWorkgroup, shaderFeatures)); m_QueuedDeviceThreadSteps[i] = false; m_QueuedGpuMathOps[i] = false; m_QueuedGpuSampleGatherOps[i] = false; diff --git a/renderdoc/driver/shaders/dxil/dxil_debug.h b/renderdoc/driver/shaders/dxil/dxil_debug.h index 963e99c0a..ffb33e4c8 100644 --- a/renderdoc/driver/shaders/dxil/dxil_debug.h +++ b/renderdoc/driver/shaders/dxil/dxil_debug.h @@ -435,10 +435,18 @@ struct GpuSampleGatherOperation ShaderVariable *result = NULL; }; +enum class ShaderFeatures : uint32_t +{ + None = 0, + Derivatives = 1 << 0, +}; + +BITMASK_OPERATORS(ShaderFeatures); + struct ThreadState { ThreadState(Debugger &debugger, const GlobalState &globalState, uint32_t maxSSAId, - uint32_t laneIndex, uint32_t numThreads); + uint32_t laneIndex, uint32_t numThreads, ShaderFeatures shaderFeatures); ~ThreadState(); void EnterEntryPoint(const DXIL::Function *function, bool hasDebugState); @@ -719,6 +727,8 @@ private: rdcarray m_ActiveMask; + ShaderFeatures m_Features; + ShaderDebugState m_PendingDebugState; ShaderVariable m_PendingResultData; GpuMathOperation m_QueuedGpuMathOp; diff --git a/renderdoc/driver/shaders/dxil/dxil_disassemble.cpp b/renderdoc/driver/shaders/dxil/dxil_disassemble.cpp index 0808d8c47..1e870a086 100644 --- a/renderdoc/driver/shaders/dxil/dxil_disassemble.cpp +++ b/renderdoc/driver/shaders/dxil/dxil_disassemble.cpp @@ -1151,12 +1151,48 @@ void Program::Parse(const DXBC::Reflection *reflection) for(Function *f : m_Functions) { + if(f->family != FunctionFamily::DXOp) + continue; if(f->name == "dx.op.barrier") m_Threadscope |= DXBC::ThreadScope::Workgroup; if(f->name.beginsWith("dx.op.quadReadLaneAt.") || f->name.beginsWith("dx.op.quadOp.") || f->name.beginsWith("dx.op.quadVote.")) m_Threadscope |= DXBC::ThreadScope::Quad; } + // Compute shaders using derivatives require quad scope + // DXOp::DerivCoarseX + // DXOp::DerivCoarseY + // DXOp::DerivFineX + // DXOp::DerivFineY + // DXOp::CalculateLOD + // DXOp::Sample + // DXOp::SampleBias + // DXOp::SampleCmp + for(Function *f : m_Functions) + { + if(f->external) + continue; + for(size_t funcIdx = 0; funcIdx < f->instructions.size(); funcIdx++) + { + const Instruction *inst = f->instructions[funcIdx]; + if(inst->op != Operation::Call) + continue; + const Function *callFunc = inst->getFuncCall(); + if(callFunc->family != FunctionFamily::DXOp) + continue; + + DXOp dxOpCode = DXOp::NumOpCodes; + RDCASSERT(getival(inst->args[0], dxOpCode)); + RDCASSERT(dxOpCode < DXOp::NumOpCodes, dxOpCode, DXOp::NumOpCodes); + if((dxOpCode == DXOp::DerivCoarseX) || (dxOpCode == DXOp::DerivCoarseY) || + (dxOpCode == DXOp::DerivFineX) || (dxOpCode == DXOp::DerivFineY) || + (dxOpCode == DXOp::CalculateLOD) || (dxOpCode == DXOp::Sample) || + (dxOpCode == DXOp::SampleBias) || (dxOpCode == DXOp::SampleCmp)) + { + m_Threadscope |= DXBC::ThreadScope::Quad; + } + } + } } m_Parsed = true;