Support for DXIL SM6.6 Derivatives in Compute Shaders

linear layout (4x1x1) : for 1D workgroup (Nx1x1) otherwise quad layout (2x2x1)

HLSL
ddx()
ddx_coarse()
ddy()
ddy_coarse()
ddx_fine()
ddy_fine()

CalculateLevelOfDetail()
CalculateLevelOfDetailUnclamped()
Sample()
SampleBias()
SampleCmp()

DXIL
DXOp::DerivCoarseX
DXOp::DerivCoarseY
DXOp::DerivFineX
DXOp::DerivFineY
DXOp::CalculateLOD
DXOp::Sample
DXOp::SampleBias
This commit is contained in:
Jake Turner
2025-12-12 11:50:41 +13:00
parent ab7016e142
commit a3f3f1a510
3 changed files with 72 additions and 10 deletions
+25 -9
View File
@@ -1523,12 +1523,13 @@ void MemoryTracking::ConvertGlobalAllocToLocal(Id allocId)
// Must be called from the replay manager thread (the debugger thread)
ThreadState::ThreadState(Debugger &debugger, const GlobalState &globalState, uint32_t maxSSAId,
uint32_t laneIndex, uint32_t numThreads)
uint32_t laneIndex, uint32_t numThreads, ShaderFeatures shaderFeatures)
: m_Debugger(debugger),
m_GlobalState(globalState),
m_Program(debugger.GetProgram()),
m_MaxSSAId(maxSSAId),
m_WorkgroupIndex(laneIndex)
m_WorkgroupIndex(laneIndex),
m_Features(shaderFeatures)
{
THREADSTATE_CHECK_DEBUGGER_THREAD();
m_ShaderType = m_Program.GetShaderType();
@@ -3099,9 +3100,12 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)
case DXOp::DerivFineX:
case DXOp::DerivFineY:
{
if(m_ShaderType != DXBC::ShaderType::Pixel || workgroup.size() < 4)
if(!(m_Features & ShaderFeatures::Derivatives) || (workgroup.size() < 4) ||
m_QuadNeighbours.contains(~0U))
{
RDCERR("Undefined results using derivative instruction outside of a pixel shader.");
RDCERR(
"Undefined results using derivative instruction in shader without support for "
"derivatives");
}
else
{
@@ -6880,12 +6884,16 @@ bool ThreadState::PerformGPUResourceOp(const rdcarray<ThreadState> &workgroup, O
ShaderVariable ddx;
ShaderVariable ddy;
// Sample, SampleBias, CalculateLOD need DDX, DDY
if((dxOpCode == DXOp::Sample) || (dxOpCode == DXOp::SampleBias) || (dxOpCode == DXOp::CalculateLOD))
// Sample, SampleBias, SampleCmp, CalculateLOD need DDX, DDY
if((dxOpCode == DXOp::Sample) || (dxOpCode == DXOp::SampleBias) ||
(dxOpCode == DXOp::SampleCmp) || (dxOpCode == DXOp::CalculateLOD))
{
if(m_ShaderType != DXBC::ShaderType::Pixel || m_QuadNeighbours.contains(~0U))
if(!(m_Features & ShaderFeatures::Derivatives) || (workgroup.size() < 4) ||
m_QuadNeighbours.contains(~0U))
{
RDCERR("Undefined results using derivative instruction outside of a pixel shader.");
RDCERR(
"Undefined results using derivative instruction in shader without support for "
"derivatives");
}
else
{
@@ -9129,6 +9137,13 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *apiWrapper, uint32_t eve
ShaderDebugTrace *ret = new ShaderDebugTrace;
ret->stage = shaderStage;
ShaderFeatures shaderFeatures = ShaderFeatures::None;
bool isSM66Plus = (m_Program->GetMajorVersion() > 6) ||
((m_Program->GetMajorVersion() == 6) && (m_Program->GetMinorVersion() >= 6));
if((shaderStage == ShaderStage::Fragment) || ((shaderStage == ShaderStage::Compute) && isSM66Plus))
shaderFeatures |= ShaderFeatures::Derivatives;
// Get the global state from the API wrapper
m_GlobalState.builtins = apiWrapper->GetBuiltins();
m_GlobalState.subgroupSize = apiWrapper->GetSubgroupSize();
@@ -9137,7 +9152,8 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *apiWrapper, uint32_t eve
for(uint32_t i = 0; i < threadsInWorkgroup; i++)
{
m_Workgroup.push_back(ThreadState(*this, m_GlobalState, maxSSAId, i, threadsInWorkgroup));
m_Workgroup.push_back(
ThreadState(*this, m_GlobalState, maxSSAId, i, threadsInWorkgroup, shaderFeatures));
m_QueuedDeviceThreadSteps[i] = false;
m_QueuedGpuMathOps[i] = false;
m_QueuedGpuSampleGatherOps[i] = false;
+11 -1
View File
@@ -435,10 +435,18 @@ struct GpuSampleGatherOperation
ShaderVariable *result = NULL;
};
enum class ShaderFeatures : uint32_t
{
None = 0,
Derivatives = 1 << 0,
};
BITMASK_OPERATORS(ShaderFeatures);
struct ThreadState
{
ThreadState(Debugger &debugger, const GlobalState &globalState, uint32_t maxSSAId,
uint32_t laneIndex, uint32_t numThreads);
uint32_t laneIndex, uint32_t numThreads, ShaderFeatures shaderFeatures);
~ThreadState();
void EnterEntryPoint(const DXIL::Function *function, bool hasDebugState);
@@ -719,6 +727,8 @@ private:
rdcarray<bool> m_ActiveMask;
ShaderFeatures m_Features;
ShaderDebugState m_PendingDebugState;
ShaderVariable m_PendingResultData;
GpuMathOperation m_QueuedGpuMathOp;
@@ -1151,12 +1151,48 @@ void Program::Parse(const DXBC::Reflection *reflection)
for(Function *f : m_Functions)
{
if(f->family != FunctionFamily::DXOp)
continue;
if(f->name == "dx.op.barrier")
m_Threadscope |= DXBC::ThreadScope::Workgroup;
if(f->name.beginsWith("dx.op.quadReadLaneAt.") || f->name.beginsWith("dx.op.quadOp.") ||
f->name.beginsWith("dx.op.quadVote."))
m_Threadscope |= DXBC::ThreadScope::Quad;
}
// Compute shaders using derivatives require quad scope
// DXOp::DerivCoarseX
// DXOp::DerivCoarseY
// DXOp::DerivFineX
// DXOp::DerivFineY
// DXOp::CalculateLOD
// DXOp::Sample
// DXOp::SampleBias
// DXOp::SampleCmp
for(Function *f : m_Functions)
{
if(f->external)
continue;
for(size_t funcIdx = 0; funcIdx < f->instructions.size(); funcIdx++)
{
const Instruction *inst = f->instructions[funcIdx];
if(inst->op != Operation::Call)
continue;
const Function *callFunc = inst->getFuncCall();
if(callFunc->family != FunctionFamily::DXOp)
continue;
DXOp dxOpCode = DXOp::NumOpCodes;
RDCASSERT(getival<DXOp>(inst->args[0], dxOpCode));
RDCASSERT(dxOpCode < DXOp::NumOpCodes, dxOpCode, DXOp::NumOpCodes);
if((dxOpCode == DXOp::DerivCoarseX) || (dxOpCode == DXOp::DerivCoarseY) ||
(dxOpCode == DXOp::DerivFineX) || (dxOpCode == DXOp::DerivFineY) ||
(dxOpCode == DXOp::CalculateLOD) || (dxOpCode == DXOp::Sample) ||
(dxOpCode == DXOp::SampleBias) || (dxOpCode == DXOp::SampleCmp))
{
m_Threadscope |= DXBC::ThreadScope::Quad;
}
}
}
}
m_Parsed = true;