DXIL debugger support for queued GPU operations (run on Device Thread)

This commit is contained in:
Jake Turner
2025-09-29 17:07:40 +01:00
parent 516dec8e17
commit f72c1a3769
3 changed files with 378 additions and 79 deletions
+254 -65
View File
@@ -2306,6 +2306,22 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)
case DXOp::TextureGatherCmp:
case DXOp::CalculateLOD:
{
if(IsPendingResultReady())
{
const ShaderVariable &data = GetPendingResult();
ConvertSampleGatherReturn(dxOpCode, inst, data, result);
if(dxOpCode == DXOp::CalculateLOD)
{
// clamped is in arg 6
ShaderVariable arg;
RDCASSERT(GetShaderVariable(inst.args[6], opCode, dxOpCode, arg, false));
// CalculateSampleGather returns {CalculateLevelOfDetail(), CalculateLevelOfDetailUnclamped()}
if(arg.value.u32v[0] == 0)
result.value.u32v[0] = data.value.u32v[1];
}
eventFlags |= ShaderEvents::SampleLoadGather;
break;
}
Id handleId = GetArgumentId(1);
bool annotatedHandle;
ShaderVariable handleVar;
@@ -2314,8 +2330,10 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)
break;
MarkResourceAccess(handleVar);
PerformGPUResourceOp(workgroup, opCode, dxOpCode, resRefInfo, inst, result);
eventFlags |= ShaderEvents::SampleLoadGather;
if(!PerformGPUResourceOp(workgroup, opCode, dxOpCode, resRefInfo, inst, result))
break;
DXIL_DEBUG_RDCASSERT(IsPendingResultPending());
break;
}
case DXOp::TextureLoad:
@@ -2337,16 +2355,28 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)
ResourceReferenceInfo resRefInfo = GetResource(handleId, annotatedHandle, handleVar);
if(!resRefInfo.Valid())
break;
MarkResourceAccess(handleVar);
ResourceClass resClass = resRefInfo.resClass;
// SRV TextureLoad is done on the GPU
if((dxOpCode == DXOp::TextureLoad) && (resClass == ResourceClass::SRV))
{
PerformGPUResourceOp(workgroup, opCode, dxOpCode, resRefInfo, inst, result);
eventFlags |= ShaderEvents::SampleLoadGather;
if(IsPendingResultReady())
{
const ShaderVariable &data = GetPendingResult();
ConvertSampleGatherReturn(dxOpCode, inst, data, result);
MarkResourceAccess(handleVar);
eventFlags |= ShaderEvents::SampleLoadGather;
}
else
{
if(PerformGPUResourceOp(workgroup, opCode, dxOpCode, resRefInfo, inst, result))
{
DXIL_DEBUG_RDCASSERT(IsPendingResultPending());
}
}
break;
}
MarkResourceAccess(handleVar);
const bool load = (dxOpCode == DXOp::TextureLoad) || (dxOpCode == DXOp::BufferLoad) ||
(dxOpCode == DXOp::RawBufferLoad);
@@ -3019,9 +3049,16 @@ bool ThreadState::ExecuteInstruction(const rdcarray<ThreadState> &workgroup)
case DXOp::Sqrt:
case DXOp::Rsqrt:
{
ShaderVariable arg;
RDCASSERT(GetShaderVariable(inst.args[1], opCode, dxOpCode, arg));
m_Debugger.CalculateMathIntrinsic(dxOpCode, arg, result);
if(IsPendingResultReady())
{
result = GetPendingResult();
}
else
{
ShaderVariable arg;
RDCASSERT(GetShaderVariable(inst.args[1], opCode, dxOpCode, arg));
QueueMathOp(dxOpCode, arg, result);
}
break;
}
case DXOp::Round_ne:
@@ -7106,10 +7143,11 @@ void ThreadState::UpdateGlobalBackingMemory(Id ptrId, const MemoryTracking::Poin
}
}
void ThreadState::PerformGPUResourceOp(const rdcarray<ThreadState> &workgroup, Operation opCode,
bool ThreadState::PerformGPUResourceOp(const rdcarray<ThreadState> &workgroup, Operation opCode,
DXOp dxOpCode, const ResourceReferenceInfo &resRefInfo,
const DXIL::Instruction &inst, ShaderVariable &result)
{
DXIL_DEBUG_RDCASSERT(!IsPendingResultPending());
// TextureLoad(srv,mipLevelOrSampleCount,coord0,coord1,coord2,offset0,offset1,offset2)
// Sample(srv,sampler,coord0,coord1,coord2,coord3,offset0,offset1,offset2,clamp)
// SampleBias(srv,sampler,coord0,coord1,coord2,coord3,offset0,offset1,offset2,bias,clamp)
@@ -7125,24 +7163,6 @@ void ThreadState::PerformGPUResourceOp(const rdcarray<ThreadState> &workgroup, O
// TextureGather(srv,sampler,coord0,coord1,coord2,coord3,offset0,offset1,channel)
// TextureGatherCmp(srv,sampler,coord0,coord1,coord2,coord3,offset0,offset1,channel,compareValue)
// DXIL reports the vector result as a struct of N members of Element type, plus an int.
const Type *retType = inst.type;
if(dxOpCode != DXOp::CalculateLOD)
{
RDCASSERTEQUAL(retType->type, Type::TypeKind::Struct);
const Type *baseType = retType->members[0];
RDCASSERTEQUAL(baseType->type, Type::TypeKind::Scalar);
result.type = ConvertDXILTypeToVarType(baseType);
result.columns = (uint8_t)(retType->members.size() - 1);
}
else
{
RDCASSERTEQUAL(retType->type, Type::TypeKind::Scalar);
RDCASSERTEQUAL(retType->scalarType, Type::Float);
RDCASSERTEQUAL(result.rows, 1);
RDCASSERTEQUAL(result.columns, 1);
}
// CalculateSampleGather is only valid for SRV resources
ResourceClass resClass = resRefInfo.resClass;
RDCASSERTEQUAL(resClass, ResourceClass::SRV);
@@ -7217,7 +7237,7 @@ void ThreadState::PerformGPUResourceOp(const rdcarray<ThreadState> &workgroup, O
ShaderVariable handleVar;
ResourceReferenceInfo samplerRef = GetResource(samplerId, annotatedHandle, handleVar);
if(!samplerRef.Valid())
return;
return false;
MarkResourceAccess(handleVar);
RDCASSERTEQUAL(samplerRef.resClass, ResourceClass::Sampler);
@@ -7386,10 +7406,32 @@ void ThreadState::PerformGPUResourceOp(const rdcarray<ThreadState> &workgroup, O
// DXGI_FORMAT_R32_UINT : u32
// DXGI_FORMAT_R32G32_UINT : u32x2
ShaderVariable data;
m_Debugger.CalculateSampleGather(dxOpCode, resourceData, samplerData, uv, ddx, ddy, texelOffsets,
msIndex, lodValue, compareValue, gatherChannel, instructionIdx,
data);
QueueSampleGather(dxOpCode, resourceData, samplerData, uv, ddx, ddy, texelOffsets, msIndex,
lodValue, compareValue, gatherChannel, instructionIdx, result);
return true;
}
void ThreadState::ConvertSampleGatherReturn(DXIL::DXOp dxOpCode, const DXIL::Instruction &inst,
const ShaderVariable &data, ShaderVariable &result) const
{
// DXIL reports the vector result as a struct of N
// members of Element type, plus an int.
const Type *retType = inst.type;
if(dxOpCode != DXOp::CalculateLOD)
{
RDCASSERTEQUAL(retType->type, Type::TypeKind::Struct);
const Type *baseType = retType->members[0];
RDCASSERTEQUAL(baseType->type, Type::TypeKind::Scalar);
result.type = ConvertDXILTypeToVarType(baseType);
result.columns = (uint8_t)(retType->members.size() - 1);
}
else
{
RDCASSERTEQUAL(retType->type, Type::TypeKind::Scalar);
RDCASSERTEQUAL(retType->scalarType, Type::Float);
RDCASSERTEQUAL(result.rows, 1);
RDCASSERTEQUAL(result.columns, 1);
}
// Do conversion to the return type
if((result.type == VarType::Float) || (result.type == VarType::SInt) ||
@@ -7417,16 +7459,6 @@ void ThreadState::PerformGPUResourceOp(const rdcarray<ThreadState> &workgroup, O
RDCERR("Unhandled return type %s", ToStr(result.type).c_str());
return;
}
if(dxOpCode == DXOp::CalculateLOD)
{
// clamped is in arg 6
ShaderVariable arg;
RDCASSERT(GetShaderVariable(inst.args[6], opCode, dxOpCode, arg, false));
// CalculateSampleGather returns {CalculateLevelOfDetail(), CalculateLevelOfDetailUnclamped()}
if(arg.value.u32v[0] == 0)
result.value.u32v[0] = data.value.u32v[1];
}
}
rdcstr ThreadState::GetArgumentName(uint32_t i) const
@@ -7850,6 +7882,46 @@ bool ThreadState::CanRunAnotherStep() const
return true;
}
void ThreadState::QueueMathOp(DXIL::DXOp dxOp, const ShaderVariable &input, ShaderVariable &result)
{
DXIL_DEBUG_RDCASSERT(!IsPendingResultPending());
m_PendingResultData = result;
m_QueuedGpuMathOp.workgroupIndex = m_WorkgroupIndex;
m_QueuedGpuMathOp.dxOp = dxOp;
m_QueuedGpuMathOp.input = input;
m_QueuedGpuMathOp.result = &m_PendingResultData;
SetStepNeedsGpuMathOp();
}
void ThreadState::QueueSampleGather(DXIL::DXOp dxOp, const SampleGatherResourceData &resourceData,
const SampleGatherSamplerData &samplerData,
const ShaderVariable &uv, const ShaderVariable &ddxCalc,
const ShaderVariable &ddyCalc, const int8_t texelOffsets[3],
int multisampleIndex, float lodValue, float compareValue,
GatherChannel gatherChannel, uint32_t instructionIdx,
ShaderVariable &result)
{
DXIL_DEBUG_RDCASSERT(!IsPendingResultPending());
m_PendingResultData = result;
m_QueuedGpuSampleGatherOp.workgroupIndex = m_WorkgroupIndex;
m_QueuedGpuSampleGatherOp.dxOp = dxOp;
m_QueuedGpuSampleGatherOp.resourceData = resourceData;
m_QueuedGpuSampleGatherOp.samplerData = samplerData;
m_QueuedGpuSampleGatherOp.uv = uv;
m_QueuedGpuSampleGatherOp.ddxCalc = ddxCalc;
m_QueuedGpuSampleGatherOp.ddyCalc = ddyCalc;
m_QueuedGpuSampleGatherOp.texelOffsets[0] = texelOffsets[0];
m_QueuedGpuSampleGatherOp.texelOffsets[1] = texelOffsets[1];
m_QueuedGpuSampleGatherOp.texelOffsets[2] = texelOffsets[2];
m_QueuedGpuSampleGatherOp.multisampleIndex = multisampleIndex;
m_QueuedGpuSampleGatherOp.lodValue = lodValue;
m_QueuedGpuSampleGatherOp.compareValue = compareValue;
m_QueuedGpuSampleGatherOp.gatherChannel = gatherChannel;
m_QueuedGpuSampleGatherOp.instructionIdx = instructionIdx;
m_QueuedGpuSampleGatherOp.result = &m_PendingResultData;
SetStepNeedsGpuSampleGatherOp();
}
Debugger::DebugInfo::~DebugInfo()
{
for(const ScopedDebugData *scope : scopedDebugDatas)
@@ -9142,6 +9214,9 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *apiWrapper, uint32_t eve
m_Stage = shaderStage;
m_EntryPointInterface = m_Program->GetEntryPointInterface();
m_QueuedDeviceThreadSteps.resize(threadsInWorkgroup);
m_QueuedGpuMathOps.resize(threadsInWorkgroup);
m_QueuedGpuSampleGatherOps.resize(threadsInWorkgroup);
m_PendingLanes.resize(threadsInWorkgroup);
uint32_t outputSSAId = m_Program->m_NextSSAId;
uint32_t maxSSAId = outputSSAId + 1;
@@ -9159,6 +9234,9 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *apiWrapper, uint32_t eve
{
m_Workgroup.push_back(ThreadState(*this, m_GlobalState, maxSSAId, i, threadsInWorkgroup));
m_QueuedDeviceThreadSteps[i] = false;
m_QueuedGpuMathOps[i] = false;
m_QueuedGpuSampleGatherOps[i] = false;
m_PendingLanes[i] = false;
}
// Get the thread state from the API wrapper
@@ -10130,6 +10208,10 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug()
do
{
ProcessQueuedDeviceThreadSteps();
// Convert the simulation threads queued operations into pending operations i.e. GPU commands
ProcessQueuedOps();
// Sync any pending GPU operations and set the results to the pending threads
SyncPendingLanes();
allStepsCompleted = true;
for(const Tangle &tangle : tangles)
@@ -10305,28 +10387,6 @@ const SRVData &Debugger::GetSRVData(const BindingSlot &slot) const
return m_ApiWrapper->GetSRVData(slot);
}
// Must be called from the replay manager thread (the debugger thread)
bool Debugger::CalculateMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariable &input,
ShaderVariable &output) const
{
CHECK_DEBUGGER_THREAD();
return m_ApiWrapper->CalculateMathIntrinsic(dxOp, input, output);
}
// Must be called from the replay manager thread (the debugger thread)
bool Debugger::CalculateSampleGather(DXIL::DXOp dxOp, SampleGatherResourceData resourceData,
SampleGatherSamplerData samplerData, const ShaderVariable &uv,
const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc,
const int8_t texelOffsets[3], int multisampleIndex,
float lodValue, float compareValue, GatherChannel gatherChannel,
uint32_t instructionIdx, ShaderVariable &output) const
{
CHECK_DEBUGGER_THREAD();
return m_ApiWrapper->CalculateSampleGather(dxOp, resourceData, samplerData, uv, ddxCalc, ddyCalc,
texelOffsets, multisampleIndex, lodValue, compareValue,
gatherChannel, instructionIdx, output);
}
// Called from any thread
DeviceOpResult Debugger::GetResourceInfo(DXIL::ResourceClass resClass,
const DXDebug::BindingSlot &slot, uint32_t mipLevel,
@@ -10406,7 +10466,11 @@ void Debugger::StepThread(uint32_t lane, StepThreadMode stepMode)
InternalStepThread(lane);
thread.ClearPendingDebugState();
}
if(thread.StepNeedsDeviceThread())
if(thread.StepNeedsGpuSampleGatherOp())
break;
else if(thread.StepNeedsGpuMathOp())
break;
else if(thread.StepNeedsDeviceThread())
break;
if(isActiveThread)
@@ -10436,6 +10500,18 @@ void Debugger::StepThread(uint32_t lane, StepThreadMode stepMode)
DXIL_DEBUG_RDCASSERT(thread.IsSimulationStepActive());
// The queueing has to be when the thread is not being simulated
if(thread.StepNeedsGpuSampleGatherOp())
{
DXIL_DEBUG_RDCASSERT(!simulateStep);
QueueGpuSampleGatherOp(lane);
return;
}
if(thread.StepNeedsGpuMathOp())
{
DXIL_DEBUG_RDCASSERT(!simulateStep);
QueueGpuMathOp(lane);
return;
}
if(thread.StepNeedsDeviceThread())
{
DXIL_DEBUG_RDCASSERT(!simulateStep);
@@ -10467,8 +10543,13 @@ void Debugger::InternalStepThread(uint32_t lane)
thread.ClearPendingDebugState();
}
thread.StepNext(true, m_Workgroup);
if(thread.StepNeedsGpuSampleGatherOp())
return;
if(thread.StepNeedsGpuMathOp())
return;
if(thread.StepNeedsDeviceThread())
return;
m_ActiveDebugState.nextInstruction = thread.GetActiveGlobalInstructionIdx();
thread.FillCallstack(m_ActiveDebugState);
@@ -10490,6 +10571,10 @@ void Debugger::InternalStepThread(uint32_t lane)
else
{
thread.StepNext(false, m_Workgroup);
if(thread.StepNeedsGpuSampleGatherOp())
return;
if(thread.StepNeedsGpuMathOp())
return;
if(thread.StepNeedsDeviceThread())
return;
}
@@ -10531,4 +10616,108 @@ void Debugger::ProcessQueuedDeviceThreadSteps()
}
}
}
// Can be called from any thread
void Debugger::QueueGpuMathOp(uint32_t lane)
{
ThreadState &thread = m_Workgroup[lane];
DXIL_DEBUG_RDCASSERT(thread.IsSimulationStepActive());
DXIL_DEBUG_RDCASSERT(!m_QueuedGpuMathOps[lane]);
m_QueuedGpuMathOps[lane] = true;
}
// Can be called from any thread
void Debugger::QueueGpuSampleGatherOp(uint32_t lane)
{
ThreadState &thread = m_Workgroup[lane];
DXIL_DEBUG_RDCASSERT(thread.IsSimulationStepActive());
DXIL_DEBUG_RDCASSERT(!m_QueuedGpuSampleGatherOps[lane]);
m_QueuedGpuSampleGatherOps[lane] = true;
}
// Must be called from the replay manager thread (the debugger thread)
void Debugger::ProcessQueuedOps()
{
CHECK_DEBUGGER_THREAD();
ProcessQueuedGpuMathOps();
ProcessQueuedGpuSampleGatherOps();
}
// Must be called from the replay manager thread (the debugger thread)
void Debugger::SyncPendingLanes()
{
CHECK_DEBUGGER_THREAD();
for(uint32_t lane = 0; lane < m_PendingLanes.size(); ++lane)
{
if(m_PendingLanes[lane])
{
m_PendingLanes[lane] = false;
ThreadState &thread = m_Workgroup[lane];
thread.SetPendingResultReady();
QueueJob(lane);
}
}
}
// Must be called from the replay manager thread (the debugger thread)
void Debugger::ProcessQueuedGpuMathOps()
{
CHECK_DEBUGGER_THREAD();
for(uint32_t lane = 0; lane < m_QueuedGpuMathOps.size(); ++lane)
{
if(m_QueuedGpuMathOps[lane])
{
m_QueuedGpuMathOps[lane] = false;
const GpuMathOperation &mathOp = m_Workgroup[lane].GetQueuedGpuMathOp();
uint32_t workgroupIndex = mathOp.workgroupIndex;
if(m_ApiWrapper->CalculateMathIntrinsic(mathOp.dxOp, mathOp.input, *mathOp.result))
{
m_PendingGpuMathsOpsResults.push_back(mathOp.result);
}
else
{
ShaderVariable &result = *mathOp.result;
memset(&result.value, 0, sizeof(result.value));
}
DXIL_DEBUG_RDCASSERT(!m_PendingLanes[workgroupIndex]);
m_PendingLanes[workgroupIndex] = true;
}
}
}
// Must be called from the replay manager thread (the debugger thread)
void Debugger::ProcessQueuedGpuSampleGatherOps()
{
CHECK_DEBUGGER_THREAD();
for(uint32_t lane = 0; lane < m_QueuedGpuSampleGatherOps.size(); ++lane)
{
if(m_QueuedGpuSampleGatherOps[lane])
{
m_QueuedGpuSampleGatherOps[lane] = false;
const GpuSampleGatherOperation &sampleGatherOp = m_Workgroup[lane].GetQueuedGpuSampleGatherOp();
uint32_t workgroupIndex = sampleGatherOp.workgroupIndex;
ShaderVariable &result = *sampleGatherOp.result;
bool hasResult = false;
if(!m_ApiWrapper->CalculateSampleGather(
sampleGatherOp.dxOp, sampleGatherOp.resourceData, sampleGatherOp.samplerData,
sampleGatherOp.uv, sampleGatherOp.ddxCalc, sampleGatherOp.ddyCalc,
sampleGatherOp.texelOffsets, sampleGatherOp.multisampleIndex, sampleGatherOp.lodValue,
sampleGatherOp.compareValue, sampleGatherOp.gatherChannel,
sampleGatherOp.instructionIdx, *sampleGatherOp.result))
{
// sample failed. Pretend we got 0 columns back
set0001(result);
hasResult = true;
}
if(!hasResult)
m_PendingGpuSampleGatherOpsResults.push_back(sampleGatherOp.result);
DXIL_DEBUG_RDCASSERT(!m_PendingLanes[workgroupIndex]);
m_PendingLanes[workgroupIndex] = true;
}
}
}
}; // namespace DXILDebug
+112 -14
View File
@@ -364,6 +364,56 @@ struct MemoryTracking
std::map<Id, Pointer> m_Pointers;
};
struct GpuMathOperation
{
void Clear()
{
workgroupIndex = 0;
dxOp = DXIL::DXOp::NumOpCodes;
input = ShaderVariable();
result = NULL;
}
uint32_t workgroupIndex;
DXIL::DXOp dxOp;
ShaderVariable input;
ShaderVariable *result;
};
struct GpuSampleGatherOperation
{
void Clear()
{
workgroupIndex = 0;
dxOp = DXIL::DXOp::NumOpCodes;
resourceData = SampleGatherResourceData();
samplerData = SampleGatherSamplerData();
uv = ddxCalc = ddyCalc = ShaderVariable();
texelOffsets[0] = 0;
texelOffsets[1] = 0;
texelOffsets[2] = 0;
multisampleIndex = ~0U;
lodValue = 0.0f;
compareValue = 0.0f;
gatherChannel = GatherChannel::Red;
instructionIdx = ~0U;
result = NULL;
}
uint32_t workgroupIndex;
DXIL::DXOp dxOp;
SampleGatherResourceData resourceData;
SampleGatherSamplerData samplerData;
ShaderVariable uv;
ShaderVariable ddxCalc;
ShaderVariable ddyCalc;
int8_t texelOffsets[3];
int multisampleIndex;
float lodValue;
float compareValue;
GatherChannel gatherChannel;
uint32_t instructionIdx;
ShaderVariable *result = NULL;
};
struct ThreadState
{
ThreadState(Debugger &debugger, const GlobalState &globalState, uint32_t maxSSAId,
@@ -393,7 +443,24 @@ struct ThreadState
return &m_PartialConvergencePoints;
}
const ShaderDebugState &GetPendingDebugState() const { return m_PendingDebugState; }
const GpuMathOperation &GetQueuedGpuMathOp() const
{
DXIL_DEBUG_RDCASSERT(AtomicLoad(&atomic_stepNeedsGpuMathOp));
DXIL_DEBUG_RDCASSERT(IsPendingResultPending());
return m_QueuedGpuMathOp;
}
const GpuSampleGatherOperation &GetQueuedGpuSampleGatherOp() const
{
DXIL_DEBUG_RDCASSERT(AtomicLoad(&atomic_stepNeedsGpuSampleGatherOp));
DXIL_DEBUG_RDCASSERT(IsPendingResultPending());
return m_QueuedGpuSampleGatherOp;
}
bool StepNeedsDeviceThread() const { return (AtomicLoad(&atomic_stepNeedsDeviceThread) == 1); }
bool StepNeedsGpuSampleGatherOp() const
{
return (AtomicLoad(&atomic_stepNeedsGpuSampleGatherOp) == 1);
}
bool StepNeedsGpuMathOp() const { return (AtomicLoad(&atomic_stepNeedsGpuMathOp) == 1); }
void SetBuiltins(const BuiltinInputs &builtins) { m_Builtins = builtins; }
void SetInput(const ShaderVariable &input) { m_Input = input; }
@@ -422,9 +489,16 @@ struct ThreadState
void SetStepQueued()
{
AtomicStore(&atomic_isSimulationStepActive, 1);
AtomicStore(&atomic_stepNeedsGpuSampleGatherOp, 0);
AtomicStore(&atomic_stepNeedsGpuMathOp, 0);
AtomicStore(&atomic_stepNeedsDeviceThread, 0);
}
void SetPendingResultUnknown() { SetPendingResultStatus(PendingResultStatus::Unknown); }
void SetPendingResultReady()
{
DXIL_DEBUG_RDCASSERTEQUAL(GetPendingResultStatus(), PendingResultStatus::Pending);
SetPendingResultStatus(PendingResultStatus::Ready);
}
void InitialiseFromActive(const ThreadState &active)
{
@@ -470,16 +544,21 @@ private:
{
return GetPendingResultStatus() == PendingResultStatus::Ready;
}
void SetPendingResultReady()
{
DXIL_DEBUG_RDCASSERTEQUAL(GetPendingResultStatus(), PendingResultStatus::Pending);
SetPendingResultStatus(PendingResultStatus::Ready);
}
const ShaderVariable &GetPendingResult() const
{
DXIL_DEBUG_RDCASSERTEQUAL(GetPendingResultStatus(), PendingResultStatus::Ready);
return m_PendingResultData;
}
void SetStepNeedsGpuSampleGatherOp()
{
AtomicStore(&atomic_stepNeedsGpuSampleGatherOp, 1);
SetPendingResultStatus(PendingResultStatus::Pending);
}
void SetStepNeedsGpuMathOp()
{
AtomicStore(&atomic_stepNeedsGpuMathOp, 1);
SetPendingResultStatus(PendingResultStatus::Pending);
}
void SetStepNeedsDeviceThread()
{
AtomicStore(&atomic_stepNeedsDeviceThread, 1);
@@ -530,9 +609,11 @@ private:
const MemoryTracking::Allocation &allocation,
const ShaderVariable &val);
void PerformGPUResourceOp(const rdcarray<ThreadState> &workgroup, DXIL::Operation opCode,
bool PerformGPUResourceOp(const rdcarray<ThreadState> &workgroup, DXIL::Operation opCode,
DXIL::DXOp dxOpCode, const ResourceReferenceInfo &resRef,
const DXIL::Instruction &inst, ShaderVariable &result);
void ConvertSampleGatherReturn(DXIL::DXOp dxOpCode, const DXIL::Instruction &inst,
const ShaderVariable &data, ShaderVariable &result) const;
void Sub(const ShaderVariable &a, const ShaderVariable &b, ShaderValue &ret) const;
ShaderValue DDX(bool fine, DXIL::Operation opCode, DXIL::DXOp dxOpCode,
@@ -559,6 +640,14 @@ private:
uint32_t GetSubgroupActiveLanes(const rdcarray<ThreadState> &workgroup,
rdcarray<uint32_t> &activeLanes) const;
void QueueMathOp(DXIL::DXOp dxOp, const ShaderVariable &input, ShaderVariable &result);
void QueueSampleGather(DXIL::DXOp dxOp, const SampleGatherResourceData &resourceData,
const SampleGatherSamplerData &samplerData, const ShaderVariable &uv,
const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc,
const int8_t texelOffsets[3], int multisampleIndex, float lodValue,
float compareValue, GatherChannel gatherChannel, uint32_t instructionIdx,
ShaderVariable &result);
struct AnnotationProperties
{
DXIL::ResourceKind resKind;
@@ -602,6 +691,8 @@ private:
ShaderDebugState m_PendingDebugState;
ShaderVariable m_PendingResultData;
GpuMathOperation m_QueuedGpuMathOp;
GpuSampleGatherOperation m_QueuedGpuSampleGatherOp;
// Track memory allocations
// For stack allocations do not bother freeing when leaving functions
@@ -645,6 +736,8 @@ private:
// These need to be accessed using atomics
int32_t atomic_pendingResultStatus = (int32_t)PendingResultStatus::Unknown;
int32_t atomic_stepNeedsGpuSampleGatherOp = 0;
int32_t atomic_stepNeedsGpuMathOp = 0;
int32_t atomic_stepNeedsDeviceThread = 0;
int32_t atomic_isSimulationStepActive = 0;
};
@@ -797,14 +890,6 @@ public:
const UAVData &GetUAVData(const BindingSlot &slot) const;
const SRVData &GetSRVData(const BindingSlot &slot) const;
bool CalculateMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariable &input,
ShaderVariable &output) const;
bool CalculateSampleGather(DXIL::DXOp dxOp, SampleGatherResourceData resourceData,
SampleGatherSamplerData samplerData, const ShaderVariable &uv,
const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc,
const int8_t texelOffsets[3], int multisampleIndex, float lodValue,
float compareValue, GatherChannel gatherChannel,
uint32_t instructionIdx, ShaderVariable &output) const;
DeviceOpResult GetResourceInfo(DXIL::ResourceClass resClass, const DXDebug::BindingSlot &slot,
uint32_t mipLevel, ShaderVariable &result) const;
DeviceOpResult GetSampleInfo(DXIL::ResourceClass resClass, const DXDebug::BindingSlot &slot,
@@ -834,7 +919,15 @@ private:
void InternalStepThread(uint32_t lane);
void SimulationJobHelper();
void QueueDeviceThreadStep(uint32_t lane);
void ProcessQueuedDeviceThreadSteps();
void ProcessQueuedOps();
void ProcessQueuedGpuMathOps();
void ProcessQueuedGpuSampleGatherOps();
void SyncPendingLanes();
void QueueGpuMathOp(uint32_t lane);
void QueueGpuSampleGatherOp(uint32_t lane);
DebugAPIWrapper *m_ApiWrapper = NULL;
@@ -846,6 +939,11 @@ private:
ShaderDebugState m_ActiveDebugState;
rdcarray<bool> m_QueuedDeviceThreadSteps;
rdcarray<bool> m_QueuedGpuMathOps;
rdcarray<bool> m_QueuedGpuSampleGatherOps;
rdcarray<bool> m_PendingLanes;
rdcarray<ShaderVariable *> m_PendingGpuMathsOpsResults;
rdcarray<ShaderVariable *> m_PendingGpuSampleGatherOpsResults;
// the live mutable global variables, to initialise a stack frame's live list
rdcarray<bool> m_LiveGlobals;
@@ -916,3 +916,15 @@ rdcstr DoStringise(const DXILDebug::DeviceOpResult &el)
}
END_ENUM_STRINGISE();
};
template <>
rdcstr DoStringise(const DXILDebug::ThreadState::PendingResultStatus &el)
{
BEGIN_ENUM_STRINGISE(DXILDebug::ThreadState::PendingResultStatus)
{
STRINGISE_ENUM_CLASS(Unknown)
STRINGISE_ENUM_CLASS(Pending)
STRINGISE_ENUM_CLASS(Ready)
}
END_ENUM_STRINGISE();
};