DXIL debugger support for GPU batching of Math and SampleGather ops

This commit is contained in:
Jake Turner
2025-10-05 12:40:18 +01:00
parent 6bb8070fd2
commit cb83fac970
8 changed files with 365 additions and 142 deletions
+6
View File
@@ -603,6 +603,12 @@ D3D12DebugManager::~D3D12DebugManager()
bool D3D12DebugManager::CreateShaderDebugResources()
{
// MathOp is 2, SampleGatherOp is 6
const uint64_t resultMaxElementSize = sizeof(Vec4f) * (2 + 6);
const uint32_t maxQueuedResults = D3D12DebugManager::MAX_SHADER_DEBUG_QUEUED_OPS;
const uint64_t shaderDebugReadbackSize = resultMaxElementSize * maxQueuedResults;
RDCCOMPILE_ASSERT(shaderDebugReadbackSize < m_ReadbackSize, "Readback buffer is not big enough");
rdcstr hlsl = GetEmbeddedResource(shaderdebug_hlsl);
D3D12RootSignature rootSig;
+5
View File
@@ -152,6 +152,11 @@ public:
D3D12DebugManager(WrappedID3D12Device *wrapper);
~D3D12DebugManager();
enum
{
MAX_SHADER_DEBUG_QUEUED_OPS = 128
};
void GetBufferData(ID3D12Resource *buff, uint64_t offset, uint64_t length, bytebuf &retData);
ID3D12Resource *MakeCBuffer(UINT64 size);
+104 -15
View File
@@ -570,7 +570,14 @@ D3D12APIWrapper::D3D12APIWrapper(WrappedID3D12Device *device, const DXIL::Progra
m_EventId(eventId),
m_Program(dxilProgram),
m_Reflection(refl),
m_DeviceThreadID(Threading::GetCurrentID())
m_DeviceThreadID(Threading::GetCurrentID()),
m_QueuedOpCmdList(NULL),
m_QueuedMathOpIndex(0),
m_QueuedSampleGatherOpIndex(0),
m_MathOpResultOffset(0),
m_MaxQueuedOps(D3D12DebugManager::MAX_SHADER_DEBUG_QUEUED_OPS),
m_SampleGatherOpResultsStart(D3D12DebugManager::MAX_SHADER_DEBUG_QUEUED_OPS *
m_MathOpResultByteSize)
{
// Create the storage layout for the constant buffers
// The constant buffer data and details are filled in outside of this method
@@ -1574,11 +1581,26 @@ UAVInfo D3D12APIWrapper::GetUAV(const BindingSlot &slot)
}
// Must be called from the replay manager thread (the debugger thread)
bool D3D12APIWrapper::CalculateMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariable &input,
ShaderVariable &output)
bool D3D12APIWrapper::QueueMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariable &input)
{
CHECK_DEVICE_THREAD();
D3D12MarkerRegion region(m_Device->GetQueue()->GetReal(), "CalculateMathIntrinsic");
ID3D12GraphicsCommandListX *cmdList = m_QueuedOpCmdList;
if(!cmdList)
{
if(StartQueuedOps())
cmdList = m_QueuedOpCmdList;
}
if(!cmdList)
return false;
if(!QueuedOpsHasSpace())
{
m_Device->AddDebugMessage(MessageCategory::Execution, MessageSeverity::High,
MessageSource::RuntimeWarning, "Too many GPU queued operations");
return false;
}
D3D12MarkerRegion region(m_Device->GetQueue()->GetReal(), "QueueMathIntrinsic");
int mathOp;
switch(dxOp)
@@ -1604,18 +1626,36 @@ bool D3D12APIWrapper::CalculateMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariab
return false;
}
ShaderVariable ignored;
return D3D12ShaderDebug::CalculateMathIntrinsic(true, m_Device, mathOp, input, output, ignored);
return D3D12ShaderDebug::QueueMathIntrinsic(false, m_Device, cmdList, mathOp, input,
m_QueuedMathOpIndex++);
}
// Must be called from the replay manager thread (the debugger thread)
bool D3D12APIWrapper::CalculateSampleGather(
DXIL::DXOp dxOp, SampleGatherResourceData resourceData, SampleGatherSamplerData samplerData,
const ShaderVariable &uv, const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc,
const int8_t texelOffsets[3], int multisampleIndex, float lodValue, float compareValue,
GatherChannel gatherChannel, uint32_t instructionIdx, ShaderVariable &output)
bool D3D12APIWrapper::QueueSampleGather(DXIL::DXOp dxOp, SampleGatherResourceData resourceData,
SampleGatherSamplerData samplerData,
const ShaderVariable &uv, const ShaderVariable &ddxCalc,
const ShaderVariable &ddyCalc, const int8_t texelOffsets[3],
int multisampleIndex, float lodValue, float compareValue,
GatherChannel gatherChannel, uint32_t instructionIdx,
int &sampleRetType)
{
CHECK_DEVICE_THREAD();
ID3D12GraphicsCommandListX *cmdList = m_QueuedOpCmdList;
if(!cmdList)
{
if(StartQueuedOps())
cmdList = m_QueuedOpCmdList;
}
if(!cmdList)
return false;
if(!QueuedOpsHasSpace())
{
m_Device->AddDebugMessage(MessageCategory::Execution, MessageSeverity::High,
MessageSource::RuntimeWarning, "Too many GPU queued operations");
return false;
}
int sampleOp;
switch(dxOp)
{
@@ -1642,10 +1682,59 @@ bool D3D12APIWrapper::CalculateSampleGather(
const char *opString = ToStr(dxOp).c_str();
uint8_t swizzle[4] = {0, 1, 2, 3};
return D3D12ShaderDebug::CalculateSampleGather(
true, m_Device, sampleOp, resourceData, samplerData, uv, ddxCalc, ddyCalc, texelOffsets,
multisampleIndex, lodValue, compareValue, swizzle, gatherChannel, m_ShaderType,
instructionIdx, opString, output);
return D3D12ShaderDebug::QueueSampleGather(
true, m_Device, m_QueuedOpCmdList, sampleOp, resourceData, samplerData, uv, ddxCalc, ddyCalc,
texelOffsets, multisampleIndex, lodValue, compareValue, swizzle, gatherChannel, m_ShaderType,
instructionIdx, opString, m_QueuedSampleGatherOpIndex++, sampleRetType);
}
// Must be called from the replay manager thread (the debugger thread)
bool D3D12APIWrapper::StartQueuedOps()
{
CHECK_DEVICE_THREAD();
RDCASSERTEQUAL(m_QueuedMathOpIndex, 0);
RDCASSERTEQUAL(m_QueuedSampleGatherOpIndex, 0);
RDCASSERTEQUAL(m_QueuedOpCmdList, NULL);
RDCASSERTEQUAL(m_MathOpResultOffset, 0);
if(m_QueuedOpCmdList)
return false;
m_QueuedOpCmdList = m_Device->GetDebugManager()->ResetDebugList();
if(!m_QueuedOpCmdList)
return false;
return true;
}
// Must be called from the replay manager thread (the debugger thread)
bool D3D12APIWrapper::GetQueuedResults(rdcarray<ShaderVariable *> &mathOpResults,
rdcarray<ShaderVariable *> &sampleGatherResults,
const rdcarray<int> &sampleRetTypes)
{
const uint32_t countMathResultsPerGpuOp = 1;
rdcarray<const uint8_t *> swizzles;
uint8_t swizzle[4] = {0, 1, 2, 3};
for(size_t i = 0; i < sampleGatherResults.size(); ++i)
swizzles.push_back(swizzle);
bool ret = D3D12ShaderDebug::GetQueuedResults(m_Device, m_QueuedOpCmdList, mathOpResults,
countMathResultsPerGpuOp, sampleGatherResults,
sampleRetTypes, swizzles);
m_QueuedOpCmdList = NULL;
m_QueuedMathOpIndex = 0;
m_QueuedSampleGatherOpIndex = 0;
m_MathOpResultOffset = 0;
return ret;
}
// Must be called from the replay manager thread (the debugger thread)
bool D3D12APIWrapper::QueuedOpsHasSpace() const
{
return (m_QueuedMathOpIndex + m_QueuedSampleGatherOpIndex) < m_MaxQueuedOps;
}
// Called from any thread
+22 -8
View File
@@ -54,14 +54,17 @@ public:
UAVInfo GetUAV(const BindingSlot &slot) override;
SRVInfo GetSRV(const BindingSlot &slot) override;
bool CalculateMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariable &input,
ShaderVariable &output) override;
bool CalculateSampleGather(DXIL::DXOp dxOp, SampleGatherResourceData resourceData,
SampleGatherSamplerData samplerData, const ShaderVariable &uv,
const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc,
const int8_t texelOffsets[3], int multisampleIndex, float lodValue,
float compareValue, GatherChannel gatherChannel,
uint32_t instructionIdx, ShaderVariable &output) override;
bool QueueMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariable &input) override;
bool QueueSampleGather(DXIL::DXOp dxOp, SampleGatherResourceData resourceData,
SampleGatherSamplerData samplerData, const ShaderVariable &uv,
const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc,
const int8_t texelOffsets[3], int multisampleIndex, float lodValue,
float compareValue, GatherChannel gatherChannel, uint32_t instructionIdx,
int &sampleRetType) override;
bool GetQueuedResults(rdcarray<ShaderVariable *> &mathOpResults,
rdcarray<ShaderVariable *> &sampleGatherResults,
const rdcarray<int> &sampleRetTypes) override;
bool QueuedOpsHasSpace() const override;
ShaderVariable GetResourceInfo(DXIL::ResourceClass resClass, const DXDebug::BindingSlot &slot,
uint32_t mipLevel) override;
@@ -144,6 +147,7 @@ private:
const char *opString);
ResourceReferenceInfo FetchResourceReferenceInfo(const DXDebug::BindingSlot &slot);
ShaderDirectAccess FetchShaderDirectAccess(DescriptorType type, const DXDebug::BindingSlot &slot);
bool StartQueuedOps();
BuiltinInputs m_Builtins;
rdcarray<DXILDebug::ThreadProperties> m_WorkgroupProperties;
@@ -195,6 +199,16 @@ private:
const ShaderReflection &m_Reflection;
WrappedID3D12Device *m_Device = NULL;
ID3D12GraphicsCommandListX *m_QueuedOpCmdList = NULL;
uint32_t m_QueuedMathOpIndex = 0;
uint32_t m_QueuedSampleGatherOpIndex = 0;
uint64_t m_MathOpResultOffset = 0;
const uint32_t m_MaxQueuedOps = 0;
const uint64_t m_MathOpResultByteSize = sizeof(Vec4f) * 2;
const uint64_t m_SampleGatherOpResultByteSize = sizeof(Vec4f);
const uint64_t m_SampleGatherOpResultsStart;
const DXIL::Program *m_Program = NULL;
const DXIL::EntryPointInterface *m_EntryPointInterface = NULL;
const DXBC::ShaderType m_ShaderType;
+169 -97
View File
@@ -42,6 +42,9 @@
using namespace DXBCBytecode;
const uint64_t s_MathOpResultByteSize = sizeof(Vec4f) * 2;
const uint64_t s_SampleGatherOpResultByteSize = sizeof(Vec4f) * 6;
static bool IsShaderParameterVisible(DXBC::ShaderType shaderType,
D3D12_SHADER_VISIBILITY shaderVisibility)
{
@@ -82,11 +85,11 @@ static D3D12_DESCRIPTOR_RANGE_TYPE ConvertOperandTypeToDescriptorType(DXBCByteco
}
// Helpers used by DXBC and DXIL debuggers to interact with GPU and resources
bool D3D12ShaderDebug::CalculateMathIntrinsic(bool dxil, WrappedID3D12Device *device, int mathOp,
const ShaderVariable &input, ShaderVariable &output1,
ShaderVariable &output2)
bool D3D12ShaderDebug::QueueMathIntrinsic(bool dxil, WrappedID3D12Device *device,
ID3D12GraphicsCommandListX *cmdList, int mathOp,
const ShaderVariable &input, const uint32_t queueIndex)
{
D3D12MarkerRegion region(device->GetQueue()->GetReal(), "CalculateMathIntrinsic");
D3D12MarkerRegion region(device->GetQueue()->GetReal(), "QueueMathIntrinsic");
ID3D12Resource *pResultBuffer = device->GetDebugManager()->GetShaderDebugResultBuffer();
ID3D12Resource *pReadbackBuffer = device->GetDebugManager()->GetReadbackBuffer();
@@ -96,7 +99,6 @@ bool D3D12ShaderDebug::CalculateMathIntrinsic(bool dxil, WrappedID3D12Device *de
cbufferData.mathOp = mathOp;
// Set root signature & sig params on command list, then execute the shader
ID3D12GraphicsCommandListX *cmdList = device->GetDebugManager()->ResetDebugList();
device->GetDebugManager()->SetDescriptorHeaps(cmdList, true, false);
cmdList->SetPipelineState(dxil ? device->GetDebugManager()->GetDXILMathIntrinsicsPso()
: device->GetDebugManager()->GetMathIntrinsicsPso());
@@ -113,52 +115,25 @@ bool D3D12ShaderDebug::CalculateMathIntrinsic(bool dxil, WrappedID3D12Device *de
barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE;
cmdList->ResourceBarrier(1, &barrier);
cmdList->CopyBufferRegion(pReadbackBuffer, 0, pResultBuffer, 0, sizeof(Vec4f) * 6);
uint64_t destOffset = queueIndex * s_MathOpResultByteSize;
cmdList->CopyBufferRegion(pReadbackBuffer, destOffset, pResultBuffer, 0, s_MathOpResultByteSize);
HRESULT hr = cmdList->Close();
if(FAILED(hr))
{
RDCERR("Failed to close command list HRESULT: %s", ToStr(hr).c_str());
return false;
}
{
ID3D12CommandList *l = cmdList;
device->GetQueue()->ExecuteCommandLists(1, &l);
device->InternalQueueWaitForIdle();
device->GetDebugManager()->ResetDebugAlloc();
}
D3D12_RANGE range = {0, sizeof(Vec4f) * 6};
byte *results = NULL;
hr = pReadbackBuffer->Map(0, &range, (void **)&results);
if(FAILED(hr))
{
pReadbackBuffer->Unmap(0, &range);
RDCERR("Failed to map readback buffer HRESULT: %s", ToStr(hr).c_str());
return false;
}
memcpy(output1.value.u32v.data(), results, sizeof(Vec4f));
memcpy(output2.value.u32v.data(), results + sizeof(Vec4f), sizeof(Vec4f));
range.End = 0;
pReadbackBuffer->Unmap(0, &range);
barrier.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_SOURCE;
barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
cmdList->ResourceBarrier(1, &barrier);
return true;
}
bool D3D12ShaderDebug::CalculateSampleGather(
bool dxil, WrappedID3D12Device *device, int sampleOp, SampleGatherResourceData resourceData,
SampleGatherSamplerData samplerData, const ShaderVariable &uvIn,
const ShaderVariable &ddxCalcIn, const ShaderVariable &ddyCalcIn, const int8_t texelOffsets[3],
int multisampleIndex, float lodValue, float compareValue, const uint8_t swizzle[4],
GatherChannel gatherChannel, const DXBC::ShaderType shaderType, uint32_t instruction,
const char *opString, ShaderVariable &output)
bool D3D12ShaderDebug::QueueSampleGather(
bool dxil, WrappedID3D12Device *device, ID3D12GraphicsCommandListX *cmdList, int sampleOp,
SampleGatherResourceData resourceData, SampleGatherSamplerData samplerData,
const ShaderVariable &uvIn, const ShaderVariable &ddxCalcIn, const ShaderVariable &ddyCalcIn,
const int8_t texelOffsets[3], int multisampleIndex, float lodValue, float compareValue,
const uint8_t swizzle[4], GatherChannel gatherChannel, const DXBC::ShaderType shaderType,
uint32_t instruction, const char *opString, const uint32_t queueIndex, int &sampleRetType)
{
D3D12MarkerRegion region(device->GetQueue()->GetReal(), "CalculateSampleGather");
D3D12MarkerRegion region(device->GetQueue()->GetReal(), "QueueSampleGather");
ShaderVariable uv(uvIn);
ShaderVariable ddxCalc(ddxCalcIn);
@@ -269,6 +244,7 @@ bool D3D12ShaderDebug::CalculateSampleGather(
{
RDCERR("Unsupported return type %d in sample operation", resourceData.retType);
}
sampleRetType = cbufferData.debugSampleRetType;
cbufferData.debugSampleGatherChannel = (int)gatherChannel;
cbufferData.debugSampleSampleIndex = multisampleIndex;
@@ -276,6 +252,7 @@ bool D3D12ShaderDebug::CalculateSampleGather(
cbufferData.debugSampleLod = lodValue;
cbufferData.debugSampleCompare = compareValue;
// Store a copy of the event's render state to restore later
D3D12RenderState &rs = device->GetQueue()->GetCommandData()->m_RenderState;
D3D12RenderState prevState = rs;
@@ -283,7 +260,6 @@ bool D3D12ShaderDebug::CalculateSampleGather(
ID3D12PipelineState *pso = dxil ? device->GetDebugManager()->GetDXILTexSamplePso(texelOffsets)
: device->GetDebugManager()->GetTexSamplePso(texelOffsets);
ID3D12GraphicsCommandListX *cmdList = device->GetDebugManager()->ResetDebugList();
rs.pipe = GetResID(pso);
rs.rts.clear();
// Set viewport/scissor unconditionally - we need to set this all the time for sampling for a
@@ -359,7 +335,32 @@ bool D3D12ShaderDebug::CalculateSampleGather(
barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE;
cmdList->ResourceBarrier(1, &barrier);
cmdList->CopyBufferRegion(pReadbackBuffer, 0, pResultBuffer, 0, sizeof(Vec4f) * 6);
const uint64_t sampleGatherOpResultsStart(D3D12DebugManager::MAX_SHADER_DEBUG_QUEUED_OPS *
s_MathOpResultByteSize);
uint64_t destOffset = sampleGatherOpResultsStart + queueIndex * s_SampleGatherOpResultByteSize;
cmdList->CopyBufferRegion(pReadbackBuffer, destOffset, pResultBuffer, 0,
s_SampleGatherOpResultByteSize);
barrier.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_SOURCE;
barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
cmdList->ResourceBarrier(1, &barrier);
// Restore D3D12 state to what the event uses
rs = prevState;
return true;
}
bool D3D12ShaderDebug::GetQueuedResults(WrappedID3D12Device *device,
ID3D12GraphicsCommandListX *cmdList,
rdcarray<ShaderVariable *> &mathOpResults,
uint32_t countMathResultsPerGpuOp,
rdcarray<ShaderVariable *> &sampleGatherResults,
const rdcarray<int> &sampleRetTypes,
const rdcarray<const uint8_t *> &swizzles)
{
RDCASSERTEQUAL(sampleGatherResults.size(), sampleRetTypes.size());
RDCASSERTEQUAL(sampleGatherResults.size(), swizzles.size());
HRESULT hr = cmdList->Close();
if(FAILED(hr))
@@ -375,46 +376,76 @@ bool D3D12ShaderDebug::CalculateSampleGather(
device->GetDebugManager()->ResetDebugAlloc();
}
rs = prevState;
ID3D12Resource *pReadbackBuffer = device->GetDebugManager()->GetReadbackBuffer();
D3D12_RANGE range = {0, sizeof(Vec4f) * 6};
void *results = NULL;
hr = pReadbackBuffer->Map(0, &range, &results);
byte *gpuResults = NULL;
hr = pReadbackBuffer->Map(0, NULL, (void **)&gpuResults);
if(FAILED(hr))
{
pReadbackBuffer->Unmap(0, &range);
pReadbackBuffer->Unmap(0, NULL);
RDCERR("Failed to map readback buffer HRESULT: %s", ToStr(hr).c_str());
return false;
}
ShaderVariable lookupResult("tex", 0.0f, 0.0f, 0.0f, 0.0f);
uintptr_t bufferEnd = (uintptr_t)(gpuResults + pReadbackBuffer->GetDesc().Width);
float *retFloats = (float *)results;
uint32_t *retUInts = (uint32_t *)(retFloats + 8);
int32_t *retSInts = (int32_t *)(retUInts + 8);
byte *gpuMathOpResults = gpuResults;
for(uint32_t i = 0; i < mathOpResults.size(); i += countMathResultsPerGpuOp)
{
const size_t countBytes = sizeof(Vec4f);
const size_t countBytesPerGpuOp = countBytes * countMathResultsPerGpuOp;
RDCASSERT((uintptr_t)gpuMathOpResults + countBytesPerGpuOp <= bufferEnd,
(uintptr_t)gpuMathOpResults, countBytesPerGpuOp, bufferEnd);
RDCASSERT(countBytesPerGpuOp <= s_MathOpResultByteSize, countBytesPerGpuOp,
s_MathOpResultByteSize);
if(cbufferData.debugSampleRetType == DEBUG_SAMPLE_UINT)
{
for(int i = 0; i < 4; i++)
lookupResult.value.u32v[i] = retUInts[swizzle[i]];
}
else if(cbufferData.debugSampleRetType == DEBUG_SAMPLE_INT)
{
for(int i = 0; i < 4; i++)
lookupResult.value.s32v[i] = retSInts[swizzle[i]];
}
else
{
for(int i = 0; i < 4; i++)
lookupResult.value.f32v[i] = retFloats[swizzle[i]];
for(uint32_t r = 0; r < countMathResultsPerGpuOp; r++)
{
ShaderVariable *result = mathOpResults[i + r];
memcpy(result->value.u32v.data(), gpuMathOpResults + r * countBytes, countBytes);
}
gpuMathOpResults += s_MathOpResultByteSize;
}
range.End = 0;
pReadbackBuffer->Unmap(0, &range);
const uint64_t sampleGatherOpResultsStart(D3D12DebugManager::MAX_SHADER_DEBUG_QUEUED_OPS *
s_MathOpResultByteSize);
byte *gpuSampleGatherOpResults = gpuResults + sampleGatherOpResultsStart;
for(uint32_t s = 0; s < sampleGatherResults.size(); ++s)
{
float *retFloats = (float *)gpuSampleGatherOpResults;
uint32_t *retUInts = (uint32_t *)(retFloats + 8);
int32_t *retSInts = (int32_t *)(retUInts + 8);
output = lookupResult;
size_t countBytes = 16;
RDCASSERT((uintptr_t)gpuSampleGatherOpResults + countBytes <= bufferEnd,
(uintptr_t)gpuSampleGatherOpResults, countBytes, bufferEnd);
RDCASSERT(countBytes <= s_SampleGatherOpResultByteSize, countBytes,
s_SampleGatherOpResultByteSize);
ShaderVariable &output = *sampleGatherResults[s];
int debugSampleRetType = sampleRetTypes[s];
const uint8_t *swizzle = swizzles[s];
if(debugSampleRetType == DEBUG_SAMPLE_UINT)
{
for(int i = 0; i < 4; i++)
output.value.u32v[i] = retUInts[swizzle[i]];
}
else if(debugSampleRetType == DEBUG_SAMPLE_INT)
{
for(int i = 0; i < 4; i++)
output.value.s32v[i] = retSInts[swizzle[i]];
}
else
{
for(int i = 0; i < 4; i++)
output.value.f32v[i] = retFloats[swizzle[i]];
}
gpuSampleGatherOpResults += s_SampleGatherOpResultByteSize;
}
pReadbackBuffer->Unmap(0, NULL);
return true;
}
@@ -533,8 +564,8 @@ D3D12Descriptor D3D12ShaderDebug::FindDescriptor(WrappedID3D12Device *device,
srvDesc.Format = DXGI_FORMAT_UNKNOWN;
srvDesc.Buffer.FirstElement = 0;
// we don't know the real length or structure stride from a root descriptor, so set
// defaults. This behaviour seems undefined in drivers, so returning 1 as the number of
// elements is as sensible as anything else
// defaults. This behaviour seems undefined in drivers, so returning 1 as the number
// of elements is as sensible as anything else
srvDesc.Buffer.NumElements = 1;
srvDesc.Buffer.StructureByteStride = 4;
srvDesc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_NONE;
@@ -555,8 +586,8 @@ D3D12Descriptor D3D12ShaderDebug::FindDescriptor(WrappedID3D12Device *device,
uavDesc.Format = DXGI_FORMAT_UNKNOWN;
uavDesc.Buffer.FirstElement = 0;
// we don't know the real length or structure stride from a root descriptor, so set
// defaults. This behaviour seems undefined in drivers, so returning 1 as the number of
// elements is as sensible as anything else
// defaults. This behaviour seems undefined in drivers, so returning 1 as the number
// of elements is as sensible as anything else
uavDesc.Buffer.NumElements = 1;
uavDesc.Buffer.StructureByteStride = 4;
uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
@@ -1012,6 +1043,7 @@ public:
private:
DXBC::ShaderType GetShaderType() { return m_dxbc ? m_dxbc->m_Type : DXBC::ShaderType::Pixel; }
WrappedID3D12Device *m_pDevice;
ID3D12GraphicsCommandListX *m_QueuedOpCmdList;
const DXBC::DXBCContainer *m_dxbc;
DXBCDebug::GlobalState &m_globalState;
uint32_t m_instruction;
@@ -1024,13 +1056,13 @@ D3D12DebugAPIWrapper::D3D12DebugAPIWrapper(WrappedID3D12Device *device,
DXBCDebug::GlobalState &globalState, uint32_t eid)
: m_pDevice(device), m_dxbc(dxbc), m_globalState(globalState), m_instruction(0), m_EventID(eid)
{
m_QueuedOpCmdList = NULL;
}
D3D12DebugAPIWrapper::~D3D12DebugAPIWrapper()
{
// if we replayed to before the action for fetching some UAVs, replay back to after the action to
// keep
// the state consistent.
// if we replayed to before the action for fetching some UAVs, replay back to after the action
// to keep the state consistent.
if(m_DidReplay)
{
D3D12MarkerRegion region(m_pDevice->GetQueue()->GetReal(), "ResetReplay");
@@ -1092,9 +1124,9 @@ void D3D12DebugAPIWrapper::FetchSRV(const DXBCDebug::BindingSlot &slot)
D3D12_RESOURCE_DESC resDesc = pResource->GetDesc();
// DXBC allows root buffers to have a stride of up to 16 bytes in the shader, which
// means encoding the byte offset into the first element here is wrong without knowing
// what the actual accessed stride is. Instead we only fetch the data from that offset
// onwards.
// means encoding the byte offset into the first element here is wrong without
// knowing what the actual accessed stride is. Instead we only fetch the data from
// that offset onwards.
// TODO: Root buffers can be 32-bit UINT/SINT/FLOAT. Using UINT for now, but the
// resource desc format or the DXBC reflection info might be more correct.
@@ -1266,9 +1298,9 @@ void D3D12DebugAPIWrapper::FetchUAV(const DXBCDebug::BindingSlot &slot)
D3D12_RESOURCE_DESC resDesc = pResource->GetDesc();
// DXBC allows root buffers to have a stride of up to 16 bytes in the shader, which
// means encoding the byte offset into the first element here is wrong without knowing
// what the actual accessed stride is. Instead we only fetch the data from that offset
// onwards.
// means encoding the byte offset into the first element here is wrong without
// knowing what the actual accessed stride is. Instead we only fetch the data from
// that offset onwards.
// TODO: Root buffers can be 32-bit UINT/SINT/FLOAT. Using UINT for now, but the
// resource desc format or the DXBC reflection info might be more correct.
@@ -1394,6 +1426,7 @@ void D3D12DebugAPIWrapper::FetchUAV(const DXBCDebug::BindingSlot &slot)
slot.registerSpace);
}
// Used by the DXBC Debugger
bool D3D12DebugAPIWrapper::CalculateMathIntrinsic(DXBCBytecode::OpcodeType opcode,
const ShaderVariable &input,
ShaderVariable &output1, ShaderVariable &output2)
@@ -1413,7 +1446,26 @@ bool D3D12DebugAPIWrapper::CalculateMathIntrinsic(DXBCBytecode::OpcodeType opcod
return false;
}
return D3D12ShaderDebug::CalculateMathIntrinsic(false, m_pDevice, mathOp, input, output1, output2);
RDCASSERT(!m_QueuedOpCmdList);
m_QueuedOpCmdList = m_pDevice->GetDebugManager()->ResetDebugList();
const uint32_t queueIndex = 0;
if(!D3D12ShaderDebug::QueueMathIntrinsic(false, m_pDevice, m_QueuedOpCmdList, mathOp, input,
queueIndex))
return false;
rdcarray<ShaderVariable *> mathOpResults;
mathOpResults.push_back(&output1);
mathOpResults.push_back(&output2);
rdcarray<ShaderVariable *> sampleGatherResults;
rdcarray<int> sampleRetTypes;
rdcarray<const uint8_t *> swizzles;
const uint32_t countMathResultsPerGpuOp = 2;
bool ret = D3D12ShaderDebug::GetQueuedResults(m_pDevice, m_QueuedOpCmdList, mathOpResults,
countMathResultsPerGpuOp, sampleGatherResults,
sampleRetTypes, swizzles);
m_QueuedOpCmdList = NULL;
return ret;
}
D3D12Descriptor D3D12DebugAPIWrapper::FindDescriptor(DXBCBytecode::OperandType type,
@@ -1493,6 +1545,7 @@ ShaderVariable D3D12DebugAPIWrapper::GetResourceInfo(DXBCBytecode::OperandType t
dim, false);
}
// Used by the DXBC Debugger
bool D3D12DebugAPIWrapper::CalculateSampleGather(
DXBCBytecode::OpcodeType opcode, DXDebug::SampleGatherResourceData resourceData,
DXDebug::SampleGatherSamplerData samplerData, const ShaderVariable &uv,
@@ -1525,10 +1578,29 @@ bool D3D12DebugAPIWrapper::CalculateSampleGather(
return false;
}
return D3D12ShaderDebug::CalculateSampleGather(
false, m_pDevice, sampleOp, resourceData, samplerData, uv, ddxCalc, ddyCalc, texelOffsets,
multisampleIndex, lodOrCompareValue, lodOrCompareValue, swizzle, gatherChannel,
GetShaderType(), m_instruction, opString, output);
RDCASSERT(!m_QueuedOpCmdList);
m_QueuedOpCmdList = m_pDevice->GetDebugManager()->ResetDebugList();
int sampleRetType = 0;
const uint32_t queueIndex = 0;
if(!D3D12ShaderDebug::QueueSampleGather(
false, m_pDevice, m_QueuedOpCmdList, sampleOp, resourceData, samplerData, uv, ddxCalc,
ddyCalc, texelOffsets, multisampleIndex, lodOrCompareValue, lodOrCompareValue, swizzle,
gatherChannel, GetShaderType(), m_instruction, opString, queueIndex, sampleRetType))
return false;
rdcarray<ShaderVariable *> mathOpResults;
rdcarray<ShaderVariable *> sampleGatherResults;
sampleGatherResults.push_back(&output);
rdcarray<int> sampleRetTypes;
sampleRetTypes.push_back(sampleRetType);
rdcarray<const uint8_t *> swizzles;
swizzles.push_back(swizzle);
bool ret = D3D12ShaderDebug::GetQueuedResults(m_pDevice, m_QueuedOpCmdList, mathOpResults, 0,
sampleGatherResults, sampleRetTypes, swizzles);
m_QueuedOpCmdList = NULL;
return ret;
}
void GatherConstantBuffers(WrappedID3D12Device *pDevice, const DXBCBytecode::Program &program,
@@ -2974,11 +3046,11 @@ ShaderDebugTrace *D3D12Replay::DebugPixel(uint32_t eventId, uint32_t x, uint32_t
return new ShaderDebugTrace;
}
// if we encounter multiple hits at our destination pixel co-ord (or any other) we check to see if
// a specific primitive was requested (via primitive parameter not being set to ~0U). If it was,
// debug that pixel, otherwise do a best-estimate of which fragment was the last to successfully
// depth test and debug that, just by checking if the depth test is ordered and picking the final
// fragment in the series
// if we encounter multiple hits at our destination pixel co-ord (or any other) we check to see
// if a specific primitive was requested (via primitive parameter not being set to ~0U). If it
// was, debug that pixel, otherwise do a best-estimate of which fragment was the last to
// successfully depth test and debug that, just by checking if the depth test is ordered and
// picking the final fragment in the series
// Get depth func and determine "winner" pixel
DXDebug::DebugHit *pWinnerHit = NULL;
+14 -11
View File
@@ -39,18 +39,21 @@ typedef DXDebug::GatherChannel GatherChannel;
typedef DXBCBytecode::SamplerMode SamplerMode;
// Helpers used by DXBC and DXIL debuggers to interact with GPU and resources
bool CalculateMathIntrinsic(bool dxil, WrappedID3D12Device *device, int mathOp,
const ShaderVariable &input, ShaderVariable &output1,
ShaderVariable &output2);
bool QueueMathIntrinsic(bool dxil, WrappedID3D12Device *device, ID3D12GraphicsCommandListX *cmdList,
int mathOp, const ShaderVariable &input, const uint32_t queueIndex);
bool CalculateSampleGather(bool dxil, WrappedID3D12Device *device, int sampleOp,
SampleGatherResourceData resourceData,
SampleGatherSamplerData samplerData, const ShaderVariable &uv,
const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc,
const int8_t texelOffsets[3], int multisampleIndex, float lodValue,
float compareValue, const uint8_t swizzle[4],
GatherChannel gatherChannel, const DXBC::ShaderType shaderType,
uint32_t instruction, const char *opString, ShaderVariable &output);
bool QueueSampleGather(bool dxil, WrappedID3D12Device *device, ID3D12GraphicsCommandListX *cmdList,
int sampleOp, SampleGatherResourceData resourceData,
SampleGatherSamplerData samplerData, const ShaderVariable &uv,
const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc,
const int8_t texelOffsets[3], int multisampleIndex, float lodValue,
float compareValue, const uint8_t swizzle[4], GatherChannel gatherChannel,
const DXBC::ShaderType shaderType, uint32_t instruction,
const char *opString, const uint32_t queueIndex, int &sampleRetType);
bool GetQueuedResults(WrappedID3D12Device *device, ID3D12GraphicsCommandListX *cmdList,
rdcarray<ShaderVariable *> &mathOpResults, uint32_t countMathResultsPerGpuOp,
rdcarray<ShaderVariable *> &sampleGatherResults,
const rdcarray<int> &sampleRetTypes, const rdcarray<const uint8_t *> &swizzles);
D3D12Descriptor FindDescriptor(WrappedID3D12Device *device,
const DXDebug::HeapDescriptorType heapType, uint32_t descriptorIndex);
+32 -3
View File
@@ -10482,6 +10482,7 @@ void Debugger::ProcessQueuedOps()
CHECK_DEBUGGER_THREAD();
ProcessQueuedGpuMathOps();
ProcessQueuedGpuSampleGatherOps();
SyncPendingGpuOps();
}
// Must be called from the replay manager thread (the debugger thread)
@@ -10508,11 +10509,14 @@ void Debugger::ProcessQueuedGpuMathOps()
{
if(m_QueuedGpuMathOps[lane])
{
if(!m_ApiWrapper->QueuedOpsHasSpace())
SyncPendingGpuOps();
m_QueuedGpuMathOps[lane] = false;
const GpuMathOperation &mathOp = m_Workgroup[lane].GetQueuedGpuMathOp();
uint32_t workgroupIndex = mathOp.workgroupIndex;
if(m_ApiWrapper->CalculateMathIntrinsic(mathOp.dxOp, mathOp.input, *mathOp.result))
if(m_ApiWrapper->QueueMathIntrinsic(mathOp.dxOp, mathOp.input))
{
m_PendingGpuMathsOpsResults.push_back(mathOp.result);
}
@@ -10536,25 +10540,32 @@ void Debugger::ProcessQueuedGpuSampleGatherOps()
{
if(m_QueuedGpuSampleGatherOps[lane])
{
if(!m_ApiWrapper->QueuedOpsHasSpace())
SyncPendingGpuOps();
m_QueuedGpuSampleGatherOps[lane] = false;
const GpuSampleGatherOperation &sampleGatherOp = m_Workgroup[lane].GetQueuedGpuSampleGatherOp();
uint32_t workgroupIndex = sampleGatherOp.workgroupIndex;
ShaderVariable &result = *sampleGatherOp.result;
bool hasResult = false;
if(!m_ApiWrapper->CalculateSampleGather(
int sampleRetType = 0;
if(!m_ApiWrapper->QueueSampleGather(
sampleGatherOp.dxOp, sampleGatherOp.resourceData, sampleGatherOp.samplerData,
sampleGatherOp.uv, sampleGatherOp.ddxCalc, sampleGatherOp.ddyCalc,
sampleGatherOp.texelOffsets, sampleGatherOp.multisampleIndex, sampleGatherOp.lodValue,
sampleGatherOp.compareValue, sampleGatherOp.gatherChannel,
sampleGatherOp.instructionIdx, *sampleGatherOp.result))
sampleGatherOp.instructionIdx, sampleRetType))
{
// sample failed. Pretend we got 0 columns back
set0001(result);
hasResult = true;
}
if(!hasResult)
{
m_PendingGpuSampleGatherOpsResults.push_back(sampleGatherOp.result);
m_PendingGpuSampleGatherOpsSampleRetTypes.push_back(sampleRetType);
}
DXIL_DEBUG_RDCASSERT(!m_PendingLanes[workgroupIndex]);
m_PendingLanes[workgroupIndex] = true;
@@ -10562,6 +10573,24 @@ void Debugger::ProcessQueuedGpuSampleGatherOps()
}
}
// Must be called from the replay manager thread (the debugger thread)
void Debugger::SyncPendingGpuOps()
{
CHECK_DEBUGGER_THREAD();
if(m_PendingGpuMathsOpsResults.empty() && m_PendingGpuSampleGatherOpsResults.empty())
return;
if(!(m_ApiWrapper->GetQueuedResults(m_PendingGpuMathsOpsResults, m_PendingGpuSampleGatherOpsResults,
m_PendingGpuSampleGatherOpsSampleRetTypes)))
{
RDCERR("GetQueuedResults failed");
return;
}
m_PendingGpuMathsOpsResults.clear();
m_PendingGpuSampleGatherOpsResults.clear();
m_PendingGpuSampleGatherOpsSampleRetTypes.clear();
}
void Debugger::SimulationJobHelper()
{
while(AtomicLoad(&atomic_simulationFinished) == 0)
+13 -8
View File
@@ -307,14 +307,17 @@ public:
virtual UAVInfo GetUAV(const BindingSlot &slot) = 0;
virtual SRVInfo GetSRV(const BindingSlot &slot) = 0;
virtual bool CalculateMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariable &input,
ShaderVariable &output) = 0;
virtual bool CalculateSampleGather(DXIL::DXOp dxOp, SampleGatherResourceData resourceData,
SampleGatherSamplerData samplerData, const ShaderVariable &uv,
const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc,
const int8_t texelOffsets[3], int multisampleIndex,
float lodValue, float compareValue, GatherChannel gatherChannel,
uint32_t instructionIdx, ShaderVariable &output) = 0;
virtual bool QueueMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariable &input) = 0;
virtual bool QueueSampleGather(DXIL::DXOp dxOp, SampleGatherResourceData resourceData,
SampleGatherSamplerData samplerData, const ShaderVariable &uv,
const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc,
const int8_t texelOffsets[3], int multisampleIndex, float lodValue,
float compareValue, GatherChannel gatherChannel,
uint32_t instructionIdx, int &sampleRetType) = 0;
virtual bool GetQueuedResults(rdcarray<ShaderVariable *> &mathOpResults,
rdcarray<ShaderVariable *> &sampleGatherResults,
const rdcarray<int> &sampleRetTypes) = 0;
virtual bool QueuedOpsHasSpace() const = 0;
virtual ShaderVariable GetResourceInfo(DXIL::ResourceClass resClass,
const DXDebug::BindingSlot &slot, uint32_t mipLevel) = 0;
virtual ShaderVariable GetSampleInfo(DXIL::ResourceClass resClass,
@@ -948,6 +951,7 @@ private:
void ProcessQueuedOps();
void ProcessQueuedGpuMathOps();
void ProcessQueuedGpuSampleGatherOps();
void SyncPendingGpuOps();
void SyncPendingLanes();
void QueueGpuMathOp(uint32_t lane);
@@ -970,6 +974,7 @@ private:
rdcarray<bool> m_PendingLanes;
rdcarray<ShaderVariable *> m_PendingGpuMathsOpsResults;
rdcarray<ShaderVariable *> m_PendingGpuSampleGatherOpsResults;
rdcarray<int> m_PendingGpuSampleGatherOpsSampleRetTypes;
// the live mutable global variables, to initialise a stack frame's live list
rdcarray<bool> m_LiveGlobals;