diff --git a/renderdoc/driver/d3d12/d3d12_debug.cpp b/renderdoc/driver/d3d12/d3d12_debug.cpp index d727764f1..b73df31eb 100644 --- a/renderdoc/driver/d3d12/d3d12_debug.cpp +++ b/renderdoc/driver/d3d12/d3d12_debug.cpp @@ -603,6 +603,12 @@ D3D12DebugManager::~D3D12DebugManager() bool D3D12DebugManager::CreateShaderDebugResources() { + // MathOp is 2, SampleGatherOp is 6 + const uint64_t resultMaxElementSize = sizeof(Vec4f) * (2 + 6); + const uint32_t maxQueuedResults = D3D12DebugManager::MAX_SHADER_DEBUG_QUEUED_OPS; + const uint64_t shaderDebugReadbackSize = resultMaxElementSize * maxQueuedResults; + RDCCOMPILE_ASSERT(shaderDebugReadbackSize < m_ReadbackSize, "Readback buffer is not big enough"); + rdcstr hlsl = GetEmbeddedResource(shaderdebug_hlsl); D3D12RootSignature rootSig; diff --git a/renderdoc/driver/d3d12/d3d12_debug.h b/renderdoc/driver/d3d12/d3d12_debug.h index dd7167873..a6264f325 100644 --- a/renderdoc/driver/d3d12/d3d12_debug.h +++ b/renderdoc/driver/d3d12/d3d12_debug.h @@ -152,6 +152,11 @@ public: D3D12DebugManager(WrappedID3D12Device *wrapper); ~D3D12DebugManager(); + enum + { + MAX_SHADER_DEBUG_QUEUED_OPS = 128 + }; + void GetBufferData(ID3D12Resource *buff, uint64_t offset, uint64_t length, bytebuf &retData); ID3D12Resource *MakeCBuffer(UINT64 size); diff --git a/renderdoc/driver/d3d12/d3d12_dxil_debug.cpp b/renderdoc/driver/d3d12/d3d12_dxil_debug.cpp index 7fe58bd76..539010c84 100644 --- a/renderdoc/driver/d3d12/d3d12_dxil_debug.cpp +++ b/renderdoc/driver/d3d12/d3d12_dxil_debug.cpp @@ -570,7 +570,14 @@ D3D12APIWrapper::D3D12APIWrapper(WrappedID3D12Device *device, const DXIL::Progra m_EventId(eventId), m_Program(dxilProgram), m_Reflection(refl), - m_DeviceThreadID(Threading::GetCurrentID()) + m_DeviceThreadID(Threading::GetCurrentID()), + m_QueuedOpCmdList(NULL), + m_QueuedMathOpIndex(0), + m_QueuedSampleGatherOpIndex(0), + m_MathOpResultOffset(0), + m_MaxQueuedOps(D3D12DebugManager::MAX_SHADER_DEBUG_QUEUED_OPS), + m_SampleGatherOpResultsStart(D3D12DebugManager::MAX_SHADER_DEBUG_QUEUED_OPS * + m_MathOpResultByteSize) { // Create the storage layout for the constant buffers // The constant buffer data and details are filled in outside of this method @@ -1574,11 +1581,26 @@ UAVInfo D3D12APIWrapper::GetUAV(const BindingSlot &slot) } // Must be called from the replay manager thread (the debugger thread) -bool D3D12APIWrapper::CalculateMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariable &input, - ShaderVariable &output) +bool D3D12APIWrapper::QueueMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariable &input) { CHECK_DEVICE_THREAD(); - D3D12MarkerRegion region(m_Device->GetQueue()->GetReal(), "CalculateMathIntrinsic"); + ID3D12GraphicsCommandListX *cmdList = m_QueuedOpCmdList; + if(!cmdList) + { + if(StartQueuedOps()) + cmdList = m_QueuedOpCmdList; + } + if(!cmdList) + return false; + + if(!QueuedOpsHasSpace()) + { + m_Device->AddDebugMessage(MessageCategory::Execution, MessageSeverity::High, + MessageSource::RuntimeWarning, "Too many GPU queued operations"); + return false; + } + + D3D12MarkerRegion region(m_Device->GetQueue()->GetReal(), "QueueMathIntrinsic"); int mathOp; switch(dxOp) @@ -1604,18 +1626,36 @@ bool D3D12APIWrapper::CalculateMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariab return false; } - ShaderVariable ignored; - return D3D12ShaderDebug::CalculateMathIntrinsic(true, m_Device, mathOp, input, output, ignored); + return D3D12ShaderDebug::QueueMathIntrinsic(false, m_Device, cmdList, mathOp, input, + m_QueuedMathOpIndex++); } // Must be called from the replay manager thread (the debugger thread) -bool D3D12APIWrapper::CalculateSampleGather( - DXIL::DXOp dxOp, SampleGatherResourceData resourceData, SampleGatherSamplerData samplerData, - const ShaderVariable &uv, const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc, - const int8_t texelOffsets[3], int multisampleIndex, float lodValue, float compareValue, - GatherChannel gatherChannel, uint32_t instructionIdx, ShaderVariable &output) +bool D3D12APIWrapper::QueueSampleGather(DXIL::DXOp dxOp, SampleGatherResourceData resourceData, + SampleGatherSamplerData samplerData, + const ShaderVariable &uv, const ShaderVariable &ddxCalc, + const ShaderVariable &ddyCalc, const int8_t texelOffsets[3], + int multisampleIndex, float lodValue, float compareValue, + GatherChannel gatherChannel, uint32_t instructionIdx, + int &sampleRetType) { CHECK_DEVICE_THREAD(); + ID3D12GraphicsCommandListX *cmdList = m_QueuedOpCmdList; + if(!cmdList) + { + if(StartQueuedOps()) + cmdList = m_QueuedOpCmdList; + } + if(!cmdList) + return false; + + if(!QueuedOpsHasSpace()) + { + m_Device->AddDebugMessage(MessageCategory::Execution, MessageSeverity::High, + MessageSource::RuntimeWarning, "Too many GPU queued operations"); + return false; + } + int sampleOp; switch(dxOp) { @@ -1642,10 +1682,59 @@ bool D3D12APIWrapper::CalculateSampleGather( const char *opString = ToStr(dxOp).c_str(); uint8_t swizzle[4] = {0, 1, 2, 3}; - return D3D12ShaderDebug::CalculateSampleGather( - true, m_Device, sampleOp, resourceData, samplerData, uv, ddxCalc, ddyCalc, texelOffsets, - multisampleIndex, lodValue, compareValue, swizzle, gatherChannel, m_ShaderType, - instructionIdx, opString, output); + return D3D12ShaderDebug::QueueSampleGather( + true, m_Device, m_QueuedOpCmdList, sampleOp, resourceData, samplerData, uv, ddxCalc, ddyCalc, + texelOffsets, multisampleIndex, lodValue, compareValue, swizzle, gatherChannel, m_ShaderType, + instructionIdx, opString, m_QueuedSampleGatherOpIndex++, sampleRetType); +} + +// Must be called from the replay manager thread (the debugger thread) +bool D3D12APIWrapper::StartQueuedOps() +{ + CHECK_DEVICE_THREAD(); + + RDCASSERTEQUAL(m_QueuedMathOpIndex, 0); + RDCASSERTEQUAL(m_QueuedSampleGatherOpIndex, 0); + RDCASSERTEQUAL(m_QueuedOpCmdList, NULL); + RDCASSERTEQUAL(m_MathOpResultOffset, 0); + + if(m_QueuedOpCmdList) + return false; + + m_QueuedOpCmdList = m_Device->GetDebugManager()->ResetDebugList(); + if(!m_QueuedOpCmdList) + return false; + + return true; +} + +// Must be called from the replay manager thread (the debugger thread) +bool D3D12APIWrapper::GetQueuedResults(rdcarray &mathOpResults, + rdcarray &sampleGatherResults, + const rdcarray &sampleRetTypes) +{ + const uint32_t countMathResultsPerGpuOp = 1; + rdcarray swizzles; + uint8_t swizzle[4] = {0, 1, 2, 3}; + for(size_t i = 0; i < sampleGatherResults.size(); ++i) + swizzles.push_back(swizzle); + + bool ret = D3D12ShaderDebug::GetQueuedResults(m_Device, m_QueuedOpCmdList, mathOpResults, + countMathResultsPerGpuOp, sampleGatherResults, + sampleRetTypes, swizzles); + + m_QueuedOpCmdList = NULL; + m_QueuedMathOpIndex = 0; + m_QueuedSampleGatherOpIndex = 0; + m_MathOpResultOffset = 0; + + return ret; +} + +// Must be called from the replay manager thread (the debugger thread) +bool D3D12APIWrapper::QueuedOpsHasSpace() const +{ + return (m_QueuedMathOpIndex + m_QueuedSampleGatherOpIndex) < m_MaxQueuedOps; } // Called from any thread diff --git a/renderdoc/driver/d3d12/d3d12_dxil_debug.h b/renderdoc/driver/d3d12/d3d12_dxil_debug.h index b17ffbce5..5042d54d7 100644 --- a/renderdoc/driver/d3d12/d3d12_dxil_debug.h +++ b/renderdoc/driver/d3d12/d3d12_dxil_debug.h @@ -54,14 +54,17 @@ public: UAVInfo GetUAV(const BindingSlot &slot) override; SRVInfo GetSRV(const BindingSlot &slot) override; - bool CalculateMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariable &input, - ShaderVariable &output) override; - bool CalculateSampleGather(DXIL::DXOp dxOp, SampleGatherResourceData resourceData, - SampleGatherSamplerData samplerData, const ShaderVariable &uv, - const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc, - const int8_t texelOffsets[3], int multisampleIndex, float lodValue, - float compareValue, GatherChannel gatherChannel, - uint32_t instructionIdx, ShaderVariable &output) override; + bool QueueMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariable &input) override; + bool QueueSampleGather(DXIL::DXOp dxOp, SampleGatherResourceData resourceData, + SampleGatherSamplerData samplerData, const ShaderVariable &uv, + const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc, + const int8_t texelOffsets[3], int multisampleIndex, float lodValue, + float compareValue, GatherChannel gatherChannel, uint32_t instructionIdx, + int &sampleRetType) override; + bool GetQueuedResults(rdcarray &mathOpResults, + rdcarray &sampleGatherResults, + const rdcarray &sampleRetTypes) override; + bool QueuedOpsHasSpace() const override; ShaderVariable GetResourceInfo(DXIL::ResourceClass resClass, const DXDebug::BindingSlot &slot, uint32_t mipLevel) override; @@ -144,6 +147,7 @@ private: const char *opString); ResourceReferenceInfo FetchResourceReferenceInfo(const DXDebug::BindingSlot &slot); ShaderDirectAccess FetchShaderDirectAccess(DescriptorType type, const DXDebug::BindingSlot &slot); + bool StartQueuedOps(); BuiltinInputs m_Builtins; rdcarray m_WorkgroupProperties; @@ -195,6 +199,16 @@ private: const ShaderReflection &m_Reflection; WrappedID3D12Device *m_Device = NULL; + + ID3D12GraphicsCommandListX *m_QueuedOpCmdList = NULL; + uint32_t m_QueuedMathOpIndex = 0; + uint32_t m_QueuedSampleGatherOpIndex = 0; + uint64_t m_MathOpResultOffset = 0; + const uint32_t m_MaxQueuedOps = 0; + const uint64_t m_MathOpResultByteSize = sizeof(Vec4f) * 2; + const uint64_t m_SampleGatherOpResultByteSize = sizeof(Vec4f); + const uint64_t m_SampleGatherOpResultsStart; + const DXIL::Program *m_Program = NULL; const DXIL::EntryPointInterface *m_EntryPointInterface = NULL; const DXBC::ShaderType m_ShaderType; diff --git a/renderdoc/driver/d3d12/d3d12_shaderdebug.cpp b/renderdoc/driver/d3d12/d3d12_shaderdebug.cpp index e49442764..a9fb8a052 100644 --- a/renderdoc/driver/d3d12/d3d12_shaderdebug.cpp +++ b/renderdoc/driver/d3d12/d3d12_shaderdebug.cpp @@ -42,6 +42,9 @@ using namespace DXBCBytecode; +const uint64_t s_MathOpResultByteSize = sizeof(Vec4f) * 2; +const uint64_t s_SampleGatherOpResultByteSize = sizeof(Vec4f) * 6; + static bool IsShaderParameterVisible(DXBC::ShaderType shaderType, D3D12_SHADER_VISIBILITY shaderVisibility) { @@ -82,11 +85,11 @@ static D3D12_DESCRIPTOR_RANGE_TYPE ConvertOperandTypeToDescriptorType(DXBCByteco } // Helpers used by DXBC and DXIL debuggers to interact with GPU and resources -bool D3D12ShaderDebug::CalculateMathIntrinsic(bool dxil, WrappedID3D12Device *device, int mathOp, - const ShaderVariable &input, ShaderVariable &output1, - ShaderVariable &output2) +bool D3D12ShaderDebug::QueueMathIntrinsic(bool dxil, WrappedID3D12Device *device, + ID3D12GraphicsCommandListX *cmdList, int mathOp, + const ShaderVariable &input, const uint32_t queueIndex) { - D3D12MarkerRegion region(device->GetQueue()->GetReal(), "CalculateMathIntrinsic"); + D3D12MarkerRegion region(device->GetQueue()->GetReal(), "QueueMathIntrinsic"); ID3D12Resource *pResultBuffer = device->GetDebugManager()->GetShaderDebugResultBuffer(); ID3D12Resource *pReadbackBuffer = device->GetDebugManager()->GetReadbackBuffer(); @@ -96,7 +99,6 @@ bool D3D12ShaderDebug::CalculateMathIntrinsic(bool dxil, WrappedID3D12Device *de cbufferData.mathOp = mathOp; // Set root signature & sig params on command list, then execute the shader - ID3D12GraphicsCommandListX *cmdList = device->GetDebugManager()->ResetDebugList(); device->GetDebugManager()->SetDescriptorHeaps(cmdList, true, false); cmdList->SetPipelineState(dxil ? device->GetDebugManager()->GetDXILMathIntrinsicsPso() : device->GetDebugManager()->GetMathIntrinsicsPso()); @@ -113,52 +115,25 @@ bool D3D12ShaderDebug::CalculateMathIntrinsic(bool dxil, WrappedID3D12Device *de barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE; cmdList->ResourceBarrier(1, &barrier); - cmdList->CopyBufferRegion(pReadbackBuffer, 0, pResultBuffer, 0, sizeof(Vec4f) * 6); + uint64_t destOffset = queueIndex * s_MathOpResultByteSize; + cmdList->CopyBufferRegion(pReadbackBuffer, destOffset, pResultBuffer, 0, s_MathOpResultByteSize); - HRESULT hr = cmdList->Close(); - if(FAILED(hr)) - { - RDCERR("Failed to close command list HRESULT: %s", ToStr(hr).c_str()); - return false; - } - - { - ID3D12CommandList *l = cmdList; - device->GetQueue()->ExecuteCommandLists(1, &l); - device->InternalQueueWaitForIdle(); - device->GetDebugManager()->ResetDebugAlloc(); - } - - D3D12_RANGE range = {0, sizeof(Vec4f) * 6}; - - byte *results = NULL; - hr = pReadbackBuffer->Map(0, &range, (void **)&results); - - if(FAILED(hr)) - { - pReadbackBuffer->Unmap(0, &range); - RDCERR("Failed to map readback buffer HRESULT: %s", ToStr(hr).c_str()); - return false; - } - - memcpy(output1.value.u32v.data(), results, sizeof(Vec4f)); - memcpy(output2.value.u32v.data(), results + sizeof(Vec4f), sizeof(Vec4f)); - - range.End = 0; - pReadbackBuffer->Unmap(0, &range); + barrier.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_SOURCE; + barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + cmdList->ResourceBarrier(1, &barrier); return true; } -bool D3D12ShaderDebug::CalculateSampleGather( - bool dxil, WrappedID3D12Device *device, int sampleOp, SampleGatherResourceData resourceData, - SampleGatherSamplerData samplerData, const ShaderVariable &uvIn, - const ShaderVariable &ddxCalcIn, const ShaderVariable &ddyCalcIn, const int8_t texelOffsets[3], - int multisampleIndex, float lodValue, float compareValue, const uint8_t swizzle[4], - GatherChannel gatherChannel, const DXBC::ShaderType shaderType, uint32_t instruction, - const char *opString, ShaderVariable &output) +bool D3D12ShaderDebug::QueueSampleGather( + bool dxil, WrappedID3D12Device *device, ID3D12GraphicsCommandListX *cmdList, int sampleOp, + SampleGatherResourceData resourceData, SampleGatherSamplerData samplerData, + const ShaderVariable &uvIn, const ShaderVariable &ddxCalcIn, const ShaderVariable &ddyCalcIn, + const int8_t texelOffsets[3], int multisampleIndex, float lodValue, float compareValue, + const uint8_t swizzle[4], GatherChannel gatherChannel, const DXBC::ShaderType shaderType, + uint32_t instruction, const char *opString, const uint32_t queueIndex, int &sampleRetType) { - D3D12MarkerRegion region(device->GetQueue()->GetReal(), "CalculateSampleGather"); + D3D12MarkerRegion region(device->GetQueue()->GetReal(), "QueueSampleGather"); ShaderVariable uv(uvIn); ShaderVariable ddxCalc(ddxCalcIn); @@ -269,6 +244,7 @@ bool D3D12ShaderDebug::CalculateSampleGather( { RDCERR("Unsupported return type %d in sample operation", resourceData.retType); } + sampleRetType = cbufferData.debugSampleRetType; cbufferData.debugSampleGatherChannel = (int)gatherChannel; cbufferData.debugSampleSampleIndex = multisampleIndex; @@ -276,6 +252,7 @@ bool D3D12ShaderDebug::CalculateSampleGather( cbufferData.debugSampleLod = lodValue; cbufferData.debugSampleCompare = compareValue; + // Store a copy of the event's render state to restore later D3D12RenderState &rs = device->GetQueue()->GetCommandData()->m_RenderState; D3D12RenderState prevState = rs; @@ -283,7 +260,6 @@ bool D3D12ShaderDebug::CalculateSampleGather( ID3D12PipelineState *pso = dxil ? device->GetDebugManager()->GetDXILTexSamplePso(texelOffsets) : device->GetDebugManager()->GetTexSamplePso(texelOffsets); - ID3D12GraphicsCommandListX *cmdList = device->GetDebugManager()->ResetDebugList(); rs.pipe = GetResID(pso); rs.rts.clear(); // Set viewport/scissor unconditionally - we need to set this all the time for sampling for a @@ -359,7 +335,32 @@ bool D3D12ShaderDebug::CalculateSampleGather( barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE; cmdList->ResourceBarrier(1, &barrier); - cmdList->CopyBufferRegion(pReadbackBuffer, 0, pResultBuffer, 0, sizeof(Vec4f) * 6); + const uint64_t sampleGatherOpResultsStart(D3D12DebugManager::MAX_SHADER_DEBUG_QUEUED_OPS * + s_MathOpResultByteSize); + + uint64_t destOffset = sampleGatherOpResultsStart + queueIndex * s_SampleGatherOpResultByteSize; + cmdList->CopyBufferRegion(pReadbackBuffer, destOffset, pResultBuffer, 0, + s_SampleGatherOpResultByteSize); + + barrier.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_SOURCE; + barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + cmdList->ResourceBarrier(1, &barrier); + + // Restore D3D12 state to what the event uses + rs = prevState; + return true; +} + +bool D3D12ShaderDebug::GetQueuedResults(WrappedID3D12Device *device, + ID3D12GraphicsCommandListX *cmdList, + rdcarray &mathOpResults, + uint32_t countMathResultsPerGpuOp, + rdcarray &sampleGatherResults, + const rdcarray &sampleRetTypes, + const rdcarray &swizzles) +{ + RDCASSERTEQUAL(sampleGatherResults.size(), sampleRetTypes.size()); + RDCASSERTEQUAL(sampleGatherResults.size(), swizzles.size()); HRESULT hr = cmdList->Close(); if(FAILED(hr)) @@ -375,46 +376,76 @@ bool D3D12ShaderDebug::CalculateSampleGather( device->GetDebugManager()->ResetDebugAlloc(); } - rs = prevState; + ID3D12Resource *pReadbackBuffer = device->GetDebugManager()->GetReadbackBuffer(); - D3D12_RANGE range = {0, sizeof(Vec4f) * 6}; - - void *results = NULL; - hr = pReadbackBuffer->Map(0, &range, &results); + byte *gpuResults = NULL; + hr = pReadbackBuffer->Map(0, NULL, (void **)&gpuResults); if(FAILED(hr)) { - pReadbackBuffer->Unmap(0, &range); + pReadbackBuffer->Unmap(0, NULL); RDCERR("Failed to map readback buffer HRESULT: %s", ToStr(hr).c_str()); return false; } - ShaderVariable lookupResult("tex", 0.0f, 0.0f, 0.0f, 0.0f); + uintptr_t bufferEnd = (uintptr_t)(gpuResults + pReadbackBuffer->GetDesc().Width); - float *retFloats = (float *)results; - uint32_t *retUInts = (uint32_t *)(retFloats + 8); - int32_t *retSInts = (int32_t *)(retUInts + 8); + byte *gpuMathOpResults = gpuResults; + for(uint32_t i = 0; i < mathOpResults.size(); i += countMathResultsPerGpuOp) + { + const size_t countBytes = sizeof(Vec4f); + const size_t countBytesPerGpuOp = countBytes * countMathResultsPerGpuOp; + RDCASSERT((uintptr_t)gpuMathOpResults + countBytesPerGpuOp <= bufferEnd, + (uintptr_t)gpuMathOpResults, countBytesPerGpuOp, bufferEnd); + RDCASSERT(countBytesPerGpuOp <= s_MathOpResultByteSize, countBytesPerGpuOp, + s_MathOpResultByteSize); - if(cbufferData.debugSampleRetType == DEBUG_SAMPLE_UINT) - { - for(int i = 0; i < 4; i++) - lookupResult.value.u32v[i] = retUInts[swizzle[i]]; - } - else if(cbufferData.debugSampleRetType == DEBUG_SAMPLE_INT) - { - for(int i = 0; i < 4; i++) - lookupResult.value.s32v[i] = retSInts[swizzle[i]]; - } - else - { - for(int i = 0; i < 4; i++) - lookupResult.value.f32v[i] = retFloats[swizzle[i]]; + for(uint32_t r = 0; r < countMathResultsPerGpuOp; r++) + { + ShaderVariable *result = mathOpResults[i + r]; + memcpy(result->value.u32v.data(), gpuMathOpResults + r * countBytes, countBytes); + } + gpuMathOpResults += s_MathOpResultByteSize; } - range.End = 0; - pReadbackBuffer->Unmap(0, &range); + const uint64_t sampleGatherOpResultsStart(D3D12DebugManager::MAX_SHADER_DEBUG_QUEUED_OPS * + s_MathOpResultByteSize); + byte *gpuSampleGatherOpResults = gpuResults + sampleGatherOpResultsStart; + for(uint32_t s = 0; s < sampleGatherResults.size(); ++s) + { + float *retFloats = (float *)gpuSampleGatherOpResults; + uint32_t *retUInts = (uint32_t *)(retFloats + 8); + int32_t *retSInts = (int32_t *)(retUInts + 8); - output = lookupResult; + size_t countBytes = 16; + RDCASSERT((uintptr_t)gpuSampleGatherOpResults + countBytes <= bufferEnd, + (uintptr_t)gpuSampleGatherOpResults, countBytes, bufferEnd); + RDCASSERT(countBytes <= s_SampleGatherOpResultByteSize, countBytes, + s_SampleGatherOpResultByteSize); + + ShaderVariable &output = *sampleGatherResults[s]; + + int debugSampleRetType = sampleRetTypes[s]; + const uint8_t *swizzle = swizzles[s]; + if(debugSampleRetType == DEBUG_SAMPLE_UINT) + { + for(int i = 0; i < 4; i++) + output.value.u32v[i] = retUInts[swizzle[i]]; + } + else if(debugSampleRetType == DEBUG_SAMPLE_INT) + { + for(int i = 0; i < 4; i++) + output.value.s32v[i] = retSInts[swizzle[i]]; + } + else + { + for(int i = 0; i < 4; i++) + output.value.f32v[i] = retFloats[swizzle[i]]; + } + gpuSampleGatherOpResults += s_SampleGatherOpResultByteSize; + } + + pReadbackBuffer->Unmap(0, NULL); return true; } @@ -533,8 +564,8 @@ D3D12Descriptor D3D12ShaderDebug::FindDescriptor(WrappedID3D12Device *device, srvDesc.Format = DXGI_FORMAT_UNKNOWN; srvDesc.Buffer.FirstElement = 0; // we don't know the real length or structure stride from a root descriptor, so set - // defaults. This behaviour seems undefined in drivers, so returning 1 as the number of - // elements is as sensible as anything else + // defaults. This behaviour seems undefined in drivers, so returning 1 as the number + // of elements is as sensible as anything else srvDesc.Buffer.NumElements = 1; srvDesc.Buffer.StructureByteStride = 4; srvDesc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_NONE; @@ -555,8 +586,8 @@ D3D12Descriptor D3D12ShaderDebug::FindDescriptor(WrappedID3D12Device *device, uavDesc.Format = DXGI_FORMAT_UNKNOWN; uavDesc.Buffer.FirstElement = 0; // we don't know the real length or structure stride from a root descriptor, so set - // defaults. This behaviour seems undefined in drivers, so returning 1 as the number of - // elements is as sensible as anything else + // defaults. This behaviour seems undefined in drivers, so returning 1 as the number + // of elements is as sensible as anything else uavDesc.Buffer.NumElements = 1; uavDesc.Buffer.StructureByteStride = 4; uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE; @@ -1012,6 +1043,7 @@ public: private: DXBC::ShaderType GetShaderType() { return m_dxbc ? m_dxbc->m_Type : DXBC::ShaderType::Pixel; } WrappedID3D12Device *m_pDevice; + ID3D12GraphicsCommandListX *m_QueuedOpCmdList; const DXBC::DXBCContainer *m_dxbc; DXBCDebug::GlobalState &m_globalState; uint32_t m_instruction; @@ -1024,13 +1056,13 @@ D3D12DebugAPIWrapper::D3D12DebugAPIWrapper(WrappedID3D12Device *device, DXBCDebug::GlobalState &globalState, uint32_t eid) : m_pDevice(device), m_dxbc(dxbc), m_globalState(globalState), m_instruction(0), m_EventID(eid) { + m_QueuedOpCmdList = NULL; } D3D12DebugAPIWrapper::~D3D12DebugAPIWrapper() { - // if we replayed to before the action for fetching some UAVs, replay back to after the action to - // keep - // the state consistent. + // if we replayed to before the action for fetching some UAVs, replay back to after the action + // to keep the state consistent. if(m_DidReplay) { D3D12MarkerRegion region(m_pDevice->GetQueue()->GetReal(), "ResetReplay"); @@ -1092,9 +1124,9 @@ void D3D12DebugAPIWrapper::FetchSRV(const DXBCDebug::BindingSlot &slot) D3D12_RESOURCE_DESC resDesc = pResource->GetDesc(); // DXBC allows root buffers to have a stride of up to 16 bytes in the shader, which - // means encoding the byte offset into the first element here is wrong without knowing - // what the actual accessed stride is. Instead we only fetch the data from that offset - // onwards. + // means encoding the byte offset into the first element here is wrong without + // knowing what the actual accessed stride is. Instead we only fetch the data from + // that offset onwards. // TODO: Root buffers can be 32-bit UINT/SINT/FLOAT. Using UINT for now, but the // resource desc format or the DXBC reflection info might be more correct. @@ -1266,9 +1298,9 @@ void D3D12DebugAPIWrapper::FetchUAV(const DXBCDebug::BindingSlot &slot) D3D12_RESOURCE_DESC resDesc = pResource->GetDesc(); // DXBC allows root buffers to have a stride of up to 16 bytes in the shader, which - // means encoding the byte offset into the first element here is wrong without knowing - // what the actual accessed stride is. Instead we only fetch the data from that offset - // onwards. + // means encoding the byte offset into the first element here is wrong without + // knowing what the actual accessed stride is. Instead we only fetch the data from + // that offset onwards. // TODO: Root buffers can be 32-bit UINT/SINT/FLOAT. Using UINT for now, but the // resource desc format or the DXBC reflection info might be more correct. @@ -1394,6 +1426,7 @@ void D3D12DebugAPIWrapper::FetchUAV(const DXBCDebug::BindingSlot &slot) slot.registerSpace); } +// Used by the DXBC Debugger bool D3D12DebugAPIWrapper::CalculateMathIntrinsic(DXBCBytecode::OpcodeType opcode, const ShaderVariable &input, ShaderVariable &output1, ShaderVariable &output2) @@ -1413,7 +1446,26 @@ bool D3D12DebugAPIWrapper::CalculateMathIntrinsic(DXBCBytecode::OpcodeType opcod return false; } - return D3D12ShaderDebug::CalculateMathIntrinsic(false, m_pDevice, mathOp, input, output1, output2); + RDCASSERT(!m_QueuedOpCmdList); + m_QueuedOpCmdList = m_pDevice->GetDebugManager()->ResetDebugList(); + const uint32_t queueIndex = 0; + if(!D3D12ShaderDebug::QueueMathIntrinsic(false, m_pDevice, m_QueuedOpCmdList, mathOp, input, + queueIndex)) + return false; + + rdcarray mathOpResults; + mathOpResults.push_back(&output1); + mathOpResults.push_back(&output2); + rdcarray sampleGatherResults; + rdcarray sampleRetTypes; + rdcarray swizzles; + + const uint32_t countMathResultsPerGpuOp = 2; + bool ret = D3D12ShaderDebug::GetQueuedResults(m_pDevice, m_QueuedOpCmdList, mathOpResults, + countMathResultsPerGpuOp, sampleGatherResults, + sampleRetTypes, swizzles); + m_QueuedOpCmdList = NULL; + return ret; } D3D12Descriptor D3D12DebugAPIWrapper::FindDescriptor(DXBCBytecode::OperandType type, @@ -1493,6 +1545,7 @@ ShaderVariable D3D12DebugAPIWrapper::GetResourceInfo(DXBCBytecode::OperandType t dim, false); } +// Used by the DXBC Debugger bool D3D12DebugAPIWrapper::CalculateSampleGather( DXBCBytecode::OpcodeType opcode, DXDebug::SampleGatherResourceData resourceData, DXDebug::SampleGatherSamplerData samplerData, const ShaderVariable &uv, @@ -1525,10 +1578,29 @@ bool D3D12DebugAPIWrapper::CalculateSampleGather( return false; } - return D3D12ShaderDebug::CalculateSampleGather( - false, m_pDevice, sampleOp, resourceData, samplerData, uv, ddxCalc, ddyCalc, texelOffsets, - multisampleIndex, lodOrCompareValue, lodOrCompareValue, swizzle, gatherChannel, - GetShaderType(), m_instruction, opString, output); + RDCASSERT(!m_QueuedOpCmdList); + m_QueuedOpCmdList = m_pDevice->GetDebugManager()->ResetDebugList(); + int sampleRetType = 0; + const uint32_t queueIndex = 0; + if(!D3D12ShaderDebug::QueueSampleGather( + false, m_pDevice, m_QueuedOpCmdList, sampleOp, resourceData, samplerData, uv, ddxCalc, + ddyCalc, texelOffsets, multisampleIndex, lodOrCompareValue, lodOrCompareValue, swizzle, + gatherChannel, GetShaderType(), m_instruction, opString, queueIndex, sampleRetType)) + return false; + + rdcarray mathOpResults; + rdcarray sampleGatherResults; + sampleGatherResults.push_back(&output); + rdcarray sampleRetTypes; + sampleRetTypes.push_back(sampleRetType); + rdcarray swizzles; + swizzles.push_back(swizzle); + + bool ret = D3D12ShaderDebug::GetQueuedResults(m_pDevice, m_QueuedOpCmdList, mathOpResults, 0, + sampleGatherResults, sampleRetTypes, swizzles); + m_QueuedOpCmdList = NULL; + + return ret; } void GatherConstantBuffers(WrappedID3D12Device *pDevice, const DXBCBytecode::Program &program, @@ -2974,11 +3046,11 @@ ShaderDebugTrace *D3D12Replay::DebugPixel(uint32_t eventId, uint32_t x, uint32_t return new ShaderDebugTrace; } - // if we encounter multiple hits at our destination pixel co-ord (or any other) we check to see if - // a specific primitive was requested (via primitive parameter not being set to ~0U). If it was, - // debug that pixel, otherwise do a best-estimate of which fragment was the last to successfully - // depth test and debug that, just by checking if the depth test is ordered and picking the final - // fragment in the series + // if we encounter multiple hits at our destination pixel co-ord (or any other) we check to see + // if a specific primitive was requested (via primitive parameter not being set to ~0U). If it + // was, debug that pixel, otherwise do a best-estimate of which fragment was the last to + // successfully depth test and debug that, just by checking if the depth test is ordered and + // picking the final fragment in the series // Get depth func and determine "winner" pixel DXDebug::DebugHit *pWinnerHit = NULL; diff --git a/renderdoc/driver/d3d12/d3d12_shaderdebug.h b/renderdoc/driver/d3d12/d3d12_shaderdebug.h index 6cf4987be..0bab0e63e 100644 --- a/renderdoc/driver/d3d12/d3d12_shaderdebug.h +++ b/renderdoc/driver/d3d12/d3d12_shaderdebug.h @@ -39,18 +39,21 @@ typedef DXDebug::GatherChannel GatherChannel; typedef DXBCBytecode::SamplerMode SamplerMode; // Helpers used by DXBC and DXIL debuggers to interact with GPU and resources -bool CalculateMathIntrinsic(bool dxil, WrappedID3D12Device *device, int mathOp, - const ShaderVariable &input, ShaderVariable &output1, - ShaderVariable &output2); +bool QueueMathIntrinsic(bool dxil, WrappedID3D12Device *device, ID3D12GraphicsCommandListX *cmdList, + int mathOp, const ShaderVariable &input, const uint32_t queueIndex); -bool CalculateSampleGather(bool dxil, WrappedID3D12Device *device, int sampleOp, - SampleGatherResourceData resourceData, - SampleGatherSamplerData samplerData, const ShaderVariable &uv, - const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc, - const int8_t texelOffsets[3], int multisampleIndex, float lodValue, - float compareValue, const uint8_t swizzle[4], - GatherChannel gatherChannel, const DXBC::ShaderType shaderType, - uint32_t instruction, const char *opString, ShaderVariable &output); +bool QueueSampleGather(bool dxil, WrappedID3D12Device *device, ID3D12GraphicsCommandListX *cmdList, + int sampleOp, SampleGatherResourceData resourceData, + SampleGatherSamplerData samplerData, const ShaderVariable &uv, + const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc, + const int8_t texelOffsets[3], int multisampleIndex, float lodValue, + float compareValue, const uint8_t swizzle[4], GatherChannel gatherChannel, + const DXBC::ShaderType shaderType, uint32_t instruction, + const char *opString, const uint32_t queueIndex, int &sampleRetType); +bool GetQueuedResults(WrappedID3D12Device *device, ID3D12GraphicsCommandListX *cmdList, + rdcarray &mathOpResults, uint32_t countMathResultsPerGpuOp, + rdcarray &sampleGatherResults, + const rdcarray &sampleRetTypes, const rdcarray &swizzles); D3D12Descriptor FindDescriptor(WrappedID3D12Device *device, const DXDebug::HeapDescriptorType heapType, uint32_t descriptorIndex); diff --git a/renderdoc/driver/shaders/dxil/dxil_debug.cpp b/renderdoc/driver/shaders/dxil/dxil_debug.cpp index cd7f216b9..6f7e6d7f7 100644 --- a/renderdoc/driver/shaders/dxil/dxil_debug.cpp +++ b/renderdoc/driver/shaders/dxil/dxil_debug.cpp @@ -10482,6 +10482,7 @@ void Debugger::ProcessQueuedOps() CHECK_DEBUGGER_THREAD(); ProcessQueuedGpuMathOps(); ProcessQueuedGpuSampleGatherOps(); + SyncPendingGpuOps(); } // Must be called from the replay manager thread (the debugger thread) @@ -10508,11 +10509,14 @@ void Debugger::ProcessQueuedGpuMathOps() { if(m_QueuedGpuMathOps[lane]) { + if(!m_ApiWrapper->QueuedOpsHasSpace()) + SyncPendingGpuOps(); + m_QueuedGpuMathOps[lane] = false; const GpuMathOperation &mathOp = m_Workgroup[lane].GetQueuedGpuMathOp(); uint32_t workgroupIndex = mathOp.workgroupIndex; - if(m_ApiWrapper->CalculateMathIntrinsic(mathOp.dxOp, mathOp.input, *mathOp.result)) + if(m_ApiWrapper->QueueMathIntrinsic(mathOp.dxOp, mathOp.input)) { m_PendingGpuMathsOpsResults.push_back(mathOp.result); } @@ -10536,25 +10540,32 @@ void Debugger::ProcessQueuedGpuSampleGatherOps() { if(m_QueuedGpuSampleGatherOps[lane]) { + if(!m_ApiWrapper->QueuedOpsHasSpace()) + SyncPendingGpuOps(); + m_QueuedGpuSampleGatherOps[lane] = false; const GpuSampleGatherOperation &sampleGatherOp = m_Workgroup[lane].GetQueuedGpuSampleGatherOp(); uint32_t workgroupIndex = sampleGatherOp.workgroupIndex; ShaderVariable &result = *sampleGatherOp.result; bool hasResult = false; - if(!m_ApiWrapper->CalculateSampleGather( + int sampleRetType = 0; + if(!m_ApiWrapper->QueueSampleGather( sampleGatherOp.dxOp, sampleGatherOp.resourceData, sampleGatherOp.samplerData, sampleGatherOp.uv, sampleGatherOp.ddxCalc, sampleGatherOp.ddyCalc, sampleGatherOp.texelOffsets, sampleGatherOp.multisampleIndex, sampleGatherOp.lodValue, sampleGatherOp.compareValue, sampleGatherOp.gatherChannel, - sampleGatherOp.instructionIdx, *sampleGatherOp.result)) + sampleGatherOp.instructionIdx, sampleRetType)) { // sample failed. Pretend we got 0 columns back set0001(result); hasResult = true; } if(!hasResult) + { m_PendingGpuSampleGatherOpsResults.push_back(sampleGatherOp.result); + m_PendingGpuSampleGatherOpsSampleRetTypes.push_back(sampleRetType); + } DXIL_DEBUG_RDCASSERT(!m_PendingLanes[workgroupIndex]); m_PendingLanes[workgroupIndex] = true; @@ -10562,6 +10573,24 @@ void Debugger::ProcessQueuedGpuSampleGatherOps() } } +// Must be called from the replay manager thread (the debugger thread) +void Debugger::SyncPendingGpuOps() +{ + CHECK_DEBUGGER_THREAD(); + if(m_PendingGpuMathsOpsResults.empty() && m_PendingGpuSampleGatherOpsResults.empty()) + return; + + if(!(m_ApiWrapper->GetQueuedResults(m_PendingGpuMathsOpsResults, m_PendingGpuSampleGatherOpsResults, + m_PendingGpuSampleGatherOpsSampleRetTypes))) + { + RDCERR("GetQueuedResults failed"); + return; + } + m_PendingGpuMathsOpsResults.clear(); + m_PendingGpuSampleGatherOpsResults.clear(); + m_PendingGpuSampleGatherOpsSampleRetTypes.clear(); +} + void Debugger::SimulationJobHelper() { while(AtomicLoad(&atomic_simulationFinished) == 0) diff --git a/renderdoc/driver/shaders/dxil/dxil_debug.h b/renderdoc/driver/shaders/dxil/dxil_debug.h index f1c9bd24b..94eda3e4d 100644 --- a/renderdoc/driver/shaders/dxil/dxil_debug.h +++ b/renderdoc/driver/shaders/dxil/dxil_debug.h @@ -307,14 +307,17 @@ public: virtual UAVInfo GetUAV(const BindingSlot &slot) = 0; virtual SRVInfo GetSRV(const BindingSlot &slot) = 0; - virtual bool CalculateMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariable &input, - ShaderVariable &output) = 0; - virtual bool CalculateSampleGather(DXIL::DXOp dxOp, SampleGatherResourceData resourceData, - SampleGatherSamplerData samplerData, const ShaderVariable &uv, - const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc, - const int8_t texelOffsets[3], int multisampleIndex, - float lodValue, float compareValue, GatherChannel gatherChannel, - uint32_t instructionIdx, ShaderVariable &output) = 0; + virtual bool QueueMathIntrinsic(DXIL::DXOp dxOp, const ShaderVariable &input) = 0; + virtual bool QueueSampleGather(DXIL::DXOp dxOp, SampleGatherResourceData resourceData, + SampleGatherSamplerData samplerData, const ShaderVariable &uv, + const ShaderVariable &ddxCalc, const ShaderVariable &ddyCalc, + const int8_t texelOffsets[3], int multisampleIndex, float lodValue, + float compareValue, GatherChannel gatherChannel, + uint32_t instructionIdx, int &sampleRetType) = 0; + virtual bool GetQueuedResults(rdcarray &mathOpResults, + rdcarray &sampleGatherResults, + const rdcarray &sampleRetTypes) = 0; + virtual bool QueuedOpsHasSpace() const = 0; virtual ShaderVariable GetResourceInfo(DXIL::ResourceClass resClass, const DXDebug::BindingSlot &slot, uint32_t mipLevel) = 0; virtual ShaderVariable GetSampleInfo(DXIL::ResourceClass resClass, @@ -948,6 +951,7 @@ private: void ProcessQueuedOps(); void ProcessQueuedGpuMathOps(); void ProcessQueuedGpuSampleGatherOps(); + void SyncPendingGpuOps(); void SyncPendingLanes(); void QueueGpuMathOp(uint32_t lane); @@ -970,6 +974,7 @@ private: rdcarray m_PendingLanes; rdcarray m_PendingGpuMathsOpsResults; rdcarray m_PendingGpuSampleGatherOpsResults; + rdcarray m_PendingGpuSampleGatherOpsSampleRetTypes; // the live mutable global variables, to initialise a stack frame's live list rdcarray m_LiveGlobals;