diff --git a/renderdoc/driver/d3d12/d3d12_commands.cpp b/renderdoc/driver/d3d12/d3d12_commands.cpp index 8ac265f63..9f3a84c40 100644 --- a/renderdoc/driver/d3d12/d3d12_commands.cpp +++ b/renderdoc/driver/d3d12/d3d12_commands.cpp @@ -671,6 +671,7 @@ void WrappedID3D12CommandQueue::CheckAndFreeRayDispatches() { if(signalled >= ray.fenceValue) { + GetResourceManager()->GetRTManager()->AddDispatchTimer(ray.query); ray.Release(); } } diff --git a/renderdoc/driver/d3d12/d3d12_device.cpp b/renderdoc/driver/d3d12/d3d12_device.cpp index 2f2865327..e19215cd7 100644 --- a/renderdoc/driver/d3d12/d3d12_device.cpp +++ b/renderdoc/driver/d3d12/d3d12_device.cpp @@ -2444,8 +2444,9 @@ HRESULT WrappedID3D12Device::Present(ID3D12GraphicsCommandList *pOverlayCommandL if(D3D12_Debug_RT_Overlay() && m_UsedRT) { ASStats blasStats = {}, tlasStats = {}; + RTGPUPatchingStats gpuStats = {}; - GetResourceManager()->GetRTManager()->GatherASAgeStatistics(blasStats, tlasStats); + GetResourceManager()->GetRTManager()->GatherRTStatistics(blasStats, tlasStats, gpuStats); overlayText += " TLAS BLAS\n"; @@ -2467,6 +2468,15 @@ HRESULT WrappedID3D12Device::Present(ID3D12GraphicsCommandList *pOverlayCommandL float(blasStats.overheadBytes + tlasStats.overheadBytes) / 1048576.0f, float(blasStats.diskBytes + tlasStats.diskBytes) / 1048576.0f, blasStats.diskCached, tlasStats.diskCached); + + overlayText += StringFormat::Fmt( + "%3u BLAS input copies with %9.2f KB in %5.2f ms = %9.2f MB/s\n" + "%2u dispatches patched in %4.2f ms\n", + gpuStats.builds, float(gpuStats.buildBytes) / 1024.0f, gpuStats.totalBuildMS, + gpuStats.totalBuildMS == 0.0 + ? 0.0 + : (float(gpuStats.buildBytes) / 1048576.0f) / (gpuStats.totalBuildMS / 1024.0f), + gpuStats.dispatches, gpuStats.totalDispatchesMS); } m_TextRenderer->RenderText(list, 0.0f, 0.0f, overlayText); diff --git a/renderdoc/driver/d3d12/d3d12_manager.cpp b/renderdoc/driver/d3d12/d3d12_manager.cpp index 17afd9da2..09fb22c08 100644 --- a/renderdoc/driver/d3d12/d3d12_manager.cpp +++ b/renderdoc/driver/d3d12/d3d12_manager.cpp @@ -925,6 +925,75 @@ void D3D12RTManager::VerifyRecord(const uint64_t recordSize, byte *wrappedRecord RDCASSERT(memcmp(record.data(), unwrappedRef, record.size()) == 0); } +uint32_t D3D12RTManager::GetFreeQuery() +{ + SCOPED_LOCK(m_TimerStatsLock); + if(m_TimerQueryHeap == NULL) + { + D3D12_QUERY_HEAP_DESC timerQueryDesc; + // allow for up to 50 dispatches per frame, 500 AS builds, and assume 5 frames before we see the results + timerQueryDesc.Count = (50 + 500) * 5 * 2; + timerQueryDesc.NodeMask = 1; + timerQueryDesc.Type = D3D12_QUERY_HEAP_TYPE_TIMESTAMP; + HRESULT hr = m_wrappedDevice->GetReal()->CreateQueryHeap( + &timerQueryDesc, __uuidof(ID3D12QueryHeap), (void **)&m_TimerQueryHeap); + CHECK_HR(m_wrappedDevice, hr); + if(FAILED(hr)) + RDCERR("Failed to create timer query heap HRESULT: %s", ToStr(hr).c_str()); + + m_GPUBufferAllocator.Alloc(D3D12GpuBufferHeapType::ReadBackHeap, + D3D12GpuBufferHeapMemoryFlag::Default, + timerQueryDesc.Count * sizeof(UINT64), 64, &m_TimerReadbackBuffer); + + if(m_TimerReadbackBuffer && m_TimerQueryHeap) + { + m_Timestamps = (uint64_t *)m_TimerReadbackBuffer->Map(); + for(uint32_t i = 0; i < timerQueryDesc.Count; i += 2) + m_FreeQueries.push_back(i); + } + + m_wrappedDevice->GetQueue()->GetTimestampFrequency(&m_TimerFrequency); + } + + if(!m_FreeQueries.empty()) + return m_FreeQueries.takeAt(m_FreeQueries.size() - 1); + return ~0U; +} + +void D3D12RTManager::AddDispatchTimer(uint32_t q) +{ + // could track this maybe, for now drop it on the floor + if(q == ~0U) + return; + + uint64_t *timestamps = m_Timestamps + q; + + { + SCOPED_LOCK(m_TimerStatsLock); + m_AccumulatedStats.dispatches++; + m_AccumulatedStats.totalDispatchesMS += + ((timestamps[1] - timestamps[0]) / double(m_TimerFrequency)) * 1024.0; + m_FreeQueries.push_back(q); + } +} + +void D3D12RTManager::AddBuildTimer(uint32_t q, uint64_t size) +{ + if(q == ~0U) + return; + + uint64_t *timestamps = m_Timestamps + q; + + { + SCOPED_LOCK(m_TimerStatsLock); + m_AccumulatedStats.builds++; + m_AccumulatedStats.buildBytes += size; + m_AccumulatedStats.totalBuildMS += + ((timestamps[1] - timestamps[0]) / double(m_TimerFrequency)) * 1024.0; + m_FreeQueries.push_back(q); + } +} + void D3D12RTManager::AddPendingASBuilds(ID3D12Fence *fence, UINT64 waitValue, const rdcarray> &callbacks) { @@ -1063,10 +1132,14 @@ void D3D12RTManager::CheckPendingASBuilds() m_PendingASBuilds.removeIf([](const PendingASBuild &build) { return build.fence == NULL; }); } -void D3D12RTManager::GatherASAgeStatistics(ASStats &blasAges, ASStats &tlasAges) +void D3D12RTManager::GatherRTStatistics(ASStats &blasAges, ASStats &tlasAges, + RTGPUPatchingStats &gpuStats) { double now = m_Timestamp.GetMilliseconds(); + gpuStats = m_AccumulatedStats; + m_AccumulatedStats = {}; + SCOPED_LOCK(m_ASBuildDataLock); blasAges.bucket[0].msThreshold = tlasAges.bucket[0].msThreshold = 50; @@ -1129,6 +1202,13 @@ PatchedRayDispatch D3D12RTManager::PatchRayDispatch(ID3D12GraphicsCommandList4 * D3D12MarkerRegion region(unwrappedCmd, "PatchRayDispatch"); + ret.resources.query = GetFreeQuery(); + + if(ret.resources.query != ~0U) + { + unwrappedCmd->EndQuery(m_TimerQueryHeap, D3D12_QUERY_TYPE_TIMESTAMP, ret.resources.query); + } + PrepareRayDispatchBuffer(NULL); D3D12GpuBuffer *scratchBuffer = NULL; @@ -1344,6 +1424,15 @@ PatchedRayDispatch D3D12RTManager::PatchRayDispatch(ID3D12GraphicsCommandList4 * ret.resources.argumentBuffer = NULL; + if(ret.resources.query != ~0U) + { + unwrappedCmd->EndQuery(m_TimerQueryHeap, D3D12_QUERY_TYPE_TIMESTAMP, ret.resources.query + 1); + unwrappedCmd->ResolveQueryData( + m_TimerQueryHeap, D3D12_QUERY_TYPE_TIMESTAMP, ret.resources.query, 2, + m_TimerReadbackBuffer->Resource(), + m_TimerReadbackBuffer->Offset() + sizeof(uint64_t) * ret.resources.query); + } + return ret; } @@ -1617,6 +1706,13 @@ ASBuildData *D3D12RTManager::CopyBuildInputs( ret->timestamp = m_Timestamp.GetMilliseconds(); ret->rtManager = this; + ret->query = GetFreeQuery(); + + if(ret->query != ~0U) + { + unwrappedCmd->EndQuery(m_TimerQueryHeap, D3D12_QUERY_TYPE_TIMESTAMP, ret->query); + } + if(inputs.Type == D3D12_RAYTRACING_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL) { ret->NumBLAS = inputs.NumDescs; @@ -1908,6 +2004,14 @@ ASBuildData *D3D12RTManager::CopyBuildInputs( m_InMemASBuildDatas.push_back(ret); } + if(ret->query != ~0U) + { + unwrappedCmd->EndQuery(m_TimerQueryHeap, D3D12_QUERY_TYPE_TIMESTAMP, ret->query + 1); + unwrappedCmd->ResolveQueryData(m_TimerQueryHeap, D3D12_QUERY_TYPE_TIMESTAMP, ret->query, 2, + m_TimerReadbackBuffer->Resource(), + m_TimerReadbackBuffer->Offset() + sizeof(uint64_t) * ret->query); + } + return ret; } @@ -3244,6 +3348,12 @@ void D3D12GpuBuffer::Release() } } +void ASBuildData::MarkWorkComplete() +{ + complete = true; + rtManager->AddBuildTimer(query, buffer ? buffer->Size() : 0); +} + void ASBuildData::AddRef() { InterlockedIncrement(&m_RefCount); diff --git a/renderdoc/driver/d3d12/d3d12_manager.h b/renderdoc/driver/d3d12/d3d12_manager.h index f02a17b9b..f962da22a 100644 --- a/renderdoc/driver/d3d12/d3d12_manager.h +++ b/renderdoc/driver/d3d12/d3d12_manager.h @@ -1034,6 +1034,8 @@ struct PatchedRayDispatch D3D12GpuBuffer *readbackBuffer; + uint32_t query; + // for convenience, when these resources are referenced in a queue they get a fence value to // indicate when they're safe to release. This values are unset when returned from patching or // referenced in the list and is set in each queue's copy of the references. @@ -1077,6 +1079,16 @@ struct ASStats uint32_t diskCached; }; +struct RTGPUPatchingStats +{ + uint32_t builds; + uint64_t buildBytes; + double totalBuildMS; + + uint32_t dispatches; + double totalDispatchesMS; +}; + // this is a refcounted GPU buffer with the build data, together with the metadata struct ASBuildData { @@ -1144,7 +1156,7 @@ struct ASBuildData // geometry GPU addresses have been de-based to contain only offsets rdcarray geoms; - void MarkWorkComplete() { complete = true; } + void MarkWorkComplete(); bool IsWorkComplete() const { return complete; } void AddRef(); @@ -1153,6 +1165,7 @@ struct ASBuildData D3D12GpuBuffer *buffer = NULL; rdcstr filename; uint64_t bytesOnDisk = 0; + uint32_t query = 0; std::function cleanupCallback; @@ -1205,6 +1218,7 @@ public: SAFE_RELEASE(m_RayPatchingData.indirectComSig); SAFE_RELEASE(m_RayPatchingData.indirectPrepPipe); SAFE_RELEASE(m_RayPatchingData.indirectPrepRootSig); + SAFE_RELEASE(m_TimerQueryHeap); } void InitInternalResources(); @@ -1227,7 +1241,7 @@ public: m_DiskCachedASBuildDatas.removeOne(data); } - void GatherASAgeStatistics(ASStats &blasAges, ASStats &tlasAges); + void GatherRTStatistics(ASStats &blasAges, ASStats &tlasAges, RTGPUPatchingStats &gpuStats); D3D12GpuBuffer *UnrollBLASInstancesList( ID3D12GraphicsCommandList4 *unwrappedCmd, @@ -1267,6 +1281,9 @@ public: void VerifyRecord(const uint64_t recordSize, byte *table, byte *ref, WrappedID3D12DescriptorHeap *resHeap, WrappedID3D12DescriptorHeap *sampHeap); + void AddDispatchTimer(uint32_t q); + void AddBuildTimer(uint32_t q, uint64_t size); + private: void InitRayDispatchPatchingResources(); void InitTLASInstanceCopyingResources(); @@ -1327,6 +1344,16 @@ private: ID3D12CommandSignature *indirectComSig = NULL; } m_RayPatchingData; + ID3D12QueryHeap *m_TimerQueryHeap = NULL; + D3D12GpuBuffer *m_TimerReadbackBuffer = NULL; + uint64_t *m_Timestamps = NULL; + uint64_t m_TimerFrequency; + Threading::CriticalSection m_TimerStatsLock; + rdcarray m_FreeQueries; + RTGPUPatchingStats m_AccumulatedStats = {}; + + uint32_t GetFreeQuery(); + struct PendingASBuild { ID3D12Fence *fence;