Add stat tracking of GPU overhead during capture

This commit is contained in:
baldurk
2024-11-22 18:13:20 +00:00
parent 9bff5fe29f
commit f6f4bc9558
4 changed files with 152 additions and 4 deletions
@@ -671,6 +671,7 @@ void WrappedID3D12CommandQueue::CheckAndFreeRayDispatches()
{
if(signalled >= ray.fenceValue)
{
GetResourceManager()->GetRTManager()->AddDispatchTimer(ray.query);
ray.Release();
}
}
+11 -1
View File
@@ -2444,8 +2444,9 @@ HRESULT WrappedID3D12Device::Present(ID3D12GraphicsCommandList *pOverlayCommandL
if(D3D12_Debug_RT_Overlay() && m_UsedRT)
{
ASStats blasStats = {}, tlasStats = {};
RTGPUPatchingStats gpuStats = {};
GetResourceManager()->GetRTManager()->GatherASAgeStatistics(blasStats, tlasStats);
GetResourceManager()->GetRTManager()->GatherRTStatistics(blasStats, tlasStats, gpuStats);
overlayText += " TLAS BLAS\n";
@@ -2467,6 +2468,15 @@ HRESULT WrappedID3D12Device::Present(ID3D12GraphicsCommandList *pOverlayCommandL
float(blasStats.overheadBytes + tlasStats.overheadBytes) / 1048576.0f,
float(blasStats.diskBytes + tlasStats.diskBytes) / 1048576.0f, blasStats.diskCached,
tlasStats.diskCached);
overlayText += StringFormat::Fmt(
"%3u BLAS input copies with %9.2f KB in %5.2f ms = %9.2f MB/s\n"
"%2u dispatches patched in %4.2f ms\n",
gpuStats.builds, float(gpuStats.buildBytes) / 1024.0f, gpuStats.totalBuildMS,
gpuStats.totalBuildMS == 0.0
? 0.0
: (float(gpuStats.buildBytes) / 1048576.0f) / (gpuStats.totalBuildMS / 1024.0f),
gpuStats.dispatches, gpuStats.totalDispatchesMS);
}
m_TextRenderer->RenderText(list, 0.0f, 0.0f, overlayText);
+111 -1
View File
@@ -925,6 +925,75 @@ void D3D12RTManager::VerifyRecord(const uint64_t recordSize, byte *wrappedRecord
RDCASSERT(memcmp(record.data(), unwrappedRef, record.size()) == 0);
}
uint32_t D3D12RTManager::GetFreeQuery()
{
SCOPED_LOCK(m_TimerStatsLock);
if(m_TimerQueryHeap == NULL)
{
D3D12_QUERY_HEAP_DESC timerQueryDesc;
// allow for up to 50 dispatches per frame, 500 AS builds, and assume 5 frames before we see the results
timerQueryDesc.Count = (50 + 500) * 5 * 2;
timerQueryDesc.NodeMask = 1;
timerQueryDesc.Type = D3D12_QUERY_HEAP_TYPE_TIMESTAMP;
HRESULT hr = m_wrappedDevice->GetReal()->CreateQueryHeap(
&timerQueryDesc, __uuidof(ID3D12QueryHeap), (void **)&m_TimerQueryHeap);
CHECK_HR(m_wrappedDevice, hr);
if(FAILED(hr))
RDCERR("Failed to create timer query heap HRESULT: %s", ToStr(hr).c_str());
m_GPUBufferAllocator.Alloc(D3D12GpuBufferHeapType::ReadBackHeap,
D3D12GpuBufferHeapMemoryFlag::Default,
timerQueryDesc.Count * sizeof(UINT64), 64, &m_TimerReadbackBuffer);
if(m_TimerReadbackBuffer && m_TimerQueryHeap)
{
m_Timestamps = (uint64_t *)m_TimerReadbackBuffer->Map();
for(uint32_t i = 0; i < timerQueryDesc.Count; i += 2)
m_FreeQueries.push_back(i);
}
m_wrappedDevice->GetQueue()->GetTimestampFrequency(&m_TimerFrequency);
}
if(!m_FreeQueries.empty())
return m_FreeQueries.takeAt(m_FreeQueries.size() - 1);
return ~0U;
}
void D3D12RTManager::AddDispatchTimer(uint32_t q)
{
// could track this maybe, for now drop it on the floor
if(q == ~0U)
return;
uint64_t *timestamps = m_Timestamps + q;
{
SCOPED_LOCK(m_TimerStatsLock);
m_AccumulatedStats.dispatches++;
m_AccumulatedStats.totalDispatchesMS +=
((timestamps[1] - timestamps[0]) / double(m_TimerFrequency)) * 1024.0;
m_FreeQueries.push_back(q);
}
}
void D3D12RTManager::AddBuildTimer(uint32_t q, uint64_t size)
{
if(q == ~0U)
return;
uint64_t *timestamps = m_Timestamps + q;
{
SCOPED_LOCK(m_TimerStatsLock);
m_AccumulatedStats.builds++;
m_AccumulatedStats.buildBytes += size;
m_AccumulatedStats.totalBuildMS +=
((timestamps[1] - timestamps[0]) / double(m_TimerFrequency)) * 1024.0;
m_FreeQueries.push_back(q);
}
}
void D3D12RTManager::AddPendingASBuilds(ID3D12Fence *fence, UINT64 waitValue,
const rdcarray<std::function<bool()>> &callbacks)
{
@@ -1063,10 +1132,14 @@ void D3D12RTManager::CheckPendingASBuilds()
m_PendingASBuilds.removeIf([](const PendingASBuild &build) { return build.fence == NULL; });
}
void D3D12RTManager::GatherASAgeStatistics(ASStats &blasAges, ASStats &tlasAges)
void D3D12RTManager::GatherRTStatistics(ASStats &blasAges, ASStats &tlasAges,
RTGPUPatchingStats &gpuStats)
{
double now = m_Timestamp.GetMilliseconds();
gpuStats = m_AccumulatedStats;
m_AccumulatedStats = {};
SCOPED_LOCK(m_ASBuildDataLock);
blasAges.bucket[0].msThreshold = tlasAges.bucket[0].msThreshold = 50;
@@ -1129,6 +1202,13 @@ PatchedRayDispatch D3D12RTManager::PatchRayDispatch(ID3D12GraphicsCommandList4 *
D3D12MarkerRegion region(unwrappedCmd, "PatchRayDispatch");
ret.resources.query = GetFreeQuery();
if(ret.resources.query != ~0U)
{
unwrappedCmd->EndQuery(m_TimerQueryHeap, D3D12_QUERY_TYPE_TIMESTAMP, ret.resources.query);
}
PrepareRayDispatchBuffer(NULL);
D3D12GpuBuffer *scratchBuffer = NULL;
@@ -1344,6 +1424,15 @@ PatchedRayDispatch D3D12RTManager::PatchRayDispatch(ID3D12GraphicsCommandList4 *
ret.resources.argumentBuffer = NULL;
if(ret.resources.query != ~0U)
{
unwrappedCmd->EndQuery(m_TimerQueryHeap, D3D12_QUERY_TYPE_TIMESTAMP, ret.resources.query + 1);
unwrappedCmd->ResolveQueryData(
m_TimerQueryHeap, D3D12_QUERY_TYPE_TIMESTAMP, ret.resources.query, 2,
m_TimerReadbackBuffer->Resource(),
m_TimerReadbackBuffer->Offset() + sizeof(uint64_t) * ret.resources.query);
}
return ret;
}
@@ -1617,6 +1706,13 @@ ASBuildData *D3D12RTManager::CopyBuildInputs(
ret->timestamp = m_Timestamp.GetMilliseconds();
ret->rtManager = this;
ret->query = GetFreeQuery();
if(ret->query != ~0U)
{
unwrappedCmd->EndQuery(m_TimerQueryHeap, D3D12_QUERY_TYPE_TIMESTAMP, ret->query);
}
if(inputs.Type == D3D12_RAYTRACING_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL)
{
ret->NumBLAS = inputs.NumDescs;
@@ -1908,6 +2004,14 @@ ASBuildData *D3D12RTManager::CopyBuildInputs(
m_InMemASBuildDatas.push_back(ret);
}
if(ret->query != ~0U)
{
unwrappedCmd->EndQuery(m_TimerQueryHeap, D3D12_QUERY_TYPE_TIMESTAMP, ret->query + 1);
unwrappedCmd->ResolveQueryData(m_TimerQueryHeap, D3D12_QUERY_TYPE_TIMESTAMP, ret->query, 2,
m_TimerReadbackBuffer->Resource(),
m_TimerReadbackBuffer->Offset() + sizeof(uint64_t) * ret->query);
}
return ret;
}
@@ -3244,6 +3348,12 @@ void D3D12GpuBuffer::Release()
}
}
void ASBuildData::MarkWorkComplete()
{
complete = true;
rtManager->AddBuildTimer(query, buffer ? buffer->Size() : 0);
}
void ASBuildData::AddRef()
{
InterlockedIncrement(&m_RefCount);
+29 -2
View File
@@ -1034,6 +1034,8 @@ struct PatchedRayDispatch
D3D12GpuBuffer *readbackBuffer;
uint32_t query;
// for convenience, when these resources are referenced in a queue they get a fence value to
// indicate when they're safe to release. This values are unset when returned from patching or
// referenced in the list and is set in each queue's copy of the references.
@@ -1077,6 +1079,16 @@ struct ASStats
uint32_t diskCached;
};
struct RTGPUPatchingStats
{
uint32_t builds;
uint64_t buildBytes;
double totalBuildMS;
uint32_t dispatches;
double totalDispatchesMS;
};
// this is a refcounted GPU buffer with the build data, together with the metadata
struct ASBuildData
{
@@ -1144,7 +1156,7 @@ struct ASBuildData
// geometry GPU addresses have been de-based to contain only offsets
rdcarray<RTGeometryDesc> geoms;
void MarkWorkComplete() { complete = true; }
void MarkWorkComplete();
bool IsWorkComplete() const { return complete; }
void AddRef();
@@ -1153,6 +1165,7 @@ struct ASBuildData
D3D12GpuBuffer *buffer = NULL;
rdcstr filename;
uint64_t bytesOnDisk = 0;
uint32_t query = 0;
std::function<bool()> cleanupCallback;
@@ -1205,6 +1218,7 @@ public:
SAFE_RELEASE(m_RayPatchingData.indirectComSig);
SAFE_RELEASE(m_RayPatchingData.indirectPrepPipe);
SAFE_RELEASE(m_RayPatchingData.indirectPrepRootSig);
SAFE_RELEASE(m_TimerQueryHeap);
}
void InitInternalResources();
@@ -1227,7 +1241,7 @@ public:
m_DiskCachedASBuildDatas.removeOne(data);
}
void GatherASAgeStatistics(ASStats &blasAges, ASStats &tlasAges);
void GatherRTStatistics(ASStats &blasAges, ASStats &tlasAges, RTGPUPatchingStats &gpuStats);
D3D12GpuBuffer *UnrollBLASInstancesList(
ID3D12GraphicsCommandList4 *unwrappedCmd,
@@ -1267,6 +1281,9 @@ public:
void VerifyRecord(const uint64_t recordSize, byte *table, byte *ref,
WrappedID3D12DescriptorHeap *resHeap, WrappedID3D12DescriptorHeap *sampHeap);
void AddDispatchTimer(uint32_t q);
void AddBuildTimer(uint32_t q, uint64_t size);
private:
void InitRayDispatchPatchingResources();
void InitTLASInstanceCopyingResources();
@@ -1327,6 +1344,16 @@ private:
ID3D12CommandSignature *indirectComSig = NULL;
} m_RayPatchingData;
ID3D12QueryHeap *m_TimerQueryHeap = NULL;
D3D12GpuBuffer *m_TimerReadbackBuffer = NULL;
uint64_t *m_Timestamps = NULL;
uint64_t m_TimerFrequency;
Threading::CriticalSection m_TimerStatsLock;
rdcarray<uint32_t> m_FreeQueries;
RTGPUPatchingStats m_AccumulatedStats = {};
uint32_t GetFreeQuery();
struct PendingASBuild
{
ID3D12Fence *fence;