From 0b187934de64033fefd2d2944febed32c3dd00d1 Mon Sep 17 00:00:00 2001 From: baldurk Date: Fri, 22 Jul 2022 16:09:46 +0100 Subject: [PATCH] Track bytes written for hidden SO counters. Closes #2662 * The D3D11 spec is unclear but checking sources including D3D11On12 it looks like the defined behaviour for SO counters is to calculate the number of bytes written and divide that by the VB stride on draw. * Old captures can't be updated to work with this because the stride is unknown, but new captures will work correctly as well as any data that is stream'd out mid-capture. --- renderdoc/driver/d3d11/d3d11_common.cpp | 4 + renderdoc/driver/d3d11/d3d11_context.cpp | 52 ++++-- renderdoc/driver/d3d11/d3d11_context.h | 13 +- renderdoc/driver/d3d11/d3d11_context_wrap.cpp | 171 ++++++++++++++---- renderdoc/driver/d3d11/d3d11_device.cpp | 3 + renderdoc/driver/d3d11/d3d11_device.h | 21 ++- renderdoc/driver/d3d11/d3d11_device_wrap.cpp | 68 +++++++ .../driver/shaders/dxbc/dxbc_bytecode.cpp | 41 +++++ renderdoc/driver/shaders/dxbc/dxbc_bytecode.h | 1 + .../driver/shaders/dxbc/dxbc_container.cpp | 28 +++ .../driver/shaders/dxbc/dxbc_container.h | 1 + util/test/demos/d3d11/d3d11_stream_out.cpp | 40 +++- 12 files changed, 373 insertions(+), 70 deletions(-) diff --git a/renderdoc/driver/d3d11/d3d11_common.cpp b/renderdoc/driver/d3d11/d3d11_common.cpp index 214b1462b..cc8657c12 100644 --- a/renderdoc/driver/d3d11/d3d11_common.cpp +++ b/renderdoc/driver/d3d11/d3d11_common.cpp @@ -524,6 +524,10 @@ bool D3D11InitParams::IsSupportedVersion(uint64_t ver) if(ver == 0x11) return true; + // 0x12 -> 0x13 - added stride from stream-out to hidden counter data + if(ver == 0x12) + return true; + return false; } diff --git a/renderdoc/driver/d3d11/d3d11_context.cpp b/renderdoc/driver/d3d11/d3d11_context.cpp index a06fe91ff..13465d7f1 100644 --- a/renderdoc/driver/d3d11/d3d11_context.cpp +++ b/renderdoc/driver/d3d11/d3d11_context.cpp @@ -213,11 +213,6 @@ WrappedID3D11DeviceContext::~WrappedID3D11DeviceContext() if(m_pRealContext && GetType() != D3D11_DEVICE_CONTEXT_IMMEDIATE) m_pDevice->RemoveDeferredContext(this); - for(auto it = m_StreamOutCounters.begin(); it != m_StreamOutCounters.end(); ++it) - { - SAFE_RELEASE(it->second.query); - } - SAFE_DELETE(m_FrameReader); SAFE_RELEASE(m_WrappedVideo.m_pReal); @@ -266,6 +261,7 @@ struct HiddenCounter { ResourceId id; uint64_t counterValue; + uint32_t stride; }; DECLARE_REFLECTION_STRUCT(HiddenCounter); @@ -275,6 +271,14 @@ void DoSerialise(SerialiserType &ser, HiddenCounter &el) { SERIALISE_MEMBER(id); SERIALISE_MEMBER(counterValue); + if(ser.VersionAtLeast(0x13)) + { + SERIALISE_MEMBER(stride); + } + else + { + el.stride = 0; + } } template @@ -316,12 +320,12 @@ bool WrappedID3D11DeviceContext::Serialise_BeginCaptureFrame(SerialiserType &ser if(buf) { - ResourceId id = GetIDForDeviceChild(buf); + StreamOutData &so = m_pDevice->GetSOHiddenCounterForBuffer(GetIDForDeviceChild(buf)); - if(m_StreamOutCounters[id].running) + if(so.running) { - m_pRealContext->End(m_StreamOutCounters[id].query); - m_StreamOutCounters[id].running = false; + m_pRealContext->End(so.query); + so.running = false; } restart[b] = true; @@ -331,7 +335,8 @@ bool WrappedID3D11DeviceContext::Serialise_BeginCaptureFrame(SerialiserType &ser D3D11_QUERY_DATA_SO_STATISTICS numPrims; // readback all known counters - for(auto it = m_StreamOutCounters.begin(); it != m_StreamOutCounters.end(); ++it) + for(auto it = m_pDevice->GetSOHiddenCounters().begin(); + it != m_pDevice->GetSOHiddenCounters().end(); ++it) { RDCEraseEl(numPrims); @@ -350,7 +355,12 @@ bool WrappedID3D11DeviceContext::Serialise_BeginCaptureFrame(SerialiserType &ser ToStr(it->first).c_str()); } - HiddenStreamOutCounters.push_back({it->first, (uint64_t)numPrims.NumPrimitivesWritten}); + HiddenCounter h; + h.id = it->first; + h.counterValue = (uint64_t)numPrims.NumPrimitivesWritten; + h.stride = it->second.stride; + + HiddenStreamOutCounters.push_back(h); } // restart any counters we were forced to stop @@ -360,10 +370,10 @@ bool WrappedID3D11DeviceContext::Serialise_BeginCaptureFrame(SerialiserType &ser if(buf && restart[b]) { - ResourceId id = GetIDForDeviceChild(buf); + StreamOutData &so = m_pDevice->GetSOHiddenCounterForBuffer(GetIDForDeviceChild(buf)); // release any previous query as the hidden counter is overwritten - SAFE_RELEASE(m_StreamOutCounters[id].query); + SAFE_RELEASE(so.query); D3D11_QUERY queryTypes[] = { D3D11_QUERY_SO_STATISTICS_STREAM0, D3D11_QUERY_SO_STATISTICS_STREAM1, @@ -374,10 +384,12 @@ bool WrappedID3D11DeviceContext::Serialise_BeginCaptureFrame(SerialiserType &ser qdesc.MiscFlags = 0; qdesc.Query = queryTypes[b]; - m_pDevice->GetReal()->CreateQuery(&qdesc, &m_StreamOutCounters[id].query); + m_pDevice->GetReal()->CreateQuery(&qdesc, &so.query); - m_pRealContext->Begin(m_StreamOutCounters[id].query); - m_StreamOutCounters[id].running = true; + m_pRealContext->Begin(so.query); + so.running = true; + + // stride doesn't change as the shader hasn't changed } } } @@ -406,8 +418,12 @@ bool WrappedID3D11DeviceContext::Serialise_BeginCaptureFrame(SerialiserType &ser for(const HiddenCounter &c : HiddenStreamOutCounters) { if(m_pDevice->GetResourceManager()->HasLiveResource(c.id)) - m_StreamOutCounters[m_pDevice->GetResourceManager()->GetLiveID(c.id)].numPrims = - c.counterValue; + { + StreamOutData &so = + m_pDevice->GetSOHiddenCounterForBuffer(m_pDevice->GetResourceManager()->GetLiveID(c.id)); + so.numPrims = c.counterValue; + so.stride = c.stride; + } } } diff --git a/renderdoc/driver/d3d11/d3d11_context.h b/renderdoc/driver/d3d11/d3d11_context.h index bfdb335d5..bc9474e2b 100644 --- a/renderdoc/driver/d3d11/d3d11_context.h +++ b/renderdoc/driver/d3d11/d3d11_context.h @@ -125,17 +125,7 @@ private: std::set m_HighTrafficResources; std::map m_OpenMaps; - struct StreamOutData - { - StreamOutData() : query(NULL), running(false), numPrims(0) {} - ID3D11Query *query; - bool running; - uint64_t numPrims; - }; - - std::map m_StreamOutCounters; - - std::map > m_ResourceUses; + std::map> m_ResourceUses; WrappedID3D11Device *m_pDevice; ID3D11DeviceContext *m_pRealContext; @@ -228,6 +218,7 @@ private: void Serialise_DebugMessages(SerialiserType &ser); void DrainAnnotationQueue(); + void LatchSOProperties(); void AddUsage(const ActionDescription &a); diff --git a/renderdoc/driver/d3d11/d3d11_context_wrap.cpp b/renderdoc/driver/d3d11/d3d11_context_wrap.cpp index c70377854..3b19ae983 100644 --- a/renderdoc/driver/d3d11/d3d11_context_wrap.cpp +++ b/renderdoc/driver/d3d11/d3d11_context_wrap.cpp @@ -2209,12 +2209,14 @@ bool WrappedID3D11DeviceContext::Serialise_SOSetTargets(SerialiserType &ser, UIN if(buf) { + StreamOutData &so = m_pDevice->GetSOHiddenCounterForBuffer(GetIDForDeviceChild(buf)); + ResourceId id = GetIDForDeviceChild(buf); - if(m_StreamOutCounters[id].running) + if(so.running) { - m_pRealContext->End(m_StreamOutCounters[id].query); - m_StreamOutCounters[id].running = false; + m_pRealContext->End(so.query); + so.running = false; } } } @@ -2226,10 +2228,10 @@ bool WrappedID3D11DeviceContext::Serialise_SOSetTargets(SerialiserType &ser, UIN if(buf) { - ResourceId id = GetIDForDeviceChild(buf); + StreamOutData &so = m_pDevice->GetSOHiddenCounterForBuffer(GetIDForDeviceChild(buf)); // release any previous query as the hidden counter is overwritten - SAFE_RELEASE(m_StreamOutCounters[id].query); + SAFE_RELEASE(so.query); D3D11_QUERY queryTypes[] = { D3D11_QUERY_SO_STATISTICS_STREAM0, D3D11_QUERY_SO_STATISTICS_STREAM1, @@ -2240,13 +2242,18 @@ bool WrappedID3D11DeviceContext::Serialise_SOSetTargets(SerialiserType &ser, UIN qdesc.MiscFlags = 0; qdesc.Query = queryTypes[b]; - HRESULT hr = m_pDevice->GetReal()->CreateQuery(&qdesc, &m_StreamOutCounters[id].query); + HRESULT hr = m_pDevice->GetReal()->CreateQuery(&qdesc, &so.query); if(FAILED(hr)) RDCERR("Couldn't create streamout query: %s", ToStr(hr).c_str()); - m_pRealContext->Begin(m_StreamOutCounters[id].query); - m_StreamOutCounters[id].running = true; + m_pRealContext->Begin(so.query); + so.running = true; + + // since we don't know the binding order (SO targets before GS, or GS before SO targets) + // we'll set this to 0 now and latch it at draw time. We assume these don't change over the + // course of the stream out. + so.stride = 0; } } @@ -2321,12 +2328,12 @@ void WrappedID3D11DeviceContext::SOSetTargets(UINT NumBuffers, ID3D11Buffer *con if(buf) { - ResourceId id = GetIDForDeviceChild(buf); + StreamOutData &so = m_pDevice->GetSOHiddenCounterForBuffer(GetIDForDeviceChild(buf)); - if(m_StreamOutCounters[id].running) + if(so.running) { - m_pRealContext->End(m_StreamOutCounters[id].query); - m_StreamOutCounters[id].running = false; + m_pRealContext->End(so.query); + so.running = false; } } } @@ -2338,10 +2345,10 @@ void WrappedID3D11DeviceContext::SOSetTargets(UINT NumBuffers, ID3D11Buffer *con if(buf) { - ResourceId id = GetIDForDeviceChild(buf); + StreamOutData &so = m_pDevice->GetSOHiddenCounterForBuffer(GetIDForDeviceChild(buf)); // release any previous query as the hidden counter is overwritten - SAFE_RELEASE(m_StreamOutCounters[id].query); + SAFE_RELEASE(so.query); D3D11_QUERY queryTypes[] = { D3D11_QUERY_SO_STATISTICS_STREAM0, D3D11_QUERY_SO_STATISTICS_STREAM1, @@ -2352,10 +2359,15 @@ void WrappedID3D11DeviceContext::SOSetTargets(UINT NumBuffers, ID3D11Buffer *con qdesc.MiscFlags = 0; qdesc.Query = queryTypes[b]; - m_pDevice->GetReal()->CreateQuery(&qdesc, &m_StreamOutCounters[id].query); + m_pDevice->GetReal()->CreateQuery(&qdesc, &so.query); - m_pRealContext->Begin(m_StreamOutCounters[id].query); - m_StreamOutCounters[id].running = true; + m_pRealContext->Begin(so.query); + so.running = true; + + // since we don't know the binding order (SO targets before GS, or GS before SO targets) + // we'll set this to 0 now and latch it at draw time. We assume these don't change over the + // course of the stream out. + so.stride = 0; } } @@ -3795,6 +3807,27 @@ void WrappedID3D11DeviceContext::Serialise_DebugMessages(SerialiserType &ser) } } +void WrappedID3D11DeviceContext::LatchSOProperties() +{ + for(UINT b = 0; b < D3D11_SO_STREAM_COUNT; b++) + { + ID3D11Buffer *buf = m_CurrentPipelineState->SO.Buffers[b]; + + if(buf) + { + StreamOutData &so = m_pDevice->GetSOHiddenCounterForBuffer(GetIDForDeviceChild(buf)); + + if(so.running && so.stride == 0) + { + const SOShaderData &shad = + m_pDevice->GetSOShaderData(GetIDForDeviceChild(m_CurrentPipelineState->GS.Object)); + + so.stride = shad.strides[b]; + } + } + } +} + template bool WrappedID3D11DeviceContext::Serialise_DrawIndexedInstanced( SerialiserType &ser, UINT IndexCountPerInstance, UINT InstanceCount, UINT StartIndexLocation, @@ -3815,6 +3848,8 @@ bool WrappedID3D11DeviceContext::Serialise_DrawIndexedInstanced( m_pRealContext->DrawIndexedInstanced(IndexCountPerInstance, InstanceCount, StartIndexLocation, BaseVertexLocation, StartInstanceLocation); + LatchSOProperties(); + if(IsLoading(m_State)) { RecordDrawStats(true, false, InstanceCount); @@ -3853,6 +3888,8 @@ void WrappedID3D11DeviceContext::DrawIndexedInstanced(UINT IndexCountPerInstance StartIndexLocation, BaseVertexLocation, StartInstanceLocation)); + LatchSOProperties(); + if(IsActiveCapturing(m_State)) { USE_SCRATCH_SERIALISER(); @@ -3888,6 +3925,8 @@ bool WrappedID3D11DeviceContext::Serialise_DrawInstanced(SerialiserType &ser, m_pRealContext->DrawInstanced(VertexCountPerInstance, InstanceCount, StartVertexLocation, StartInstanceLocation); + LatchSOProperties(); + if(IsLoading(m_State)) { RecordDrawStats(true, false, InstanceCount); @@ -3923,6 +3962,8 @@ void WrappedID3D11DeviceContext::DrawInstanced(UINT VertexCountPerInstance, UINT SERIALISE_TIME_CALL(m_pRealContext->DrawInstanced(VertexCountPerInstance, InstanceCount, StartVertexLocation, StartInstanceLocation)); + LatchSOProperties(); + if(IsActiveCapturing(m_State)) { USE_SCRATCH_SERIALISER(); @@ -3955,6 +3996,8 @@ bool WrappedID3D11DeviceContext::Serialise_DrawIndexed(SerialiserType &ser, UINT { m_pRealContext->DrawIndexed(IndexCount, StartIndexLocation, BaseVertexLocation); + LatchSOProperties(); + if(IsLoading(m_State)) { RecordDrawStats(false, false, 1); @@ -3988,6 +4031,8 @@ void WrappedID3D11DeviceContext::DrawIndexed(UINT IndexCount, UINT StartIndexLoc SERIALISE_TIME_CALL(m_pRealContext->DrawIndexed(IndexCount, StartIndexLocation, BaseVertexLocation)); + LatchSOProperties(); + if(IsActiveCapturing(m_State)) { USE_SCRATCH_SERIALISER(); @@ -4017,6 +4062,8 @@ bool WrappedID3D11DeviceContext::Serialise_Draw(SerialiserType &ser, UINT Vertex { m_pRealContext->Draw(VertexCount, StartVertexLocation); + LatchSOProperties(); + if(IsLoading(m_State)) { RecordDrawStats(false, false, 1); @@ -4048,6 +4095,8 @@ void WrappedID3D11DeviceContext::Draw(UINT VertexCount, UINT StartVertexLocation SERIALISE_TIME_CALL(m_pRealContext->Draw(VertexCount, StartVertexLocation)); + LatchSOProperties(); + if(IsActiveCapturing(m_State)) { USE_SCRATCH_SERIALISER(); @@ -4069,7 +4118,7 @@ bool WrappedID3D11DeviceContext::Serialise_DrawAuto(SerialiserType &ser) SERIALISE_CHECK_READ_ERRORS(); - uint64_t numVerts = 0; + uint64_t numVertsToDraw = 0; if(IsReplayingAndReading()) { @@ -4084,7 +4133,7 @@ bool WrappedID3D11DeviceContext::Serialise_DrawAuto(SerialiserType &ser) { ResourceId id = GetIDForDeviceChild(m_CurrentPipelineState->IA.VBs[0]); - StreamOutData &data = m_StreamOutCounters[id]; + StreamOutData &data = m_pDevice->GetSOHiddenCounterForBuffer(id); // if we have a query, the stream-out data for this DrawAuto was generated // in the captured frame, so we can do a legitimate DrawAuto() @@ -4103,27 +4152,61 @@ bool WrappedID3D11DeviceContext::Serialise_DrawAuto(SerialiserType &ser) sizeof(D3D11_QUERY_DATA_SO_STATISTICS), 0); } while(hr == S_FALSE); - if(m_CurrentPipelineState->IA.Topo == D3D11_PRIMITIVE_TOPOLOGY_POINTLIST) - numVerts = numPrims.NumPrimitivesWritten; - else if(m_CurrentPipelineState->IA.Topo == D3D11_PRIMITIVE_TOPOLOGY_LINELIST) - numVerts = numPrims.NumPrimitivesWritten * 2; + if(data.stride != 0) + { + uint64_t bytesWritten = numPrims.NumPrimitivesWritten * data.stride; + + numVertsToDraw = uint32_t((bytesWritten - m_CurrentPipelineState->IA.Offsets[0]) / + m_CurrentPipelineState->IA.Strides[0]); + } else - numVerts = numPrims.NumPrimitivesWritten * 3; + { + RDCERR("Unexpected 0 stride on DrawAuto, no SO shader bound properly?"); + + // fallback to the mostly-accurate estimate + + if(m_CurrentPipelineState->IA.Topo == D3D11_PRIMITIVE_TOPOLOGY_POINTLIST) + numVertsToDraw = numPrims.NumPrimitivesWritten; + else if(m_CurrentPipelineState->IA.Topo == D3D11_PRIMITIVE_TOPOLOGY_LINELIST) + numVertsToDraw = numPrims.NumPrimitivesWritten * 2; + else + numVertsToDraw = numPrims.NumPrimitivesWritten * 3; + } m_pRealContext->DrawAuto(); } else { // otherwise use the cached value from the previous frame. + // in older captures we only stored the number of primitives, so use the old behaviour of + // taking the current topology and assuming it's the same, so behaviour doesn't change. + // newer captures store enough information that we can do a proper byte-wise calculation - if(m_CurrentPipelineState->IA.Topo == D3D11_PRIMITIVE_TOPOLOGY_POINTLIST) - numVerts = data.numPrims; - else if(m_CurrentPipelineState->IA.Topo == D3D11_PRIMITIVE_TOPOLOGY_LINELIST) - numVerts = data.numPrims * 2; + if(data.stride != 0) + { + uint64_t bytesWritten = data.numPrims * data.stride; + + numVertsToDraw = uint32_t((bytesWritten - m_CurrentPipelineState->IA.Offsets[0]) / + m_CurrentPipelineState->IA.Strides[0]); + } else - numVerts = data.numPrims * 3; + { + m_pDevice->AddDebugMessage(MessageCategory::Execution, MessageSeverity::High, + MessageSource::IncorrectAPIUse, + "Call to DrawAuto may be inaccurate if topology or vertex " + "stride has changed between stream-out and draw.\n" + "Recapture with this version of RenderDoc to fix this " + "problem, this capture was created with an older version."); - m_pRealContext->Draw((UINT)numVerts, 0); + if(m_CurrentPipelineState->IA.Topo == D3D11_PRIMITIVE_TOPOLOGY_POINTLIST) + numVertsToDraw = data.numPrims; + else if(m_CurrentPipelineState->IA.Topo == D3D11_PRIMITIVE_TOPOLOGY_LINELIST) + numVertsToDraw = data.numPrims * 2; + else + numVertsToDraw = data.numPrims * 3; + } + + m_pRealContext->Draw((UINT)numVertsToDraw, 0); } } @@ -4134,9 +4217,9 @@ bool WrappedID3D11DeviceContext::Serialise_DrawAuto(SerialiserType &ser) AddEvent(); ActionDescription action; - action.customName = StringFormat::Fmt("DrawAuto(<%u>)", numVerts); + action.customName = StringFormat::Fmt("DrawAuto(<%u>)", numVertsToDraw); action.flags |= ActionFlags::Drawcall | ActionFlags::Auto; - action.numIndices = (uint32_t)numVerts; + action.numIndices = (uint32_t)numVertsToDraw; action.vertexOffset = 0; action.indexOffset = 0; action.instanceOffset = 0; @@ -4195,6 +4278,8 @@ bool WrappedID3D11DeviceContext::Serialise_DrawIndexedInstancedIndirect(Serialis AlignedByteOffsetForArgs); } + LatchSOProperties(); + if(IsLoading(m_State)) { AddEvent(); @@ -4289,6 +4374,8 @@ void WrappedID3D11DeviceContext::DrawIndexedInstancedIndirect(ID3D11Buffer *pBuf SERIALISE_TIME_CALL(m_pRealContext->DrawIndexedInstancedIndirect( UNWRAP(WrappedID3D11Buffer, pBufferForArgs), AlignedByteOffsetForArgs)); + LatchSOProperties(); + if(IsActiveCapturing(m_State)) { USE_SCRATCH_SERIALISER(); @@ -4326,6 +4413,8 @@ bool WrappedID3D11DeviceContext::Serialise_DrawInstancedIndirect(SerialiserType AlignedByteOffsetForArgs); } + LatchSOProperties(); + if(IsLoading(m_State)) { AddEvent(); @@ -4417,6 +4506,8 @@ void WrappedID3D11DeviceContext::DrawInstancedIndirect(ID3D11Buffer *pBufferForA SERIALISE_TIME_CALL(m_pRealContext->DrawInstancedIndirect( UNWRAP(WrappedID3D11Buffer, pBufferForArgs), AlignedByteOffsetForArgs)); + LatchSOProperties(); + if(IsActiveCapturing(m_State)) { USE_SCRATCH_SERIALISER(); @@ -6455,12 +6546,12 @@ bool WrappedID3D11DeviceContext::Serialise_ClearState(SerialiserType &ser) if(buf) { - ResourceId id = GetIDForDeviceChild(buf); + StreamOutData &so = m_pDevice->GetSOHiddenCounterForBuffer(GetIDForDeviceChild(buf)); - if(m_StreamOutCounters[id].running) + if(so.running) { - m_pRealContext->End(m_StreamOutCounters[id].query); - m_StreamOutCounters[id].running = false; + m_pRealContext->End(so.query); + so.running = false; } } } @@ -6499,12 +6590,14 @@ void WrappedID3D11DeviceContext::ClearState() if(buf) { + StreamOutData &so = m_pDevice->GetSOHiddenCounterForBuffer(GetIDForDeviceChild(buf)); + ResourceId id = GetIDForDeviceChild(buf); - if(m_StreamOutCounters[id].running) + if(so.running) { - m_pRealContext->End(m_StreamOutCounters[id].query); - m_StreamOutCounters[id].running = false; + m_pRealContext->End(so.query); + so.running = false; } } } diff --git a/renderdoc/driver/d3d11/d3d11_device.cpp b/renderdoc/driver/d3d11/d3d11_device.cpp index 32f823eb2..37be59ded 100644 --- a/renderdoc/driver/d3d11/d3d11_device.cpp +++ b/renderdoc/driver/d3d11/d3d11_device.cpp @@ -265,6 +265,9 @@ WrappedID3D11Device::~WrappedID3D11Device() RenderDoc::Inst().RemoveDeviceFrameCapturer((ID3D11Device *)this); + for(auto it = m_StreamOutCounters.begin(); it != m_StreamOutCounters.end(); ++it) + SAFE_RELEASE(it->second.query); + for(auto it = m_CachedStateObjects.begin(); it != m_CachedStateObjects.end(); ++it) if(*it) IntRelease(*it); diff --git a/renderdoc/driver/d3d11/d3d11_device.h b/renderdoc/driver/d3d11/d3d11_device.h index c4cb13a8c..6f1b92fc1 100644 --- a/renderdoc/driver/d3d11/d3d11_device.h +++ b/renderdoc/driver/d3d11/d3d11_device.h @@ -46,6 +46,19 @@ class D3D11Replay; #define D3D11_1_UAV_SLOT_COUNT 64 #endif +struct StreamOutData +{ + ID3D11Query *query = NULL; + bool running = false; + uint64_t numPrims = 0; + uint32_t stride = 0; +}; + +struct SOShaderData +{ + uint32_t strides[4] = {}; +}; + enum TextureDisplayType { TEXDISPLAY_UNKNOWN = 0, @@ -68,7 +81,7 @@ struct D3D11InitParams uint32_t VendorUAV = ~0U; // check if a frame capture section version is supported - static const uint64_t CurrentVersion = 0x12; + static const uint64_t CurrentVersion = 0x13; static bool IsSupportedVersion(uint64_t ver); }; @@ -585,6 +598,9 @@ private: std::map > m_LayoutDescs; std::map m_LayoutShaders; + std::map m_StreamOutCounters; + std::map m_SOShaders; + static WrappedID3D11Device *m_pCurrentWrappedDevice; std::map m_SwapChains; @@ -652,6 +668,9 @@ public: void RemoveDeferredContext(WrappedID3D11DeviceContext *defctx); WrappedID3D11DeviceContext *GetDeferredContext(size_t idx); + const std::map &GetSOHiddenCounters() { return m_StreamOutCounters; } + StreamOutData &GetSOHiddenCounterForBuffer(ResourceId id) { return m_StreamOutCounters[id]; } + const SOShaderData &GetSOShaderData(ResourceId id) { return m_SOShaders[id]; } ResourceId GetResourceID() { return m_ResourceID; } const ActionDescription *GetAction(uint32_t eventId); ResourceDescription &GetResourceDesc(ResourceId id); diff --git a/renderdoc/driver/d3d11/d3d11_device_wrap.cpp b/renderdoc/driver/d3d11/d3d11_device_wrap.cpp index 4d3d68e88..b27aae03c 100644 --- a/renderdoc/driver/d3d11/d3d11_device_wrap.cpp +++ b/renderdoc/driver/d3d11/d3d11_device_wrap.cpp @@ -1724,6 +1724,40 @@ bool WrappedID3D11Device::Serialise_CreateGeometryShaderWithStreamOutput( GetResourceManager()->AddLiveResource(pShader, ret); } + D3D_PRIMITIVE_TOPOLOGY topo = + DXBC::DXBCContainer::GetOutputTopology(pShaderBytecode, BytecodeLength); + + uint32_t vertsPerPrim = 1; + if(topo == D3D_PRIMITIVE_TOPOLOGY_LINELIST) + vertsPerPrim = 1; + else if(topo == D3D_PRIMITIVE_TOPOLOGY_LINELIST) + vertsPerPrim = 2; + else + vertsPerPrim = 3; + + SOShaderData &soshader = m_SOShaders[GetIDForDeviceChild(ret)]; + + for(UINT i = 0; i < NumStrides; i++) + soshader.strides[i] = pBufferStrides[i] * vertsPerPrim; + + // Undocumented, but D3D11 auto-calculates tight strides if they are not specified, based on the + // declarations (which are tightly packed) + for(UINT i = NumStrides; i < D3D11_SO_STREAM_COUNT; i++) + { + // count the entries writing to this slot + for(size_t decl = 0; decl < NumEntries; decl++) + { + if(pSODeclaration[decl].OutputSlot == i) + { + // all components are written as 32-bit values + soshader.strides[i] += pSODeclaration[decl].ComponentCount * sizeof(uint32_t); + } + } + + // still want the stride per-primitive not per-vertex + soshader.strides[i] *= vertsPerPrim; + } + AddResource(pShader, ResourceType::Shader, "Geometry Shader"); // if this shader was initialised with a shader ext UAV, pull in that chunk as one of ours // and unset it (there will be one for each create that actually used vendor extensions) @@ -1768,6 +1802,40 @@ HRESULT WrappedID3D11Device::CreateGeometryShaderWithStreamOutput( wrapped = new WrappedID3D11Shader( real, ResourceId(), (const byte *)pShaderBytecode, BytecodeLength, this); + D3D_PRIMITIVE_TOPOLOGY topo = + DXBC::DXBCContainer::GetOutputTopology(pShaderBytecode, BytecodeLength); + + uint32_t vertsPerPrim = 1; + if(topo == D3D_PRIMITIVE_TOPOLOGY_LINELIST) + vertsPerPrim = 1; + else if(topo == D3D_PRIMITIVE_TOPOLOGY_LINELIST) + vertsPerPrim = 2; + else + vertsPerPrim = 3; + + SOShaderData &soshader = m_SOShaders[GetIDForDeviceChild(wrapped)]; + + for(UINT i = 0; i < NumStrides; i++) + soshader.strides[i] = pBufferStrides[i] * vertsPerPrim; + + // Undocumented, but D3D11 auto-calculates tight strides if they are not specified, based on the + // declarations (which are tightly packed) + for(UINT i = NumStrides; i < D3D11_SO_STREAM_COUNT; i++) + { + // count the entries writing to this slot + for(size_t decl = 0; decl < NumEntries; decl++) + { + if(pSODeclaration[decl].OutputSlot == i) + { + // all components are written as 32-bit values + soshader.strides[i] += pSODeclaration[decl].ComponentCount * sizeof(uint32_t); + } + } + + // still want the stride per-primitive not per-vertex + soshader.strides[i] *= vertsPerPrim; + } + if(IsCaptureMode(m_State)) { Chunk *vendorChunk = NULL; diff --git a/renderdoc/driver/shaders/dxbc/dxbc_bytecode.cpp b/renderdoc/driver/shaders/dxbc/dxbc_bytecode.cpp index 7fcffe903..270269af8 100644 --- a/renderdoc/driver/shaders/dxbc/dxbc_bytecode.cpp +++ b/renderdoc/driver/shaders/dxbc/dxbc_bytecode.cpp @@ -589,6 +589,47 @@ D3D_PRIMITIVE_TOPOLOGY Program::GetOutputTopology() return D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST; } +D3D_PRIMITIVE_TOPOLOGY Program::GetOutputTopology(const byte *bytes, size_t length) +{ + uint32_t *begin = (uint32_t *)bytes; + uint32_t *cur = begin; + uint32_t *end = begin + (length / sizeof(uint32_t)); + + // skip version and length + cur += 2; + + while(cur < end) + { + uint32_t OpcodeToken0 = cur[0]; + + OpcodeType op = Opcode::Type.Get(OpcodeToken0); + + // nvidia is a structured buffer with counter + // AMD is a RW byte address buffer + if(op == OPCODE_DCL_GS_OUTPUT_PRIMITIVE_TOPOLOGY) + { + uint32_t *tokenStream = cur; + + // skip opcode and length + tokenStream++; + + return Decl::OutputPrimitiveTopology.Get(tokenStream[0]); + } + + if(op == OPCODE_CUSTOMDATA) + { + // length in opcode token is 0, full length is in second dword + cur += cur[1]; + } + else + { + cur += Opcode::Length.Get(OpcodeToken0); + } + } + + return D3D_PRIMITIVE_TOPOLOGY_UNDEFINED; +} + void Program::SetupRegisterFile(rdcarray ®isters) const { size_t numRegisters = m_NumTemps + m_IndexTempSizes.size() + m_NumOutputs; diff --git a/renderdoc/driver/shaders/dxbc/dxbc_bytecode.h b/renderdoc/driver/shaders/dxbc/dxbc_bytecode.h index 345285536..f22173d34 100644 --- a/renderdoc/driver/shaders/dxbc/dxbc_bytecode.h +++ b/renderdoc/driver/shaders/dxbc/dxbc_bytecode.h @@ -1235,6 +1235,7 @@ public: rdcstr GetRegisterName(OperandType oper, uint32_t index) const; static bool UsesExtensionUAV(uint32_t slot, uint32_t space, const byte *bytes, size_t length); + static D3D_PRIMITIVE_TOPOLOGY GetOutputTopology(const byte *bytes, size_t length); protected: friend class Program; diff --git a/renderdoc/driver/shaders/dxbc/dxbc_container.cpp b/renderdoc/driver/shaders/dxbc/dxbc_container.cpp index ac7f03d03..29bb1697b 100644 --- a/renderdoc/driver/shaders/dxbc/dxbc_container.cpp +++ b/renderdoc/driver/shaders/dxbc/dxbc_container.cpp @@ -458,6 +458,34 @@ D3D_PRIMITIVE_TOPOLOGY DXBCContainer::GetOutputTopology() return m_OutputTopology; } +D3D_PRIMITIVE_TOPOLOGY DXBCContainer::GetOutputTopology(const void *ByteCode, size_t ByteCodeLength) +{ + const FileHeader *header = (const FileHeader *)ByteCode; + + const byte *data = (const byte *)ByteCode; // just for convenience + + if(header->fourcc != FOURCC_DXBC) + return D3D_PRIMITIVE_TOPOLOGY_UNDEFINED; + + if(header->fileLength != (uint32_t)ByteCodeLength) + return D3D_PRIMITIVE_TOPOLOGY_UNDEFINED; + + const uint32_t *chunkOffsets = (const uint32_t *)(header + 1); // right after the header + + for(uint32_t chunkIdx = 0; chunkIdx < header->numChunks; chunkIdx++) + { + const uint32_t *fourcc = (const uint32_t *)(data + chunkOffsets[chunkIdx]); + const uint32_t *chunkSize = (const uint32_t *)(fourcc + 1); + + const byte *chunkContents = (const byte *)(chunkSize + 1); + + if(*fourcc == FOURCC_SHEX || *fourcc == FOURCC_SHDR) + return DXBCBytecode::Program::GetOutputTopology(chunkContents, *chunkSize); + } + + return D3D_PRIMITIVE_TOPOLOGY_UNDEFINED; +} + const rdcstr &DXBCContainer::GetDisassembly() { if(m_Disassembly.empty()) diff --git a/renderdoc/driver/shaders/dxbc/dxbc_container.h b/renderdoc/driver/shaders/dxbc/dxbc_container.h index 216a8b5ad..db0d6638c 100644 --- a/renderdoc/driver/shaders/dxbc/dxbc_container.h +++ b/renderdoc/driver/shaders/dxbc/dxbc_container.h @@ -220,6 +220,7 @@ public: static bool CheckForDebugInfo(const void *ByteCode, size_t ByteCodeLength); static bool CheckForDXIL(const void *ByteCode, size_t ByteCodeLength); static rdcstr GetDebugBinaryPath(const void *ByteCode, size_t ByteCodeLength); + static D3D_PRIMITIVE_TOPOLOGY GetOutputTopology(const void *ByteCode, size_t ByteCodeLength); private: void TryFetchSeparateDebugInfo(bytebuf &byteCode, const rdcstr &debugInfoPath); diff --git a/util/test/demos/d3d11/d3d11_stream_out.cpp b/util/test/demos/d3d11/d3d11_stream_out.cpp index e636514a7..6abeff944 100644 --- a/util/test/demos/d3d11/d3d11_stream_out.cpp +++ b/util/test/demos/d3d11/d3d11_stream_out.cpp @@ -73,8 +73,9 @@ RD_TEST(D3D11_Stream_Out, D3D11GraphicsTest) ID3D11BufferPtr vb = MakeBuffer().Vertex().Data(DefaultTri); - ID3D11BufferPtr so[2] = { + ID3D11BufferPtr so[3] = { MakeBuffer().StreamOut().Vertex().Size(2048), MakeBuffer().StreamOut().Vertex().Size(2048), + MakeBuffer().StreamOut().Vertex().Size(2048), }; D3D11_INPUT_ELEMENT_DESC layoutdesc[] = { @@ -93,6 +94,29 @@ RD_TEST(D3D11_Stream_Out, D3D11GraphicsTest) CHECK_HR(dev->CreateInputLayout(layoutdesc, ARRAY_COUNT(layoutdesc), vsblob->GetBufferPointer(), vsblob->GetBufferSize(), &streamoutLayout)); + // pre fill buffer 2 with pre-frame data + { + ctx->ClearState(); + + IASetVertexBuffer(vb, sizeof(DefaultA2V), 0); + ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST); + ctx->IASetInputLayout(defaultLayout); + + ctx->VSSetShader(vs, NULL, 0); + ctx->GSSetShader(gs, NULL, 0); + ctx->PSSetShader(ps, NULL, 0); + + RSSetViewport({0.0f, 0.0f, (float)screenWidth, (float)screenHeight, 0.0f, 1.0f}); + + ctx->OMSetRenderTargets(1, &bbRTV.GetInterfacePtr(), NULL); + + ID3D11Buffer *bufs[] = {so[2], so[1]}; + UINT offs[2] = {0}; + ctx->SOSetTargets(2, bufs, offs); + + ctx->Draw(3, 0); + } + while(Running()) { ctx->ClearState(); @@ -156,7 +180,9 @@ RD_TEST(D3D11_Stream_Out, D3D11GraphicsTest) ctx->IASetVertexBuffers(0, 2, bufs, &strides[0], offs); ctx->IASetInputLayout(streamoutLayout); + ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_POINTLIST); ctx->DrawAuto(); + ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST); RSSetViewport({0.0f, 0.0f, (float)screenWidth, (float)screenHeight, 0.0f, 1.0f}); @@ -203,8 +229,20 @@ RD_TEST(D3D11_Stream_Out, D3D11GraphicsTest) ctx->IASetInputLayout(streamoutLayout); ctx->DrawAuto(); + ctx->SOSetTargets(0, NULL, NULL); + + RSSetViewport({(screenWidth * 3.0f) / 4.0f, 0.0f, (float)screenWidth / 4.0f, + (float)screenHeight / 4.0f, 0.0f, 1.0f}); + + bufs[0] = so[2]; + ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_POINTLIST); + ctx->IASetVertexBuffers(0, 2, bufs, &strides[0], offs); + ctx->DrawAuto(); + // leave stream-out buffers bound at the end of the frame ctx->ClearState(); + bufs[0] = so[1]; + bufs[1] = so[0]; ctx->SOSetTargets(2, bufs, offs); Present();