From 27aec9b277a3c2cb24fb83b922f20bb768533052 Mon Sep 17 00:00:00 2001 From: baldurk Date: Thu, 8 Oct 2015 11:20:20 +0200 Subject: [PATCH] Add thread-local temp memory pool for storing temp unwrapped structs * Avoids needing to do lots of small allocations --- renderdoc/driver/vulkan/vk_core.cpp | 29 +++++++++ renderdoc/driver/vulkan/vk_core.h | 14 +++++ .../driver/vulkan/wrappers/vk_cmd_funcs.cpp | 52 +++++++--------- .../vulkan/wrappers/vk_descriptor_funcs.cpp | 62 ++++++++----------- .../vulkan/wrappers/vk_device_funcs.cpp | 5 +- .../driver/vulkan/wrappers/vk_misc_funcs.cpp | 6 +- .../driver/vulkan/wrappers/vk_queue_funcs.cpp | 9 +-- .../vulkan/wrappers/vk_shader_funcs.cpp | 26 ++++---- 8 files changed, 108 insertions(+), 95 deletions(-) diff --git a/renderdoc/driver/vulkan/vk_core.cpp b/renderdoc/driver/vulkan/vk_core.cpp index 844eae5a8..ba0dbca92 100644 --- a/renderdoc/driver/vulkan/vk_core.cpp +++ b/renderdoc/driver/vulkan/vk_core.cpp @@ -412,6 +412,35 @@ const char * WrappedVulkan::GetChunkName(uint32_t idx) return VkChunkNames[idx-FIRST_CHUNK_ID]; } +byte *WrappedVulkan::GetTempMemory(size_t s) +{ + TempMem *mem = (TempMem *)Threading::GetTLSValue(tempMemoryTLSSlot); + if(mem && mem->size >= s) return mem->memory; + + // alloc or grow alloc + TempMem *newmem = mem; + + if(!newmem) newmem = new TempMem(); + + // free old memory, don't need to keep contents + if(newmem->memory) delete[] newmem->memory; + + // alloc new memory + newmem->size = s; + newmem->memory = new byte[s]; + + Threading::SetTLSValue(tempMemoryTLSSlot, (void *)newmem); + + // if this is entirely new, save it for deletion on shutdown + if(!mem) + { + SCOPED_LOCK(m_ThreadTempMemLock); + m_ThreadTempMem.push_back(newmem); + } + + return newmem->memory; +} + Serialiser *WrappedVulkan::GetThreadSerialiser() { Serialiser *ser = (Serialiser *)Threading::GetTLSValue(threadSerialiserTLSSlot); diff --git a/renderdoc/driver/vulkan/vk_core.h b/renderdoc/driver/vulkan/vk_core.h index fd3c800e0..5a110bd30 100644 --- a/renderdoc/driver/vulkan/vk_core.h +++ b/renderdoc/driver/vulkan/vk_core.h @@ -116,6 +116,16 @@ private: Threading::CriticalSection m_ThreadSerialisersLock; vector m_ThreadSerialisers; + + uint64_t tempMemoryTLSSlot; + struct TempMem + { + TempMem() : memory(NULL), size(0) {} + byte *memory; + size_t size; + }; + Threading::CriticalSection m_ThreadTempMemLock; + vector m_ThreadTempMem; VulkanReplay m_Replay; @@ -379,6 +389,10 @@ private: static const char *GetChunkName(uint32_t idx); + // returns thread-local temporary memory + byte *GetTempMemory(size_t s); + template T *GetTempArray(uint32_t arraycount) { return (T*)GetTempMemory(sizeof(T)*arraycount); } + Serialiser *GetThreadSerialiser(); Serialiser *GetMainSerialiser() { return m_pSerialiser; } diff --git a/renderdoc/driver/vulkan/wrappers/vk_cmd_funcs.cpp b/renderdoc/driver/vulkan/wrappers/vk_cmd_funcs.cpp index 86285e0d0..6d7603efd 100644 --- a/renderdoc/driver/vulkan/wrappers/vk_cmd_funcs.cpp +++ b/renderdoc/driver/vulkan/wrappers/vk_cmd_funcs.cpp @@ -819,16 +819,11 @@ void WrappedVulkan::vkCmdBindDescriptorSets( uint32_t dynamicOffsetCount, const uint32_t* pDynamicOffsets) { - // VKTODOLOW this should be a persistent per-thread array that resizes up - // to a high water mark, so we don't have to allocate - VkDescriptorSet *unwrapped = new VkDescriptorSet[setCount]; - for(uint32_t i=0; i < setCount; i++) - unwrapped[i] = Unwrap(pDescriptorSets[i]); + VkDescriptorSet *unwrapped = GetTempArray(setCount); + for(uint32_t i=0; i < setCount; i++) unwrapped[i] = Unwrap(pDescriptorSets[i]); ObjDisp(cmdBuffer)->CmdBindDescriptorSets(Unwrap(cmdBuffer), pipelineBindPoint, Unwrap(layout), firstSet, setCount, unwrapped, dynamicOffsetCount, pDynamicOffsets); - SAFE_DELETE_ARRAY(unwrapped); - if(m_State >= WRITING) { VkResourceRecord *record = GetRecord(cmdBuffer); @@ -1132,16 +1127,11 @@ void WrappedVulkan::vkCmdBindVertexBuffers( const VkBuffer* pBuffers, const VkDeviceSize* pOffsets) { - // VKTODOLOW this should be a persistent per-thread array that resizes up - // to a high water mark, so we don't have to allocate - VkBuffer *unwrapped = new VkBuffer[bindingCount]; - for(uint32_t i=0; i < bindingCount; i++) - unwrapped[i] = Unwrap(pBuffers[i]); + VkBuffer *unwrapped = GetTempArray(bindingCount); + for(uint32_t i=0; i < bindingCount; i++) unwrapped[i] = Unwrap(pBuffers[i]); ObjDisp(cmdBuffer)->CmdBindVertexBuffers(Unwrap(cmdBuffer), startBinding, bindingCount, unwrapped, pOffsets); - SAFE_DELETE_ARRAY(unwrapped); - if(m_State >= WRITING) { VkResourceRecord *record = GetRecord(cmdBuffer); @@ -1313,16 +1303,16 @@ void WrappedVulkan::vkCmdPipelineBarrier( { { - // VKTODOLOW this should be a persistent per-thread array that resizes up - // to a high water mark, so we don't have to allocate - vector im; - vector buf; + // conservatively request memory for worst case to avoid needing to iterate + // twice to count + byte *memory = GetTempMemory( ( sizeof(void*) + sizeof(VkImageMemoryBarrier) + sizeof(VkBufferMemoryBarrier) )*memBarrierCount); - // ensure we don't resize while looping so we can take pointers - im.reserve(memBarrierCount); - buf.reserve(memBarrierCount); + VkImageMemoryBarrier *im = (VkImageMemoryBarrier *)memory; + VkBufferMemoryBarrier *buf = (VkBufferMemoryBarrier *)(im + memBarrierCount); - void **unwrappedBarriers = new void*[memBarrierCount]; + size_t imCount = 0, bufCount = 0; + + void **unwrappedBarriers = (void **)(buf + memBarrierCount); for(uint32_t i=0; i < memBarrierCount; i++) { @@ -1330,17 +1320,21 @@ void WrappedVulkan::vkCmdPipelineBarrier( if(header->sType == VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER) { - VkImageMemoryBarrier barrier = *(VkImageMemoryBarrier *)header; + VkImageMemoryBarrier &barrier = im[imCount]; + barrier = *(VkImageMemoryBarrier *)header; barrier.image = Unwrap(barrier.image); - im.push_back(barrier); - unwrappedBarriers[i] = &im.back(); + unwrappedBarriers[i] = &im[imCount]; + + imCount++; } else if(header->sType == VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER) { - VkBufferMemoryBarrier barrier = *(VkBufferMemoryBarrier *)header; + VkBufferMemoryBarrier &barrier = buf[bufCount]; + barrier = *(VkBufferMemoryBarrier *)header; barrier.buffer = Unwrap(barrier.buffer); - buf.push_back(barrier); - unwrappedBarriers[i] = &buf.back(); + unwrappedBarriers[i] = &buf[bufCount]; + + bufCount++; } else { @@ -1349,8 +1343,6 @@ void WrappedVulkan::vkCmdPipelineBarrier( } ObjDisp(cmdBuffer)->CmdPipelineBarrier(Unwrap(cmdBuffer), srcStageMask, destStageMask, byRegion, memBarrierCount, unwrappedBarriers); - - SAFE_DELETE_ARRAY(unwrappedBarriers); } if(m_State >= WRITING) diff --git a/renderdoc/driver/vulkan/wrappers/vk_descriptor_funcs.cpp b/renderdoc/driver/vulkan/wrappers/vk_descriptor_funcs.cpp index 895478d0f..d6ba402cf 100644 --- a/renderdoc/driver/vulkan/wrappers/vk_descriptor_funcs.cpp +++ b/renderdoc/driver/vulkan/wrappers/vk_descriptor_funcs.cpp @@ -139,16 +139,24 @@ VkResult WrappedVulkan::vkCreateDescriptorSetLayout( const VkDescriptorSetLayoutCreateInfo* pCreateInfo, VkDescriptorSetLayout* pSetLayout) { - // VKTODOLOW this should be a persistent per-thread array that resizes up - // to a high water mark, so we don't have to allocate - VkDescriptorSetLayoutBinding *unwrapped = new VkDescriptorSetLayoutBinding[pCreateInfo->count]; + size_t tempmemSize = sizeof(VkDescriptorSetLayoutBinding)*pCreateInfo->count; + + // need to count how many VkSampler arrays to allocate for + for(uint32_t i=0; i < pCreateInfo->count; i++) + if(pCreateInfo->pBinding[i].pImmutableSamplers) tempmemSize += pCreateInfo->pBinding[i].arraySize; + + byte *memory = GetTempMemory(tempmemSize); + + VkDescriptorSetLayoutBinding *unwrapped = (VkDescriptorSetLayoutBinding *)memory; + VkSampler *nextSampler = (VkSampler *)(unwrapped + pCreateInfo->count); + for(uint32_t i=0; i < pCreateInfo->count; i++) { unwrapped[i] = pCreateInfo->pBinding[i]; if(unwrapped[i].pImmutableSamplers) { - VkSampler *unwrappedSamplers = new VkSampler[unwrapped[i].arraySize]; + VkSampler *unwrappedSamplers = nextSampler; nextSampler += unwrapped[i].arraySize; for(uint32_t j=0; j < unwrapped[i].arraySize; j++) unwrappedSamplers[j] = Unwrap(unwrapped[i].pImmutableSamplers[j]); unwrapped[i].pImmutableSamplers = unwrappedSamplers; @@ -158,10 +166,6 @@ VkResult WrappedVulkan::vkCreateDescriptorSetLayout( VkDescriptorSetLayoutCreateInfo unwrappedInfo = *pCreateInfo; unwrappedInfo.pBinding = unwrapped; VkResult ret = ObjDisp(device)->CreateDescriptorSetLayout(Unwrap(device), &unwrappedInfo, pSetLayout); - - for(uint32_t i=0; i < pCreateInfo->count; i++) - delete[] unwrapped[i].pImmutableSamplers; - SAFE_DELETE_ARRAY(unwrapped); if(ret == VK_SUCCESS) { @@ -245,15 +249,10 @@ VkResult WrappedVulkan::vkAllocDescriptorSets( VkDescriptorSet* pDescriptorSets, uint32_t* pCount) { - // VKTODOLOW this should be a persistent per-thread array that resizes up - // to a high water mark, so we don't have to allocate - VkDescriptorSetLayout *unwrapped = new VkDescriptorSetLayout[count]; - for(uint32_t i=0; i < count; i++) - unwrapped[i] = Unwrap(pSetLayouts[i]); + VkDescriptorSetLayout *unwrapped = GetTempArray(count); + for(uint32_t i=0; i < count; i++) unwrapped[i] = Unwrap(pSetLayouts[i]); VkResult ret = ObjDisp(device)->AllocDescriptorSets(Unwrap(device), Unwrap(descriptorPool), setUsage, count, unwrapped, pDescriptorSets, pCount); - - SAFE_DELETE_ARRAY(unwrapped); RDCASSERT(pCount == NULL || *pCount == count); // VKTODOMED: find out what *pCount < count means @@ -309,11 +308,8 @@ VkResult WrappedVulkan::vkFreeDescriptorSets( uint32_t count, const VkDescriptorSet* pDescriptorSets) { - // VKTODOLOW this should be a persistent per-thread array that resizes up - // to a high water mark, so we don't have to allocate - VkDescriptorSet *unwrapped = new VkDescriptorSet[count]; - for(uint32_t i=0; i < count; i++) - unwrapped[i] = Unwrap(pDescriptorSets[i]); + VkDescriptorSet *unwrapped = GetTempArray(count); + for(uint32_t i=0; i < count; i++) unwrapped[i] = Unwrap(pDescriptorSets[i]); for(uint32_t i=0; i < count; i++) GetResourceManager()->ReleaseWrappedResource(pDescriptorSets[i]); @@ -451,27 +447,24 @@ VkResult WrappedVulkan::vkUpdateDescriptorSets( VkResult ret = VK_SUCCESS; { - // VKTODOLOW this should be a persistent per-thread array that resizes up - // to a high water mark, so we don't have to allocate - vector desc; - + // need to count up number of descriptor infos, to be able to alloc enough space uint32_t numInfos = 0; for(uint32_t i=0; i < writeCount; i++) numInfos += pDescriptorWrites[i].count; - - // ensure we don't resize while looping so we can take pointers - desc.resize(numInfos); - - VkWriteDescriptorSet *unwrappedWrites = new VkWriteDescriptorSet[writeCount]; - VkCopyDescriptorSet *unwrappedCopies = new VkCopyDescriptorSet[copyCount]; - uint32_t curInfo = 0; + byte *memory = GetTempMemory(sizeof(VkDescriptorInfo)*numInfos + + sizeof(VkWriteDescriptorSet)*writeCount + sizeof(VkCopyDescriptorSet)*copyCount); + + VkWriteDescriptorSet *unwrappedWrites = (VkWriteDescriptorSet *)memory; + VkCopyDescriptorSet *unwrappedCopies = (VkCopyDescriptorSet *)(unwrappedWrites + writeCount); + VkDescriptorInfo *nextDescriptors = (VkDescriptorInfo *)(unwrappedCopies + copyCount); + for(uint32_t i=0; i < writeCount; i++) { unwrappedWrites[i] = pDescriptorWrites[i]; unwrappedWrites[i].destSet = Unwrap(unwrappedWrites[i].destSet); - VkDescriptorInfo *unwrappedInfos = &desc[curInfo]; - curInfo += pDescriptorWrites[i].count; + VkDescriptorInfo *unwrappedInfos = nextDescriptors; + nextDescriptors += pDescriptorWrites[i].count; for(uint32_t j=0; j < pDescriptorWrites[i].count; j++) { @@ -493,9 +486,6 @@ VkResult WrappedVulkan::vkUpdateDescriptorSets( } ret = ObjDisp(device)->UpdateDescriptorSets(Unwrap(device), writeCount, unwrappedWrites, copyCount, unwrappedCopies); - - SAFE_DELETE_ARRAY(unwrappedWrites); - SAFE_DELETE_ARRAY(unwrappedCopies); } if(ret == VK_SUCCESS) diff --git a/renderdoc/driver/vulkan/wrappers/vk_device_funcs.cpp b/renderdoc/driver/vulkan/wrappers/vk_device_funcs.cpp index 9f3bfe2ab..d9f506212 100644 --- a/renderdoc/driver/vulkan/wrappers/vk_device_funcs.cpp +++ b/renderdoc/driver/vulkan/wrappers/vk_device_funcs.cpp @@ -39,10 +39,11 @@ VkResult WrappedVulkan::vkCreateInstance( VkResult ret = GetInstanceDispatchTable(*pInstance)->CreateInstance(pCreateInfo, &inst); - // VKTODOHIGH need to deallocate this + // VKTODOHIGH need to deallocate these threadSerialiserTLSSlot = Threading::AllocateTLSSlot(); + tempMemoryTLSSlot = Threading::AllocateTLSSlot(); - // VKTODOHIGH need to deallocate m_ThreadSerialisers + // VKTODOHIGH need to deallocate m_ThreadSerialisers and m_ThreadTempMem GetResourceManager()->WrapResource(inst, inst); diff --git a/renderdoc/driver/vulkan/wrappers/vk_misc_funcs.cpp b/renderdoc/driver/vulkan/wrappers/vk_misc_funcs.cpp index 784b09a82..cfd9a04fd 100644 --- a/renderdoc/driver/vulkan/wrappers/vk_misc_funcs.cpp +++ b/renderdoc/driver/vulkan/wrappers/vk_misc_funcs.cpp @@ -499,9 +499,7 @@ VkResult WrappedVulkan::vkCreateFramebuffer( const VkFramebufferCreateInfo* pCreateInfo, VkFramebuffer* pFramebuffer) { - // VKTODOLOW this should be a persistent per-thread array that resizes up - // to a high water mark, so we don't have to allocate - VkAttachmentBindInfo *unwrapped = new VkAttachmentBindInfo[pCreateInfo->attachmentCount]; + VkAttachmentBindInfo *unwrapped = GetTempArray(pCreateInfo->attachmentCount); for(uint32_t i=0; i < pCreateInfo->attachmentCount; i++) { unwrapped[i] = pCreateInfo->pAttachments[i]; @@ -514,8 +512,6 @@ VkResult WrappedVulkan::vkCreateFramebuffer( VkResult ret = ObjDisp(device)->CreateFramebuffer(Unwrap(device), &unwrappedInfo, pFramebuffer); - SAFE_DELETE_ARRAY(unwrapped); - if(ret == VK_SUCCESS) { ResourceId id = GetResourceManager()->WrapResource(Unwrap(device), *pFramebuffer); diff --git a/renderdoc/driver/vulkan/wrappers/vk_queue_funcs.cpp b/renderdoc/driver/vulkan/wrappers/vk_queue_funcs.cpp index f94d1aea9..f74caad10 100644 --- a/renderdoc/driver/vulkan/wrappers/vk_queue_funcs.cpp +++ b/renderdoc/driver/vulkan/wrappers/vk_queue_funcs.cpp @@ -325,16 +325,11 @@ VkResult WrappedVulkan::vkQueueSubmit( const VkCmdBuffer* pCmdBuffers, VkFence fence) { - // VKTODOLOW this should be a persistent per-thread array that resizes up - // to a high water mark, so we don't have to allocate - VkCmdBuffer *unwrapped = new VkCmdBuffer[cmdBufferCount]; - for(uint32_t i=0; i < cmdBufferCount; i++) - unwrapped[i] = Unwrap(pCmdBuffers[i]); + VkCmdBuffer *unwrapped = GetTempArray(cmdBufferCount); + for(uint32_t i=0; i < cmdBufferCount; i++) unwrapped[i] = Unwrap(pCmdBuffers[i]); VkResult ret = ObjDisp(queue)->QueueSubmit(Unwrap(queue), cmdBufferCount, unwrapped, Unwrap(fence)); - SAFE_DELETE_ARRAY(unwrapped); - // VKTODOHIGH when maps are intercepted with local buffers, this will have to be // done when not in capframe :(. if(m_State == WRITING_CAPFRAME) diff --git a/renderdoc/driver/vulkan/wrappers/vk_shader_funcs.cpp b/renderdoc/driver/vulkan/wrappers/vk_shader_funcs.cpp index ac8ba8917..a54acd404 100644 --- a/renderdoc/driver/vulkan/wrappers/vk_shader_funcs.cpp +++ b/renderdoc/driver/vulkan/wrappers/vk_shader_funcs.cpp @@ -64,19 +64,14 @@ VkResult WrappedVulkan::vkCreatePipelineLayout( const VkPipelineLayoutCreateInfo* pCreateInfo, VkPipelineLayout* pPipelineLayout) { - // VKTODOLOW this should be a persistent per-thread array that resizes up - // to a high water mark, so we don't have to allocate - VkDescriptorSetLayout *unwrapped = new VkDescriptorSetLayout[pCreateInfo->descriptorSetCount]; - for(uint32_t i=0; i < pCreateInfo->descriptorSetCount; i++) - unwrapped[i] = Unwrap(pCreateInfo->pSetLayouts[i]); + VkDescriptorSetLayout *unwrapped = GetTempArray(pCreateInfo->descriptorSetCount); + for(uint32_t i=0; i < pCreateInfo->descriptorSetCount; i++) unwrapped[i] = Unwrap(pCreateInfo->pSetLayouts[i]); VkPipelineLayoutCreateInfo unwrappedInfo = *pCreateInfo; unwrappedInfo.pSetLayouts = unwrapped; VkResult ret = ObjDisp(device)->CreatePipelineLayout(Unwrap(device), &unwrappedInfo, pPipelineLayout); - SAFE_DELETE_ARRAY(unwrapped); - if(ret == VK_SUCCESS) { ResourceId id = GetResourceManager()->WrapResource(Unwrap(device), *pPipelineLayout); @@ -378,12 +373,17 @@ VkResult WrappedVulkan::vkCreateGraphicsPipelines( const VkGraphicsPipelineCreateInfo* pCreateInfos, VkPipeline* pPipelines) { - // VKTODOLOW this should be a persistent per-thread array that resizes up - // to a high water mark, so we don't have to allocate - VkGraphicsPipelineCreateInfo *unwrappedInfos = new VkGraphicsPipelineCreateInfo[count]; + // conservatively request memory for 5 stages on each pipeline + // (worst case - can't have compute stage). Avoids needing to count + byte *unwrapped = GetTempMemory(sizeof(VkGraphicsPipelineCreateInfo)*count + sizeof(VkPipelineShaderStageCreateInfo)*count*5); + + // keep pipelines first in the memory, then the stages + VkGraphicsPipelineCreateInfo *unwrappedInfos = (VkGraphicsPipelineCreateInfo *)unwrapped; + VkPipelineShaderStageCreateInfo *nextUnwrappedStages = (VkPipelineShaderStageCreateInfo *)(unwrappedInfos + count); + for(uint32_t i=0; i < count; i++) { - VkPipelineShaderStageCreateInfo *unwrappedStages = new VkPipelineShaderStageCreateInfo[pCreateInfos[i].stageCount]; + VkPipelineShaderStageCreateInfo *unwrappedStages = nextUnwrappedStages; nextUnwrappedStages += pCreateInfos[i].stageCount; for(uint32_t j=0; j < pCreateInfos[i].stageCount; j++) { unwrappedStages[j] = pCreateInfos[i].pStages[j]; @@ -399,10 +399,6 @@ VkResult WrappedVulkan::vkCreateGraphicsPipelines( VkResult ret = ObjDisp(device)->CreateGraphicsPipelines(Unwrap(device), Unwrap(pipelineCache), count, unwrappedInfos, pPipelines); - for(uint32_t i=0; i < count; i++) - delete[] unwrappedInfos[i].pStages; - SAFE_DELETE_ARRAY(unwrappedInfos); - if(ret == VK_SUCCESS) { for(uint32_t i=0; i < count; i++)