Apply workaround for presumed windows bug with BDA capture/replay

* This replaces the previous workaround disabling BDA capture/replay on dedicated image memory, as that is not feasible with future functionality and appears not to be the source of the bug seen before. * Although this is believed to be a windows bug, it only manifests on NV so is treated as an NV bug for now. A future NV driver should contain a workaround for this so it can be disabled later.
2026-05-05 17:40:39 +00:00 · 2025-07-29 15:32:33 +01:00
parent 3be544c51d
commit 259225cacb
4 changed files with 51 additions and 40 deletions
@@ -1156,6 +1156,12 @@ VkDriverInfo::VkDriverInfo(const VkPhysicalDeviceProperties &physProps,
        RDCLOG("Enabling NV workaround for static pipeline force-bind to preserve state");
      nvidiaStaticPipelineRebindStates = true;
    }
+
+    // this is fixed in a windows version but we can't easily query that, so instead we are waiting
+    // for a driver-based workaround and apply the workaround ourselves in the meantime
+    if(active)
+      RDCLOG("Enabling NV workaround for unaligned BDA memory capture/replay");
+    nvidiaUnalignedBDAIssue = true;
  }

  if(driverProps.driverID == VK_DRIVER_ID_AMD_PROPRIETARY ||
@@ -374,6 +374,10 @@ public:
  // On Mali there are some known issues regarding acceleration structure serialisation to device
  // memory, for the affected driver versions we switch to the host command variants
  bool MaliBrokenASDeviceSerialisation() const { return maliBrokenASDeviceSerialisation; }
+  // on NV BDA capture/replay can sometimes fail for memory on certain windows versions. Although
+  // this is believed to be a windows bug currently, it only manifests on NV and a workaround will
+  // be arriving in later NV drivers, so for now we treat this as a driver bug.
+  bool NVUnalignedBDAIssue() const { return nvidiaUnalignedBDAIssue; }
 private:
  GPUVendor m_Vendor;

@@ -390,6 +394,7 @@ private:
  bool intelBrokenOcclusionQueries = false;
  bool nvidiaStaticPipelineRebindStates = false;
  bool maliBrokenASDeviceSerialisation = false;
+  bool nvidiaUnalignedBDAIssue = false;
 };

 struct DynamicRenderingLocalRead
@@ -348,6 +348,13 @@ bool WrappedVulkan::Serialise_vkAllocateMemory(SerialiserType &ser, VkDevice dev
      return false;
    }

+    // apply workaround for presumed windows bug
+    if(GetDriverInfo().NVUnalignedBDAIssue())
+    {
+      // all memory allocations must be 64kB aligned. The rest of the workaround only applies during capture
+      patched.allocationSize = AlignUp(patched.allocationSize, VkDeviceSize(64 * 1024));
+    }
+
    VkResult ret = ObjDisp(device)->AllocateMemory(Unwrap(device), &patched, NULL, &mem);

    if(ret != VK_SUCCESS)
@@ -526,30 +533,22 @@ VkResult WrappedVulkan::vkAllocateMemory(VkDevice device, const VkMemoryAllocate
  // will be bound against since there's no requirement for the buffer to be marked as BDA. This
  // means that when RT is enabled ALL MEMORY IN THE ENTIRE PROGRAM must be marked as BDA just in
  // case.
-  //
-  // we don't force this on for memory allocations that are going to be used for dedicated images
  bool forceBDA = false;
  if(IsCaptureMode(m_State) && AccelerationStructures())
  {
-    const VkMemoryDedicatedAllocateInfo *dedicated =
-        (const VkMemoryDedicatedAllocateInfo *)FindNextStruct(
-            pAllocateInfo, VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO);
-    if(dedicated == NULL || dedicated->image == VK_NULL_HANDLE)
-    {
-      // force BDA flag when creating, by adding the struct if needed
-      forceBDA = true;
+    // force BDA flag when creating, by adding the struct if needed
+    forceBDA = true;

-      if(memFlags)
-      {
-        memFlags->flags |= VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT;
-      }
-      else
-      {
-        rtForcedFlags.flags = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT |
-                              VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT;
-        rtForcedFlags.pNext = unwrapped.pNext;
-        unwrapped.pNext = &rtForcedFlags;
-      }
+    if(memFlags)
+    {
+      memFlags->flags |= VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT;
+    }
+    else
+    {
+      rtForcedFlags.flags = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT |
+                            VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT;
+      rtForcedFlags.pNext = unwrapped.pNext;
+      unwrapped.pNext = &rtForcedFlags;
    }
  }

@@ -574,6 +573,27 @@ VkResult WrappedVulkan::vkAllocateMemory(VkDevice device, const VkMemoryAllocate
    }
  }

+  // apply workaround for presumed windows bug
+  if(GetDriverInfo().NVUnalignedBDAIssue())
+  {
+    const VkDeviceSize kb64 = 64 * 1024;
+    const VkDeviceSize mb2 = 2 * 1024 * 1024;
+
+    // all memory allocations must be 64kB aligned. We do this silently, not affecting the serialised size
+    unwrapped.allocationSize = AlignUp(unwrapped.allocationSize, kb64);
+
+    // <2 MB allocations must have an extra 64kB
+    if(unwrapped.allocationSize < mb2)
+    {
+      unwrapped.allocationSize += kb64;
+    }
+    else
+    {
+      // >= 2MB allocations must be aligned to 2MB
+      unwrapped.allocationSize = AlignUp(unwrapped.allocationSize, mb2);
+    }
+  }
+
  VkResult ret;
  SERIALISE_TIME_CALL(
      ret = ObjDisp(device)->AllocateMemory(Unwrap(device), &unwrapped, NULL, pMemory));
@@ -429,26 +429,6 @@ bool WrappedVulkan::Serialise_vkCreateSwapchainKHR(SerialiserType &ser, VkDevice
          GetGPULocalMemoryIndex(mrq.memoryTypeBits),
      };

-      VkMemoryDedicatedAllocateInfo dedicated = {
-          VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO,
-          NULL,
-          Unwrap(im),
-      };
-
-      // if the acceleration structures feature is enabled, we _must_ be on at least vulkan 1.1 (the
-      // extension requires it). Vulkan 1.1 unconditionally allows the use of dedicated image
-      // allocations without any feature bits (the original extension didn't have any either).
-      //
-      // we do this because without it, the extra memory allocation may be promoted to BDA by
-      // self-capturing and cause problems with address space clashes. We don't need the dedicated
-      // allocation but it avoids that behaviour as it may not be legal to query the address of a
-      // dedicated image memory allocation and in any case we know that it can't be legally used for
-      // BDA or AS backing memory
-      if(AccelerationStructures())
-      {
-        allocInfo.pNext = &dedicated;
-      }
-
      vkr = ObjDisp(device)->AllocateMemory(Unwrap(device), &allocInfo, NULL, &mem);
      CHECK_VKR(this, vkr);