DXIL Debugger GroupShared simulation changes

On the active thread
* GSM reads come from the local GSM cache (not the global GSM data)
* GSM writes go to the local GSM cache and the global GSM data
* Workgroup Memory barrier populates the local GSM cache with the data from the global GSM data

On the non-active threads:
* GSM reads/write always operate on the global GSM data
* Workgroup Memory barrier is ignored
This commit is contained in:
Jake Turner
2025-05-19 08:54:20 +01:00
parent 9523f98b8a
commit 4539ff5c43
2 changed files with 172 additions and 40 deletions
+168 -39
View File
@@ -41,6 +41,9 @@ using namespace rdcshaders;
// TODO: Support UAVs with counter
// TODO: Extend debug data parsing: DW_TAG_array_type for the base element type
// TODO: Extend debug data parsing: N-dimensional arrays, mapping covers whole sub-array
// TODO: Load's from Pointers should re-read the memory to get accurate values
// TODO: Change Pointers to be GPUPointers : update pointers aliased to base pointers
// TODO: Unify and share the code for reading and writing thru pointers
// Notes:
// The phi node capture variables are not shown in the UI
@@ -96,6 +99,19 @@ using namespace DXDebug;
const uint32_t POINTER_MAGIC = 0xBEAFDEAF;
static bool IsEncodedPointer(const ShaderVariable &var)
{
if(var.type != VarType::GPUPointer)
{
return false;
}
if(var.value.u32v[1] != POINTER_MAGIC)
{
return false;
}
return true;
}
static void EncodePointer(DXILDebug::Id ptrId, uint64_t offset, uint64_t size, ShaderVariable &var)
{
var.type = VarType::GPUPointer;
@@ -108,16 +124,12 @@ static void EncodePointer(DXILDebug::Id ptrId, uint64_t offset, uint64_t size, S
static bool DecodePointer(DXILDebug::Id &ptrId, uint64_t &offset, uint64_t &size,
const ShaderVariable &var)
{
if(var.type != VarType::GPUPointer)
if(!IsEncodedPointer(var))
{
RDCERR("Calling DecodePointer on non-pointer type %s", ToStr(var.type).c_str());
return false;
}
if(var.value.u32v[1] != POINTER_MAGIC)
{
RDCERR("Calling DecodePointer on non encoded pointer type %u", var.value.u32v[1]);
RDCERR("Calling DecodePointer on non encoded pointer");
return false;
}
ptrId = var.value.u32v[0];
offset = var.value.u64v[1];
size = var.value.u64v[2];
@@ -655,6 +667,35 @@ static uint8_t GetShaderVariableElementByteSize(const ShaderVariable &var)
return GetShaderVariableElementByteSize(var.members[0]);
}
static void UpdateShaderVariableFromBackingMemory(ShaderVariable &var, const void *ptr)
{
// Memory copy from backing memory to base memory variable
size_t elementSize = GetShaderVariableElementByteSize(var);
const uint8_t *src = (const uint8_t *)ptr;
if(var.members.size() == 0)
{
RDCASSERTEQUAL(var.rows, 1);
RDCASSERTEQUAL(var.columns, 1);
if(elementSize <= sizeof(ShaderValue))
memcpy(&var.value, src, elementSize);
else
RDCERR("Updating MemoryVariable elementSize %u too large max %u", elementSize,
sizeof(ShaderValue));
}
else
{
for(uint32_t i = 0; i < var.members.size(); ++i)
{
if(elementSize <= sizeof(ShaderValue))
memcpy(&var.members[i].value, src, elementSize);
else
RDCERR("Updating MemoryVariable member %u elementSize %u too large max %u", i, elementSize,
sizeof(ShaderValue));
src += elementSize;
}
}
}
static DXBC::ResourceRetType ConvertComponentTypeToResourceRetType(const ComponentType compType)
{
switch(compType)
@@ -1601,7 +1642,7 @@ void MemoryTracking::AllocateMemoryForType(const DXIL::Type *type, Id allocId, b
size_t byteSize = ComputeDXILTypeByteSize(type->inner);
void *backingMem = malloc(byteSize);
memset(backingMem, 0, byteSize);
m_Allocations[allocId] = {backingMem, byteSize, global};
m_Allocations[allocId] = {backingMem, byteSize, global, !global};
// Create a pointer to represent this allocation
m_Pointers[allocId] = {allocId, backingMem, byteSize};
@@ -1622,7 +1663,7 @@ ThreadState::~ThreadState()
{
for(auto it : m_Memory.m_Allocations)
{
if(!it.second.global)
if(it.second.localMemory)
free(it.second.backingMemory);
}
}
@@ -3140,6 +3181,8 @@ bool ThreadState::ExecuteInstruction(DebugAPIWrapper *apiWrapper,
// For thread barriers the threads must be converged
if(barrierMode & BarrierMode::SyncThreadGroup)
RDCASSERT(!WorkgroupIsDiverged(workgroup));
if(barrierMode & BarrierMode::TGSMFence)
ExecuteMemoryBarrier();
break;
}
case DXOp::Discard:
@@ -4811,7 +4854,7 @@ bool ThreadState::ExecuteInstruction(DebugAPIWrapper *apiWrapper,
// SV_StartInstanceLocation
// StartInstanceLocation from Draw*Instanced
// Needed for debugger support of multi-threaded compute execution
// SM 6.8
case DXOp::BarrierByMemoryType:
case DXOp::BarrierByMemoryHandle:
@@ -5094,7 +5137,7 @@ bool ThreadState::ExecuteInstruction(DebugAPIWrapper *apiWrapper,
}
const MemoryTracking::Allocation &allocation = itAlloc->second;
ShaderVariable arg;
if(allocation.global && !IsVariableAssigned(ptrId))
if(allocation.globalVarAlloc && !IsVariableAssigned(ptrId))
{
RDCASSERT(IsVariableAssigned(baseMemoryId));
arg = m_Variables[baseMemoryId];
@@ -5151,6 +5194,24 @@ bool ThreadState::ExecuteInstruction(DebugAPIWrapper *apiWrapper,
const MemoryTracking::Allocation &allocation = itAlloc->second;
UpdateMemoryVariableFromBackingMemory(baseMemoryId, allocation.backingMemory);
// active lane : writes to a GSM variable, write to local and global backing memory
if(m_State)
{
if(m_GlobalState.groupSharedMemoryIds.contains(baseMemoryId))
{
// Compute the local pointer offset and apply it to the global base memory
ptrdiff_t offset = (uintptr_t)memory - (uintptr_t)allocation.backingMemory;
auto globalMem = m_GlobalState.memory.m_Allocations.find(baseMemoryId);
if(globalMem == m_GlobalState.memory.m_Allocations.end())
{
RDCERR("Unknown global memory allocation Id %u", baseMemoryId);
break;
}
void *globalMemory = (void *)((uintptr_t)globalMem->second.backingMemory + offset);
allocSize = ptr.size;
UpdateBackingMemoryFromVariable(globalMemory, allocSize, val);
}
}
// record the change to the base memory variable if it is not the ptrId variable
if(recordBaseMemoryChange)
{
@@ -5241,7 +5302,7 @@ bool ThreadState::ExecuteInstruction(DebugAPIWrapper *apiWrapper,
// Ensure global variables use global memory
// Ensure non-global variables do not use global memory
if(allocation.global)
if(allocation.globalVarAlloc)
RDCASSERT(cast<GlobalVar>(inst.args[0]));
else
RDCASSERT(!cast<GlobalVar>(inst.args[0]));
@@ -6143,7 +6204,16 @@ bool ThreadState::ExecuteInstruction(DebugAPIWrapper *apiWrapper,
RDCASSERTNOTEQUAL(resultId, DXILDebug::INVALID_ID);
RDCASSERT(IsVariableAssigned(ptrId));
const ShaderVariable a = m_Variables[ptrId];
ShaderVariable a;
if(allocation.globalVarAlloc && !IsVariableAssigned(ptrId))
{
RDCASSERT(IsVariableAssigned(baseMemoryId));
a = m_Variables[baseMemoryId];
}
else
{
a = m_Variables[ptrId];
}
size_t newValueArgIdx = (opCode == Operation::CompareExchange) ? 2 : 1;
ShaderVariable b;
@@ -6264,6 +6334,25 @@ bool ThreadState::ExecuteInstruction(DebugAPIWrapper *apiWrapper,
UpdateMemoryVariableFromBackingMemory(baseMemoryId, allocMemoryBackingPtr);
// active lane : writes to a GSM variable, write to local and global backing memory
if(m_State)
{
if(m_GlobalState.groupSharedMemoryIds.contains(baseMemoryId))
{
// Compute the local pointer offset and apply it to the global base memory
ptrdiff_t offset = (uintptr_t)memory - (uintptr_t)allocation.backingMemory;
auto globalMem = m_GlobalState.memory.m_Allocations.find(baseMemoryId);
if(globalMem == m_GlobalState.memory.m_Allocations.end())
{
RDCERR("Unknown global memory allocation Id %u", baseMemoryId);
break;
}
void *globalMemory = (void *)((uintptr_t)globalMem->second.backingMemory + offset);
allocSize = ptr.size;
UpdateBackingMemoryFromVariable(globalMemory, allocSize, res);
}
}
// record the change to the base memory variable
if(recordBaseMemoryChange)
{
@@ -6646,31 +6735,7 @@ void ThreadState::UpdateBackingMemoryFromVariable(void *ptr, uint64_t &allocSize
void ThreadState::UpdateMemoryVariableFromBackingMemory(Id memoryId, const void *ptr)
{
ShaderVariable &baseMemory = m_Variables[memoryId];
// Memory copy from backing memory to base memory variable
size_t elementSize = GetShaderVariableElementByteSize(baseMemory);
const uint8_t *src = (const uint8_t *)ptr;
if(baseMemory.members.size() == 0)
{
RDCASSERTEQUAL(baseMemory.rows, 1);
RDCASSERTEQUAL(baseMemory.columns, 1);
if(elementSize <= sizeof(ShaderValue))
memcpy(&baseMemory.value, src, elementSize);
else
RDCERR("Updating MemoryVariable elementSize %u too large max %u", elementSize,
sizeof(ShaderValue));
}
else
{
for(uint32_t i = 0; i < baseMemory.members.size(); ++i)
{
if(elementSize <= sizeof(ShaderValue))
memcpy(&baseMemory.members[i].value, src, elementSize);
else
RDCERR("Updating MemoryVariable member %u elementSize %u too large max %u", i, elementSize,
sizeof(ShaderValue));
src += elementSize;
}
}
UpdateShaderVariableFromBackingMemory(baseMemory, ptr);
}
void ThreadState::PerformGPUResourceOp(const rdcarray<ThreadState> &workgroup, Operation opCode,
@@ -7162,11 +7227,61 @@ ShaderValue ThreadState::DDY(bool fine, Operation opCode, DXOp dxOpCode,
return ret;
}
void ThreadState::ExecuteMemoryBarrier()
{
// ignore if not the active thread
if(!m_State)
return;
// copy the global GSM memory into the local GSM cache
for(Id id : m_GlobalState.groupSharedMemoryIds)
{
auto globalMem = m_GlobalState.memory.m_Allocations.find(id);
if(globalMem == m_GlobalState.memory.m_Allocations.end())
{
RDCERR("Unknown global memory allocation for GSM Id %u", id);
continue;
}
RDCASSERT(globalMem->second.globalVarAlloc);
RDCASSERT(!globalMem->second.localMemory);
auto localMem = m_Memory.m_Allocations.find(id);
if(localMem == m_Memory.m_Allocations.end())
{
RDCERR("Unknown local memory allocation for GSM Id %u", id);
continue;
}
RDCASSERT(localMem->second.globalVarAlloc);
RDCASSERT(localMem->second.localMemory);
auto localVar = m_Variables.find(id);
if(localVar == m_Variables.end())
{
RDCERR("Unknown local memory allocation for GSM Id %u", id);
continue;
}
ShaderVariableChange change;
ShaderVariable &local = localVar->second;
change.before = local;
const void *globalBackingMemory = globalMem->second.backingMemory;
UpdateMemoryVariableFromBackingMemory(id, globalBackingMemory);
change.after = local;
if(!(change.after == change.before))
m_State->changes.push_back(change);
// Update local backing memory from the local variable
RDCASSERTEQUAL(globalMem->second.size, localMem->second.size);
void *localBackingMemory = localMem->second.backingMemory;
const size_t allocSize = (size_t)globalMem->second.size;
memcpy(localBackingMemory, globalBackingMemory, allocSize);
}
}
GlobalState::~GlobalState()
{
for(auto it : memory.m_Allocations)
{
RDCASSERT(it.second.global);
RDCASSERT(!it.second.localMemory);
free(it.second.backingMemory);
}
}
@@ -8620,6 +8735,8 @@ ShaderDebugTrace *Debugger::BeginDebug(uint32_t eventId, const DXBC::DXBCContain
}
}
}
if(gv->type->addrSpace == DXIL::Type::PointerAddrSpace::GroupShared)
m_GlobalState.groupSharedMemoryIds.push_back(globalVar.id);
m_GlobalState.globals.push_back(globalVar);
m_LiveGlobals[globalVar.id] = true;
}
@@ -9408,6 +9525,18 @@ rdcarray<ShaderDebugState> Debugger::ContinueDebug(DebugAPIWrapper *apiWrapper)
}
}
// active lane : needs it own local backing memory, copied from global at the start
for(Id id : m_GlobalState.groupSharedMemoryIds)
{
MemoryTracking::Allocation &globalAlloc = active.m_Memory.m_Allocations[id];
RDCASSERT(globalAlloc.globalVarAlloc);
const size_t allocSize = (size_t)globalAlloc.size;
void *localBackingMem = malloc(allocSize);
memcpy(localBackingMem, globalAlloc.backingMemory, allocSize);
active.m_Memory.m_Allocations[id] = {localBackingMem, globalAlloc.size, true, true};
active.m_Memory.m_Pointers[id] = {id, localBackingMem, globalAlloc.size};
}
// globals won't be filled out by entering the entry point, ensure their change is registered.
for(const GlobalVariable &gv : m_GlobalState.globals)
initial.changes.push_back({ShaderVariable(), gv.var});
+4 -1
View File
@@ -211,7 +211,8 @@ struct MemoryTracking
// the allocated memory
void *backingMemory;
uint64_t size;
bool global;
bool globalVarAlloc;
bool localMemory;
};
// Represents pointers within a Allocation memory allocation (heap)
@@ -292,6 +293,7 @@ struct ThreadState
void ProcessScopeChange(const rdcarray<bool> &oldLive, const rdcarray<bool> &newLive);
void ExecuteMemoryBarrier();
static bool WorkgroupIsDiverged(const rdcarray<ThreadState> &workgroup);
static bool QuadIsDiverged(const rdcarray<ThreadState> &workgroup,
const rdcfixedarray<uint32_t, 4> &quadNeighbours);
@@ -444,6 +446,7 @@ struct GlobalState
rdcarray<ShaderVariable> constantBlocks;
std::map<ConstantBlockReference, bytebuf> constantBlocksDatas;
rdcarray<Id> groupSharedMemoryIds;
// resources may be read-write but the variable itself doesn't change
rdcarray<ShaderVariable> readOnlyResources;
rdcarray<ShaderVariable> readWriteResources;