Fetch and apply pixel inputs and derivatives to quad

This commit is contained in:
baldurk
2020-04-03 17:17:46 +01:00
parent 21f869192c
commit fc2181d625
4 changed files with 477 additions and 24 deletions
+15 -1
View File
@@ -25,6 +25,7 @@
#pragma once
#include "api/replay/rdcarray.h"
#include "maths/vec.h"
#include "spirv_common.h"
#include "spirv_processor.h"
@@ -43,7 +44,17 @@ public:
virtual void ReadConstantBufferValue(uint32_t set, uint32_t bind, uint32_t offset,
uint32_t byteSize, void *dst) = 0;
virtual void FillInputValue(ShaderVariable &var, ShaderBuiltin builtin, uint32_t location,
uint32_t offset) = 0;
uint32_t component) = 0;
struct DerivativeDeltas
{
Vec4f ddxcoarse;
Vec4f ddycoarse;
Vec4f ddxfine;
Vec4f ddyfine;
};
virtual DerivativeDeltas GetDerivative(uint32_t location, uint32_t component) = 0;
};
struct GlobalState
@@ -172,6 +183,9 @@ private:
uint32_t AllocateVariable(const Decorations &varDecorations, const Decorations &curDecorations,
DebugVariableType sourceVarType, const rdcstr &sourceName,
uint32_t offset, const DataType &inType, ShaderVariable &outVar);
uint32_t ApplyDerivatives(uint32_t quadIndex, const Decorations &curDecorations,
uint32_t location, const DataType &inType, ShaderVariable &outVar);
void AddSourceVars(rdcarray<SourceVariableMapping> &sourceVars, const DataType &inType,
const rdcstr &sourceName, const rdcstr &varName, uint32_t &offset);
void MakeSignatureNames(const rdcarray<SPIRVInterfaceAccess> &sigList, rdcarray<rdcstr> &sigNames);
@@ -264,7 +264,7 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *apiWrapper, const Shader
// fill the interface variable
AllocateVariable(decorations[v.id], decorations[v.id],
isInput ? DebugVariableType::Input : DebugVariableType::Variable, sourceName,
0, dataTypes[type.InnerType()], var);
decorations[v.id].location, dataTypes[type.InnerType()], var);
for(size_t i = oldSize; i < globalSourceVars.size(); i++)
globalSourceVars[i].signatureIndex =
@@ -385,6 +385,33 @@ ShaderDebugTrace *Debugger::BeginDebug(DebugAPIWrapper *apiWrapper, const Shader
workgroup[i].inputs = active.inputs;
workgroup[i].outputs = active.outputs;
workgroup[i].ids = active.ids;
// mark as inactive/helper lane
workgroup[i].done = true;
}
if(stage == ShaderStage::Pixel)
{
// apply derivatives to generate the correct inputs for the quad neighbours
for(uint32_t q = 0; q < workgroupSize; q++)
{
if(q == activeLaneIndex)
continue;
for(size_t i = 0; i < inputIDs.size(); i++)
{
Id id = inputIDs[i];
const DataType &type = dataTypes[idTypes[id]];
// global variables should all be pointers into opaque storage
RDCASSERT(type.type == DataType::PointerType);
const DataType &innertype = dataTypes[type.InnerType()];
ApplyDerivatives(q, decorations[id], decorations[id].location, innertype,
workgroup[q].inputs[i]);
}
}
}
return ret;
@@ -980,10 +1007,21 @@ uint32_t Debugger::AllocateVariable(const Decorations &varDecorations,
if(sourceVarType == DebugVariableType::Input)
{
uint32_t location = genLocations ? offset : 0;
uint32_t component = 0;
for(const DecorationAndParamData &dec : curDecorations.others)
{
if(dec.value == Decoration::Component)
{
component = dec.component;
break;
}
}
apiWrapper->FillInputValue(
outVar, builtin,
(curDecorations.flags & Decorations::HasLocation) ? curDecorations.location : location,
(curDecorations.flags & Decorations::HasOffset) ? curDecorations.offset : 0);
component);
}
else if(sourceVarType == DebugVariableType::Constant)
{
@@ -1044,6 +1082,226 @@ uint32_t Debugger::AllocateVariable(const Decorations &varDecorations,
return outVar.rows;
}
uint32_t Debugger::ApplyDerivatives(uint32_t quadIndex, const Decorations &curDecorations,
uint32_t location, const DataType &inType, ShaderVariable &outVar)
{
switch(inType.type)
{
case DataType::PointerType:
{
RDCERR("Pointers not supported in interface variables");
return 0;
}
case DataType::ScalarType:
case DataType::VectorType:
case DataType::MatrixType: break;
case DataType::StructType:
{
uint32_t childLocation = 0;
for(int32_t i = 0; i < inType.children.count(); i++)
{
const Decorations &childDecorations = inType.children[i].decorations;
uint32_t locations = ApplyDerivatives(quadIndex, childDecorations, location + childLocation,
dataTypes[inType.children[i].type], outVar.members[i]);
childLocation += locations;
}
return childLocation;
}
case DataType::ArrayType:
{
uint32_t childLocation = 0;
ShaderVariable len = GetActiveLane().ids[inType.length];
for(uint32_t i = 0; i < len.value.u.x; i++)
{
uint32_t locations = ApplyDerivatives(quadIndex, curDecorations, location + childLocation,
dataTypes[inType.InnerType()], outVar.members[i]);
childLocation += locations;
}
return childLocation;
}
case DataType::ImageType:
case DataType::SamplerType:
case DataType::SampledImageType:
case DataType::UnknownType:
{
RDCERR("Unexpected variable type %d", inType.type);
return 0;
}
}
// only floats have derivatives
if(outVar.type == VarType::Float)
{
uint32_t component = 0;
for(const DecorationAndParamData &dec : curDecorations.others)
{
if(dec.value == Decoration::Component)
{
component = dec.component;
break;
}
}
// We make the assumption that the coarse derivatives are generated from (0,0) in the quad, and
// fine derivatives are generated from the destination index and its neighbours in X and Y.
// This isn't spec'd but we must assume something and this will hopefully get us closest to
// reproducing actual results.
//
// For debugging, we need members of the quad to be able to generate coarse and fine
// derivatives.
//
// For (0,0) we only need the coarse derivatives to get our neighbours (1,0) and (0,1) which
// will give us coarse and fine derivatives being identical.
//
// For the others we will need to use a combination of coarse and fine derivatives to get the
// diagonal element in the quad. In the examples below, remember that the quad indices are:
//
// +---+---+
// | 0 | 1 |
// +---+---+
// | 2 | 3 |
// +---+---+
//
// And that we have definitions of the derivatives:
//
// ddx_coarse = (1,0) - (0,0)
// ddy_coarse = (0,1) - (0,0)
//
// i.e. the same for all members of the quad
//
// ddx_fine = (x,y) - (1-x,y)
// ddy_fine = (x,y) - (x,1-y)
//
// i.e. the difference to the neighbour of our desired invocation (the one we have the actual
// inputs for, from gathering above).
//
// So e.g. if our thread is at (1,1) destIdx = 3
//
// (1,0) = (1,1) - ddx_fine
// (0,1) = (1,1) - ddy_fine
// (0,0) = (1,1) - ddy_fine - ddx_coarse
//
// and ddy_coarse is unused. For (1,0) destIdx = 1:
//
// (1,1) = (1,0) + ddy_fine
// (0,1) = (1,0) - ddx_coarse + ddy_coarse
// (0,0) = (1,0) - ddx_coarse
//
// and ddx_fine is unused (it's identical to ddx_coarse anyway)
if(curDecorations.flags & Decorations::HasLocation)
location = curDecorations.location;
DebugAPIWrapper::DerivativeDeltas derivs = apiWrapper->GetDerivative(location, component);
Vec4f &dst = *(Vec4f *)outVar.value.fv;
// in the diagrams below * marks the active lane index.
//
// V and ^ == coarse ddy
// , and ` == fine ddy
// < and > == coarse ddx
// { and } == fine ddx
//
// We are basically making one or two cardinal direction moves from the starting point
// (activeLaneIndex) to the end point (quadIndex).
RDCASSERTNOTEQUAL(activeLaneIndex, quadIndex);
switch(activeLaneIndex)
{
case 0:
{
// +---+---+
// |*0 > 1 |
// +-V-+-V-+
// | 2 | 3 |
// +---+---+
switch(quadIndex)
{
case 0: break;
case 1: dst += derivs.ddxcoarse; break;
case 2: dst += derivs.ddycoarse; break;
case 3:
dst += derivs.ddxcoarse;
dst += derivs.ddycoarse;
break;
default: break;
}
break;
}
case 1:
{
// we need to use fine to get from 1 to 3 as coarse only ever involves 0->1 and 0->2
// +---+---+
// | 0 < 1*|
// +-V-+-,-+
// | 2 | 3 |
// +---+---+
switch(quadIndex)
{
case 0: dst -= derivs.ddxcoarse; break;
case 1: break;
case 2:
dst -= derivs.ddxcoarse;
dst += derivs.ddycoarse;
break;
case 3: dst += derivs.ddyfine; break;
default: break;
}
break;
}
case 2:
{
// +---+---+
// | 0 > 1 |
// +-^-+---+
// |*2 } 3 |
// +---+---+
switch(quadIndex)
{
case 0: dst -= derivs.ddycoarse; break;
case 1:
dst -= derivs.ddycoarse;
dst += derivs.ddxcoarse;
break;
case 2: break;
case 3: dst += derivs.ddxfine; break;
default: break;
}
break;
}
case 3:
{
// +---+---+
// | 0 < 1 |
// +---+-`-+
// | 2 { 3*|
// +---+---+
switch(quadIndex)
{
case 0:
dst -= derivs.ddyfine;
dst -= derivs.ddxcoarse;
break;
case 1: dst -= derivs.ddyfine; break;
case 2: dst -= derivs.ddxfine; break;
case 3: break;
default: break;
}
break;
}
default: break;
}
}
// each row consumes a new location
return outVar.rows;
}
void Debugger::PreParse(uint32_t maxId)
{
Processor::PreParse(maxId);
+170 -21
View File
@@ -61,7 +61,7 @@ public:
}
virtual void FillInputValue(ShaderVariable &var, ShaderBuiltin builtin, uint32_t location,
uint32_t offset) override
uint32_t component) override
{
if(builtin != ShaderBuiltin::Undefined)
{
@@ -76,7 +76,8 @@ public:
return;
}
RDCASSERT(offset == 0);
// TODO handle components
RDCASSERT(component == 0);
if(location < location_inputs.size())
{
@@ -84,13 +85,28 @@ public:
return;
}
RDCERR("Couldn't get input for location=%u, offset=%u", location, offset);
RDCERR("Couldn't get input for %s at location=%u, component=%u", var.name.c_str(), location,
component);
}
virtual DerivativeDeltas GetDerivative(uint32_t location, uint32_t component) override
{
// TODO handle components
RDCASSERT(component == 0);
if(location < derivatives.size())
return derivatives[location];
RDCERR("Couldn't get derivative for location=%u, component=%u", location, component);
return DerivativeDeltas();
}
std::map<rdcpair<uint32_t, uint32_t>, bytebuf> cbuffers;
std::map<ShaderBuiltin, ShaderVariable> builtin_inputs;
rdcarray<ShaderVariable> location_inputs;
rdcarray<DerivativeDeltas> derivatives;
private:
WrappedVulkan *m_pDriver = NULL;
};
@@ -113,6 +129,16 @@ enum class InputSpecConstant
static const uint32_t validMagicNumber = 12345;
struct PSHit
{
Vec4f pos;
uint32_t prim;
uint32_t sample;
uint32_t valid;
uint32_t padding;
// PSInput base, ddx, ....
};
static void CreatePSInputFetcher(rdcarray<uint32_t> &fragspv, uint32_t &structStride,
VulkanCreationInfo::ShaderModuleReflection &shadRefl,
StorageMode storageMode, bool usePrimitiveID, bool useSampleID)
@@ -487,9 +513,6 @@ static void CreatePSInputFetcher(rdcarray<uint32_t> &fragspv, uint32_t &structSt
member++;
}
// we have 5 input structs, and two vectors for our data
structStride = sizeof(Vec4f) + sizeof(Vec4f) + structStride * 5;
rdcspv::Id PSHitRTArray = editor.AddType(rdcspv::OpTypeRuntimeArray(editor.MakeId(), PSHit));
editor.AddDecoration(rdcspv::OpDecorate(
@@ -1145,7 +1168,10 @@ ShaderDebugTrace *VulkanReplay::DebugPixel(uint32_t eventId, uint32_t x, uint32_
m_pDriver->GetShaderCache()->MakeGraphicsPipelineInfo(graphicsInfo, state.graphics.pipeline);
VkDeviceSize feedbackStorageSize = overdrawLevels * structStride + sizeof(Vec4f) + 1024;
// struct size is PSHit header plus 5x structStride = base, ddxcoarse, ddycoarse, ddxfine, ddyfine
uint32_t structSize = sizeof(PSHit) + structStride * 5;
VkDeviceSize feedbackStorageSize = overdrawLevels * structSize + sizeof(Vec4f) + 1024;
if(feedbackStorageSize > m_BindlessFeedback.FeedbackBuffer.sz)
{
@@ -1387,27 +1413,150 @@ ShaderDebugTrace *VulkanReplay::DebugPixel(uint32_t eventId, uint32_t x, uint32_
base += sizeof(Vec4f);
struct PSHit
{
Vec4f pos;
uint32_t prim;
uint32_t sample;
uint32_t valid;
uint32_t padding;
// PSInput base, ddx, ....
};
PSHit *winner = NULL;
RDCLOG("Got %u hits", numHits);
// if we encounter multiple hits at our destination pixel co-ord (or any other) we
// check to see if a specific primitive was requested (via primitive parameter not
// being set to ~0U). If it was, debug that pixel, otherwise do a best-estimate
// of which fragment was the last to successfully depth test and debug that, just by
// checking if the depth test is ordered and picking the final fragment in the series
// figure out the TL pixel's coords. Assume even top left (towards 0,0)
// this isn't spec'd but is a reasonable assumption.
int xTL = x & (~1);
int yTL = y & (~1);
// get the index of our desired pixel
int destIdx = (x - xTL) + 2 * (y - yTL);
VkCompareOp depthOp = pipe.depthCompareOp;
// depth tests disabled acts the same as always compare mode
if(!pipe.depthTestEnable)
depthOp = VK_COMPARE_OP_ALWAYS;
for(uint32_t i = 0; i < numHits; i++)
{
PSHit *hit = (PSHit *)(base + structStride * i);
RDCLOG("Hit %u at %f, %f, %f, %f", i, hit->pos.x, hit->pos.y, hit->pos.z, hit->pos.w);
if(hit->valid != validMagicNumber)
{
RDCWARN("Hit %u doesn't have valid magic number");
continue;
}
// see if this hit is a closer match than the previous winner.
// if there's no previous winner it's clearly better
if(winner == NULL)
{
winner = hit;
continue;
}
// if we're looking for a specific primitive
if(primitive != ~0U)
{
// and this hit is a match and the winner isn't, it's better
if(winner->prim != primitive && hit->prim == primitive)
{
winner = hit;
continue;
}
// if the winner is a match and we're not, we can't be better so stop now
if(winner->prim == primitive && hit->prim != primitive)
{
continue;
}
}
// if we're looking for a particular sample, check that
if(sample != ~0U)
{
if(winner->sample != sample && hit->sample == sample)
{
winner = hit;
continue;
}
if(winner->sample == sample && hit->sample != sample)
{
continue;
}
}
// otherwise apply depth test
switch(depthOp)
{
case VK_COMPARE_OP_NEVER:
case VK_COMPARE_OP_EQUAL:
case VK_COMPARE_OP_NOT_EQUAL:
case VK_COMPARE_OP_ALWAYS:
default:
// don't emulate equal or not equal since we don't know the reference value. Take any hit
// (thus meaning the last hit)
winner = hit;
break;
case VK_COMPARE_OP_LESS:
if(hit->pos.z < winner->pos.z)
winner = hit;
break;
case VK_COMPARE_OP_LESS_OR_EQUAL:
if(hit->pos.z <= winner->pos.z)
winner = hit;
break;
case VK_COMPARE_OP_GREATER:
if(hit->pos.z > winner->pos.z)
winner = hit;
break;
case VK_COMPARE_OP_GREATER_OR_EQUAL:
if(hit->pos.z >= winner->pos.z)
winner = hit;
break;
}
}
rdcspv::Debugger *debugger = new rdcspv::Debugger;
debugger->Parse(shader.spirv.GetSPIRV());
ShaderDebugTrace *ret = debugger->BeginDebug(apiWrapper, ShaderStage::Pixel, entryPoint, spec,
shadRefl.instructionLines, shadRefl.patchData, 0);
ShaderDebugTrace *ret = NULL;
if(winner)
{
rdcspv::Debugger *debugger = new rdcspv::Debugger;
debugger->Parse(shader.spirv.GetSPIRV());
// the data immediately follows the PSHit header. Every piece of data is vec4 aligned, and the
// output is in input signature order.
byte *PSInputs = (byte *)(winner + 1);
Vec4f *value = (Vec4f *)(PSInputs + 0 * structStride);
Vec4f *ddxcoarse = (Vec4f *)(PSInputs + 1 * structStride);
Vec4f *ddycoarse = (Vec4f *)(PSInputs + 2 * structStride);
Vec4f *ddxfine = (Vec4f *)(PSInputs + 3 * structStride);
Vec4f *ddyfine = (Vec4f *)(PSInputs + 4 * structStride);
rdcarray<ShaderVariable> &locations = apiWrapper->location_inputs;
for(size_t i = 0; i < shadRefl.refl.inputSignature.size(); i++)
{
const SigParameter &param = shadRefl.refl.inputSignature[i];
locations.resize(RDCMAX((uint32_t)locations.size(), param.regIndex + 1));
apiWrapper->derivatives.resize(RDCMAX((uint32_t)locations.size(), param.regIndex + 1));
memcpy(&locations[param.regIndex].value.uv, &value[i], sizeof(Vec4f));
memcpy(&apiWrapper->derivatives[param.regIndex].ddxcoarse, &ddxcoarse[i], sizeof(Vec4f));
memcpy(&apiWrapper->derivatives[param.regIndex].ddycoarse, &ddycoarse[i], sizeof(Vec4f));
memcpy(&apiWrapper->derivatives[param.regIndex].ddxfine, &ddxfine[i], sizeof(Vec4f));
memcpy(&apiWrapper->derivatives[param.regIndex].ddyfine, &ddyfine[i], sizeof(Vec4f));
}
ret = debugger->BeginDebug(apiWrapper, ShaderStage::Pixel, entryPoint, spec,
shadRefl.instructionLines, shadRefl.patchData, destIdx);
}
else
{
RDCLOG("Didn't get any valid hit to debug");
delete apiWrapper;
}
if(descpool != VK_NULL_HANDLE)
{
+32
View File
@@ -109,6 +109,38 @@ inline Vec3f operator+=(Vec3f &a, const Vec3f &b)
return a;
}
inline Vec4f operator*(const Vec4f &a, const float b)
{
return Vec4f(a.x * b, a.y * b, a.z * b);
}
inline Vec4f operator+(const Vec4f &a, const Vec4f &b)
{
return Vec4f(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline Vec4f operator-(const Vec4f &a)
{
return Vec4f(-a.x, -a.y, -a.z);
}
inline Vec4f operator-(const Vec4f &a, const Vec4f &b)
{
return a + (-b);
}
inline Vec4f operator-=(Vec4f &a, const Vec4f &b)
{
a = a - b;
return a;
}
inline Vec4f operator+=(Vec4f &a, const Vec4f &b)
{
a = a + b;
return a;
}
struct Vec4u
{
Vec4u(uint32_t X = 0, uint32_t Y = 0, uint32_t Z = 0, uint32_t W = 0)