From 9309b09ab6e5726e58eef2d76c9079c1a950167a Mon Sep 17 00:00:00 2001 From: baldurk Date: Fri, 6 Nov 2020 17:09:47 +0000 Subject: [PATCH] Convert main SPIR-V dispatch loop to handle any bit width --- .../driver/shaders/spirv/spirv_debug.cpp | 1204 +++++++++++------ 1 file changed, 802 insertions(+), 402 deletions(-) diff --git a/renderdoc/driver/shaders/spirv/spirv_debug.cpp b/renderdoc/driver/shaders/spirv/spirv_debug.cpp index 2e32bfe42..e57327c8a 100644 --- a/renderdoc/driver/shaders/spirv/spirv_debug.cpp +++ b/renderdoc/driver/shaders/spirv/spirv_debug.cpp @@ -29,36 +29,51 @@ #include "maths/half_convert.h" #include "os/os_specific.h" #include "spirv_op_helpers.h" +#include "var_dispatch_helpers.h" -static bool ContainsNaNInf(const ShaderVariable &val) +static bool ContainsNaNInf(const ShaderVariable &var) { bool ret = false; - for(const ShaderVariable &member : val.members) + for(const ShaderVariable &member : var.members) ret |= ContainsNaNInf(member); - int count = int(val.rows) * int(val.columns); + int count = int(var.rows) * int(var.columns); - if(val.type == VarType::Float || val.type == VarType::Half) + for(int c = 0; c < count; c++) { - for(int i = 0; i < count; i++) - { - ret |= RDCISINF(val.value.fv[i]); - ret |= RDCISNAN(val.value.fv[i]) != 0; - } - } - else if(val.type == VarType::Double) - { - for(int i = 0; i < count; i++) - { - ret |= RDCISINF(val.value.dv[i]); - ret |= RDCISNAN(val.value.dv[i]) != 0; - } +#undef _IMPL +#define _IMPL(T) RDCISINF(comp(var, c)) || RDCISNAN(comp(var, c)) + + IMPL_FOR_FLOAT_TYPES(_IMPL); } return ret; } +namespace Bits +{ +// add simple overloads that upcast for small types +inline uint16_t CountOnes(uint16_t value) +{ + return Bits::CountOnes((uint32_t)value) & 0xffffu; +} +inline uint8_t CountOnes(uint8_t value) +{ + return Bits::CountOnes((uint32_t)value) & 0xffu; +} + +// on non-64-bit platforms we implement a two-half manual bitcount for 64-bit integers +#if DISABLED(RDOC_X64) +inline uint64_t CountOnes(uint64_t value) +{ + uint32_t words[2]; + memcpy(words, &value, sizeof(value)); + return Bits::CountOnes(words[0]) + Bits::CountOnes(words[1]); +} +#endif +} + namespace rdcspv { const BindpointIndex DebugAPIWrapper::invalidBind = BindpointIndex(-12345, -12345, ~0U); @@ -452,11 +467,17 @@ ShaderVariable ThreadState::CalcDeriv(ThreadState::DerivDir dir, ThreadState::De ShaderVariable aval = a->GetSrc(val); ShaderVariable bval = b->GetSrc(val); + ShaderVariable var = aval; - for(uint8_t c = 0; c < aval.columns; c++) - aval.value.fv[c] = bval.value.fv[c] - aval.value.fv[c]; + for(uint8_t c = 0; c < var.columns; c++) + { +#undef _IMPL +#define _IMPL(T) comp(var, c) = comp(bval, c) - comp(aval, c) - return aval; + IMPL_FOR_FLOAT_TYPES(_IMPL); + } + + return var; } void ThreadState::JumpToLabel(Id target) @@ -645,7 +666,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray // evaluate the indices indices.reserve(chain.indexes.size()); for(Id id : chain.indexes) - indices.push_back(GetSrc(id).value.uv[0]); + indices.push_back(uintComp(GetSrc(id), 0)); SetDst(chain.result, debugger.MakeCompositePointer( ids[chain.base], debugger.GetPointerBaseId(ids[chain.base]), indices)); @@ -670,7 +691,6 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray ShaderVariable result; result.rows = result.columns = 1; - result.type = VarType::UInt; BindpointIndex bind = debugger.GetPointerValue(structPointer).GetBinding(); @@ -681,7 +701,9 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray RDCASSERT(dec.flags & Decorations::HasArrayStride); byteLen /= dec.arrayStride; - result.value.uv[0] = uint32_t(byteLen); + // Result Type must be an OpTypeInt with 32-bit Width and 0 Signedness + result.type = VarType::UInt; + setUintComp(result, 0, uint32_t(byteLen)); SetDst(len.result, result); @@ -702,9 +724,9 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray var.type = VarType::Bool; if(opdata.op == Op::PtrEqual) - var.value.uv[0] = isEqual ? 1 : 0; + setUintComp(var, 0, isEqual ? 1 : 0); else - var.value.uv[0] = isEqual ? 0 : 1; + setUintComp(var, 0, isEqual ? 0 : 1); SetDst(equal.result, var); break; @@ -756,7 +778,12 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray ShaderVariable ddy = CalcDeriv(DDY, type, workgroup, deriv.p); for(uint32_t c = 0; c < var.columns; c++) - var.value.fv[c] = fabsf(var.value.fv[c]) + fabsf(ddy.value.fv[c]); + { +#undef _IMPL +#define _IMPL(T) comp(var, c) = fabs(comp(var, c)) + fabs(comp(ddy, c)) + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } SetDst(deriv.result, var); @@ -817,20 +844,12 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray RDCASSERTEQUAL(mod->rows, obj.columns); for(uint32_t row = 0; row < mod->rows; row++) - { - if(VarTypeByteSize(mod->type) == 8) - mod->value.u64v[row * mod->columns + column] = obj.value.u64v[row]; - else - mod->value.uv[row * mod->columns + column] = obj.value.uv[row]; - } + copyComp(*mod, row * mod->columns + column, obj, row); } else { // if it's a vector, replace one scalar - if(VarTypeByteSize(mod->type) == 8) - mod->value.u64v[idx] = obj.value.u64v[0]; - else - mod->value.uv[idx] = obj.value.uv[0]; + copyComp(*mod, idx, obj, 0); } } else if(i + 2 == insert.indexes.size()) @@ -839,10 +858,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray uint32_t column = insert.indexes[i]; uint32_t row = insert.indexes[i + 1]; - if(VarTypeByteSize(mod->type) == 8) - mod->value.u64v[row * mod->columns + column] = obj.value.u64v[0]; - else - mod->value.uv[row * mod->columns + column] = obj.value.uv[0]; + copyComp(*mod, row * mod->columns + column, obj, 0); } // then evaluate it, to get the extracted value @@ -893,20 +909,15 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray // it is possible to construct larger vectors from a collection of scalars and smaller // vectors. - size_t dst = 0; + uint32_t dst = 0; for(size_t i = 0; i < construct.constituents.size(); i++) { ShaderVariable src = GetSrc(construct.constituents[i]); RDCASSERTEQUAL(src.rows, 1); - for(size_t j = 0; j < src.columns; j++) - { - if(VarTypeByteSize(var.type) == 8) - var.value.u64v[dst++] = src.value.u64v[j]; - else - var.value.uv[dst++] = src.value.uv[j]; - } + for(uint32_t j = 0; j < src.columns; j++) + copyComp(var, dst++, src, j); } } else if(type.type == DataType::MatrixType) @@ -923,16 +934,9 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray for(size_t i = 0; i < construct.constituents.size(); i++) columns[i] = GetSrc(construct.constituents[i]); - for(size_t r = 0; r < var.rows; r++) - { - for(size_t c = 0; c < var.columns; c++) - { - if(VarTypeByteSize(var.type) == 8) - var.value.u64v[r * var.columns + c] = columns[c].value.u64v[r]; - else - var.value.uv[r * var.columns + c] = columns[c].value.uv[r]; - } - } + for(uint32_t r = 0; r < var.rows; r++) + for(uint32_t c = 0; c < var.columns; c++) + copyComp(var, r * var.columns + c, columns[c], r); } SetDst(construct.result, var); @@ -956,13 +960,13 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray uint32_t vec1Cols = src1.columns; - for(size_t i = 0; i < shuffle.components.size(); i++) + for(uint32_t i = 0; i < shuffle.components.size(); i++) { uint32_t c = shuffle.components[i]; if(c < vec1Cols) - var.value.uv[i] = src1.value.uv[c]; + copyComp(var, i, src1, c); else - var.value.uv[i] = src2.value.uv[c - vec1Cols]; + copyComp(var, i, src2, c - vec1Cols); } SetDst(shuffle.result, var); @@ -976,12 +980,10 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray ShaderVariable var = GetSrc(extract.vector); ShaderVariable idx = GetSrc(extract.index); - uint32_t comp = idx.value.uv[0]; + uint32_t comp = uintComp(idx, 0); - if(VarTypeByteSize(var.type) == 8) - var.value.u64v[0] = var.value.u64v[comp]; - else - var.value.uv[0] = var.value.uv[comp]; + if(comp != 0) + copyComp(var, 0, var, comp); // result is now scalar var.columns = 1; @@ -997,12 +999,9 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray ShaderVariable scalar = GetSrc(insert.component); ShaderVariable idx = GetSrc(insert.index); - uint32_t comp = idx.value.uv[0]; + uint32_t comp = uintComp(idx, 0); - if(VarTypeByteSize(var.type) == 8) - var.value.u64v[comp] = scalar.value.u64v[0]; - else - var.value.uv[comp] = scalar.value.uv[0]; + copyComp(var, comp, scalar, 0); SetDst(insert.result, var); break; @@ -1019,20 +1018,15 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray ShaderVariable b = GetSrc(select.object2); if(cond.columns == 1) { - if(cond.value.uv[0] == 0) + if(uintComp(cond, 0) == 0) var = b; } else { for(uint8_t c = 0; c < cond.columns; c++) { - if(cond.value.uv[c] == 0) - { - if(VarTypeByteSize(var.type) == 8) - var.value.u64v[c] = b.value.u64v[c]; - else - var.value.uv[c] = b.value.uv[c]; - } + if(uintComp(cond, c) == 0) + copyComp(var, c, b, c); } } @@ -1052,36 +1046,82 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray case Op::ConvertSToF: case Op::ConvertUToF: { - OpConvertFToS conv(it); + OpConvertFToS convert(it); - ShaderVariable var = GetSrc(conv.floatValue); + const ShaderVariable &var = GetSrc(convert.floatValue); + const DataType &resultType = debugger.GetType(convert.resultType); + + ShaderVariable conv = var; + conv.type = resultType.scalar().Type(); if(opdata.op == Op::ConvertFToS) { for(uint8_t c = 0; c < var.columns; c++) - var.value.iv[c] = (int)var.value.fv[c]; - var.type = VarType::SInt; + { + double x = 0.0; + +#undef _IMPL +#define _IMPL(T) x = comp(var, c); + IMPL_FOR_FLOAT_TYPES_FOR_TYPE(_IMPL, var.type); + +#undef _IMPL +#define _IMPL(I, S, U) comp(conv, c) = (S)x; + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, conv.type); + } } else if(opdata.op == Op::ConvertFToU) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = var.value.fv[c] > 0.0f ? (uint32_t)var.value.fv[c] : 0U; - var.type = VarType::UInt; + { + double x = 0.0; + +#undef _IMPL +#define _IMPL(T) x = comp(var, c); + IMPL_FOR_FLOAT_TYPES_FOR_TYPE(_IMPL, var.type); + +#undef _IMPL +#define _IMPL(I, S, U) comp(conv, c) = (U)x; + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, conv.type); + } } else if(opdata.op == Op::ConvertSToF) { for(uint8_t c = 0; c < var.columns; c++) - var.value.fv[c] = (float)var.value.iv[c]; - var.type = VarType::Float; + { + int64_t x = 0; + +#undef _IMPL +#define _IMPL(I, S, U) x = comp(var, c); + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, var.type); + + if(conv.type == VarType::Float) + comp(conv, c) = (float)x; + else if(conv.type == VarType::Half) + comp(conv, c) = (float)x; + else if(conv.type == VarType::Double) + comp(conv, c) = (double)x; + } } else if(opdata.op == Op::ConvertUToF) { for(uint8_t c = 0; c < var.columns; c++) - var.value.fv[c] = (float)var.value.uv[c]; - var.type = VarType::Float; + { + uint64_t x = 0; + +#undef _IMPL +#define _IMPL(I, S, U) x = comp(var, c); + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, var.type); + + if(conv.type == VarType::Float) + comp(conv, c) = (float)x; + else if(conv.type == VarType::Half) + comp(conv, c) = (float)x; + else if(conv.type == VarType::Double) + comp(conv, c) = (double)x; + } } - SetDst(conv.result, var); + SetDst(convert.result, conv); break; } case Op::QuantizeToF16: @@ -1089,31 +1129,74 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray OpQuantizeToF16 quant(it); ShaderVariable var = GetSrc(quant.value); + ShaderVariable conv = var; + + // Result Type must be a scalar or vector of floating-point type. The component width must be + // 32 bits. + conv.type = VarType::Float; for(uint8_t c = 0; c < var.columns; c++) - var.value.fv[c] = ConvertFromHalf(ConvertToHalf(var.value.fv[c])); + setFloatComp(conv, c, ConvertFromHalf(ConvertToHalf(floatComp(var, c)))); - SetDst(quant.result, var); + SetDst(quant.result, conv); break; } case Op::UConvert: { OpUConvert cast(it); - ShaderVariable var = GetSrc(cast.unsignedValue); + const ShaderVariable &var = GetSrc(cast.unsignedValue); + const DataType &resultType = debugger.GetType(cast.resultType); - // TODO - conversion between bit widths once we support it + ShaderVariable conv = var; + conv.type = resultType.scalar().Type(); - SetDst(cast.result, var); + RDCEraseEl(conv.value); + + // this is a zero-extend or truncate. Column-wise we read the variable out into a u64 then + // cast + for(uint8_t c = 0; c < var.columns; c++) + { + uint64_t x = 0; + +#undef _IMPL +#define _IMPL(I, S, U) x = comp(var, c); + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, var.type); + +#undef _IMPL +#define _IMPL(I, S, U) comp(conv, c) = (U)x; + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, conv.type); + } + + SetDst(cast.result, conv); break; } case Op::SConvert: { OpSConvert cast(it); - ShaderVariable var = GetSrc(cast.signedValue); + const ShaderVariable &var = GetSrc(cast.signedValue); + const DataType &resultType = debugger.GetType(cast.resultType); - // TODO - conversion between bit widths once we support it + ShaderVariable conv = var; + conv.type = resultType.scalar().Type(); + + RDCEraseEl(conv.value); + + // this is a sign-extend or truncate. Column-wise we read the variable out into a u64 then + // cast + for(uint8_t c = 0; c < var.columns; c++) + { + int64_t x = 0; + +#undef _IMPL +#define _IMPL(I, S, U) x = comp(var, c); + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, var.type); + +#undef _IMPL +#define _IMPL(I, S, U) comp(conv, c) = (S)x; + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, conv.type); + } SetDst(cast.result, var); break; @@ -1122,42 +1205,35 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray { OpFConvert cast(it); - const DataType &type = debugger.GetType(cast.resultType); + const ShaderVariable &var = GetSrc(cast.floatValue); + const DataType &resultType = debugger.GetType(cast.resultType); - ShaderVariable var = GetSrc(cast.floatValue); - - ShaderVariable result = var; - - uint32_t srcWidth = VarTypeByteSize(var.type); - uint32_t dstWidth = type.scalar().width / 8; + ShaderVariable conv = var; + conv.type = resultType.scalar().Type(); + // we can safely upconvert to double as an intermediary because the IEEE format is the same. + // All we're doing effectively is sign extending the exponent and zero extending the mantissa. for(uint8_t c = 0; c < var.columns; c++) { - if(srcWidth == 8) - { - if(dstWidth == 8) - { - // nop - } - else - { - result.value.fv[c] = (float)var.value.dv[c]; - } - } - else if(srcWidth == 4) - { - if(dstWidth == 8) - { - result.value.dv[c] = (double)var.value.dv[c]; - } - else - { - // nop - } - } + double x = 0.0; + +#undef _IMPL +#define _IMPL(T) x = comp(var, c); + IMPL_FOR_FLOAT_TYPES_FOR_TYPE(_IMPL, var.type); + +#undef _IMPL +#define _IMPL(T) comp(conv, c) = (T)x; + // IMPL_FOR_FLOAT_TYPES_FOR_TYPE(_IMPL, conv.type); + + if(conv.type == VarType::Float) + comp(conv, c) = (float)x; + else if(conv.type == VarType::Half) + comp(conv, c) = (float)x; + else if(conv.type == VarType::Double) + comp(conv, c) = (double)x; } - SetDst(cast.result, result); + SetDst(cast.result, conv); break; } case Op::Bitcast: @@ -1188,30 +1264,13 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray // must be identical bit count RDCASSERT(dstByteCount * type.vector().count == srcByteCount * var.columns); - uint32_t byteSize = VarTypeByteSize(var.type); - - bytebuf bytes; - for(uint32_t c = 0; c < var.columns; c++) - { - if(byteSize == 8) - bytes.append((const byte *)&var.value.u64v[c], byteSize); - else - bytes.append((const byte *)&var.value.uv[c], byteSize); - } + // because this is a bitcast, we leave var.value entirely alone. There is the same number of + // bytes so the union handles it. E.g. uv[0], uv[1] being bitcast to a single 64-bit + // corresponds exactly to the LSB and MSB of u64v[0] var.type = type.scalar().Type(); var.columns = type.vector().count & 0xff; var.value = ShaderValue(); - - byte *b = bytes.data(); - for(uint32_t c = 0; c < var.columns; c++) - { - if(byteSize == 8) - memcpy(&var.value.u64v[c], b, byteSize); - else - memcpy(&var.value.uv[c], b, byteSize); - b += byteSize; - } } SetDst(cast.result, var); @@ -1298,70 +1357,131 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray case Op::FUnordLessThan: case Op::FUnordLessThanEqual: { - OpFMul comp(it); + OpFMul compare(it); - ShaderVariable var = GetSrc(comp.operand1); - ShaderVariable b = GetSrc(comp.operand2); + ShaderVariable a = GetSrc(compare.operand1); + ShaderVariable b = GetSrc(compare.operand2); + ShaderVariable var = a; if(opdata.op == Op::IEqual || opdata.op == Op::LogicalEqual) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.uv[c] == b.value.uv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(a, c) == comp(b, c) ? 1 : 0 + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::INotEqual || opdata.op == Op::LogicalNotEqual) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.uv[c] != b.value.uv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(a, c) != comp(b, c) ? 1 : 0 + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::LogicalAnd) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = var.value.uv[c] & b.value.uv[c]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(a, c) & comp(b, c) + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::LogicalOr) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = var.value.uv[c] | b.value.uv[c]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(a, c) | comp(b, c) + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::UGreaterThan) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.uv[c] > b.value.uv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(a, c) > comp(b, c) ? 1 : 0 + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::UGreaterThanEqual) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.uv[c] >= b.value.uv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(a, c) >= comp(b, c) ? 1 : 0 + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::ULessThan) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.uv[c] < b.value.uv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(a, c) < comp(b, c) ? 1 : 0 + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::ULessThanEqual) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.uv[c] <= b.value.uv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(a, c) <= comp(b, c) ? 1 : 0 + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::SGreaterThan) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.iv[c] > b.value.iv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(a, c) > comp(b, c) ? 1 : 0 + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::SGreaterThanEqual) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.iv[c] >= b.value.iv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(a, c) >= comp(b, c) ? 1 : 0 + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::SLessThan) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.iv[c] < b.value.iv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(a, c) < comp(b, c) ? 1 : 0 + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::SLessThanEqual) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.iv[c] <= b.value.iv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(a, c) <= comp(b, c) ? 1 : 0 + + IMPL_FOR_INT_TYPES(_IMPL); + } } // FOrd are all "Floating-point comparison if operands are ordered and Operand 1 is ... than @@ -1380,68 +1500,128 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray if(opdata.op == Op::FOrdEqual) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.fv[c] == b.value.fv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) = (comp(a, c) == comp(b, c)) ? 1 : 0 + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } else if(opdata.op == Op::FOrdNotEqual) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.fv[c] != b.value.fv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) = (comp(a, c) != comp(b, c)) ? 1 : 0 + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } else if(opdata.op == Op::FOrdGreaterThan) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.fv[c] > b.value.fv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) = (comp(a, c) > comp(b, c)) ? 1 : 0 + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } else if(opdata.op == Op::FOrdGreaterThanEqual) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.fv[c] >= b.value.fv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) = (comp(a, c) >= comp(b, c)) ? 1 : 0 + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } else if(opdata.op == Op::FOrdLessThan) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.fv[c] < b.value.fv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) = (comp(a, c) < comp(b, c)) ? 1 : 0 + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } else if(opdata.op == Op::FOrdLessThanEqual) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.fv[c] <= b.value.fv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) = (comp(a, c) <= comp(b, c)) ? 1 : 0 + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } if(opdata.op == Op::FUnordEqual) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.fv[c] != b.value.fv[c]) ? 0 : 1; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) = (comp(a, c) != comp(b, c)) ? 0 : 1 + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } else if(opdata.op == Op::FUnordNotEqual) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.fv[c] == b.value.fv[c]) ? 0 : 1; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) = (comp(a, c) == comp(b, c)) ? 0 : 1 + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } else if(opdata.op == Op::FUnordGreaterThan) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.fv[c] <= b.value.fv[c]) ? 0 : 1; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) = (comp(a, c) <= comp(b, c)) ? 0 : 1 + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } else if(opdata.op == Op::FUnordGreaterThanEqual) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.fv[c] < b.value.fv[c]) ? 0 : 1; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) = (comp(a, c) < comp(b, c)) ? 0 : 1 + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } else if(opdata.op == Op::FUnordLessThan) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.fv[c] >= b.value.fv[c]) ? 0 : 1; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) = (comp(a, c) >= comp(b, c)) ? 0 : 1 + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } else if(opdata.op == Op::FUnordLessThanEqual) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = (var.value.fv[c] > b.value.fv[c]) ? 0 : 1; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) = (comp(a, c) <= comp(b, c)) ? 0 : 1 + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } var.type = VarType::Bool; - SetDst(comp.result, var); + SetDst(compare.result, var); break; } case Op::LogicalNot: @@ -1451,7 +1631,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray ShaderVariable var = GetSrc(negate.operand); for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = 1U - var.value.uv[c]; + setUintComp(var, c, 1U - uintComp(var, c)); var.type = VarType::Bool; @@ -1468,9 +1648,9 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray for(uint8_t c = 1; c < var.columns; c++) { if(opdata.op == Op::Any) - var.value.uv[0] |= var.value.uv[c]; + setUintComp(var, 0, uintComp(var, 0) | uintComp(var, c)); else - var.value.uv[0] &= var.value.uv[c]; + setUintComp(var, 0, uintComp(var, 0) & uintComp(var, c)); } var.columns = 1; @@ -1482,10 +1662,16 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray { OpIsNan is(it); - ShaderVariable var = GetSrc(is.x); + ShaderVariable x = GetSrc(is.x); + ShaderVariable var = x; for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = RDCISNAN(var.value.fv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(T) setUintComp(var, c, RDCISNAN(comp(x, c)) ? 1 : 0) + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } var.type = VarType::Bool; @@ -1496,10 +1682,16 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray { OpIsNan is(it); - ShaderVariable var = GetSrc(is.x); + ShaderVariable x = GetSrc(is.x); + ShaderVariable var = x; for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = RDCISINF(var.value.fv[c]) ? 1 : 0; + { +#undef _IMPL +#define _IMPL(T) setUintComp(var, c, RDCISINF(comp(x, c)) ? 1 : 0); + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } var.type = VarType::Bool; @@ -1517,12 +1709,20 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray { OpBitCount bitwise(it); + const DataType &type = debugger.GetType(bitwise.resultType); ShaderVariable var = GetSrc(bitwise.base); + ShaderVariable ret = var; + ret.type = type.scalar().Type(); for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = Bits::CountOnes(var.value.uv[c]); + { +#undef _IMPL +#define _IMPL(I, S, U) setUintComp(ret, c, (uint32_t)Bits::CountOnes(comp(var, c))); - SetDst(bitwise.result, var); + IMPL_FOR_INT_TYPES(_IMPL); + } + + SetDst(bitwise.result, ret); break; } case Op::BitReverse: @@ -1533,13 +1733,17 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray for(uint8_t c = 0; c < var.columns; c++) { - uint32_t u = var.value.uv[c]; - var.value.uv[c] = 0; - for(uint8_t b = 0; b < 32; b++) - { - uint32_t bit = u & (1u << b); - var.value.uv[c] |= bit << (31 - b); - } +#undef _IMPL +#define _IMPL(I, S, U) \ + U v = comp(var, c); \ + comp(var, c) = 0; \ + for(uint8_t b = 0; b < 32; b++) \ + { \ + uint32_t bit = (v >> b) & 0x1; \ + comp(var, c) |= bit << (31 - b); \ + } + + IMPL_FOR_INT_TYPES(_IMPL); } SetDst(bitwise.result, var); @@ -1556,17 +1760,21 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray for(uint8_t c = 0; c < var.columns; c++) { - const uint32_t mask = (1u << count.value.uv[c]) - 1; +#undef _IMPL +#define _IMPL(I, S, U) \ + const U mask = (U(1) << comp(count, c)) - U(1); \ + \ + comp(var, c) >>= comp(offset, c); \ + comp(var, c) &= mask; \ + \ + if(opdata.op == Op::BitFieldSExtract) \ + { \ + U topbit = (mask + U(1)) >> U(1); \ + if(comp(var, c) & topbit) \ + comp(var, c) |= (~0ULL ^ mask); \ + } - var.value.uv[c] >>= offset.value.uv[c]; - var.value.uv[c] &= (1u << count.value.uv[c]) - 1; - - if(opdata.op == Op::BitFieldSExtract) - { - uint32_t topbit = (mask + 1u) >> 1u; - if(var.value.uv[c] & topbit) - var.value.uv[c] |= (0xffffffffu ^ mask); - } + IMPL_FOR_INT_TYPES(_IMPL); } SetDst(bitwise.result, var); @@ -1583,10 +1791,14 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray for(uint8_t c = 0; c < var.columns; c++) { - const uint32_t mask = (1u << count.value.uv[c]) - 1; +#undef _IMPL +#define _IMPL(I, S, U) \ + const U mask = (U(1) << comp(count, c)) - U(1); \ + \ + comp(var, c) &= ~(mask << comp(offset, c)); \ + comp(var, c) |= (comp(insert, c) & mask) << comp(offset, c); - var.value.uv[c] &= ~(mask << offset.value.uv[c]); - var.value.uv[c] |= (insert.value.uv[c] & mask) << offset.value.uv[c]; + IMPL_FOR_INT_TYPES(_IMPL); } SetDst(bitwise.result, var); @@ -1607,32 +1819,62 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray if(opdata.op == Op::BitwiseOr) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = var.value.uv[c] | b.value.uv[c]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(var, c) | comp(b, c) + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::BitwiseAnd) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = var.value.uv[c] & b.value.uv[c]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(var, c) & comp(b, c) + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::BitwiseXor) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = var.value.uv[c] ^ b.value.uv[c]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(var, c) ^ comp(b, c) + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::ShiftLeftLogical) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = var.value.uv[c] << b.value.uv[c]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(var, c) << comp(b, c) + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::ShiftRightArithmetic) { for(uint8_t c = 0; c < var.columns; c++) - var.value.iv[c] = var.value.iv[c] >> b.value.uv[c]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(var, c) >> comp(b, c) + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::ShiftRightLogical) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = var.value.uv[c] >> b.value.uv[c]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = comp(var, c) >> comp(b, c) + + IMPL_FOR_INT_TYPES(_IMPL); + } } SetDst(bitwise.result, var); @@ -1645,7 +1887,12 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray ShaderVariable var = GetSrc(bitwise.operand); for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] = ~var.value.uv[c]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = ~comp(var, c) + + IMPL_FOR_INT_TYPES(_IMPL); + } SetDst(bitwise.result, var); break; @@ -1680,125 +1927,184 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray if(opdata.op == Op::FMul) { for(uint8_t c = 0; c < var.columns; c++) - var.value.fv[c] *= b.value.fv[c]; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) *= comp(b, c) + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } else if(opdata.op == Op::FDiv) { for(uint8_t c = 0; c < var.columns; c++) - var.value.fv[c] /= b.value.fv[c]; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) /= comp(b, c) + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } else if(opdata.op == Op::FMod) { for(uint8_t c = 0; c < var.columns; c++) { - float af = var.value.fv[c], bf = b.value.fv[c]; - var.value.fv[c] = fmodf(af, bf); - if(var.value.fv[c] < 0.0f && bf >= 0.0f) - var.value.fv[c] += fabsf(bf); - else if(var.value.fv[c] >= 0.0f && bf < 0.0f) - var.value.fv[c] -= fabsf(bf); +#undef _IMPL +#define _IMPL(T) \ + T af = comp(var, c), bf = comp(b, c); \ + comp(var, c) = fmod(af, bf); \ + if(comp(var, c) < 0.0f && bf >= 0.0f) \ + comp(var, c) += fabs(bf); \ + else if(comp(var, c) >= 0.0f && bf < 0.0f) \ + comp(var, c) -= fabs(bf); + + IMPL_FOR_FLOAT_TYPES(_IMPL); } } else if(opdata.op == Op::FRem) { for(uint8_t c = 0; c < var.columns; c++) { - float af = var.value.fv[c], bf = b.value.fv[c]; - var.value.fv[c] = fmodf(af, bf); - if(var.value.fv[c] < 0.0f && af >= 0.0f) - var.value.fv[c] += fabsf(bf); - else if(var.value.fv[c] >= 0.0f && af < 0.0f) - var.value.fv[c] -= fabsf(bf); +#undef _IMPL +#define _IMPL(T) \ + T af = comp(var, c), bf = comp(b, c); \ + comp(var, c) = fmod(af, bf); \ + if(comp(var, c) < 0.0f && af >= 0.0f) \ + comp(var, c) += fabs(bf); \ + else if(comp(var, c) >= 0.0f && af < 0.0f) \ + comp(var, c) -= fabs(bf); + + IMPL_FOR_FLOAT_TYPES(_IMPL); } } else if(opdata.op == Op::FAdd) { for(uint8_t c = 0; c < var.columns; c++) - var.value.fv[c] += b.value.fv[c]; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) += comp(b, c) + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } else if(opdata.op == Op::FSub) { for(uint8_t c = 0; c < var.columns; c++) - var.value.fv[c] -= b.value.fv[c]; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) -= comp(b, c) + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } else if(opdata.op == Op::IMul) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] *= b.value.uv[c]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) *= comp(b, c) + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::SDiv) { for(uint8_t c = 0; c < var.columns; c++) { - if(b.value.iv[c] != 0) - { - var.value.iv[c] /= b.value.iv[c]; - } - else - { - var.value.uv[c] = ~0U; - if(m_State) - m_State->flags |= ShaderEvents::GeneratedNanOrInf; - } +#undef _IMPL +#define _IMPL(I, S, U) \ + if(comp(b, c) != 0) \ + { \ + comp(var, c) /= comp(b, c); \ + } \ + else \ + { \ + comp(var, c) = 0; \ + if(m_State) \ + m_State->flags |= ShaderEvents::GeneratedNanOrInf; \ + } + + IMPL_FOR_INT_TYPES(_IMPL); } } else if(opdata.op == Op::UDiv) { for(uint8_t c = 0; c < var.columns; c++) { - if(b.value.uv[c] != 0) - { - var.value.uv[c] /= b.value.uv[c]; - } - else - { - var.value.uv[c] = ~0U; - if(m_State) - m_State->flags |= ShaderEvents::GeneratedNanOrInf; - } +#undef _IMPL +#define _IMPL(I, S, U) \ + if(comp(b, c) != 0) \ + { \ + comp(var, c) /= comp(b, c); \ + } \ + else \ + { \ + comp(var, c) = 0; \ + if(m_State) \ + m_State->flags |= ShaderEvents::GeneratedNanOrInf; \ + } + + IMPL_FOR_INT_TYPES(_IMPL); } } else if(opdata.op == Op::UMod) { for(uint8_t c = 0; c < var.columns; c++) { - if(b.value.uv[c] != 0) - { - var.value.uv[c] %= b.value.uv[c]; - } - else - { - var.value.uv[c] = ~0U; - if(m_State) - m_State->flags |= ShaderEvents::GeneratedNanOrInf; - } +#undef _IMPL +#define _IMPL(I, S, U) \ + if(comp(b, c) != 0) \ + { \ + comp(var, c) %= comp(b, c); \ + } \ + else \ + { \ + comp(var, c) = 0; \ + if(m_State) \ + m_State->flags |= ShaderEvents::GeneratedNanOrInf; \ + } + + IMPL_FOR_INT_TYPES(_IMPL); } } else if(opdata.op == Op::SRem || opdata.op == Op::SMod) { for(uint8_t c = 0; c < var.columns; c++) { - if(b.value.iv[c] != 0) - { - var.value.iv[c] %= b.value.iv[c]; - } - else - { - var.value.uv[c] = ~0U; - if(m_State) - m_State->flags |= ShaderEvents::GeneratedNanOrInf; - } +#undef _IMPL +#define _IMPL(I, S, U) \ + if(comp(b, c) != 0) \ + { \ + comp(var, c) %= comp(b, c); \ + } \ + else \ + { \ + comp(var, c) = 0; \ + if(m_State) \ + m_State->flags |= ShaderEvents::GeneratedNanOrInf; \ + } + + IMPL_FOR_INT_TYPES(_IMPL); } } else if(opdata.op == Op::IAdd) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] += b.value.uv[c]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) += comp(b, c) + + IMPL_FOR_INT_TYPES(_IMPL); + } } else if(opdata.op == Op::ISub) { for(uint8_t c = 0; c < var.columns; c++) - var.value.uv[c] -= b.value.uv[c]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) -= comp(b, c) + + IMPL_FOR_INT_TYPES(_IMPL); + } } SetDst(math.result, var); @@ -1818,64 +2124,85 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray ShaderVariable lsb = a; ShaderVariable msb = a; + uint32_t elemSize = VarTypeByteSize(a.type); + uint32_t elemBits = elemSize * 8; + if(opdata.op == Op::UMulExtended) { // if this is less than 64-bit precision inputs, we can just upcast, do the mul, and then // mask off the bits we care about - if(VarTypeByteSize(a.type) < 8) + if(elemSize < 8) { + uint32_t mask = 0xFFFFFFFFu >> (32 - elemBits); for(uint8_t c = 0; c < a.columns; c++) { - const uint64_t x = a.value.uv[c]; - const uint64_t y = b.value.uv[c]; + const uint64_t x = uintComp(a, c); + const uint64_t y = uintComp(b, c); const uint64_t res = x * y; - lsb.value.uv[c] = uint32_t(res & 0xFFFFFFFFu); - msb.value.uv[c] = uint32_t(res >> 32); + setUintComp(lsb, c, uint32_t(res & mask)); + setUintComp(msb, c, uint32_t(res >> elemBits)); } } + else + { + RDCERR("Unsupported UMulExtended on 64-bit operands"); + } } else if(opdata.op == Op::SMulExtended) { - if(VarTypeByteSize(a.type) < 8) + if(elemSize < 8) { + uint32_t mask = 0xFFFFFFFFu >> (32 - elemBits); for(uint8_t c = 0; c < a.columns; c++) { - const int64_t x = a.value.iv[c]; - const int64_t y = b.value.iv[c]; + const int64_t x = intComp(a, c); + const int64_t y = intComp(b, c); const int64_t res = x * y; - lsb.value.iv[c] = int32_t(res & 0xFFFFFFFFu); - msb.value.iv[c] = int32_t(res >> 32); + setIntComp(lsb, c, int32_t(res & mask)); + setIntComp(msb, c, int32_t(res >> elemBits)); } } + else + { + RDCERR("Unsupported SMulExtended on 64-bit operands"); + } } else if(opdata.op == Op::IAddCarry) { for(uint8_t c = 0; c < a.columns; c++) { - // unsigned overflow is well-defined to wrap around, giving us the lsb we want. - lsb.value.uv[c] = a.value.uv[c] + b.value.uv[c]; - // if the result is less than one of the operands, we overflowed so set msb - msb.value.uv[c] = (lsb.value.uv[c] < b.value.uv[c]) ? 1 : 0; +// unsigned overflow is well-defined to wrap around, giving us the lsb we want. +// if the result is less than one of the operands, we overflowed so set msb +#undef _IMPL +#define _IMPL(I, S, U) \ + comp(lsb, c) = comp(a, c) + comp(b, c); \ + comp(msb, c) = (comp(lsb, c) < comp(b, c)) ? 1 : 0; + + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, a.type); } } else if(opdata.op == Op::ISubBorrow) { for(uint8_t c = 0; c < a.columns; c++) { - // if b <= a we don't need to borrow - if(b.value.uv[c] <= a.value.uv[c]) - { - msb.value.uv[c] = 0; - lsb.value.uv[c] = a.value.uv[c] - b.value.uv[c]; - } - else - { - // otherwise set borrow bit - msb.value.uv[c] = 1; - lsb.value.uv[c] = 0xFFFFFFFFu - (b.value.uv[c] - a.value.uv[c] - 1); - } +// if b <= a we don't need to borrow, otherwise set the borrow bit + +#undef _IMPL +#define _IMPL(I, S, U) \ + if(comp(b, c) <= comp(a, c)) \ + { \ + comp(msb, c) = 0; \ + comp(lsb, c) = comp(a, c) - comp(b, c); \ + } \ + else \ + { \ + comp(msb, c) = 1; \ + comp(lsb, c) = ~0ULL - (comp(b, c) - comp(a, c) - 1U); \ + } + + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, a.type); } } @@ -1900,12 +2227,22 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray if(opdata.op == Op::FNegate) { for(uint8_t c = 0; c < var.columns; c++) - var.value.fv[c] = -var.value.fv[c]; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) = -comp(var, c) + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } } else if(opdata.op == Op::SNegate) { for(uint8_t c = 0; c < var.columns; c++) - var.value.iv[c] = -var.value.iv[c]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(var, c) = -comp(var, c) + + IMPL_FOR_INT_TYPES(_IMPL); + } } SetDst(math.result, var); @@ -1920,12 +2257,16 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray RDCASSERTEQUAL(var.columns, b.columns); - float ret = 0; - for(uint8_t c = 0; c < var.columns; c++) - ret += var.value.fv[c] * b.value.fv[c]; +#undef _IMPL +#define _IMPL(T) \ + T ret(0.0); \ + for(uint8_t c = 0; c < var.columns; c++) \ + ret += comp(var, c) * comp(b, c); \ + comp(var, 0) = ret; + + IMPL_FOR_FLOAT_TYPES(_IMPL); var.columns = 1; - var.value.fv[0] = ret; SetDst(dot.result, var); break; @@ -1938,7 +2279,12 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray ShaderVariable scalar = GetSrc(mul.scalar); for(uint8_t c = 0; c < var.columns; c++) - var.value.fv[c] *= scalar.value.fv[0]; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) *= comp(scalar, 0) + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } SetDst(mul.result, var); break; @@ -1951,7 +2297,12 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray ShaderVariable scalar = GetSrc(mul.scalar); for(uint8_t c = 0; c < var.rows * var.columns; c++) - var.value.fv[c] *= scalar.value.fv[0]; + { +#undef _IMPL +#define _IMPL(T) comp(var, c) *= comp(scalar, 0) + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } SetDst(mul.result, var); break; @@ -1966,20 +2317,19 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray ShaderVariable var = vector; var.columns = matrix.columns; - float *m = matrix.value.fv; - float *v = vector.value.fv; - const DataType &type = debugger.GetType(mul.resultType); RDCASSERTEQUAL(type.vector().count, var.columns); RDCASSERTEQUAL(matrix.rows, vector.columns); for(uint8_t c = 0; c < matrix.columns; c++) { - var.value.fv[c] = 0.0f; - for(uint8_t r = 0; r < matrix.rows; r++) - { - var.value.fv[c] += m[r * matrix.columns + c] * v[r]; - } +#undef _IMPL +#define _IMPL(T) \ + comp(var, c) = 0.0; \ + for(uint8_t r = 0; r < matrix.rows; r++) \ + comp(var, c) += comp(matrix, r * matrix.columns + c) * comp(vector, r); + + IMPL_FOR_FLOAT_TYPES(_IMPL); } SetDst(mul.result, var); @@ -1994,8 +2344,15 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray std::swap(var.rows, var.columns); for(uint8_t r = 0; r < var.rows; r++) + { for(uint8_t c = 0; c < var.columns; c++) - var.value.fv[r * var.columns + c] = matrix.value.fv[c * matrix.columns + r]; + { +#undef _IMPL +#define _IMPL(T) comp(var, r * var.columns + c) = comp(matrix, c * matrix.columns + r) + + IMPL_FOR_FLOAT_TYPES(_IMPL); + } + } SetDst(transpose.result, var); break; @@ -2010,20 +2367,19 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray ShaderVariable var = vector; var.columns = matrix.rows; - float *m = matrix.value.fv; - float *v = vector.value.fv; - const DataType &type = debugger.GetType(mul.resultType); RDCASSERTEQUAL(type.vector().count, var.columns); RDCASSERTEQUAL(matrix.columns, vector.columns); for(uint8_t r = 0; r < matrix.rows; r++) { - var.value.fv[r] = 0.0f; - for(uint8_t c = 0; c < matrix.columns; c++) - { - var.value.fv[r] += m[r * matrix.columns + c] * v[c]; - } +#undef _IMPL +#define _IMPL(T) \ + comp(var, r) = 0.0; \ + for(uint8_t c = 0; c < matrix.columns; c++) \ + comp(var, r) += comp(matrix, r * matrix.columns + c) * comp(vector, c); + + IMPL_FOR_FLOAT_TYPES(_IMPL); } SetDst(mul.result, var); @@ -2040,22 +2396,21 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray var.rows = left.rows; var.columns = right.columns; - float *l = left.value.fv; - float *r = right.value.fv; - RDCASSERTEQUAL(left.columns, right.rows); for(uint8_t dstr = 0; dstr < var.rows; dstr++) { for(uint8_t dstc = 0; dstc < var.columns; dstc++) { - float &dstval = var.value.fv[dstr * var.columns + dstc]; - dstval = 0.0f; +#undef _IMPL +#define _IMPL(T) \ + T &dstval = comp(var, dstr * var.columns + dstc); \ + dstval = 0.0; \ + \ + for(uint8_t src = 0; src < right.rows; src++) \ + dstval += comp(left, dstr * left.columns + src) * comp(right, src * right.columns + dstc); - for(uint8_t src = 0; src < right.rows; src++) - { - dstval += l[dstr * left.columns + src] * r[src * right.columns + dstc]; - } + IMPL_FOR_FLOAT_TYPES(_IMPL); } } @@ -2077,7 +2432,9 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray { for(uint8_t c = 0; c < var.columns; c++) { - var.value.fv[r * var.columns + c] = left.value.fv[r] * right.value.fv[c]; +#undef _IMPL +#define _IMPL(T) comp(var, r * var.columns + c) = comp(left, r) * comp(right, c); + IMPL_FOR_FLOAT_TYPES(_IMPL); } } @@ -2164,7 +2521,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray sampler = img = GetSrc(image.sampledImage); uv = GetSrc(image.coordinate); - gather = GatherChannel(GetSrc(image.component).value.uv[0]); + gather = GatherChannel(uintComp(GetSrc(image.component), 0)); operands = image.imageOperands; } else if(opdata.op == Op::ImageDrefGather) @@ -2317,17 +2674,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray compare, gather, operands, result)) { // sample failed. Pretend we got 0 columns back - - result.value.uv[0] = 0; - result.value.uv[1] = 0; - result.value.uv[2] = 0; - - if(result.type == VarType::Float || result.type == VarType::Half) - result.value.fv[3] = 1.0f; - else if(result.type == VarType::Double) - result.value.dv[3] = 1.0; - else - result.value.uv[3] = 1; + set0001(result); } result.rows = 1; @@ -2360,8 +2707,8 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray debugger.GetAPIWrapper()->FillInputValue(curCoord, ShaderBuiltin::Position, 0, 0); // co-ords are relative to the current position - coord.value.uv[0] += curCoord.value.uv[0]; - coord.value.uv[1] += curCoord.value.uv[1]; + setUintComp(coord, 0, uintComp(coord, 0) + uintComp(curCoord, 0)); + setUintComp(coord, 1, uintComp(coord, 1) + uintComp(curCoord, 1)); // do it with samplegather as ImageFetch rather than a Read which caches the whole texture // on the CPU for no reason (since we can't write to it) @@ -2372,37 +2719,19 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray ImageOperandsAndParamDatas(), result)) { // sample failed. Pretend we got 0 columns back - result.value.uv[0] = 0; - result.value.uv[1] = 0; - result.value.uv[2] = 0; - - if(result.type == VarType::Float || result.type == VarType::Half) - result.value.fv[3] = 1.0f; - else if(result.type == VarType::Double) - result.value.dv[3] = 1.0; - else - result.value.uv[3] = 1; + set0001(result); } } else { if(!debugger.GetAPIWrapper()->ReadTexel(img.GetBinding(), coord, read.imageOperands.flags & ImageOperands::Sample - ? GetSrc(read.imageOperands.sample).value.uv[0] + ? uintComp(GetSrc(read.imageOperands.sample), 0) : 0, result)) { // sample failed. Pretend we got 0 columns back - result.value.uv[0] = 0; - result.value.uv[1] = 0; - result.value.uv[2] = 0; - - if(result.type == VarType::Float || result.type == VarType::Half) - result.value.fv[3] = 1.0f; - else if(result.type == VarType::Double) - result.value.dv[3] = 1.0; - else - result.value.uv[3] = 1; + set0001(result); } } @@ -2425,7 +2754,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray debugger.GetAPIWrapper()->WriteTexel(img.GetBinding(), coord, write.imageOperands.flags & ImageOperands::Sample - ? GetSrc(write.imageOperands.sample).value.uv[0] + ? uintComp(GetSrc(write.imageOperands.sample), 0) : 0, texel); @@ -2462,7 +2791,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray for(const PairLiteralIntegerIdRef &case_ : switch_.target) { - if(selector.value.uv[0] == case_.first) + if(uintComp(selector, 0) == case_.first) { targetLabel = case_.second; break; @@ -2483,7 +2812,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray OpBranchConditional branch(it); Id target = branch.falseLabel; - if(GetSrc(branch.condition).value.uv[0]) + if(uintComp(GetSrc(branch.condition), 0)) target = branch.trueLabel; JumpToLabel(target); @@ -2540,6 +2869,9 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray result.rows = 1; result.columns = RDCMAX(1U, resultType.vector().count) & 0xff; + // whatever the type is, we just write the full 64-bit value. If it's a 64-bit integer it gets + // it natively, or if it's a 2-vector of uint32_t then it gets the lsb/msb automatically from + // the union. result.value.u64v[0] = global.clock; SetDst(opdata.result, result); @@ -2549,11 +2881,11 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray { ShaderVariable result; - result.type = VarType::UInt; + result.type = VarType::Bool; result.rows = 1; result.columns = 1; - result.value.uv[0] = helperInvocation; + setUintComp(result, 0, helperInvocation ? 1 : 0); SetDst(opdata.result, result); break; @@ -2698,10 +3030,10 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray result.type = resultType.scalar().Type(); if(!debugger.GetAPIWrapper()->ReadTexel(ptr.members[0].GetBinding(), ptr.members[1], - ptr.members[2].value.uv[0], result)) + uintComp(ptr.members[2], 0), result)) { // sample failed. Pretend we got 0 columns back - result.value.uv[0] = 0; + RDCEraseEl(result.value); } } @@ -2726,7 +3058,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray else { debugger.GetAPIWrapper()->WriteTexel(ptr.members[0].GetBinding(), ptr.members[1], - ptr.members[2].value.uv[0], value); + uintComp(ptr.members[2], 0), value); } break; @@ -2756,14 +3088,14 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray result.type = resultType.scalar().Type(); if(!debugger.GetAPIWrapper()->ReadTexel(ptr.members[0].GetBinding(), ptr.members[1], - ptr.members[2].value.uv[0], result)) + uintComp(ptr.members[2], 0), result)) { // sample failed. Pretend we got 0 columns back - result.value.uv[0] = 0; + RDCEraseEl(result.value); } debugger.GetAPIWrapper()->WriteTexel(ptr.members[0].GetBinding(), ptr.members[1], - ptr.members[2].value.uv[0], value); + uintComp(ptr.members[2], 0), value); } SetDst(excg.result, result); @@ -2782,6 +3114,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray ShaderVariable result; const ShaderVariable &ptr = GetSrc(cmpexcg.pointer); const ShaderVariable &value = GetSrc(cmpexcg.value); + const ShaderVariable &comparator = GetSrc(cmpexcg.comparator); if(ptr.members.empty()) { @@ -2795,17 +3128,29 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray result.type = resultType.scalar().Type(); if(!debugger.GetAPIWrapper()->ReadTexel(ptr.members[0].GetBinding(), ptr.members[1], - ptr.members[2].value.uv[0], result)) + uintComp(ptr.members[2], 0), result)) { // sample failed. Pretend we got 0 columns back - result.value.uv[0] = 0; + RDCEraseEl(result.value); } } SetDst(cmpexcg.result, result); - // write the new value, only if the value is the same as expected - if(result.value.u64v[0] == GetSrc(cmpexcg.comparator).value.u64v[0]) + uint64_t resultVal = 0, compareVal = 0; + +#undef _IMPL +#define _IMPL(I, S, U) resultVal = comp(result, 0); + + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, result.type); + +#undef _IMPL +#define _IMPL(I, S, U) compareVal = comp(comparator, 0); + + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, comparator.type); + + // write the new value, only if the value is the same as expected. + if(resultVal == compareVal) { if(ptr.members.empty()) { @@ -2814,7 +3159,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray else { debugger.GetAPIWrapper()->WriteTexel(ptr.members[0].GetBinding(), ptr.members[1], - ptr.members[2].value.uv[0], value); + uintComp(ptr.members[2], 0), value); } } break; @@ -2843,19 +3188,25 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray result.type = resultType.scalar().Type(); if(!debugger.GetAPIWrapper()->ReadTexel(ptr.members[0].GetBinding(), ptr.members[1], - ptr.members[2].value.uv[0], result)) + uintComp(ptr.members[2], 0), result)) { // sample failed. Pretend we got 0 columns back - result.value.uv[0] = 0; + RDCEraseEl(result.value); } } SetDst(atomic.result, result); - if(opdata.op == Op::AtomicIIncrement) - result.value.uv[0]++; - else - result.value.uv[0]--; + { +#undef _IMPL +#define _IMPL(I, S, U) \ + if(opdata.op == Op::AtomicIIncrement) \ + comp(result, 0)++; \ + else \ + comp(result, 0)--; + + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, result.type); + } // write the new value if(ptr.members.empty()) @@ -2865,7 +3216,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray else { debugger.GetAPIWrapper()->WriteTexel(ptr.members[0].GetBinding(), ptr.members[1], - ptr.members[2].value.uv[0], result); + uintComp(ptr.members[2], 0), result); } break; } @@ -2902,35 +3253,84 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray result.type = resultType.scalar().Type(); if(!debugger.GetAPIWrapper()->ReadTexel(ptr.members[0].GetBinding(), ptr.members[1], - ptr.members[2].value.uv[0], result)) + uintComp(ptr.members[2], 0), result)) { // sample failed. Pretend we got 0 columns back - result.value.uv[0] = 0; + RDCEraseEl(result.value); } } SetDst(atomic.result, result); if(opdata.op == Op::AtomicIAdd) - result.value.uv[0] += value.value.uv[0]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(result, 0) += comp(value, 0) + + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, value.type); + } else if(opdata.op == Op::AtomicISub) - result.value.uv[0] -= value.value.uv[0]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(result, 0) -= comp(value, 0) + + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, value.type); + } else if(opdata.op == Op::AtomicSMin) - result.value.iv[0] = RDCMIN(result.value.iv[0], value.value.iv[0]); + { +#undef _IMPL +#define _IMPL(I, S, U) comp(result, 0) = RDCMIN(comp(result, 0), comp(value, 0)) + + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, value.type); + } else if(opdata.op == Op::AtomicUMin) - result.value.uv[0] = RDCMIN(result.value.uv[0], value.value.uv[0]); + { +#undef _IMPL +#define _IMPL(I, S, U) comp(result, 0) = RDCMIN(comp(result, 0), comp(value, 0)) + + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, value.type); + } else if(opdata.op == Op::AtomicSMax) - result.value.iv[0] = RDCMAX(result.value.iv[0], value.value.iv[0]); + { +#undef _IMPL +#define _IMPL(I, S, U) comp(result, 0) = RDCMAX(comp(result, 0), comp(value, 0)) + + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, value.type); + } else if(opdata.op == Op::AtomicUMax) - result.value.uv[0] = RDCMAX(result.value.uv[0], value.value.uv[0]); + { +#undef _IMPL +#define _IMPL(I, S, U) comp(result, 0) = RDCMAX(comp(result, 0), comp(value, 0)) + + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, value.type); + } else if(opdata.op == Op::AtomicAnd) - result.value.uv[0] &= value.value.uv[0]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(result, 0) &= comp(value, 0) + + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, value.type); + } else if(opdata.op == Op::AtomicOr) - result.value.uv[0] |= value.value.uv[0]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(result, 0) |= comp(value, 0) + + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, value.type); + } else if(opdata.op == Op::AtomicXor) - result.value.uv[0] ^= value.value.uv[0]; + { +#undef _IMPL +#define _IMPL(I, S, U) comp(result, 0) ^= comp(value, 0) + + IMPL_FOR_INT_TYPES_FOR_TYPE(_IMPL, value.type); + } else if(opdata.op == Op::AtomicFAddEXT) - result.value.fv[0] += value.value.fv[0]; + { +#undef _IMPL +#define _IMPL(T) comp(result, 0) += comp(value, 0) + IMPL_FOR_FLOAT_TYPES_FOR_TYPE(_IMPL, value.type); + } // write the new value if(ptr.members.empty()) @@ -2940,7 +3340,7 @@ void ThreadState::StepNext(ShaderDebugState *state, const rdcarray else { debugger.GetAPIWrapper()->WriteTexel(ptr.members[0].GetBinding(), ptr.members[1], - ptr.members[2].value.uv[0], result); + uintComp(ptr.members[2], 0), result); } break; }