From 7a052278ef67d33ae97a3814713e8aa7e6ea805e Mon Sep 17 00:00:00 2001 From: Jonathan Glines Date: Mon, 9 Jun 2025 11:25:05 -0700 Subject: [PATCH] Update NVCounterEnumerator to use the latest Nsight PerfSDK APIs This updates NvCounterEnumerator to use the RawCounterConfig and counter availability APIs available in recent Nsight PerfSDK headers. Using these APIs to filter metrics by counter availability prevents crashes and NaN values on some GPUs. --- .../driver/ihv/nv/nv_counter_enumerator.cpp | 133 ++++++++++++++++-- .../driver/ihv/nv/nv_counter_enumerator.h | 12 +- 2 files changed, 129 insertions(+), 16 deletions(-) diff --git a/renderdoc/driver/ihv/nv/nv_counter_enumerator.cpp b/renderdoc/driver/ihv/nv/nv_counter_enumerator.cpp index c54ade803..b61e2f182 100644 --- a/renderdoc/driver/ihv/nv/nv_counter_enumerator.cpp +++ b/renderdoc/driver/ihv/nv/nv_counter_enumerator.cpp @@ -37,12 +37,17 @@ #include "NvPerfCounterConfiguration.h" #include "NvPerfCounterData.h" +#include "NvPerfMetricsConfigBuilder.h" #include "NvPerfMetricsEvaluator.h" +#include + struct NVCounterEnumerator::Impl { public: nv::perf::MetricsEvaluator Evaluator; + nv::perf::RawCounterConfigBuilder RawCounterConfigBuilder; + bytebuf CounterAvailabilityImage; nv::perf::CounterConfiguration SelectedConfiguration; // configImage etc. for the current selection rdcarray SelectedExternalIds; @@ -111,9 +116,13 @@ static CounterUnit ToCounterUnit(const std::vector &dimUnits return CounterUnit::Absolute; } -bool NVCounterEnumerator::Init(nv::perf::MetricsEvaluator &&metricsEvaluator) +bool NVCounterEnumerator::Init(nv::perf::MetricsEvaluator &&metricsEvaluator, + nv::perf::RawCounterConfigBuilder &&rawCounterConfigBuilder, + bytebuf &&counterAvailabilityImage) { m_Impl->Evaluator = std::move(metricsEvaluator); + m_Impl->RawCounterConfigBuilder = std::move(rawCounterConfigBuilder); + m_Impl->CounterAvailabilityImage = std::move(counterAvailabilityImage); return true; } @@ -126,6 +135,72 @@ void NVCounterEnumerator::Impl::InitEnumerateCounters() m_EnumerationDone = true; + if(!CounterAvailabilityImage.empty()) + { + NVPA_Status result; + NVPW_RawCounterConfig_SetCounterAvailability_Params params = {}; + params.structSize = sizeof(params); + params.pRawCounterConfig = RawCounterConfigBuilder; + params.pCounterAvailabilityImage = CounterAvailabilityImage.data(); + result = NVPW_RawCounterConfig_SetCounterAvailability(¶ms); + if(result != NVPA_STATUS_SUCCESS) + { + NV_PERF_LOG_ERR(50, "NvPerf could not determine counter availability for this GPU"); + return; + } + } + + rdcarray availableDomains; + { + const std::vector availableSCDs = + RawCounterConfigBuilder.GetAllAvailableSingularCounterDomains(); + for(NVPW_RawCounterDomain scd : availableSCDs) + { + availableDomains.push_back((uint32_t)scd); + } + + std::vector availableCDGs = + RawCounterConfigBuilder.GetAllAvailableCooperativeDomainGroups(); + for(uint32_t cdg : availableCDGs) + { + availableDomains.push_back(cdg); + } + } + + std::unordered_set availableCounters; + if(!RawCounterConfigBuilder.BeginPassGroupsAll()) + { + NV_PERF_LOG_ERR(50, "NvPerf failed to begin pass group"); + return; + } + for(uint32_t domain : availableDomains) + { + const size_t numCounters = RawCounterConfigBuilder.GetNumRawCounters(domain); + if(numCounters == SIZE_MAX) + { + NV_PERF_LOG_ERR(50, + "NvPerf failed to determine number of raw counters available for this GPU"); + return; + } + for(size_t counterIdx = 0; counterIdx < numCounters; ++counterIdx) + { + const char *pCounterName = RawCounterConfigBuilder.GetRawCounterName(domain, counterIdx); + if(!pCounterName) + { + NV_PERF_LOG_ERR(50, "NvPerf failed to obtain raw counter name"); + return; + } + availableCounters.insert(pCounterName); + } + } + if(!RawCounterConfigBuilder.EndPassGroupsAll()) + { + NV_PERF_LOG_ERR(50, "NvPerf failed to end pass group"); + return; + } + + std::vector rawDependencies, optionalRawDependencies; + struct MetricAttribute { NVPW_MetricType metricType; @@ -190,6 +265,29 @@ void NVCounterEnumerator::Impl::InitEnumerateCounters() continue; } + //----------------- + // Filter out metrics that cannot be scheduled + rawDependencies.clear(); + optionalRawDependencies.clear(); + if(!Evaluator.GetMetricRawCounterDependencies(&evalReq, 1, rawDependencies, + optionalRawDependencies)) + { + NV_PERF_LOG_ERR(50, "NvPerf failed to determine raw counter dependencies for metric \"%s\"", + counterName); + return; + } + bool metricAvailable = true; + for(const char *dependency : rawDependencies) + { + if(availableCounters.find(rdcstr(dependency)) == availableCounters.end()) + { + metricAvailable = false; + break; + } + } + if(!metricAvailable) + continue; + CounterDescription desc = {}; desc.resultType = CompType::Float; desc.resultByteWidth = 8; @@ -288,13 +386,13 @@ bool NVCounterEnumerator::HasCounter(GPUCounter counterID) } bool NVCounterEnumerator::CreateConfig(const char *pChipName, - NVPA_RawMetricsConfig *pRawMetricsConfig, + NVPW_RawCounterConfig *pRawCounterConfig, const rdcarray &counters) { nv::perf::MetricsConfigBuilder metricsConfigBuilder; - if(!metricsConfigBuilder.Initialize(m_Impl->Evaluator, pRawMetricsConfig, pChipName)) + if(!metricsConfigBuilder.Initialize(m_Impl->Evaluator, pRawCounterConfig, pChipName)) { - RDCERR("NvPerf failed to initialize config builder"); + NV_PERF_LOG_ERR(50, "NvPerf failed to initialize config builder"); return false; } @@ -315,13 +413,13 @@ bool NVCounterEnumerator::CreateConfig(const char *pChipName, // std::string metricName = nv::perf::ToString(m_Impl->Evaluator, evalReq); const char *metricName = nv::perf::ToCString( m_Impl->Evaluator, (NVPW_MetricType)evalReq.metricType, evalReq.metricIndex); - RDCERR("NvPerf failed to configure metric: %s", metricName); + NV_PERF_LOG_ERR(50, "NvPerf failed to configure metric: %s", metricName); } } if(!metricsConfigBuilder.PrepareConfigImage()) { - RDCERR("NvPerf failed to prepare config image"); + NV_PERF_LOG_ERR(50, "NvPerf failed to prepare config image"); return false; } @@ -369,7 +467,7 @@ bool NVCounterEnumerator::EvaluateMetrics(const uint8_t *counterDataImage, m_Impl->Evaluator, counterDataImage, counterDataImageSize); if(!setDeviceSuccess) { - RDCERR("NvPerf failed to determine device attributes from counter data"); + NV_PERF_LOG_ERR(50, "NvPerf failed to determine device attributes from counter data"); return false; } @@ -384,14 +482,14 @@ bool NVCounterEnumerator::EvaluateMetrics(const uint8_t *counterDataImage, counterDataImage, rangeIndex, '/', &leafRangeName); if(!leafRangeName) { - RDCERR("Failed to access NvPerf range name"); + NV_PERF_LOG_ERR(50, "Failed to access NvPerf range name"); continue; } errno = 0; uint32_t eid = (uint32_t)strtoul(leafRangeName, NULL, 10); if(errno != 0) { - RDCERR("Failed to parse NvPerf range name: %s", leafRangeName); + NV_PERF_LOG_ERR(50, "Failed to parse NvPerf range name: %s", leafRangeName); continue; } @@ -401,7 +499,7 @@ bool NVCounterEnumerator::EvaluateMetrics(const uint8_t *counterDataImage, m_Impl->SelectedEvalRequests.data(), doubleValues.data()); if(!evalSuccess) { - RDCERR("NvPerf failed to evaluate GPU metrics for range: %s", leafRangeName); + NV_PERF_LOG_ERR(50, "NvPerf failed to evaluate GPU metrics for range: %s", leafRangeName); continue; } for(size_t counterIndex = 0; counterIndex < m_Impl->SelectedExternalIds.size(); ++counterIndex) @@ -435,7 +533,7 @@ bool NVCounterEnumerator::InitializeNvPerf() return nv::perf::InitializeNvPerf(); } -CounterDescription NVCounterEnumerator::LibraryNotFoundMessage() +static CounterDescription DownloadLibraryMessage(const char *message) { rdcstr pluginPath = FileIO::GetAppFolderFilename( #if ENABLED(RDOC_WIN32) @@ -458,7 +556,7 @@ CounterDescription NVCounterEnumerator::LibraryNotFoundMessage() CounterDescription desc = {}; desc.resultType = CompType::Typeless; desc.resultByteWidth = 0; - desc.name = "ERROR: Could not find Nsight Perf SDK library"; + desc.name = message; desc.description = StringFormat::Fmt( "To use these counters, please:" "
    " @@ -484,3 +582,14 @@ CounterDescription NVCounterEnumerator::LibraryNotFoundMessage() return desc; } + +CounterDescription NVCounterEnumerator::LibraryNotFoundMessage() +{ + return DownloadLibraryMessage("ERROR: Could not find Nsight Perf SDK library"); +} + +CounterDescription NVCounterEnumerator::LibraryNotSupportedMessage() +{ + return DownloadLibraryMessage( + "ERROR: Installed version of Nsight Perf SDK library is not supported"); +} diff --git a/renderdoc/driver/ihv/nv/nv_counter_enumerator.h b/renderdoc/driver/ihv/nv/nv_counter_enumerator.h index 2640afd06..e9ac4b6d9 100644 --- a/renderdoc/driver/ihv/nv/nv_counter_enumerator.h +++ b/renderdoc/driver/ihv/nv/nv_counter_enumerator.h @@ -30,12 +30,13 @@ #include "api/replay/replay_enums.h" #include "common/common.h" -struct NVPA_RawMetricsConfig; +struct NVPW_RawCounterConfig; namespace nv { namespace perf { class MetricsEvaluator; +class RawCounterConfigBuilder; } } @@ -45,14 +46,16 @@ public: NVCounterEnumerator(); ~NVCounterEnumerator(); - // This function takes ownership of metricsEvaluator. - bool Init(nv::perf::MetricsEvaluator &&metricsEvaluator); + // This function takes ownership of metricsEvaluator and rawCounterConfigBuilder. + bool Init(nv::perf::MetricsEvaluator &&metricsEvaluator, + nv::perf::RawCounterConfigBuilder &&rawCounterConfigBuilder, + bytebuf &&counterAvailabilityImage); rdcarray GetPublicCounterIds(); CounterDescription GetCounterDescription(GPUCounter counterID); bool HasCounter(GPUCounter counterID); - bool CreateConfig(const char *pChipName, NVPA_RawMetricsConfig *pRawMetricsConfig, + bool CreateConfig(const char *pChipName, NVPW_RawCounterConfig *pRawCounterConfig, const rdcarray &counters); void GetConfig(const uint8_t *&pConfigImage, size_t &configImageSize, const uint8_t *&pCounterDataPrefix, size_t &counterDataPrefixSize); @@ -64,6 +67,7 @@ public: static bool InitializeNvPerf(); static CounterDescription LibraryNotFoundMessage(); + static CounterDescription LibraryNotSupportedMessage(); private: struct Impl;