Update NVCounterEnumerator to use the latest Nsight PerfSDK APIs

This updates NvCounterEnumerator to use the RawCounterConfig and counter
availability APIs available in recent Nsight PerfSDK headers. Using
these APIs to filter metrics by counter availability prevents crashes
and NaN values on some GPUs.
This commit is contained in:
Jonathan Glines
2025-06-09 11:25:05 -07:00
committed by Baldur Karlsson
parent 4133ee7a43
commit 7a052278ef
2 changed files with 129 additions and 16 deletions
+121 -12
View File
@@ -37,12 +37,17 @@
#include "NvPerfCounterConfiguration.h"
#include "NvPerfCounterData.h"
#include "NvPerfMetricsConfigBuilder.h"
#include "NvPerfMetricsEvaluator.h"
#include <unordered_set>
struct NVCounterEnumerator::Impl
{
public:
nv::perf::MetricsEvaluator Evaluator;
nv::perf::RawCounterConfigBuilder RawCounterConfigBuilder;
bytebuf CounterAvailabilityImage;
nv::perf::CounterConfiguration SelectedConfiguration; // configImage etc. for the current selection
rdcarray<GPUCounter> SelectedExternalIds;
@@ -111,9 +116,13 @@ static CounterUnit ToCounterUnit(const std::vector<NVPW_DimUnitFactor> &dimUnits
return CounterUnit::Absolute;
}
bool NVCounterEnumerator::Init(nv::perf::MetricsEvaluator &&metricsEvaluator)
bool NVCounterEnumerator::Init(nv::perf::MetricsEvaluator &&metricsEvaluator,
nv::perf::RawCounterConfigBuilder &&rawCounterConfigBuilder,
bytebuf &&counterAvailabilityImage)
{
m_Impl->Evaluator = std::move(metricsEvaluator);
m_Impl->RawCounterConfigBuilder = std::move(rawCounterConfigBuilder);
m_Impl->CounterAvailabilityImage = std::move(counterAvailabilityImage);
return true;
}
@@ -126,6 +135,72 @@ void NVCounterEnumerator::Impl::InitEnumerateCounters()
m_EnumerationDone = true;
if(!CounterAvailabilityImage.empty())
{
NVPA_Status result;
NVPW_RawCounterConfig_SetCounterAvailability_Params params = {};
params.structSize = sizeof(params);
params.pRawCounterConfig = RawCounterConfigBuilder;
params.pCounterAvailabilityImage = CounterAvailabilityImage.data();
result = NVPW_RawCounterConfig_SetCounterAvailability(&params);
if(result != NVPA_STATUS_SUCCESS)
{
NV_PERF_LOG_ERR(50, "NvPerf could not determine counter availability for this GPU");
return;
}
}
rdcarray<uint32_t> availableDomains;
{
const std::vector<NVPW_RawCounterDomain> availableSCDs =
RawCounterConfigBuilder.GetAllAvailableSingularCounterDomains();
for(NVPW_RawCounterDomain scd : availableSCDs)
{
availableDomains.push_back((uint32_t)scd);
}
std::vector<uint32_t> availableCDGs =
RawCounterConfigBuilder.GetAllAvailableCooperativeDomainGroups();
for(uint32_t cdg : availableCDGs)
{
availableDomains.push_back(cdg);
}
}
std::unordered_set<rdcstr> availableCounters;
if(!RawCounterConfigBuilder.BeginPassGroupsAll())
{
NV_PERF_LOG_ERR(50, "NvPerf failed to begin pass group");
return;
}
for(uint32_t domain : availableDomains)
{
const size_t numCounters = RawCounterConfigBuilder.GetNumRawCounters(domain);
if(numCounters == SIZE_MAX)
{
NV_PERF_LOG_ERR(50,
"NvPerf failed to determine number of raw counters available for this GPU");
return;
}
for(size_t counterIdx = 0; counterIdx < numCounters; ++counterIdx)
{
const char *pCounterName = RawCounterConfigBuilder.GetRawCounterName(domain, counterIdx);
if(!pCounterName)
{
NV_PERF_LOG_ERR(50, "NvPerf failed to obtain raw counter name");
return;
}
availableCounters.insert(pCounterName);
}
}
if(!RawCounterConfigBuilder.EndPassGroupsAll())
{
NV_PERF_LOG_ERR(50, "NvPerf failed to end pass group");
return;
}
std::vector<const char *> rawDependencies, optionalRawDependencies;
struct MetricAttribute
{
NVPW_MetricType metricType;
@@ -190,6 +265,29 @@ void NVCounterEnumerator::Impl::InitEnumerateCounters()
continue;
}
//-----------------
// Filter out metrics that cannot be scheduled
rawDependencies.clear();
optionalRawDependencies.clear();
if(!Evaluator.GetMetricRawCounterDependencies(&evalReq, 1, rawDependencies,
optionalRawDependencies))
{
NV_PERF_LOG_ERR(50, "NvPerf failed to determine raw counter dependencies for metric \"%s\"",
counterName);
return;
}
bool metricAvailable = true;
for(const char *dependency : rawDependencies)
{
if(availableCounters.find(rdcstr(dependency)) == availableCounters.end())
{
metricAvailable = false;
break;
}
}
if(!metricAvailable)
continue;
CounterDescription desc = {};
desc.resultType = CompType::Float;
desc.resultByteWidth = 8;
@@ -288,13 +386,13 @@ bool NVCounterEnumerator::HasCounter(GPUCounter counterID)
}
bool NVCounterEnumerator::CreateConfig(const char *pChipName,
NVPA_RawMetricsConfig *pRawMetricsConfig,
NVPW_RawCounterConfig *pRawCounterConfig,
const rdcarray<GPUCounter> &counters)
{
nv::perf::MetricsConfigBuilder metricsConfigBuilder;
if(!metricsConfigBuilder.Initialize(m_Impl->Evaluator, pRawMetricsConfig, pChipName))
if(!metricsConfigBuilder.Initialize(m_Impl->Evaluator, pRawCounterConfig, pChipName))
{
RDCERR("NvPerf failed to initialize config builder");
NV_PERF_LOG_ERR(50, "NvPerf failed to initialize config builder");
return false;
}
@@ -315,13 +413,13 @@ bool NVCounterEnumerator::CreateConfig(const char *pChipName,
// std::string metricName = nv::perf::ToString(m_Impl->Evaluator, evalReq);
const char *metricName = nv::perf::ToCString(
m_Impl->Evaluator, (NVPW_MetricType)evalReq.metricType, evalReq.metricIndex);
RDCERR("NvPerf failed to configure metric: %s", metricName);
NV_PERF_LOG_ERR(50, "NvPerf failed to configure metric: %s", metricName);
}
}
if(!metricsConfigBuilder.PrepareConfigImage())
{
RDCERR("NvPerf failed to prepare config image");
NV_PERF_LOG_ERR(50, "NvPerf failed to prepare config image");
return false;
}
@@ -369,7 +467,7 @@ bool NVCounterEnumerator::EvaluateMetrics(const uint8_t *counterDataImage,
m_Impl->Evaluator, counterDataImage, counterDataImageSize);
if(!setDeviceSuccess)
{
RDCERR("NvPerf failed to determine device attributes from counter data");
NV_PERF_LOG_ERR(50, "NvPerf failed to determine device attributes from counter data");
return false;
}
@@ -384,14 +482,14 @@ bool NVCounterEnumerator::EvaluateMetrics(const uint8_t *counterDataImage,
counterDataImage, rangeIndex, '/', &leafRangeName);
if(!leafRangeName)
{
RDCERR("Failed to access NvPerf range name");
NV_PERF_LOG_ERR(50, "Failed to access NvPerf range name");
continue;
}
errno = 0;
uint32_t eid = (uint32_t)strtoul(leafRangeName, NULL, 10);
if(errno != 0)
{
RDCERR("Failed to parse NvPerf range name: %s", leafRangeName);
NV_PERF_LOG_ERR(50, "Failed to parse NvPerf range name: %s", leafRangeName);
continue;
}
@@ -401,7 +499,7 @@ bool NVCounterEnumerator::EvaluateMetrics(const uint8_t *counterDataImage,
m_Impl->SelectedEvalRequests.data(), doubleValues.data());
if(!evalSuccess)
{
RDCERR("NvPerf failed to evaluate GPU metrics for range: %s", leafRangeName);
NV_PERF_LOG_ERR(50, "NvPerf failed to evaluate GPU metrics for range: %s", leafRangeName);
continue;
}
for(size_t counterIndex = 0; counterIndex < m_Impl->SelectedExternalIds.size(); ++counterIndex)
@@ -435,7 +533,7 @@ bool NVCounterEnumerator::InitializeNvPerf()
return nv::perf::InitializeNvPerf();
}
CounterDescription NVCounterEnumerator::LibraryNotFoundMessage()
static CounterDescription DownloadLibraryMessage(const char *message)
{
rdcstr pluginPath = FileIO::GetAppFolderFilename(
#if ENABLED(RDOC_WIN32)
@@ -458,7 +556,7 @@ CounterDescription NVCounterEnumerator::LibraryNotFoundMessage()
CounterDescription desc = {};
desc.resultType = CompType::Typeless;
desc.resultByteWidth = 0;
desc.name = "ERROR: Could not find Nsight Perf SDK library";
desc.name = message;
desc.description = StringFormat::Fmt(
"To use these counters, please:"
"<ol>"
@@ -484,3 +582,14 @@ CounterDescription NVCounterEnumerator::LibraryNotFoundMessage()
return desc;
}
CounterDescription NVCounterEnumerator::LibraryNotFoundMessage()
{
return DownloadLibraryMessage("ERROR: Could not find Nsight Perf SDK library");
}
CounterDescription NVCounterEnumerator::LibraryNotSupportedMessage()
{
return DownloadLibraryMessage(
"ERROR: Installed version of Nsight Perf SDK library is not supported");
}
@@ -30,12 +30,13 @@
#include "api/replay/replay_enums.h"
#include "common/common.h"
struct NVPA_RawMetricsConfig;
struct NVPW_RawCounterConfig;
namespace nv
{
namespace perf
{
class MetricsEvaluator;
class RawCounterConfigBuilder;
}
}
@@ -45,14 +46,16 @@ public:
NVCounterEnumerator();
~NVCounterEnumerator();
// This function takes ownership of metricsEvaluator.
bool Init(nv::perf::MetricsEvaluator &&metricsEvaluator);
// This function takes ownership of metricsEvaluator and rawCounterConfigBuilder.
bool Init(nv::perf::MetricsEvaluator &&metricsEvaluator,
nv::perf::RawCounterConfigBuilder &&rawCounterConfigBuilder,
bytebuf &&counterAvailabilityImage);
rdcarray<GPUCounter> GetPublicCounterIds();
CounterDescription GetCounterDescription(GPUCounter counterID);
bool HasCounter(GPUCounter counterID);
bool CreateConfig(const char *pChipName, NVPA_RawMetricsConfig *pRawMetricsConfig,
bool CreateConfig(const char *pChipName, NVPW_RawCounterConfig *pRawCounterConfig,
const rdcarray<GPUCounter> &counters);
void GetConfig(const uint8_t *&pConfigImage, size_t &configImageSize,
const uint8_t *&pCounterDataPrefix, size_t &counterDataPrefixSize);
@@ -64,6 +67,7 @@ public:
static bool InitializeNvPerf();
static CounterDescription LibraryNotFoundMessage();
static CounterDescription LibraryNotSupportedMessage();
private:
struct Impl;