Update HWCPipe to support performance counters of Dimensity 9000 SoC

Base HWCPipe commit: https://github.com/ARM-software/HWCPipe/commit/8cc02065b4ef249127aa0164dc0d62d65c0d4203
This commit is contained in:
kmchan
2022-04-27 00:09:12 +08:00
committed by Baldur Karlsson
parent f54d28c2cd
commit 85789e6209
15 changed files with 6681 additions and 3591 deletions
@@ -1,6 +1,6 @@
MIT License
Copyright (c) 2019 Arm Software
Copyright (c) 2019-2022 Arm Limited
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -42,17 +42,41 @@ enum class CpuCounter
BranchInstructions,
BranchMisses,
L1Accesses,
InstrRetired,
L2Accesses,
L3Accesses,
BusReads,
BusWrites,
MemReads,
MemWrites,
ASESpec,
VFPSpec,
CryptoSpec,
MaxValue
};
// Mapping from CPU counter names to enum values. Used for JSON initialization.
const std::unordered_map<std::string, CpuCounter> cpu_counter_names{
{"Cycles", CpuCounter::Cycles},
{"Instructions", CpuCounter::Instructions},
{"CacheReferences", CpuCounter::CacheReferences},
{"CacheMisses", CpuCounter::CacheMisses},
{"BranchInstructions", CpuCounter::BranchInstructions},
{"BranchMisses", CpuCounter::BranchMisses},
const std::unordered_map<std::string, CpuCounter> cpu_counter_names {
{"Cycles", CpuCounter::Cycles},
{"Instructions", CpuCounter::Instructions},
{"CacheReferences", CpuCounter::CacheReferences},
{"CacheMisses", CpuCounter::CacheMisses},
{"BranchInstructions", CpuCounter::BranchInstructions},
{"BranchMisses", CpuCounter::BranchMisses},
{"L1Accesses", CpuCounter::L1Accesses},
{"InstrRetired", CpuCounter::InstrRetired},
{"L2Accesses", CpuCounter::L2Accesses},
{"L3Accesses", CpuCounter::L3Accesses},
{"BusReads", CpuCounter::BusReads},
{"BusWrites", CpuCounter::BusWrites},
{"MemReads", CpuCounter::MemReads},
{"MemWrites", CpuCounter::MemWrites},
{"ASESpec", CpuCounter::ASESpec},
{"VFPSpec", CpuCounter::VFPSpec},
{"CryptoSpec", CpuCounter::CryptoSpec},
};
// A hash function for CpuCounter values
@@ -72,23 +96,35 @@ struct CpuCounterInfo
};
// Mapping from each counter to its corresponding information (description and unit)
const std::unordered_map<CpuCounter, CpuCounterInfo, CpuCounterHash> cpu_counter_info{
{CpuCounter::Cycles, {"Number of CPU cycles", "cycles"}},
{CpuCounter::Instructions, {"Number of CPU instructions", "instructions"}},
{CpuCounter::CacheReferences, {"Number of cache references", "references"}},
{CpuCounter::CacheMisses, {"Number of cache misses", "misses"}},
{CpuCounter::BranchInstructions, {"Number of branch instructions", "instructions"}},
{CpuCounter::BranchMisses, {"Number of branch misses", "misses"}},
const std::unordered_map<CpuCounter, CpuCounterInfo, CpuCounterHash> cpu_counter_info {
{CpuCounter::Cycles, {"Number of CPU cycles", "cycles"}},
{CpuCounter::Instructions, {"Number of CPU instructions", "instructions"}},
{CpuCounter::CacheReferences, {"Number of cache references", "references"}},
{CpuCounter::CacheMisses, {"Number of cache misses", "misses"}},
{CpuCounter::BranchInstructions, {"Number of branch instructions", "instructions"}},
{CpuCounter::BranchMisses, {"Number of branch misses", "misses"}},
{CpuCounter::L1Accesses, {"L1 data cache accesses", "accesses"}},
{CpuCounter::InstrRetired, {"All retired instructions", "instructions"}},
{CpuCounter::L2Accesses, {"L2 data cache accesses", "accesses"}},
{CpuCounter::L3Accesses, {"L3 data cache accesses", "accesses"}},
{CpuCounter::BusReads, {"Bus access reads", "beats"}},
{CpuCounter::BusWrites, {"Bus access writes", "beats"}},
{CpuCounter::MemReads, {"Data memory access, load instructions", "instructions"}},
{CpuCounter::MemWrites, {"Data memory access, store instructions", "instructions"}},
{CpuCounter::ASESpec, {"Speculatively executed SIMD operations", "operations"}},
{CpuCounter::VFPSpec, {"Speculatively executed floating point operations", "operations"}},
{CpuCounter::CryptoSpec, {"Speculatively executed cryptographic operations", "operations"}},
};
typedef std::unordered_set<CpuCounter, CpuCounterHash> CpuCounterSet;
typedef std::unordered_map<CpuCounter, Value, CpuCounterHash>
CpuMeasurements;
CpuMeasurements;
/** An interface for classes that collect CPU performance data. */
class CpuProfiler
{
public:
public:
virtual ~CpuProfiler() = default;
// Returns the enabled counters
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019 ARM Limited.
* Copyright (c) 2019-2022 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,16 +36,25 @@ namespace hwcpipe
enum class GpuCounter
{
GpuCycles,
ComputeCycles,
VertexCycles,
VertexComputeCycles,
FragmentCycles,
TilerCycles,
ComputeJobs,
VertexJobs,
VertexComputeJobs,
FragmentJobs,
Pixels,
CulledPrimitives,
VisiblePrimitives,
InputPrimitives,
Tiles,
TransactionEliminations,
EarlyZTests,
EarlyZKilled,
LateZTests,
@@ -54,13 +63,17 @@ enum class GpuCounter
Instructions,
DivergedInstructions,
ShaderComputeCycles,
ShaderFragmentCycles,
ShaderCycles,
ShaderArithmeticCycles,
ShaderInterpolatorCycles,
ShaderLoadStoreCycles,
ShaderTextureCycles,
CacheReadLookups,
CacheWriteLookups,
ExternalMemoryReadAccesses,
ExternalMemoryWriteAccesses,
ExternalMemoryReadStalls,
@@ -72,39 +85,52 @@ enum class GpuCounter
};
// Mapping from GPU counter names to enum values. Used for JSON initialization.
const std::unordered_map<std::string, GpuCounter> gpu_counter_names{
{"GpuCycles", GpuCounter::GpuCycles},
{"VertexComputeCycles", GpuCounter::VertexComputeCycles},
{"FragmentCycles", GpuCounter::FragmentCycles},
{"TilerCycles", GpuCounter::TilerCycles},
const std::unordered_map<std::string, GpuCounter> gpu_counter_names {
{"GpuCycles", GpuCounter::GpuCycles},
{"ComputeCycles", GpuCounter::ComputeCycles},
{"VertexCycles", GpuCounter::VertexCycles},
{"VertexComputeCycles", GpuCounter::VertexComputeCycles},
{"FragmentCycles", GpuCounter::FragmentCycles},
{"TilerCycles", GpuCounter::TilerCycles},
{"VertexComputeJobs", GpuCounter::VertexComputeJobs},
{"Tiles", GpuCounter::Tiles},
{"TransactionEliminations", GpuCounter::TransactionEliminations},
{"FragmentJobs", GpuCounter::FragmentJobs},
{"Pixels", GpuCounter::Pixels},
{"ComputeJobs", GpuCounter::VertexComputeJobs},
{"VertexJobs", GpuCounter::VertexJobs},
{"VertexComputeJobs", GpuCounter::VertexComputeJobs},
{"FragmentJobs", GpuCounter::FragmentJobs},
{"Pixels", GpuCounter::Pixels},
{"EarlyZTests", GpuCounter::EarlyZTests},
{"EarlyZKilled", GpuCounter::EarlyZKilled},
{"LateZTests", GpuCounter::LateZTests},
{"LateZKilled", GpuCounter::LateZKilled},
{"CulledPrimitives", GpuCounter::CulledPrimitives},
{"VisiblePrimitives", GpuCounter::VisiblePrimitives},
{"InputPrimitives", GpuCounter::InputPrimitives},
{"Instructions", GpuCounter::Instructions},
{"DivergedInstructions", GpuCounter::DivergedInstructions},
{"Tiles", GpuCounter::Tiles},
{"TransactionEliminations", GpuCounter::TransactionEliminations},
{"ShaderCycles", GpuCounter::ShaderCycles},
{"ShaderArithmeticCycles", GpuCounter::ShaderArithmeticCycles},
{"ShaderLoadStoreCycles", GpuCounter::ShaderLoadStoreCycles},
{"ShaderTextureCycles", GpuCounter::ShaderTextureCycles},
{"EarlyZTests", GpuCounter::EarlyZTests},
{"EarlyZKilled", GpuCounter::EarlyZKilled},
{"LateZTests", GpuCounter::LateZTests},
{"LateZKilled", GpuCounter::LateZKilled},
{"CacheReadLookups", GpuCounter::CacheReadLookups},
{"CacheWriteLookups", GpuCounter::CacheWriteLookups},
{"ExternalMemoryReadAccesses", GpuCounter::ExternalMemoryReadAccesses},
{"ExternalMemoryWriteAccesses", GpuCounter::ExternalMemoryWriteAccesses},
{"ExternalMemoryReadStalls", GpuCounter::ExternalMemoryReadStalls},
{"ExternalMemoryWriteStalls", GpuCounter::ExternalMemoryWriteStalls},
{"ExternalMemoryReadBytes", GpuCounter::ExternalMemoryReadBytes},
{"ExternalMemoryWriteBytes", GpuCounter::ExternalMemoryWriteBytes},
{"Instructions", GpuCounter::Instructions},
{"DivergedInstructions", GpuCounter::DivergedInstructions},
{"ShaderComputeCycles", GpuCounter::ShaderComputeCycles},
{"ShaderFragmentCycles", GpuCounter::ShaderFragmentCycles},
{"ShaderCycles", GpuCounter::ShaderCycles},
{"ShaderArithmeticCycles", GpuCounter::ShaderArithmeticCycles},
{"ShaderInterpolatorCycles", GpuCounter::ShaderInterpolatorCycles},
{"ShaderLoadStoreCycles", GpuCounter::ShaderLoadStoreCycles},
{"ShaderTextureCycles", GpuCounter::ShaderTextureCycles},
{"CacheReadLookups", GpuCounter::CacheReadLookups},
{"CacheWriteLookups", GpuCounter::CacheWriteLookups},
{"ExternalMemoryReadAccesses", GpuCounter::ExternalMemoryReadAccesses},
{"ExternalMemoryWriteAccesses", GpuCounter::ExternalMemoryWriteAccesses},
{"ExternalMemoryReadStalls", GpuCounter::ExternalMemoryReadStalls},
{"ExternalMemoryWriteStalls", GpuCounter::ExternalMemoryWriteStalls},
{"ExternalMemoryReadBytes", GpuCounter::ExternalMemoryReadBytes},
{"ExternalMemoryWriteBytes", GpuCounter::ExternalMemoryWriteBytes},
};
// A hash function for GpuCounter values
@@ -124,39 +150,52 @@ struct GpuCounterInfo
};
// Mapping from each counter to its corresponding information (description and unit)
const std::unordered_map<GpuCounter, GpuCounterInfo, GpuCounterHash> gpu_counter_info{
{GpuCounter::GpuCycles, {"Number of GPU cycles", "cycles"}},
{GpuCounter::VertexComputeCycles, {"Number of vertex/compute cycles", "cycles"}},
{GpuCounter::FragmentCycles, {"Number of fragment cycles", "cycles"}},
{GpuCounter::TilerCycles, {"Number of tiler cycles", "cycles"}},
const std::unordered_map<GpuCounter, GpuCounterInfo, GpuCounterHash> gpu_counter_info {
{GpuCounter::GpuCycles, {"Number of GPU cycles", "cycles"}},
{GpuCounter::ComputeCycles, {"Number of compute cycles", "cycles"}},
{GpuCounter::VertexCycles, {"Number of vertex cycles", "cycles"}},
{GpuCounter::VertexComputeCycles, {"Number of vertex/compute cycles", "cycles"}},
{GpuCounter::FragmentCycles, {"Number of fragment cycles", "cycles"}},
{GpuCounter::TilerCycles, {"Number of tiler cycles", "cycles"}},
{GpuCounter::VertexComputeJobs, {"Number of vertex/compute jobs", "jobs"}},
{GpuCounter::Tiles, {"Number of physical tiles written", "tiles"}},
{GpuCounter::TransactionEliminations, {"Number of transaction eliminations", "tiles"}},
{GpuCounter::FragmentJobs, {"Number of fragment jobs", "jobs"}},
{GpuCounter::Pixels, {"Number of pixels shaded", "cycles"}},
{GpuCounter::ComputeJobs, {"Number of compute jobs", "jobs"}},
{GpuCounter::VertexJobs, {"Number of vertex jobs", "jobs"}},
{GpuCounter::VertexComputeJobs, {"Number of vertex/compute jobs", "jobs"}},
{GpuCounter::FragmentJobs, {"Number of fragment jobs", "jobs"}},
{GpuCounter::Pixels, {"Number of pixels shaded", "cycles"}},
{GpuCounter::EarlyZTests, {"Early-Z tests performed", "tests"}},
{GpuCounter::EarlyZKilled, {"Early-Z tests resulting in a kill", "tests"}},
{GpuCounter::LateZTests, {"Late-Z tests performed", "tests"}},
{GpuCounter::LateZKilled, {"Late-Z tests resulting in a kill", "tests"}},
{GpuCounter::CulledPrimitives, {"Number of culled primitives", "triangles"}},
{GpuCounter::VisiblePrimitives, {"Number of visible primitives", "triangles"}},
{GpuCounter::InputPrimitives, {"Number of input primitives", "triangles"}},
{GpuCounter::Instructions, {"Number of shader instructions", "instructions"}},
{GpuCounter::DivergedInstructions, {"Number of diverged shader instructions", "instructions"}},
{GpuCounter::Tiles, {"Number of physical tiles written", "tiles"}},
{GpuCounter::TransactionEliminations, {"Number of transaction eliminations", "tiles"}},
{GpuCounter::ShaderCycles, {"Shader total cycles", "cycles"}},
{GpuCounter::ShaderArithmeticCycles, {"Shader arithmetic cycles", "cycles"}},
{GpuCounter::ShaderLoadStoreCycles, {"Shader load/store cycles", "cycles"}},
{GpuCounter::ShaderTextureCycles, {"Shader texture cycles", "cycles"}},
{GpuCounter::EarlyZTests, {"Number of early-Z tests performed", "tests"}},
{GpuCounter::EarlyZKilled, {"Number of early-Z tests resulting in a kill", "tests"}},
{GpuCounter::LateZTests, {"Number of late-Z tests performed", "tests"}},
{GpuCounter::LateZKilled, {"Number of late-Z tests resulting in a kill", "tests"}},
{GpuCounter::CacheReadLookups, {"Cache read lookups", "lookups"}},
{GpuCounter::CacheWriteLookups, {"Cache write lookups", "lookups"}},
{GpuCounter::ExternalMemoryReadAccesses, {"Reads from external memory", "accesses"}},
{GpuCounter::ExternalMemoryWriteAccesses, {"Writes to external memory", "accesses"}},
{GpuCounter::ExternalMemoryReadStalls, {"Stalls when reading from external memory", "stalls"}},
{GpuCounter::ExternalMemoryWriteStalls, {"Stalls when writing to external memory", "stalls"}},
{GpuCounter::ExternalMemoryReadBytes, {"Bytes read to external memory", "B"}},
{GpuCounter::ExternalMemoryWriteBytes, {"Bytes written to external memory", "B"}},
{GpuCounter::Instructions, {"Number of shader instructions", "instructions"}},
{GpuCounter::DivergedInstructions, {"Number of diverged shader instructions", "instructions"}},
{GpuCounter::ShaderComputeCycles, {"Number of shader vertex/compute cycles", "cycles"}},
{GpuCounter::ShaderFragmentCycles, {"Number of shader fragment cycles", "cycles"}},
{GpuCounter::ShaderCycles, {"Number of shader core cycles", "cycles"}},
{GpuCounter::ShaderArithmeticCycles, {"Number of shader arithmetic cycles", "cycles"}},
{GpuCounter::ShaderInterpolatorCycles, {"Number of shader interpolator cycles", "cycles"}},
{GpuCounter::ShaderLoadStoreCycles, {"Number of shader load/store cycles", "cycles"}},
{GpuCounter::ShaderTextureCycles, {"Number of shader texture cycles", "cycles"}},
{GpuCounter::CacheReadLookups, {"Number of cache read lookups", "lookups"}},
{GpuCounter::CacheWriteLookups, {"Number of cache write lookups", "lookups"}},
{GpuCounter::ExternalMemoryReadAccesses, {"Number of reads from external memory", "accesses"}},
{GpuCounter::ExternalMemoryWriteAccesses, {"Number of writes to external memory", "accesses"}},
{GpuCounter::ExternalMemoryReadStalls, {"Number of stall cycles when reading from external memory", "cycles"}},
{GpuCounter::ExternalMemoryWriteStalls, {"Number of stall cycles when writing to external memory", "cycles"}},
{GpuCounter::ExternalMemoryReadBytes, {"Number of bytes read to external memory", "bytes"}},
{GpuCounter::ExternalMemoryWriteBytes, {"Number of bytes written to external memory", "bytes"}},
};
typedef std::unordered_set<GpuCounter, GpuCounterHash> GpuCounterSet;
@@ -165,7 +204,7 @@ typedef std::unordered_map<GpuCounter, Value, GpuCounterHash> GpuMeasurements;
/** An interface for classes that collect GPU performance data. */
class GpuProfiler
{
public:
public:
virtual ~GpuProfiler() = default;
// Returns the enabled counters
@@ -26,12 +26,12 @@
#include "hwcpipe_log.h"
#ifdef __linux__
# include "vendor/arm/pmu/pmu_profiler.h"
# include "vendor/arm/mali/mali_profiler.h"
#include "vendor/arm/pmu/pmu_profiler.h"
#include "vendor/arm/mali/mali_profiler.h"
#endif
#ifndef HWCPIPE_NO_JSON
#include <json.hpp>
#include <json.hpp>
using json = nlohmann::json;
#endif
@@ -44,7 +44,7 @@ HWCPipe::HWCPipe(const char *json_string)
{
auto json = json::parse(json_string);
CpuCounterSet enabled_cpu_counters{};
CpuCounterSet enabled_cpu_counters {};
auto cpu = json.find("cpu");
if (cpu != json.end())
{
@@ -62,7 +62,7 @@ HWCPipe::HWCPipe(const char *json_string)
}
}
GpuCounterSet enabled_gpu_counters{};
GpuCounterSet enabled_gpu_counters {};
auto gpu = json.find("gpu");
if (gpu != json.end())
{
@@ -91,25 +91,29 @@ HWCPipe::HWCPipe(CpuCounterSet enabled_cpu_counters, GpuCounterSet enabled_gpu_c
HWCPipe::HWCPipe()
{
CpuCounterSet enabled_cpu_counters{CpuCounter::Cycles,
CpuCounter::Instructions,
CpuCounter::CacheReferences,
CpuCounter::CacheMisses,
CpuCounter::BranchInstructions,
CpuCounter::BranchMisses};
CpuCounterSet enabled_cpu_counters {
CpuCounter::Cycles,
CpuCounter::Instructions,
CpuCounter::CacheReferences,
CpuCounter::CacheMisses,
CpuCounter::BranchInstructions,
CpuCounter::BranchMisses,
};
GpuCounterSet enabled_gpu_counters{GpuCounter::GpuCycles,
GpuCounter::VertexComputeCycles,
GpuCounter::FragmentCycles,
GpuCounter::TilerCycles,
GpuCounter::CacheReadLookups,
GpuCounter::CacheWriteLookups,
GpuCounter::ExternalMemoryReadAccesses,
GpuCounter::ExternalMemoryWriteAccesses,
GpuCounter::ExternalMemoryReadStalls,
GpuCounter::ExternalMemoryWriteStalls,
GpuCounter::ExternalMemoryReadBytes,
GpuCounter::ExternalMemoryWriteBytes};
GpuCounterSet enabled_gpu_counters {
GpuCounter::GpuCycles,
GpuCounter::VertexComputeCycles,
GpuCounter::FragmentCycles,
GpuCounter::TilerCycles,
GpuCounter::CacheReadLookups,
GpuCounter::CacheWriteLookups,
GpuCounter::ExternalMemoryReadAccesses,
GpuCounter::ExternalMemoryWriteAccesses,
GpuCounter::ExternalMemoryReadStalls,
GpuCounter::ExternalMemoryWriteStalls,
GpuCounter::ExternalMemoryReadBytes,
GpuCounter::ExternalMemoryWriteBytes,
};
create_profilers(std::move(enabled_cpu_counters), std::move(enabled_gpu_counters));
}
@@ -174,7 +178,10 @@ void HWCPipe::create_profilers(CpuCounterSet enabled_cpu_counters, GpuCounterSet
#ifdef __linux__
try
{
cpu_profiler_ = std::unique_ptr<PmuProfiler>(new PmuProfiler(enabled_cpu_counters));
if (enabled_cpu_counters.size() != 0)
{
cpu_profiler_ = std::unique_ptr<PmuProfiler>(new PmuProfiler(enabled_cpu_counters));
}
}
catch (const std::runtime_error &e)
{
@@ -183,7 +190,10 @@ void HWCPipe::create_profilers(CpuCounterSet enabled_cpu_counters, GpuCounterSet
try
{
gpu_profiler_ = std::unique_ptr<MaliProfiler>(new MaliProfiler(enabled_gpu_counters));
if (enabled_gpu_counters.size() != 0)
{
gpu_profiler_ = std::unique_ptr<MaliProfiler>(new MaliProfiler(enabled_gpu_counters));
}
}
catch (const std::runtime_error &e)
{
@@ -37,14 +37,14 @@ namespace hwcpipe
{
struct Measurements
{
const CpuMeasurements *cpu{nullptr};
const GpuMeasurements *gpu{nullptr};
const CpuMeasurements *cpu {nullptr};
const GpuMeasurements *gpu {nullptr};
};
/** A class that collects CPU/GPU performance data. */
class HWCPipe
{
public:
public:
#ifndef HWCPIPE_NO_JSON
// Initializes HWCPipe via a JSON configuration string
explicit HWCPipe(const char *json_string);
@@ -83,9 +83,9 @@ class HWCPipe
return gpu_profiler_.get();
}
private:
std::unique_ptr<CpuProfiler> cpu_profiler_{};
std::unique_ptr<GpuProfiler> gpu_profiler_{};
private:
std::unique_ptr<CpuProfiler> cpu_profiler_ {};
std::unique_ptr<GpuProfiler> gpu_profiler_ {};
void create_profilers(CpuCounterSet enabled_cpu_counters, GpuCounterSet enabled_gpu_counters);
};
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019 ARM Limited.
* Copyright (c) 2019-2022 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,11 +27,11 @@
#define HWCPIPE_TAG "HWCPipe"
#if defined(__ANDROID__)
# include <android/log.h>
#include <android/log.h>
# define HWCPIPE_LOG(...) //__android_log_print(ANDROID_LOG_VERBOSE, HWCPIPE_TAG, __VA_ARGS__)
#define HWCPIPE_LOG(...) __android_log_print(ANDROID_LOG_VERBOSE, HWCPIPE_TAG, __VA_ARGS__)
#else
# define HWCPIPE_LOG(...) \
#define HWCPIPE_LOG(...) \
{ \
fprintf(stdout, "%s [INFO] : ", HWCPIPE_TAG); \
fprintf(stdout, __VA_ARGS__); \
@@ -28,19 +28,19 @@ namespace hwcpipe
{
class Value
{
public:
public:
Value() :
is_int_(true),
int_(0),
double_(0.0f)
is_int_(true),
int_(0),
double_(0.0f)
{}
Value(long long value) :
is_int_(true),
int_(value)
is_int_(true),
int_(value)
{}
Value(double value) :
is_int_(false),
double_(value)
is_int_(false),
double_(value)
{}
template <typename T>
@@ -61,9 +61,9 @@ class Value
is_int_ = false;
}
private:
private:
bool is_int_;
long long int_{0};
double double_{0.0};
long long int_ {0};
double double_ {0.0};
};
} // namespace hwcpipe
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017-2019 ARM Limited.
* Copyright (c) 2017-2022 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,16 +37,14 @@
#include "hwc_names.hpp"
#ifndef DOXYGEN_SKIP_THIS
# if defined(ANDROID) || defined(__ANDROID__)
/* We use _IOR_BAD/_IOW_BAD rather than _IOR/_IOW otherwise fails to compile with NDK-BUILD because of _IOC_TYPECHECK is defined, not because the paramter is invalid */
# define MALI_IOR(a, b, c) _IOR_BAD(a, b, c)
# define MALI_IOW(a, b, c) _IOW_BAD(a, b, c)
# else
# define MALI_IOR(a, b, c) _IOR(a, b, c)
# define MALI_IOW(a, b, c) _IOW(a, b, c)
# endif
#if defined(ANDROID) || defined(__ANDROID__)
/* We use _IOR_BAD/_IOW_BAD rather than _IOR/_IOW otherwise fails to compile with NDK-BUILD because of _IOC_TYPECHECK is defined, not because the paramter is invalid */
#define MALI_IOR(a, b, c) _IOR_BAD(a, b, c)
#define MALI_IOW(a, b, c) _IOW_BAD(a, b, c)
#else
#define MALI_IOR(a, b, c) _IOR(a, b, c)
#define MALI_IOW(a, b, c) _IOW(a, b, c)
#endif
namespace mali_userspace
{
@@ -57,8 +55,8 @@ union uk_header
uint64_t sizer;
};
# define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 3
# define BASE_MAX_COHERENT_GROUPS 16
#define BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS 3
#define BASE_MAX_COHERENT_GROUPS 16
struct mali_base_gpu_core_props
{
@@ -117,7 +115,7 @@ struct mali_base_gpu_coherent_group_info
mali_base_gpu_coherent_group group[BASE_MAX_COHERENT_GROUPS];
};
# define GPU_MAX_JOB_SLOTS 16
#define GPU_MAX_JOB_SLOTS 16
struct gpu_raw_gpu_props
{
uint64_t shader_present;
@@ -164,35 +162,35 @@ struct kbase_uk_gpuprops
mali_base_gpu_props props;
};
# define KBASE_GPUPROP_VALUE_SIZE_U8 (0x0)
# define KBASE_GPUPROP_VALUE_SIZE_U16 (0x1)
# define KBASE_GPUPROP_VALUE_SIZE_U32 (0x2)
# define KBASE_GPUPROP_VALUE_SIZE_U64 (0x3)
#define KBASE_GPUPROP_VALUE_SIZE_U8 (0x0)
#define KBASE_GPUPROP_VALUE_SIZE_U16 (0x1)
#define KBASE_GPUPROP_VALUE_SIZE_U32 (0x2)
#define KBASE_GPUPROP_VALUE_SIZE_U64 (0x3)
# define KBASE_GPUPROP_PRODUCT_ID 1
# define KBASE_GPUPROP_MINOR_REVISION 3
# define KBASE_GPUPROP_MAJOR_REVISION 4
#define KBASE_GPUPROP_PRODUCT_ID 1
#define KBASE_GPUPROP_MINOR_REVISION 3
#define KBASE_GPUPROP_MAJOR_REVISION 4
# define KBASE_GPUPROP_COHERENCY_NUM_GROUPS 61
# define KBASE_GPUPROP_COHERENCY_NUM_CORE_GROUPS 62
# define KBASE_GPUPROP_COHERENCY_GROUP_0 64
# define KBASE_GPUPROP_COHERENCY_GROUP_1 65
# define KBASE_GPUPROP_COHERENCY_GROUP_2 66
# define KBASE_GPUPROP_COHERENCY_GROUP_3 67
# define KBASE_GPUPROP_COHERENCY_GROUP_4 68
# define KBASE_GPUPROP_COHERENCY_GROUP_5 69
# define KBASE_GPUPROP_COHERENCY_GROUP_6 70
# define KBASE_GPUPROP_COHERENCY_GROUP_7 71
# define KBASE_GPUPROP_COHERENCY_GROUP_8 72
# define KBASE_GPUPROP_COHERENCY_GROUP_9 73
# define KBASE_GPUPROP_COHERENCY_GROUP_10 74
# define KBASE_GPUPROP_COHERENCY_GROUP_11 75
# define KBASE_GPUPROP_COHERENCY_GROUP_12 76
# define KBASE_GPUPROP_COHERENCY_GROUP_13 77
# define KBASE_GPUPROP_COHERENCY_GROUP_14 78
# define KBASE_GPUPROP_COHERENCY_GROUP_15 79
#define KBASE_GPUPROP_COHERENCY_NUM_GROUPS 61
#define KBASE_GPUPROP_COHERENCY_NUM_CORE_GROUPS 62
#define KBASE_GPUPROP_COHERENCY_GROUP_0 64
#define KBASE_GPUPROP_COHERENCY_GROUP_1 65
#define KBASE_GPUPROP_COHERENCY_GROUP_2 66
#define KBASE_GPUPROP_COHERENCY_GROUP_3 67
#define KBASE_GPUPROP_COHERENCY_GROUP_4 68
#define KBASE_GPUPROP_COHERENCY_GROUP_5 69
#define KBASE_GPUPROP_COHERENCY_GROUP_6 70
#define KBASE_GPUPROP_COHERENCY_GROUP_7 71
#define KBASE_GPUPROP_COHERENCY_GROUP_8 72
#define KBASE_GPUPROP_COHERENCY_GROUP_9 73
#define KBASE_GPUPROP_COHERENCY_GROUP_10 74
#define KBASE_GPUPROP_COHERENCY_GROUP_11 75
#define KBASE_GPUPROP_COHERENCY_GROUP_12 76
#define KBASE_GPUPROP_COHERENCY_GROUP_13 77
#define KBASE_GPUPROP_COHERENCY_GROUP_14 78
#define KBASE_GPUPROP_COHERENCY_GROUP_15 79
# define KBASE_GPUPROP_L2_NUM_L2_SLICES 15
#define KBASE_GPUPROP_L2_NUM_L2_SLICES 15
struct gpu_props
{
@@ -212,36 +210,36 @@ static const struct
size_t offset;
int size;
} gpu_property_mapping[] = {
# define PROP(name, member) \
{ \
KBASE_GPUPROP_##name, offsetof(struct gpu_props, member), \
sizeof(((struct gpu_props *) 0)->member) \
}
PROP(PRODUCT_ID, product_id),
PROP(MINOR_REVISION, minor_revision),
PROP(MAJOR_REVISION, major_revision),
PROP(COHERENCY_NUM_GROUPS, num_groups),
PROP(COHERENCY_NUM_CORE_GROUPS, num_core_groups),
PROP(COHERENCY_GROUP_0, core_mask[0]),
PROP(COHERENCY_GROUP_1, core_mask[1]),
PROP(COHERENCY_GROUP_2, core_mask[2]),
PROP(COHERENCY_GROUP_3, core_mask[3]),
PROP(COHERENCY_GROUP_4, core_mask[4]),
PROP(COHERENCY_GROUP_5, core_mask[5]),
PROP(COHERENCY_GROUP_6, core_mask[6]),
PROP(COHERENCY_GROUP_7, core_mask[7]),
PROP(COHERENCY_GROUP_8, core_mask[8]),
PROP(COHERENCY_GROUP_9, core_mask[9]),
PROP(COHERENCY_GROUP_10, core_mask[10]),
PROP(COHERENCY_GROUP_11, core_mask[11]),
PROP(COHERENCY_GROUP_12, core_mask[12]),
PROP(COHERENCY_GROUP_13, core_mask[13]),
PROP(COHERENCY_GROUP_14, core_mask[14]),
PROP(COHERENCY_GROUP_15, core_mask[15]),
#define PROP(name, member) \
{ \
KBASE_GPUPROP_##name, offsetof(struct gpu_props, member), \
sizeof(((struct gpu_props *) 0)->member) \
}
PROP(PRODUCT_ID, product_id),
PROP(MINOR_REVISION, minor_revision),
PROP(MAJOR_REVISION, major_revision),
PROP(COHERENCY_NUM_GROUPS, num_groups),
PROP(COHERENCY_NUM_CORE_GROUPS, num_core_groups),
PROP(COHERENCY_GROUP_0, core_mask[0]),
PROP(COHERENCY_GROUP_1, core_mask[1]),
PROP(COHERENCY_GROUP_2, core_mask[2]),
PROP(COHERENCY_GROUP_3, core_mask[3]),
PROP(COHERENCY_GROUP_4, core_mask[4]),
PROP(COHERENCY_GROUP_5, core_mask[5]),
PROP(COHERENCY_GROUP_6, core_mask[6]),
PROP(COHERENCY_GROUP_7, core_mask[7]),
PROP(COHERENCY_GROUP_8, core_mask[8]),
PROP(COHERENCY_GROUP_9, core_mask[9]),
PROP(COHERENCY_GROUP_10, core_mask[10]),
PROP(COHERENCY_GROUP_11, core_mask[11]),
PROP(COHERENCY_GROUP_12, core_mask[12]),
PROP(COHERENCY_GROUP_13, core_mask[13]),
PROP(COHERENCY_GROUP_14, core_mask[14]),
PROP(COHERENCY_GROUP_15, core_mask[15]),
PROP(L2_NUM_L2_SLICES, l2_slices),
# undef PROP
{0, 0, 0}};
PROP(L2_NUM_L2_SLICES, l2_slices),
#undef PROP
{0, 0, 0}};
struct kbase_hwcnt_reader_metadata
{
@@ -307,11 +305,12 @@ struct kbase_ioctl_hwcnt_reader_setup
uint32_t mmu_l2_bm;
};
# define KBASE_IOCTL_TYPE 0x80
# define KBASE_IOCTL_GET_GPUPROPS MALI_IOW(KBASE_IOCTL_TYPE, 3, struct mali_userspace::kbase_ioctl_get_gpuprops)
# define KBASE_IOCTL_VERSION_CHECK _IOWR(KBASE_IOCTL_TYPE, 0, struct mali_userspace::kbase_ioctl_version_check)
# define KBASE_IOCTL_SET_FLAGS _IOW(KBASE_IOCTL_TYPE, 1, struct mali_userspace::kbase_ioctl_set_flags)
# define KBASE_IOCTL_HWCNT_READER_SETUP _IOW(KBASE_IOCTL_TYPE, 8, struct mali_userspace::kbase_ioctl_hwcnt_reader_setup)
#define KBASE_IOCTL_TYPE 0x80
#define KBASE_IOCTL_GET_GPUPROPS MALI_IOW(KBASE_IOCTL_TYPE, 3, struct mali_userspace::kbase_ioctl_get_gpuprops)
#define KBASE_IOCTL_VERSION_CHECK_JM _IOWR(KBASE_IOCTL_TYPE, 0, struct mali_userspace::kbase_ioctl_version_check)
#define KBASE_IOCTL_VERSION_CHECK_CSF _IOWR(KBASE_IOCTL_TYPE, 52, struct mali_userspace::kbase_ioctl_version_check)
#define KBASE_IOCTL_SET_FLAGS _IOW(KBASE_IOCTL_TYPE, 1, struct mali_userspace::kbase_ioctl_set_flags)
#define KBASE_IOCTL_HWCNT_READER_SETUP _IOW(KBASE_IOCTL_TYPE, 8, struct mali_userspace::kbase_ioctl_hwcnt_reader_setup)
/** IOCTL parameters to set flags */
struct kbase_uk_hwcnt_reader_set_flags
@@ -350,7 +349,9 @@ struct uku_version_check_args
enum
{
UKP_FUNC_ID_CHECK_VERSION = 0,
UKP_FUNC_ID_CHECK_VERSION_JM = 0,
UKP_FUNC_ID_CHECK_VERSION_CSF = 52,
/* Related to mali0 ioctl interface */
LINUX_UK_BASE_MAGIC = 0x80,
BASE_CONTEXT_CREATE_KERNEL_FLAGS = 0x2,
@@ -408,6 +409,5 @@ static inline int mali_ioctl(int fd, T &arg)
return 0;
}
} // namespace mali_userspace
#endif /* DOXYGEN_SKIP_THIS */
} // namespace mali_userspace
File diff suppressed because it is too large Load Diff
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017-2019 ARM Limited.
* Copyright (c) 2017-2022 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,7 @@
#include "hwcpipe_log.h"
#include <algorithm>
#include <stdexcept>
using mali_userspace::MALI_NAME_BLOCK_JM;
using mali_userspace::MALI_NAME_BLOCK_MMU;
@@ -56,18 +57,38 @@ MaliHWInfo get_mali_hw_info(const char *path)
}
{
// Try matching Job Manager version IOCTL
bool checked_version = true;
mali_userspace::kbase_uk_hwcnt_reader_version_check_args version_check_args;
version_check_args.header.id = mali_userspace::UKP_FUNC_ID_CHECK_VERSION; // NOLINT
version_check_args.header.id = mali_userspace::UKP_FUNC_ID_CHECK_VERSION_JM;
version_check_args.major = 10;
version_check_args.minor = 2;
if (mali_userspace::mali_ioctl(fd, version_check_args) != 0)
{
mali_userspace::kbase_ioctl_version_check _version_check_args = {0, 0};
if (ioctl(fd, KBASE_IOCTL_VERSION_CHECK, &_version_check_args) < 0)
if (ioctl(fd, KBASE_IOCTL_VERSION_CHECK_JM, &_version_check_args) < 0)
{
close(fd);
throw std::runtime_error("Failed to check version.");
checked_version = false;
}
}
// Try matching CSF version IOCTL
if (!checked_version)
{
mali_userspace::kbase_uk_hwcnt_reader_version_check_args version_check_args;
version_check_args.header.id = mali_userspace::UKP_FUNC_ID_CHECK_VERSION_CSF;
version_check_args.major = 1;
version_check_args.minor = 4;
if (mali_userspace::mali_ioctl(fd, version_check_args) != 0)
{
mali_userspace::kbase_ioctl_version_check _version_check_args = {0, 0};
if (ioctl(fd, KBASE_IOCTL_VERSION_CHECK_CSF, &_version_check_args) < 0)
{
close(fd);
throw std::runtime_error("Failed to check version.");
}
}
}
}
@@ -216,70 +237,176 @@ MaliHWInfo get_mali_hw_info(const char *path)
typedef std::function<uint64_t(void)> MaliValueGetter;
MaliProfiler::MaliProfiler(const GpuCounterSet &enabled_counters) :
enabled_counters_(enabled_counters)
enabled_counters_(enabled_counters)
{
// Throws if setup fails
init();
const std::unordered_map<GpuCounter, MaliValueGetter, GpuCounterHash> valhall_csf_mappings = {
{GpuCounter::GpuCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "GPU_ACTIVE"); }},
{GpuCounter::ComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "ITER_COMP_ACTIVE"); }},
{GpuCounter::VertexCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "ITER_TILER_ACTIVE"); }},
{GpuCounter::FragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "ITER_FRAGMENT_ACTIVE"); }},
{GpuCounter::TilerCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "TILER_ACTIVE"); }},
{GpuCounter::ComputeJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "ITER_COMP_JOB_COMPLETED"); }},
{GpuCounter::VertexJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "ITER_TILER_JOB_COMPLETED"); }},
{GpuCounter::FragmentJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "ITER_FRAG_JOB_COMPLETED"); }},
{GpuCounter::Pixels, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "ITER_FRAG_TASK_COMPLETED") * 1024; }},
{GpuCounter::CulledPrimitives, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "PRIM_CULLED") + get_counter_value(MALI_NAME_BLOCK_TILER, "PRIM_CLIPPED") + get_counter_value(MALI_NAME_BLOCK_TILER, "PRIM_SAT_CULLED"); }},
{GpuCounter::VisiblePrimitives, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "PRIM_VISIBLE"); }},
{GpuCounter::InputPrimitives, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "TRIANGLES") + get_counter_value(MALI_NAME_BLOCK_TILER, "LINES") + get_counter_value(MALI_NAME_BLOCK_TILER, "POINTS"); }},
{GpuCounter::Tiles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_PTILES"); }},
{GpuCounter::TransactionEliminations, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_TRANS_ELIM"); }},
{GpuCounter::EarlyZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_TEST"); }},
{GpuCounter::EarlyZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_KILL"); }},
{GpuCounter::LateZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_TEST"); }},
{GpuCounter::LateZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_KILL"); }},
{GpuCounter::Instructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_FMA") + get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_CVT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_SFU") + get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_MSG"); }},
{GpuCounter::DivergedInstructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_DIVERGED"); }},
{GpuCounter::ShaderComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "COMPUTE_ACTIVE"); }},
{GpuCounter::ShaderFragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_ACTIVE"); }},
{GpuCounter::ShaderCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_CORE_ACTIVE"); }},
// The three units run in parallel so we can approximate cycles by taking the largest value. SFU instructions use 4 cycles per warp.
{GpuCounter::ShaderArithmeticCycles, [this] { return std::max(get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_FMA"), std::max(get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_CVT"), 4 * get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_SFU"))); }},
{GpuCounter::ShaderInterpolatorCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "VARY_SLOT_16") + get_counter_value(MALI_NAME_BLOCK_SHADER, "VARY_SLOT_32"); }},
{GpuCounter::ShaderLoadStoreCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_ATOMIC"); }},
{GpuCounter::ShaderTextureCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "TEX_FILT_NUM_OPERATIONS"); }},
{GpuCounter::CacheReadLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_READ_LOOKUP"); }},
{GpuCounter::CacheWriteLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_WRITE_LOOKUP"); }},
{GpuCounter::ExternalMemoryReadAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ"); }},
{GpuCounter::ExternalMemoryWriteAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE"); }},
{GpuCounter::ExternalMemoryReadStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_AR_STALL"); }},
{GpuCounter::ExternalMemoryWriteStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_W_STALL"); }},
{GpuCounter::ExternalMemoryReadBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ_BEATS") * 16; }},
{GpuCounter::ExternalMemoryWriteBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE_BEATS") * 16; }},
};
const std::unordered_map<GpuCounter, MaliValueGetter, GpuCounterHash> valhall_mappings = {
{GpuCounter::GpuCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "GPU_ACTIVE"); }},
{GpuCounter::VertexComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_ACTIVE"); }},
{GpuCounter::FragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_ACTIVE"); }},
{GpuCounter::TilerCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "TILER_ACTIVE"); }},
{GpuCounter::VertexComputeJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_JOBS"); }},
{GpuCounter::FragmentJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_JOBS"); }},
{GpuCounter::Pixels, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_TASKS") * 1024; }},
{GpuCounter::CulledPrimitives, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "PRIM_CULLED") + get_counter_value(MALI_NAME_BLOCK_TILER, "PRIM_CLIPPED") + get_counter_value(MALI_NAME_BLOCK_TILER, "PRIM_SAT_CULLED"); }},
{GpuCounter::VisiblePrimitives, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "PRIM_VISIBLE"); }},
{GpuCounter::InputPrimitives, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "TRIANGLES") + get_counter_value(MALI_NAME_BLOCK_TILER, "LINES") + get_counter_value(MALI_NAME_BLOCK_TILER, "POINTS"); }},
{GpuCounter::Tiles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_PTILES"); }},
{GpuCounter::TransactionEliminations, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_TRANS_ELIM"); }},
{GpuCounter::EarlyZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_TEST"); }},
{GpuCounter::EarlyZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_KILL"); }},
{GpuCounter::LateZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_TEST"); }},
{GpuCounter::LateZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_KILL"); }},
{GpuCounter::Instructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_FMA") + get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_CVT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_SFU") + get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_MSG"); }},
{GpuCounter::DivergedInstructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_DIVERGED"); }},
{GpuCounter::ShaderComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "COMPUTE_ACTIVE"); }},
{GpuCounter::ShaderFragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_ACTIVE"); }},
{GpuCounter::ShaderCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_CORE_ACTIVE"); }},
// The three units run in parallel so we can approximate cycles by taking the largest value. SFU instructions use 4 cycles per warp.
{GpuCounter::ShaderArithmeticCycles, [this] { return std::max(get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_FMA"), std::max(get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_CVT"), 4 * get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_SFU"))); }},
{GpuCounter::ShaderInterpolatorCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "VARY_SLOT_16") + get_counter_value(MALI_NAME_BLOCK_SHADER, "VARY_SLOT_32"); }},
{GpuCounter::ShaderLoadStoreCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_ATOMIC"); }},
{GpuCounter::ShaderTextureCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "TEX_FILT_NUM_OPERATIONS"); }},
{GpuCounter::CacheReadLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_READ_LOOKUP"); }},
{GpuCounter::CacheWriteLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_WRITE_LOOKUP"); }},
{GpuCounter::ExternalMemoryReadAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ"); }},
{GpuCounter::ExternalMemoryWriteAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE"); }},
{GpuCounter::ExternalMemoryReadStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_AR_STALL"); }},
{GpuCounter::ExternalMemoryWriteStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_W_STALL"); }},
{GpuCounter::ExternalMemoryReadBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ_BEATS") * 16; }},
{GpuCounter::ExternalMemoryWriteBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE_BEATS") * 16; }},
};
const std::unordered_map<GpuCounter, MaliValueGetter, GpuCounterHash> bifrost_mappings = {
{GpuCounter::GpuCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "GPU_ACTIVE"); }},
{GpuCounter::VertexComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_ACTIVE"); }},
{GpuCounter::FragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_ACTIVE"); }},
{GpuCounter::TilerCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "TILER_ACTIVE"); }},
{GpuCounter::GpuCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "GPU_ACTIVE"); }},
{GpuCounter::VertexComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_ACTIVE"); }},
{GpuCounter::FragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_ACTIVE"); }},
{GpuCounter::TilerCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "TILER_ACTIVE"); }},
{GpuCounter::VertexComputeJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_JOBS"); }},
{GpuCounter::FragmentJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_JOBS"); }},
{GpuCounter::Pixels, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_TASKS") * 1024; }},
{GpuCounter::VertexComputeJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_JOBS"); }},
{GpuCounter::FragmentJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_JOBS"); }},
{GpuCounter::Pixels, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_TASKS") * 1024; }},
{GpuCounter::Tiles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_PTILES"); }},
{GpuCounter::TransactionEliminations, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_TRANS_ELIM"); }},
{GpuCounter::EarlyZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_TEST"); }},
{GpuCounter::EarlyZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_KILL"); }},
{GpuCounter::LateZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_TEST"); }},
{GpuCounter::LateZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_KILL"); }},
{GpuCounter::CulledPrimitives, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "PRIM_CULLED") + get_counter_value(MALI_NAME_BLOCK_TILER, "PRIM_CLIPPED") + get_counter_value(MALI_NAME_BLOCK_TILER, "PRIM_SAT_CULLED"); }},
{GpuCounter::VisiblePrimitives, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "PRIM_VISIBLE"); }},
{GpuCounter::InputPrimitives, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "TRIANGLES") + get_counter_value(MALI_NAME_BLOCK_TILER, "LINES") + get_counter_value(MALI_NAME_BLOCK_TILER, "POINTS"); }},
{GpuCounter::Instructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_COUNT"); }},
{GpuCounter::DivergedInstructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_DIVERGED"); }},
{GpuCounter::Tiles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_PTILES"); }},
{GpuCounter::TransactionEliminations, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_TRANS_ELIM"); }},
{GpuCounter::EarlyZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_TEST"); }},
{GpuCounter::EarlyZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_KILL"); }},
{GpuCounter::LateZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_TEST"); }},
{GpuCounter::LateZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_LZS_KILL"); }},
{GpuCounter::ShaderCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_CORE_ACTIVE"); }},
{GpuCounter::ShaderArithmeticCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_COUNT"); }},
{GpuCounter::ShaderLoadStoreCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_ATOMIC"); }},
{GpuCounter::ShaderTextureCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "TEX_FILT_NUM_OPERATIONS"); }},
{GpuCounter::Instructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_COUNT"); }},
{GpuCounter::DivergedInstructions, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_DIVERGED"); }},
{GpuCounter::CacheReadLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_READ_LOOKUP"); }},
{GpuCounter::CacheWriteLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_WRITE_LOOKUP"); }},
{GpuCounter::ExternalMemoryReadAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ"); }},
{GpuCounter::ExternalMemoryWriteAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE"); }},
{GpuCounter::ExternalMemoryReadStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_AR_STALL"); }},
{GpuCounter::ExternalMemoryWriteStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_W_STALL"); }},
{GpuCounter::ExternalMemoryReadBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ_BEATS") * 16; }},
{GpuCounter::ExternalMemoryWriteBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE_BEATS") * 16; }},
{GpuCounter::ShaderComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "COMPUTE_ACTIVE"); }},
{GpuCounter::ShaderFragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_ACTIVE"); }},
{GpuCounter::ShaderCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_CORE_ACTIVE"); }},
{GpuCounter::ShaderArithmeticCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "EXEC_INSTR_COUNT"); }},
{GpuCounter::ShaderInterpolatorCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "VARY_SLOT_16") + get_counter_value(MALI_NAME_BLOCK_SHADER, "VARY_SLOT_32"); }},
{GpuCounter::ShaderLoadStoreCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_FULL") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_READ_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_WRITE_SHORT") + get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_MEM_ATOMIC"); }},
{GpuCounter::ShaderTextureCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "TEX_FILT_NUM_OPERATIONS"); }},
{GpuCounter::CacheReadLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_READ_LOOKUP"); }},
{GpuCounter::CacheWriteLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_WRITE_LOOKUP"); }},
{GpuCounter::ExternalMemoryReadAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ"); }},
{GpuCounter::ExternalMemoryWriteAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE"); }},
{GpuCounter::ExternalMemoryReadStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_AR_STALL"); }},
{GpuCounter::ExternalMemoryWriteStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_W_STALL"); }},
{GpuCounter::ExternalMemoryReadBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ_BEATS") * 16; }},
{GpuCounter::ExternalMemoryWriteBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE_BEATS") * 16; }},
};
const std::unordered_map<GpuCounter, MaliValueGetter, GpuCounterHash> midgard_mappings = {
{GpuCounter::GpuCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "GPU_ACTIVE"); }},
{GpuCounter::VertexComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_ACTIVE"); }},
{GpuCounter::FragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_ACTIVE"); }},
{GpuCounter::GpuCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "GPU_ACTIVE"); }},
{GpuCounter::VertexComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_ACTIVE"); }},
{GpuCounter::FragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_ACTIVE"); }},
{GpuCounter::VertexComputeJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_JOBS"); }},
{GpuCounter::FragmentJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_JOBS"); }},
{GpuCounter::Pixels, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_TASKS") * 1024; }},
{GpuCounter::VertexComputeJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS1_JOBS"); }},
{GpuCounter::FragmentJobs, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_JOBS"); }},
{GpuCounter::Pixels, [this] { return get_counter_value(MALI_NAME_BLOCK_JM, "JS0_TASKS") * 1024; }},
{GpuCounter::Tiles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_PTILES"); }},
{GpuCounter::TransactionEliminations, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_TRANS_ELIM"); }},
{GpuCounter::EarlyZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_TEST"); }},
{GpuCounter::EarlyZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_KILLED"); }},
{GpuCounter::LateZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_THREADS_LZS_TEST"); }},
{GpuCounter::LateZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_THREADS_LZS_KILLED"); }},
{GpuCounter::CulledPrimitives, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "PRIM_CULLED") + get_counter_value(MALI_NAME_BLOCK_TILER, "PRIM_CLIPPED"); }},
{GpuCounter::VisiblePrimitives, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "PRIM_VISIBLE"); }},
{GpuCounter::InputPrimitives, [this] { return get_counter_value(MALI_NAME_BLOCK_TILER, "TRIANGLES") + get_counter_value(MALI_NAME_BLOCK_TILER, "LINES") + get_counter_value(MALI_NAME_BLOCK_TILER, "POINTS"); }},
{GpuCounter::CacheReadLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_READ_LOOKUP"); }},
{GpuCounter::CacheWriteLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_WRITE_LOOKUP"); }},
{GpuCounter::ExternalMemoryReadAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ"); }},
{GpuCounter::ExternalMemoryWriteAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE"); }},
{GpuCounter::ExternalMemoryReadStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_AR_STALL"); }},
{GpuCounter::ExternalMemoryWriteStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_W_STALL"); }},
{GpuCounter::ExternalMemoryReadBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ_BEATS") * 16; }},
{GpuCounter::ExternalMemoryWriteBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE_BEATS") * 16; }},
{GpuCounter::Tiles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_PTILES"); }},
{GpuCounter::TransactionEliminations, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_TRANS_ELIM"); }},
{GpuCounter::EarlyZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_TEST"); }},
{GpuCounter::EarlyZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_QUADS_EZS_KILLED"); }},
{GpuCounter::LateZTests, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_THREADS_LZS_TEST"); }},
{GpuCounter::LateZKilled, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_THREADS_LZS_KILLED"); }},
{GpuCounter::ShaderComputeCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "COMPUTE_ACTIVE"); }},
{GpuCounter::ShaderFragmentCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "FRAG_ACTIVE"); }},
{GpuCounter::ShaderCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "TRIPIPE_ACTIVE"); }},
{GpuCounter::ShaderArithmeticCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "ARITH_WORDS"); }},
{GpuCounter::ShaderLoadStoreCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "LS_ISSUES"); }},
{GpuCounter::ShaderTextureCycles, [this] { return get_counter_value(MALI_NAME_BLOCK_SHADER, "TEX_ISSUES"); }},
{GpuCounter::CacheReadLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_READ_LOOKUP"); }},
{GpuCounter::CacheWriteLookups, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_WRITE_LOOKUP"); }},
{GpuCounter::ExternalMemoryReadAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ"); }},
{GpuCounter::ExternalMemoryWriteAccesses, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE"); }},
{GpuCounter::ExternalMemoryReadStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_AR_STALL"); }},
{GpuCounter::ExternalMemoryWriteStalls, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_W_STALL"); }},
{GpuCounter::ExternalMemoryReadBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_READ_BEATS") * 16; }},
{GpuCounter::ExternalMemoryWriteBytes, [this] { return get_counter_value(MALI_NAME_BLOCK_MMU, "L2_EXT_WRITE_BEATS") * 16; }},
};
auto product = std::find_if(std::begin(mali_userspace::products), std::end(mali_userspace::products), [&](const mali_userspace::CounterMapping &cm) {
@@ -310,9 +437,25 @@ MaliProfiler::MaliProfiler(const GpuCounterSet &enabled_counters) :
break;
case mali_userspace::PRODUCT_ID_TSIX:
case mali_userspace::PRODUCT_ID_TNOX:
default:
case mali_userspace::PRODUCT_ID_TGOX:
case mali_userspace::PRODUCT_ID_TDVX:
mappings_ = bifrost_mappings;
break;
case mali_userspace::PRODUCT_ID_TNAXa:
case mali_userspace::PRODUCT_ID_TNAXb:
case mali_userspace::PRODUCT_ID_TTRX:
case mali_userspace::PRODUCT_ID_TOTX:
case mali_userspace::PRODUCT_ID_TBOX:
case mali_userspace::PRODUCT_ID_TBOXAE:
mappings_ = valhall_mappings;
break;
case mali_userspace::PRODUCT_ID_TODX:
case mali_userspace::PRODUCT_ID_TVIX:
case mali_userspace::PRODUCT_ID_TGRX:
case mali_userspace::PRODUCT_ID_TVAX:
default:
mappings_ = valhall_csf_mappings;
break;
}
}
else
@@ -336,24 +479,44 @@ void MaliProfiler::init()
throw std::runtime_error("Failed to open /dev/mali0.");
}
// Set API version
{
mali_userspace::kbase_uk_hwcnt_reader_version_check_args check; // NOLINT
memset(&check, 0, sizeof(check));
// Try matching Job Manager version IOCTL
bool checked_version = true;
mali_userspace::kbase_uk_hwcnt_reader_version_check_args version_check_args;
version_check_args.header.id = mali_userspace::UKP_FUNC_ID_CHECK_VERSION_JM;
version_check_args.major = 10;
version_check_args.minor = 2;
if (mali_userspace::mali_ioctl(fd_, check) != 0)
if (mali_userspace::mali_ioctl(fd_, version_check_args) != 0)
{
mali_userspace::kbase_ioctl_version_check _check = {0, 0};
if (ioctl(fd_, KBASE_IOCTL_VERSION_CHECK, &_check) < 0)
mali_userspace::kbase_ioctl_version_check _version_check_args = {0, 0};
if (ioctl(fd_, KBASE_IOCTL_VERSION_CHECK_JM, &_version_check_args) < 0)
{
throw std::runtime_error("Failed to get ABI version.");
checked_version = false;
}
}
else if (check.major < 10)
// Try matching CSF version IOCTL
if (!checked_version)
{
throw std::runtime_error("Unsupported ABI version 10.");
mali_userspace::kbase_uk_hwcnt_reader_version_check_args version_check_args;
version_check_args.header.id = mali_userspace::UKP_FUNC_ID_CHECK_VERSION_CSF;
version_check_args.major = 1;
version_check_args.minor = 4;
if (mali_userspace::mali_ioctl(fd_, version_check_args) != 0)
{
mali_userspace::kbase_ioctl_version_check _version_check_args = {0, 0};
if (ioctl(fd_, KBASE_IOCTL_VERSION_CHECK_CSF, &_version_check_args) < 0)
{
close(fd_);
throw std::runtime_error("Failed to check version.");
}
}
}
}
{
mali_userspace::kbase_uk_hwcnt_reader_set_flags flags; // NOLINT
memset(&flags, 0, sizeof(flags));
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019 ARM Limited.
* Copyright (c) 2019-2022 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,7 +36,7 @@ namespace hwcpipe
/** A Gpu profiler that uses Mali counter data. */
class MaliProfiler : public GpuProfiler
{
public:
public:
explicit MaliProfiler(const GpuCounterSet &enabled_counters);
virtual ~MaliProfiler() = default;
@@ -59,59 +59,74 @@ class MaliProfiler : public GpuProfiler
virtual const GpuMeasurements &sample() override;
virtual void stop() override;
private:
GpuCounterSet enabled_counters_{};
private:
GpuCounterSet enabled_counters_ {};
const GpuCounterSet supported_counters_{
GpuCounter::GpuCycles,
GpuCounter::VertexComputeCycles,
GpuCounter::FragmentCycles,
GpuCounter::TilerCycles,
GpuCounter::VertexComputeJobs,
GpuCounter::Tiles,
GpuCounter::TransactionEliminations,
GpuCounter::FragmentJobs,
GpuCounter::Pixels,
GpuCounter::EarlyZTests,
GpuCounter::EarlyZKilled,
GpuCounter::LateZTests,
GpuCounter::LateZKilled,
GpuCounter::Instructions,
GpuCounter::DivergedInstructions,
GpuCounter::ShaderCycles,
GpuCounter::ShaderArithmeticCycles,
GpuCounter::ShaderLoadStoreCycles,
GpuCounter::ShaderTextureCycles,
GpuCounter::CacheReadLookups,
GpuCounter::CacheWriteLookups,
GpuCounter::ExternalMemoryReadAccesses,
GpuCounter::ExternalMemoryWriteAccesses,
GpuCounter::ExternalMemoryReadStalls,
GpuCounter::ExternalMemoryWriteStalls,
GpuCounter::ExternalMemoryReadBytes,
GpuCounter::ExternalMemoryWriteBytes,
const GpuCounterSet supported_counters_ {
GpuCounter::GpuCycles,
GpuCounter::VertexCycles,
GpuCounter::ComputeCycles,
GpuCounter::VertexComputeCycles,
GpuCounter::FragmentCycles,
GpuCounter::TilerCycles,
GpuCounter::VertexJobs,
GpuCounter::ComputeJobs,
GpuCounter::VertexComputeJobs,
GpuCounter::FragmentJobs,
GpuCounter::Pixels,
GpuCounter::CulledPrimitives,
GpuCounter::VisiblePrimitives,
GpuCounter::InputPrimitives,
GpuCounter::Tiles,
GpuCounter::TransactionEliminations,
GpuCounter::EarlyZTests,
GpuCounter::EarlyZKilled,
GpuCounter::LateZTests,
GpuCounter::LateZKilled,
GpuCounter::Instructions,
GpuCounter::DivergedInstructions,
GpuCounter::ShaderFragmentCycles,
GpuCounter::ShaderComputeCycles,
GpuCounter::ShaderCycles,
GpuCounter::ShaderArithmeticCycles,
GpuCounter::ShaderInterpolatorCycles,
GpuCounter::ShaderLoadStoreCycles,
GpuCounter::ShaderTextureCycles,
GpuCounter::CacheReadLookups,
GpuCounter::CacheWriteLookups,
GpuCounter::ExternalMemoryReadAccesses,
GpuCounter::ExternalMemoryWriteAccesses,
GpuCounter::ExternalMemoryReadStalls,
GpuCounter::ExternalMemoryWriteStalls,
GpuCounter::ExternalMemoryReadBytes,
GpuCounter::ExternalMemoryWriteBytes,
};
typedef std::function<double(void)> MaliValueGetter;
std::unordered_map<GpuCounter, MaliValueGetter, GpuCounterHash> mappings_{};
std::unordered_map<GpuCounter, MaliValueGetter, GpuCounterHash> mappings_ {};
const char *const device_{"/dev/mali0"};
int num_cores_{0};
int num_l2_slices_{0};
int gpu_id_{0};
uint32_t hw_ver_{0};
int buffer_count_{16};
size_t buffer_size_{0};
uint8_t * sample_data_{nullptr};
uint64_t timestamp_{0};
const char *const *names_lut_{
nullptr};
std::vector<uint32_t> raw_counter_buffer_{};
std::vector<unsigned int> core_index_remap_{};
int fd_{-1};
int hwc_fd_{-1};
const char *const device_ {"/dev/mali0"};
int num_cores_ {0};
int num_l2_slices_ {0};
int gpu_id_ {0};
uint32_t hw_ver_ {0};
int buffer_count_ {16};
size_t buffer_size_ {0};
uint8_t * sample_data_ {nullptr};
uint64_t timestamp_ {0};
const char *const * names_lut_ {nullptr};
std::vector<uint32_t> raw_counter_buffer_ {};
std::vector<unsigned int> core_index_remap_ {};
int fd_ {-1};
int hwc_fd_ {-1};
GpuMeasurements measurements_{};
GpuMeasurements measurements_ {};
void init();
void sample_counters();
@@ -26,26 +26,13 @@
#include <asm/unistd.h>
#include <cstring>
#include <linux/version.h>
#include <stdexcept>
#include <sys/ioctl.h>
/* Add std_to_string implementation as it is possible that Android does not provide it */
#include <string>
#include <sstream>
template <typename T>
std::string std_to_string(T value)
{
std::ostringstream os ;
os << value ;
return os.str() ;
}
PmuCounter::PmuCounter() :
_perf_config()
_perf_config()
{
_perf_config.type = PERF_TYPE_HARDWARE;
_perf_config.size = sizeof(perf_event_attr);
// Start disabled
@@ -57,8 +44,8 @@ PmuCounter::PmuCounter() :
_perf_config.inherit_stat = 1;
}
PmuCounter::PmuCounter(uint64_t config) :
PmuCounter()
PmuCounter::PmuCounter(PmuEventInfo config) :
PmuCounter()
{
open(config);
}
@@ -68,9 +55,10 @@ PmuCounter::~PmuCounter()
close();
}
void PmuCounter::open(uint64_t config)
void PmuCounter::open(PmuEventInfo config)
{
_perf_config.config = config;
_perf_config.config = config.event;
_perf_config.type = config.type;
open(_perf_config);
}
@@ -133,12 +121,16 @@ std::string PmuCounter::config_to_str(const perf_event_attr &perf_config)
return "PERF_COUNT_HW_BRANCH_MISSES";
case PERF_COUNT_HW_BUS_CYCLES:
return "PERF_COUNT_HW_BUS_CYCLES";
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)
case PERF_COUNT_HW_STALLED_CYCLES_FRONTEND:
return "PERF_COUNT_HW_STALLED_CYCLES_FRONTEND";
case PERF_COUNT_HW_STALLED_CYCLES_BACKEND:
return "PERF_COUNT_HW_STALLED_CYCLES_BACKEND";
#endif
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0)
case PERF_COUNT_HW_REF_CPU_CYCLES:
return "PERF_COUNT_HW_REF_CPU_CYCLES";
#endif
default:
return "UNKNOWN HARDWARE COUNTER";
}
@@ -160,16 +152,50 @@ std::string PmuCounter::config_to_str(const perf_event_attr &perf_config)
return "PERF_COUNT_SW_PAGE_FAULTS_MIN";
case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
return "PERF_COUNT_SW_PAGE_FAULTS_MAJ";
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 33)
case PERF_COUNT_SW_ALIGNMENT_FAULTS:
return "PERF_COUNT_SW_ALIGNMENT_FAULTS";
case PERF_COUNT_SW_EMULATION_FAULTS:
return "PERF_COUNT_SW_EMULATION_FAULTS";
#endif
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 12, 0)
case PERF_COUNT_SW_DUMMY:
return "PERF_COUNT_SW_DUMMY";
#endif
default:
return "UNKNOWN SOFTWARE COUNTER";
}
case PERF_TYPE_RAW:
switch (static_cast<PmuImplDefined>(perf_config.config))
{
case PmuImplDefined::L1_ACCESSES:
return "L1_ACCESSES";
case PmuImplDefined::INSTR_RETIRED:
return "INSTR_RETIRED";
case PmuImplDefined::L2_ACCESSES:
return "L2_ACCESSES";
case PmuImplDefined::L3_ACCESSES:
return "L3_ACCESSES";
case PmuImplDefined::BUS_READS:
return "BUS_READS";
case PmuImplDefined::BUS_WRITES:
return "BUS_WRITES";
case PmuImplDefined::MEM_READS:
return "MEM_READS";
case PmuImplDefined::MEM_WRITES:
return "MEM_WRITES";
case PmuImplDefined::ASE_SPEC:
return "ASE_SPEC";
case PmuImplDefined::VFP_SPEC:
return "VFP_SPEC";
case PmuImplDefined::CRYPTO_SPEC:
return "CRYPTO_SPEC";
default:
return "UNKNOWN RAW COUNTER";
}
default:
return std_to_string(perf_config.config);
return std::to_string(perf_config.config);
}
}
@@ -35,41 +35,71 @@
#include "hwcpipe_log.h"
enum class PmuImplDefined : uint64_t
{
L1_ACCESSES = 0x4,
INSTR_RETIRED = 0x8,
L2_ACCESSES = 0x16,
L3_ACCESSES = 0x2b,
BUS_READS = 0x60,
BUS_WRITES = 0x61,
MEM_READS = 0x66,
MEM_WRITES = 0x67,
ASE_SPEC = 0x74,
VFP_SPEC = 0x75,
CRYPTO_SPEC = 0x77,
};
struct PmuEventInfo
{
uint64_t type;
uint64_t event;
PmuEventInfo(uint64_t type, uint64_t event) :
type(type),
event(event)
{}
PmuEventInfo(uint64_t type, PmuImplDefined event) :
PmuEventInfo(type, static_cast<uint64_t>(event))
{}
};
/** Class provides access to CPU hardware counters. */
class PmuCounter
{
public:
public:
/** Default constructor. */
PmuCounter();
/** Create PMU counter with specified config.
*
* This constructor automatically calls @ref open with the default
* configuration.
*
* @param[in] config Counter identifier.
*/
PmuCounter(uint64_t config);
*
* This constructor automatically calls @ref open with the default
* configuration.
*
* @param[in] config Counter info.
*/
PmuCounter(PmuEventInfo config);
/** Default destructor. */
~PmuCounter();
/** Get the counter value.
*
* @return Counter value casted to the specified type. */
*
* @return Counter value casted to the specified type. */
template <typename T>
T get_value() const;
/** Open the specified counter based on the default configuration.
*
* @param[in] config The default configuration.
*/
void open(uint64_t config);
/** Open the specified counter based on the given configuration.
*
* @param[in] config The configuration.
*/
void open(PmuEventInfo config);
/** Open the specified configuration.
*
* @param[in] perf_config The specified configuration.
*/
*
* @param[in] perf_config The specified configuration.
*/
void open(const perf_event_attr &perf_config);
/** Close the currently open counter. */
@@ -83,15 +113,15 @@ class PmuCounter
/** Print counter config ID. */
std::string config_to_str(const perf_event_attr &perf_config);
private:
private:
perf_event_attr _perf_config;
long _fd{-1};
long _fd {-1};
};
template <typename T>
T PmuCounter::get_value() const
{
long long value{};
long long value {};
const ssize_t result = read(_fd, &value, sizeof(long long));
if (result == -1)
@@ -28,17 +28,29 @@
namespace hwcpipe
{
const std::unordered_map<CpuCounter, uint64_t, CpuCounterHash> pmu_mappings{
{CpuCounter::Cycles, PERF_COUNT_HW_CPU_CYCLES},
{CpuCounter::Instructions, PERF_COUNT_HW_INSTRUCTIONS},
{CpuCounter::CacheReferences, PERF_COUNT_HW_CACHE_REFERENCES},
{CpuCounter::CacheMisses, PERF_COUNT_HW_CACHE_MISSES},
{CpuCounter::BranchInstructions, PERF_COUNT_HW_BRANCH_INSTRUCTIONS},
{CpuCounter::BranchMisses, PERF_COUNT_HW_BRANCH_MISSES},
const std::unordered_map<CpuCounter, PmuEventInfo, CpuCounterHash> pmu_mappings {
{CpuCounter::Cycles, {PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES}},
{CpuCounter::Instructions, {PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS}},
{CpuCounter::CacheReferences, {PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES}},
{CpuCounter::CacheMisses, {PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES}},
{CpuCounter::BranchInstructions, {PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS}},
{CpuCounter::BranchMisses, {PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES}},
{CpuCounter::L1Accesses, {PERF_TYPE_RAW, PmuImplDefined::L1_ACCESSES}},
{CpuCounter::InstrRetired, {PERF_TYPE_RAW, PmuImplDefined::INSTR_RETIRED}},
{CpuCounter::L2Accesses, {PERF_TYPE_RAW, PmuImplDefined::L2_ACCESSES}},
{CpuCounter::L3Accesses, {PERF_TYPE_RAW, PmuImplDefined::L3_ACCESSES}},
{CpuCounter::BusReads, {PERF_TYPE_RAW, PmuImplDefined::BUS_READS}},
{CpuCounter::BusWrites, {PERF_TYPE_RAW, PmuImplDefined::BUS_WRITES}},
{CpuCounter::MemReads, {PERF_TYPE_RAW, PmuImplDefined::MEM_READS}},
{CpuCounter::MemWrites, {PERF_TYPE_RAW, PmuImplDefined::MEM_WRITES}},
{CpuCounter::ASESpec, {PERF_TYPE_RAW, PmuImplDefined::ASE_SPEC}},
{CpuCounter::VFPSpec, {PERF_TYPE_RAW, PmuImplDefined::VFP_SPEC}},
{CpuCounter::CryptoSpec, {PERF_TYPE_RAW, PmuImplDefined::CRYPTO_SPEC}},
};
PmuProfiler::PmuProfiler(const CpuCounterSet &enabled_counters) :
enabled_counters_(enabled_counters)
enabled_counters_(enabled_counters)
{
// Set up PMU counters
for (const auto &counter : enabled_counters)
@@ -77,7 +89,7 @@ void PmuProfiler::run()
for (auto &pmu_counter : pmu_counters_)
{
pmu_counter.second.reset();
prev_measurements_[pmu_counter.first] = Value{};
prev_measurements_[pmu_counter.first] = Value {};
}
}
@@ -33,7 +33,7 @@ namespace hwcpipe
/** A CPU profiler that uses PMU counter data. */
class PmuProfiler : public CpuProfiler
{
public:
public:
explicit PmuProfiler(const CpuCounterSet &enabled_counters);
virtual ~PmuProfiler() = default;
@@ -56,22 +56,35 @@ class PmuProfiler : public CpuProfiler
virtual const CpuMeasurements &sample() override;
virtual void stop() override;
private:
CpuCounterSet enabled_counters_{};
CpuCounterSet available_counters_{};
private:
CpuCounterSet enabled_counters_ {};
CpuCounterSet available_counters_ {};
const CpuCounterSet supported_counters_{
CpuCounter::Cycles,
CpuCounter::Instructions,
CpuCounter::CacheReferences,
CpuCounter::CacheMisses,
CpuCounter::BranchInstructions,
CpuCounter::BranchMisses};
const CpuCounterSet supported_counters_ {
CpuCounter::Cycles,
CpuCounter::Instructions,
CpuCounter::CacheReferences,
CpuCounter::CacheMisses,
CpuCounter::BranchInstructions,
CpuCounter::BranchMisses,
CpuMeasurements measurements_{};
CpuMeasurements prev_measurements_{};
CpuCounter::L1Accesses,
CpuCounter::InstrRetired,
CpuCounter::L2Accesses,
CpuCounter::L3Accesses,
CpuCounter::BusReads,
CpuCounter::BusWrites,
CpuCounter::MemReads,
CpuCounter::MemWrites,
CpuCounter::ASESpec,
CpuCounter::VFPSpec,
CpuCounter::CryptoSpec,
};
std::unordered_map<CpuCounter, PmuCounter, CpuCounterHash> pmu_counters_{};
CpuMeasurements measurements_ {};
CpuMeasurements prev_measurements_ {};
std::unordered_map<CpuCounter, PmuCounter, CpuCounterHash> pmu_counters_ {};
};
} // namespace hwcpipe