radv: Add a list of performance counters.
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Sat, 4 Jun 2022 21:51:38 +0000 (23:51 +0200)
committerMarge Bot <emma+marge@anholt.net>
Sat, 9 Jul 2022 12:29:06 +0000 (12:29 +0000)
Plus helpers that will be shared for listing counters + doing the
actual queries.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16879>

src/amd/vulkan/radv_device.c
src/amd/vulkan/radv_perfcounter.c
src/amd/vulkan/radv_private.h

index 78d89a2..34b9605 100644 (file)
@@ -3569,6 +3569,11 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
          result = VK_ERROR_OUT_OF_HOST_MEMORY;
          goto fail_cache;
       }
+
+      if (!device->physical_device->ac_perfcounters.blocks) {
+         result = VK_ERROR_INITIALIZATION_FAILED;
+         goto fail_cache;
+      }
    }
 
    *pDevice = radv_device_to_handle(device);
index b328551..5da44fc 100644 (file)
@@ -23,6 +23,8 @@
 
 #include <inttypes.h>
 
+#include "ac_perfcounter.h"
+#include "amdgfxregs.h"
 #include "radv_cs.h"
 #include "radv_private.h"
 #include "sid.h"
@@ -78,3 +80,314 @@ radv_perfcounter_emit_spm_stop(struct radv_device *device, struct radeon_cmdbuf
                                                             V_036020_STRM_PERFMON_STATE_START_COUNTING :
                                                             V_036020_STRM_PERFMON_STATE_STOP_COUNTING));
 }
+
+enum radv_perfcounter_op {
+   RADV_PC_OP_SUM,
+   RADV_PC_OP_MAX,
+   RADV_PC_OP_RATIO_DIVSCALE,
+   RADV_PC_OP_REVERSE_RATIO, /* (reg1 - reg0) / reg1 */
+   RADV_PC_OP_SUM_WEIGHTED_4,
+};
+
+#define S_REG_SEL(x)   ((x)&0xFFFF)
+#define G_REG_SEL(x)   ((x)&0xFFFF)
+#define S_REG_BLOCK(x) ((x) << 16)
+#define G_REG_BLOCK(x) (((x) >> 16) & 0x7FFF)
+
+#define S_REG_OFFSET(x)    ((x)&0xFFFF)
+#define G_REG_OFFSET(x)    ((x)&0xFFFF)
+#define S_REG_INSTANCES(x) ((x) << 16)
+#define G_REG_INSTANCES(x) (((x) >> 16) & 0x7FFF)
+#define S_REG_CONSTANT(x)  ((x) << 31)
+#define G_REG_CONSTANT(x)  ((x) >> 31)
+
+struct radv_perfcounter_impl {
+   enum radv_perfcounter_op op;
+   uint32_t regs[8];
+};
+
+/* Only append to this list, never insert into the middle or remove (but can rename).
+ *
+ * The invariant we're trying to get here is counters that have the same meaning, so
+ * these can be shared between counters that have different implementations on different
+ * GPUs, but should be unique within a GPU.
+ */
+enum radv_perfcounter_uuid {
+   RADV_PC_UUID_GPU_CYCLES,
+   RADV_PC_UUID_SHADER_WAVES,
+   RADV_PC_UUID_SHADER_INSTRUCTIONS,
+   RADV_PC_UUID_SHADER_INSTRUCTIONS_VALU,
+   RADV_PC_UUID_SHADER_INSTRUCTIONS_SALU,
+   RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_LOAD,
+   RADV_PC_UUID_SHADER_INSTRUCTIONS_SMEM_LOAD,
+   RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_STORE,
+   RADV_PC_UUID_SHADER_INSTRUCTIONS_LDS,
+   RADV_PC_UUID_SHADER_INSTRUCTIONS_GDS,
+   RADV_PC_UUID_SHADER_VALU_BUSY,
+   RADV_PC_UUID_SHADER_SALU_BUSY,
+   RADV_PC_UUID_VRAM_READ_SIZE,
+   RADV_PC_UUID_VRAM_WRITE_SIZE,
+   RADV_PC_UUID_L0_CACHE_HIT_RATIO,
+   RADV_PC_UUID_L1_CACHE_HIT_RATIO,
+   RADV_PC_UUID_L2_CACHE_HIT_RATIO,
+};
+
+struct radv_perfcounter_desc {
+   struct radv_perfcounter_impl impl;
+
+   VkPerformanceCounterUnitKHR unit;
+
+   char name[VK_MAX_DESCRIPTION_SIZE];
+   char category[VK_MAX_DESCRIPTION_SIZE];
+   char description[VK_MAX_DESCRIPTION_SIZE];
+   enum radv_perfcounter_uuid uuid;
+};
+
+#define PC_DESC(arg_op, arg_unit, arg_name, arg_category, arg_description, arg_uuid, ...)          \
+   (struct radv_perfcounter_desc)                                                                  \
+   {                                                                                               \
+      .impl = {.op = arg_op, .regs = {__VA_ARGS__}},                                               \
+      .unit = VK_PERFORMANCE_COUNTER_UNIT_##arg_unit##_KHR, .name = arg_name,                      \
+      .category = arg_category, .description = arg_description, .uuid = RADV_PC_UUID_##arg_uuid    \
+   }
+
+#define ADD_PC(op, unit, name, category, description, uuid, ...)                                   \
+   do {                                                                                            \
+      if (descs) {                                                                                 \
+         descs[*count] = PC_DESC((op), unit, name, category, description, uuid, __VA_ARGS__);      \
+      }                                                                                            \
+      ++*count;                                                                                    \
+   } while (0)
+#define CTR(block, ctr) (S_REG_BLOCK(block) | S_REG_SEL(ctr))
+#define CONSTANT(v)     (S_REG_CONSTANT(1) | (uint32_t)(v))
+
+enum { GRBM_PERF_SEL_GUI_ACTIVE = CTR(GRBM, 2) };
+
+enum { CPF_PERF_SEL_CPF_STAT_BUSY_GFX10 = CTR(CPF, 0x18) };
+
+enum {
+   GL1C_PERF_SEL_REQ = CTR(GL1C, 0xe),
+   GL1C_PERF_SEL_REQ_MISS = CTR(GL1C, 0x12),
+};
+
+enum {
+   GL2C_PERF_SEL_REQ = CTR(GL2C, 0x3),
+
+   GL2C_PERF_SEL_MISS_GFX101 = CTR(GL2C, 0x23),
+   GL2C_PERF_SEL_MC_WRREQ_GFX101 = CTR(GL2C, 0x4b),
+   GL2C_PERF_SEL_EA_WRREQ_64B_GFX101 = CTR(GL2C, 0x4c),
+   GL2C_PERF_SEL_EA_RDREQ_32B_GFX101 = CTR(GL2C, 0x59),
+   GL2C_PERF_SEL_EA_RDREQ_64B_GFX101 = CTR(GL2C, 0x5a),
+   GL2C_PERF_SEL_EA_RDREQ_96B_GFX101 = CTR(GL2C, 0x5b),
+   GL2C_PERF_SEL_EA_RDREQ_128B_GFX101 = CTR(GL2C, 0x5c),
+
+   GL2C_PERF_SEL_MISS_GFX103 = CTR(GL2C, 0x2b),
+   GL2C_PERF_SEL_MC_WRREQ_GFX103 = CTR(GL2C, 0x53),
+   GL2C_PERF_SEL_EA_WRREQ_64B_GFX103 = CTR(GL2C, 0x55),
+   GL2C_PERF_SEL_EA_RDREQ_32B_GFX103 = CTR(GL2C, 0x63),
+   GL2C_PERF_SEL_EA_RDREQ_64B_GFX103 = CTR(GL2C, 0x64),
+   GL2C_PERF_SEL_EA_RDREQ_96B_GFX103 = CTR(GL2C, 0x65),
+   GL2C_PERF_SEL_EA_RDREQ_128B_GFX103 = CTR(GL2C, 0x66),
+};
+
+enum {
+   SQ_PERF_SEL_WAVES = CTR(SQ, 0x4),
+   SQ_PERF_SEL_INSTS_ALL_GFX10 = CTR(SQ, 0x31),
+   SQ_PERF_SEL_INSTS_GDS_GFX10 = CTR(SQ, 0x37),
+   SQ_PERF_SEL_INSTS_LDS_GFX10 = CTR(SQ, 0x3b),
+   SQ_PERF_SEL_INSTS_SALU_GFX10 = CTR(SQ, 0x3c),
+   SQ_PERF_SEL_INSTS_SMEM_GFX10 = CTR(SQ, 0x3d),
+   SQ_PERF_SEL_INSTS_VALU_GFX10 = CTR(SQ, 0x40),
+   SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10 = CTR(SQ, 0x45),
+   SQ_PERF_SEL_INSTS_TEX_STORE_GFX10 = CTR(SQ, 0x46),
+   SQ_PERF_SEL_INST_CYCLES_VALU_GFX10 = CTR(SQ, 0x75),
+};
+
+enum {
+   TCP_PERF_SEL_REQ_GFX10 = CTR(TCP, 0x9),
+   TCP_PERF_SEL_REQ_MISS_GFX10 = CTR(TCP, 0x12),
+};
+
+#define CTR_NUM_SIMD                                                                               \
+   CONSTANT(pdev->rad_info.num_simd_per_compute_unit * pdev->rad_info.num_good_compute_units)
+#define CTR_NUM_CUS CONSTANT(pdev->rad_info.num_good_compute_units)
+
+static void
+radv_query_perfcounter_descs(struct radv_physical_device *pdev, uint32_t *count,
+                             struct radv_perfcounter_desc *descs)
+{
+   *count = 0;
+
+   ADD_PC(RADV_PC_OP_MAX, CYCLES, "GPU active cycles", "GRBM",
+          "cycles the GPU is active processing a command buffer.", GPU_CYCLES,
+          GRBM_PERF_SEL_GUI_ACTIVE);
+
+   ADD_PC(RADV_PC_OP_SUM, GENERIC, "Waves", "Shaders", "Number of waves executed", SHADER_WAVES,
+          SQ_PERF_SEL_WAVES);
+   ADD_PC(RADV_PC_OP_SUM, GENERIC, "Instructions", "Shaders", "Number of Instructions executed",
+          SHADER_INSTRUCTIONS, SQ_PERF_SEL_INSTS_ALL_GFX10);
+   ADD_PC(RADV_PC_OP_SUM, GENERIC, "VALU Instructions", "Shaders",
+          "Number of VALU Instructions executed", SHADER_INSTRUCTIONS_VALU,
+          SQ_PERF_SEL_INSTS_VALU_GFX10);
+   ADD_PC(RADV_PC_OP_SUM, GENERIC, "SALU Instructions", "Shaders",
+          "Number of SALU Instructions executed", SHADER_INSTRUCTIONS_SALU,
+          SQ_PERF_SEL_INSTS_SALU_GFX10);
+   ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Load Instructions", "Shaders",
+          "Number of VMEM load instructions executed", SHADER_INSTRUCTIONS_VMEM_LOAD,
+          SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10);
+   ADD_PC(RADV_PC_OP_SUM, GENERIC, "SMEM Load Instructions", "Shaders",
+          "Number of SMEM load instructions executed", SHADER_INSTRUCTIONS_SMEM_LOAD,
+          SQ_PERF_SEL_INSTS_SMEM_GFX10);
+   ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Store Instructions", "Shaders",
+          "Number of VMEM store instructions executed", SHADER_INSTRUCTIONS_VMEM_STORE,
+          SQ_PERF_SEL_INSTS_TEX_STORE_GFX10);
+   ADD_PC(RADV_PC_OP_SUM, GENERIC, "LDS Instructions", "Shaders",
+          "Number of LDS Instructions executed", SHADER_INSTRUCTIONS_LDS,
+          SQ_PERF_SEL_INSTS_LDS_GFX10);
+   ADD_PC(RADV_PC_OP_SUM, GENERIC, "GDS Instructions", "Shaders",
+          "Number of GDS Instructions executed", SHADER_INSTRUCTIONS_GDS,
+          SQ_PERF_SEL_INSTS_GDS_GFX10);
+
+   ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "VALU Busy", "Shader Utilization",
+          "Percentage of time the VALU units are busy", SHADER_VALU_BUSY,
+          SQ_PERF_SEL_INST_CYCLES_VALU_GFX10, CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_SIMD);
+   ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "SALU Busy", "Shader Utilization",
+          "Percentage of time the SALU units are busy", SHADER_SALU_BUSY,
+          SQ_PERF_SEL_INSTS_SALU_GFX10, CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_CUS);
+
+   if (pdev->rad_info.gfx_level >= GFX10_3) {
+      ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory",
+             "Number of bytes read from VRAM", VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX103,
+             CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX103, CONSTANT(64),
+             GL2C_PERF_SEL_EA_RDREQ_96B_GFX103, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX103,
+             CONSTANT(128));
+      ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory",
+             "Number of bytes written to VRAM", VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX103,
+             CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX103, CONSTANT(64), CONSTANT(0),
+             CONSTANT(0), CONSTANT(0), CONSTANT(0));
+   } else {
+      ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory",
+             "Number of bytes read from VRAM", VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX101,
+             CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX101, CONSTANT(64),
+             GL2C_PERF_SEL_EA_RDREQ_96B_GFX101, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX101,
+             CONSTANT(128));
+      ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory",
+             "Number of bytes written to VRAM", VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX101,
+             CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX101, CONSTANT(32), CONSTANT(0),
+             CONSTANT(0), CONSTANT(0), CONSTANT(0));
+   }
+
+   ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L0 cache hit ratio", "Memory", "Hit ratio of L0 cache",
+          L0_CACHE_HIT_RATIO, TCP_PERF_SEL_REQ_MISS_GFX10, TCP_PERF_SEL_REQ_GFX10);
+   ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L1 cache hit ratio", "Memory", "Hit ratio of L1 cache",
+          L1_CACHE_HIT_RATIO, GL1C_PERF_SEL_REQ_MISS, GL1C_PERF_SEL_REQ);
+   if (pdev->rad_info.gfx_level >= GFX10_3) {
+      ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory",
+             "Hit ratio of L2 cache", L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX103,
+             GL2C_PERF_SEL_REQ);
+   } else {
+      ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory",
+             "Hit ratio of L2 cache", L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX101,
+             GL2C_PERF_SEL_REQ);
+   }
+}
+
+static bool
+radv_init_perfcounter_descs(struct radv_physical_device *pdev)
+{
+   if (pdev->perfcounters)
+      return true;
+
+   uint32_t count;
+   radv_query_perfcounter_descs(pdev, &count, NULL);
+
+   struct radv_perfcounter_desc *descs = malloc(sizeof(*descs) * count);
+   if (!descs)
+      return false;
+
+   radv_query_perfcounter_descs(pdev, &count, descs);
+   pdev->num_perfcounters = count;
+   pdev->perfcounters = descs;
+
+   return true;
+}
+
+static int
+cmp_uint32_t(const void *a, const void *b)
+{
+   uint32_t l = *(const uint32_t *)a;
+   uint32_t r = *(const uint32_t *)b;
+
+   return (l < r) ? -1 : (l > r) ? 1 : 0;
+}
+
+static VkResult
+radv_get_counter_registers(const struct radv_physical_device *pdevice, uint32_t num_indices,
+                           const uint32_t *indices, unsigned *out_num_regs, uint32_t **out_regs)
+{
+   ASSERTED uint32_t num_counters = pdevice->num_perfcounters;
+   const struct radv_perfcounter_desc *descs = pdevice->perfcounters;
+
+   unsigned full_reg_cnt = num_indices * ARRAY_SIZE(descs->impl.regs);
+   uint32_t *regs = malloc(full_reg_cnt * sizeof(uint32_t));
+   if (!regs)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   unsigned reg_cnt = 0;
+   for (unsigned i = 0; i < num_indices; ++i) {
+      uint32_t index = indices[i];
+      assert(index < num_counters);
+      for (unsigned j = 0; j < ARRAY_SIZE(descs[index].impl.regs) && descs[index].impl.regs[j];
+           ++j) {
+         if (!G_REG_CONSTANT(descs[index].impl.regs[j]))
+            regs[reg_cnt++] = descs[index].impl.regs[j];
+      }
+   }
+
+   qsort(regs, reg_cnt, sizeof(uint32_t), cmp_uint32_t);
+
+   unsigned deduped_reg_cnt = 0;
+   for (unsigned i = 1; i < reg_cnt; ++i) {
+      if (regs[i] != regs[deduped_reg_cnt])
+         regs[++deduped_reg_cnt] = regs[i];
+   }
+   ++deduped_reg_cnt;
+
+   *out_num_regs = deduped_reg_cnt;
+   *out_regs = regs;
+   return VK_SUCCESS;
+}
+
+static unsigned
+radv_pc_get_num_instances(const struct radv_physical_device *pdevice, struct ac_pc_block *ac_block)
+{
+   return ac_block->num_instances *
+          ((ac_block->b->b->flags & AC_PC_BLOCK_SE) ? pdevice->rad_info.max_se : 1);
+}
+
+static unsigned
+radv_get_num_counter_passes(const struct radv_physical_device *pdevice, unsigned num_regs,
+                            const uint32_t *regs)
+{
+   enum ac_pc_gpu_block prev_block = NUM_GPU_BLOCK;
+   unsigned block_reg_count = 0;
+   struct ac_pc_block *ac_block = NULL;
+   unsigned passes_needed = 1;
+
+   for (unsigned i = 0; i < num_regs; ++i) {
+      enum ac_pc_gpu_block block = G_REG_BLOCK(regs[i]);
+
+      if (block != prev_block) {
+         block_reg_count = 0;
+         prev_block = block;
+         ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
+      }
+
+      ++block_reg_count;
+
+      passes_needed =
+         MAX2(passes_needed, DIV_ROUND_UP(block_reg_count, ac_block->b->b->num_counters));
+   }
+
+   return passes_needed;
+}
index 8c54ed9..a45b060 100644 (file)
@@ -260,6 +260,8 @@ enum radv_queue_family {
    RADV_QUEUE_IGNORED,
 };
 
+struct radv_perfcounter_desc;
+
 struct radv_physical_device {
    struct vk_physical_device vk;
 
@@ -336,6 +338,9 @@ struct radv_physical_device {
 
    /* Performance counters. */
    struct ac_perfcounters ac_perfcounters;
+
+   uint32_t num_perfcounters;
+   struct radv_perfcounter_desc *perfcounters;
 };
 
 struct radv_instance {