radv: Add a list of performance counters.
[platform/upstream/mesa.git] / src / amd / vulkan / radv_perfcounter.c
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23
24 #include <inttypes.h>
25
26 #include "ac_perfcounter.h"
27 #include "amdgfxregs.h"
28 #include "radv_cs.h"
29 #include "radv_private.h"
30 #include "sid.h"
31
32 void
33 radv_perfcounter_emit_shaders(struct radeon_cmdbuf *cs, unsigned shaders)
34 {
35    radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
36    radeon_emit(cs, shaders & 0x7f);
37    radeon_emit(cs, 0xffffffff);
38 }
39
40 void
41 radv_perfcounter_emit_spm_reset(struct radeon_cmdbuf *cs)
42 {
43    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
44                               S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
45                               S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_DISABLE_AND_RESET));
46 }
47
48 void
49 radv_perfcounter_emit_spm_start(struct radv_device *device, struct radeon_cmdbuf *cs, int family)
50 {
51    /* Start SPM counters. */
52    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
53                               S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
54                               S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING));
55
56    /* Start windowed performance counters. */
57    if (family == RADV_QUEUE_GENERAL) {
58       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
59       radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
60    }
61    radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(1));
62 }
63
64 void
65 radv_perfcounter_emit_spm_stop(struct radv_device *device, struct radeon_cmdbuf *cs, int family)
66 {
67    /* Stop windowed performance counters. */
68    if (family == RADV_QUEUE_GENERAL) {
69       if (!device->physical_device->rad_info.never_send_perfcounter_stop) {
70          radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
71          radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
72       }
73    }
74    radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(0));
75
76    /* Stop SPM counters. */
77    radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
78                               S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
79                               S_036020_SPM_PERFMON_STATE(device->physical_device->rad_info.never_stop_sq_perf_counters ?
80                                                             V_036020_STRM_PERFMON_STATE_START_COUNTING :
81                                                             V_036020_STRM_PERFMON_STATE_STOP_COUNTING));
82 }
83
84 enum radv_perfcounter_op {
85    RADV_PC_OP_SUM,
86    RADV_PC_OP_MAX,
87    RADV_PC_OP_RATIO_DIVSCALE,
88    RADV_PC_OP_REVERSE_RATIO, /* (reg1 - reg0) / reg1 */
89    RADV_PC_OP_SUM_WEIGHTED_4,
90 };
91
92 #define S_REG_SEL(x)   ((x)&0xFFFF)
93 #define G_REG_SEL(x)   ((x)&0xFFFF)
94 #define S_REG_BLOCK(x) ((x) << 16)
95 #define G_REG_BLOCK(x) (((x) >> 16) & 0x7FFF)
96
97 #define S_REG_OFFSET(x)    ((x)&0xFFFF)
98 #define G_REG_OFFSET(x)    ((x)&0xFFFF)
99 #define S_REG_INSTANCES(x) ((x) << 16)
100 #define G_REG_INSTANCES(x) (((x) >> 16) & 0x7FFF)
101 #define S_REG_CONSTANT(x)  ((x) << 31)
102 #define G_REG_CONSTANT(x)  ((x) >> 31)
103
104 struct radv_perfcounter_impl {
105    enum radv_perfcounter_op op;
106    uint32_t regs[8];
107 };
108
109 /* Only append to this list, never insert into the middle or remove (but can rename).
110  *
111  * The invariant we're trying to get here is counters that have the same meaning, so
112  * these can be shared between counters that have different implementations on different
113  * GPUs, but should be unique within a GPU.
114  */
115 enum radv_perfcounter_uuid {
116    RADV_PC_UUID_GPU_CYCLES,
117    RADV_PC_UUID_SHADER_WAVES,
118    RADV_PC_UUID_SHADER_INSTRUCTIONS,
119    RADV_PC_UUID_SHADER_INSTRUCTIONS_VALU,
120    RADV_PC_UUID_SHADER_INSTRUCTIONS_SALU,
121    RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_LOAD,
122    RADV_PC_UUID_SHADER_INSTRUCTIONS_SMEM_LOAD,
123    RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_STORE,
124    RADV_PC_UUID_SHADER_INSTRUCTIONS_LDS,
125    RADV_PC_UUID_SHADER_INSTRUCTIONS_GDS,
126    RADV_PC_UUID_SHADER_VALU_BUSY,
127    RADV_PC_UUID_SHADER_SALU_BUSY,
128    RADV_PC_UUID_VRAM_READ_SIZE,
129    RADV_PC_UUID_VRAM_WRITE_SIZE,
130    RADV_PC_UUID_L0_CACHE_HIT_RATIO,
131    RADV_PC_UUID_L1_CACHE_HIT_RATIO,
132    RADV_PC_UUID_L2_CACHE_HIT_RATIO,
133 };
134
135 struct radv_perfcounter_desc {
136    struct radv_perfcounter_impl impl;
137
138    VkPerformanceCounterUnitKHR unit;
139
140    char name[VK_MAX_DESCRIPTION_SIZE];
141    char category[VK_MAX_DESCRIPTION_SIZE];
142    char description[VK_MAX_DESCRIPTION_SIZE];
143    enum radv_perfcounter_uuid uuid;
144 };
145
146 #define PC_DESC(arg_op, arg_unit, arg_name, arg_category, arg_description, arg_uuid, ...)          \
147    (struct radv_perfcounter_desc)                                                                  \
148    {                                                                                               \
149       .impl = {.op = arg_op, .regs = {__VA_ARGS__}},                                               \
150       .unit = VK_PERFORMANCE_COUNTER_UNIT_##arg_unit##_KHR, .name = arg_name,                      \
151       .category = arg_category, .description = arg_description, .uuid = RADV_PC_UUID_##arg_uuid    \
152    }
153
154 #define ADD_PC(op, unit, name, category, description, uuid, ...)                                   \
155    do {                                                                                            \
156       if (descs) {                                                                                 \
157          descs[*count] = PC_DESC((op), unit, name, category, description, uuid, __VA_ARGS__);      \
158       }                                                                                            \
159       ++*count;                                                                                    \
160    } while (0)
161 #define CTR(block, ctr) (S_REG_BLOCK(block) | S_REG_SEL(ctr))
162 #define CONSTANT(v)     (S_REG_CONSTANT(1) | (uint32_t)(v))
163
164 enum { GRBM_PERF_SEL_GUI_ACTIVE = CTR(GRBM, 2) };
165
166 enum { CPF_PERF_SEL_CPF_STAT_BUSY_GFX10 = CTR(CPF, 0x18) };
167
168 enum {
169    GL1C_PERF_SEL_REQ = CTR(GL1C, 0xe),
170    GL1C_PERF_SEL_REQ_MISS = CTR(GL1C, 0x12),
171 };
172
173 enum {
174    GL2C_PERF_SEL_REQ = CTR(GL2C, 0x3),
175
176    GL2C_PERF_SEL_MISS_GFX101 = CTR(GL2C, 0x23),
177    GL2C_PERF_SEL_MC_WRREQ_GFX101 = CTR(GL2C, 0x4b),
178    GL2C_PERF_SEL_EA_WRREQ_64B_GFX101 = CTR(GL2C, 0x4c),
179    GL2C_PERF_SEL_EA_RDREQ_32B_GFX101 = CTR(GL2C, 0x59),
180    GL2C_PERF_SEL_EA_RDREQ_64B_GFX101 = CTR(GL2C, 0x5a),
181    GL2C_PERF_SEL_EA_RDREQ_96B_GFX101 = CTR(GL2C, 0x5b),
182    GL2C_PERF_SEL_EA_RDREQ_128B_GFX101 = CTR(GL2C, 0x5c),
183
184    GL2C_PERF_SEL_MISS_GFX103 = CTR(GL2C, 0x2b),
185    GL2C_PERF_SEL_MC_WRREQ_GFX103 = CTR(GL2C, 0x53),
186    GL2C_PERF_SEL_EA_WRREQ_64B_GFX103 = CTR(GL2C, 0x55),
187    GL2C_PERF_SEL_EA_RDREQ_32B_GFX103 = CTR(GL2C, 0x63),
188    GL2C_PERF_SEL_EA_RDREQ_64B_GFX103 = CTR(GL2C, 0x64),
189    GL2C_PERF_SEL_EA_RDREQ_96B_GFX103 = CTR(GL2C, 0x65),
190    GL2C_PERF_SEL_EA_RDREQ_128B_GFX103 = CTR(GL2C, 0x66),
191 };
192
193 enum {
194    SQ_PERF_SEL_WAVES = CTR(SQ, 0x4),
195    SQ_PERF_SEL_INSTS_ALL_GFX10 = CTR(SQ, 0x31),
196    SQ_PERF_SEL_INSTS_GDS_GFX10 = CTR(SQ, 0x37),
197    SQ_PERF_SEL_INSTS_LDS_GFX10 = CTR(SQ, 0x3b),
198    SQ_PERF_SEL_INSTS_SALU_GFX10 = CTR(SQ, 0x3c),
199    SQ_PERF_SEL_INSTS_SMEM_GFX10 = CTR(SQ, 0x3d),
200    SQ_PERF_SEL_INSTS_VALU_GFX10 = CTR(SQ, 0x40),
201    SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10 = CTR(SQ, 0x45),
202    SQ_PERF_SEL_INSTS_TEX_STORE_GFX10 = CTR(SQ, 0x46),
203    SQ_PERF_SEL_INST_CYCLES_VALU_GFX10 = CTR(SQ, 0x75),
204 };
205
206 enum {
207    TCP_PERF_SEL_REQ_GFX10 = CTR(TCP, 0x9),
208    TCP_PERF_SEL_REQ_MISS_GFX10 = CTR(TCP, 0x12),
209 };
210
211 #define CTR_NUM_SIMD                                                                               \
212    CONSTANT(pdev->rad_info.num_simd_per_compute_unit * pdev->rad_info.num_good_compute_units)
213 #define CTR_NUM_CUS CONSTANT(pdev->rad_info.num_good_compute_units)
214
215 static void
216 radv_query_perfcounter_descs(struct radv_physical_device *pdev, uint32_t *count,
217                              struct radv_perfcounter_desc *descs)
218 {
219    *count = 0;
220
221    ADD_PC(RADV_PC_OP_MAX, CYCLES, "GPU active cycles", "GRBM",
222           "cycles the GPU is active processing a command buffer.", GPU_CYCLES,
223           GRBM_PERF_SEL_GUI_ACTIVE);
224
225    ADD_PC(RADV_PC_OP_SUM, GENERIC, "Waves", "Shaders", "Number of waves executed", SHADER_WAVES,
226           SQ_PERF_SEL_WAVES);
227    ADD_PC(RADV_PC_OP_SUM, GENERIC, "Instructions", "Shaders", "Number of Instructions executed",
228           SHADER_INSTRUCTIONS, SQ_PERF_SEL_INSTS_ALL_GFX10);
229    ADD_PC(RADV_PC_OP_SUM, GENERIC, "VALU Instructions", "Shaders",
230           "Number of VALU Instructions executed", SHADER_INSTRUCTIONS_VALU,
231           SQ_PERF_SEL_INSTS_VALU_GFX10);
232    ADD_PC(RADV_PC_OP_SUM, GENERIC, "SALU Instructions", "Shaders",
233           "Number of SALU Instructions executed", SHADER_INSTRUCTIONS_SALU,
234           SQ_PERF_SEL_INSTS_SALU_GFX10);
235    ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Load Instructions", "Shaders",
236           "Number of VMEM load instructions executed", SHADER_INSTRUCTIONS_VMEM_LOAD,
237           SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10);
238    ADD_PC(RADV_PC_OP_SUM, GENERIC, "SMEM Load Instructions", "Shaders",
239           "Number of SMEM load instructions executed", SHADER_INSTRUCTIONS_SMEM_LOAD,
240           SQ_PERF_SEL_INSTS_SMEM_GFX10);
241    ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Store Instructions", "Shaders",
242           "Number of VMEM store instructions executed", SHADER_INSTRUCTIONS_VMEM_STORE,
243           SQ_PERF_SEL_INSTS_TEX_STORE_GFX10);
244    ADD_PC(RADV_PC_OP_SUM, GENERIC, "LDS Instructions", "Shaders",
245           "Number of LDS Instructions executed", SHADER_INSTRUCTIONS_LDS,
246           SQ_PERF_SEL_INSTS_LDS_GFX10);
247    ADD_PC(RADV_PC_OP_SUM, GENERIC, "GDS Instructions", "Shaders",
248           "Number of GDS Instructions executed", SHADER_INSTRUCTIONS_GDS,
249           SQ_PERF_SEL_INSTS_GDS_GFX10);
250
251    ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "VALU Busy", "Shader Utilization",
252           "Percentage of time the VALU units are busy", SHADER_VALU_BUSY,
253           SQ_PERF_SEL_INST_CYCLES_VALU_GFX10, CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_SIMD);
254    ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "SALU Busy", "Shader Utilization",
255           "Percentage of time the SALU units are busy", SHADER_SALU_BUSY,
256           SQ_PERF_SEL_INSTS_SALU_GFX10, CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_CUS);
257
258    if (pdev->rad_info.gfx_level >= GFX10_3) {
259       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory",
260              "Number of bytes read from VRAM", VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX103,
261              CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX103, CONSTANT(64),
262              GL2C_PERF_SEL_EA_RDREQ_96B_GFX103, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX103,
263              CONSTANT(128));
264       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory",
265              "Number of bytes written to VRAM", VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX103,
266              CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX103, CONSTANT(64), CONSTANT(0),
267              CONSTANT(0), CONSTANT(0), CONSTANT(0));
268    } else {
269       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory",
270              "Number of bytes read from VRAM", VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX101,
271              CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX101, CONSTANT(64),
272              GL2C_PERF_SEL_EA_RDREQ_96B_GFX101, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX101,
273              CONSTANT(128));
274       ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory",
275              "Number of bytes written to VRAM", VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX101,
276              CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX101, CONSTANT(32), CONSTANT(0),
277              CONSTANT(0), CONSTANT(0), CONSTANT(0));
278    }
279
280    ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L0 cache hit ratio", "Memory", "Hit ratio of L0 cache",
281           L0_CACHE_HIT_RATIO, TCP_PERF_SEL_REQ_MISS_GFX10, TCP_PERF_SEL_REQ_GFX10);
282    ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L1 cache hit ratio", "Memory", "Hit ratio of L1 cache",
283           L1_CACHE_HIT_RATIO, GL1C_PERF_SEL_REQ_MISS, GL1C_PERF_SEL_REQ);
284    if (pdev->rad_info.gfx_level >= GFX10_3) {
285       ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory",
286              "Hit ratio of L2 cache", L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX103,
287              GL2C_PERF_SEL_REQ);
288    } else {
289       ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory",
290              "Hit ratio of L2 cache", L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX101,
291              GL2C_PERF_SEL_REQ);
292    }
293 }
294
295 static bool
296 radv_init_perfcounter_descs(struct radv_physical_device *pdev)
297 {
298    if (pdev->perfcounters)
299       return true;
300
301    uint32_t count;
302    radv_query_perfcounter_descs(pdev, &count, NULL);
303
304    struct radv_perfcounter_desc *descs = malloc(sizeof(*descs) * count);
305    if (!descs)
306       return false;
307
308    radv_query_perfcounter_descs(pdev, &count, descs);
309    pdev->num_perfcounters = count;
310    pdev->perfcounters = descs;
311
312    return true;
313 }
314
315 static int
316 cmp_uint32_t(const void *a, const void *b)
317 {
318    uint32_t l = *(const uint32_t *)a;
319    uint32_t r = *(const uint32_t *)b;
320
321    return (l < r) ? -1 : (l > r) ? 1 : 0;
322 }
323
324 static VkResult
325 radv_get_counter_registers(const struct radv_physical_device *pdevice, uint32_t num_indices,
326                            const uint32_t *indices, unsigned *out_num_regs, uint32_t **out_regs)
327 {
328    ASSERTED uint32_t num_counters = pdevice->num_perfcounters;
329    const struct radv_perfcounter_desc *descs = pdevice->perfcounters;
330
331    unsigned full_reg_cnt = num_indices * ARRAY_SIZE(descs->impl.regs);
332    uint32_t *regs = malloc(full_reg_cnt * sizeof(uint32_t));
333    if (!regs)
334       return VK_ERROR_OUT_OF_HOST_MEMORY;
335
336    unsigned reg_cnt = 0;
337    for (unsigned i = 0; i < num_indices; ++i) {
338       uint32_t index = indices[i];
339       assert(index < num_counters);
340       for (unsigned j = 0; j < ARRAY_SIZE(descs[index].impl.regs) && descs[index].impl.regs[j];
341            ++j) {
342          if (!G_REG_CONSTANT(descs[index].impl.regs[j]))
343             regs[reg_cnt++] = descs[index].impl.regs[j];
344       }
345    }
346
347    qsort(regs, reg_cnt, sizeof(uint32_t), cmp_uint32_t);
348
349    unsigned deduped_reg_cnt = 0;
350    for (unsigned i = 1; i < reg_cnt; ++i) {
351       if (regs[i] != regs[deduped_reg_cnt])
352          regs[++deduped_reg_cnt] = regs[i];
353    }
354    ++deduped_reg_cnt;
355
356    *out_num_regs = deduped_reg_cnt;
357    *out_regs = regs;
358    return VK_SUCCESS;
359 }
360
361 static unsigned
362 radv_pc_get_num_instances(const struct radv_physical_device *pdevice, struct ac_pc_block *ac_block)
363 {
364    return ac_block->num_instances *
365           ((ac_block->b->b->flags & AC_PC_BLOCK_SE) ? pdevice->rad_info.max_se : 1);
366 }
367
368 static unsigned
369 radv_get_num_counter_passes(const struct radv_physical_device *pdevice, unsigned num_regs,
370                             const uint32_t *regs)
371 {
372    enum ac_pc_gpu_block prev_block = NUM_GPU_BLOCK;
373    unsigned block_reg_count = 0;
374    struct ac_pc_block *ac_block = NULL;
375    unsigned passes_needed = 1;
376
377    for (unsigned i = 0; i < num_regs; ++i) {
378       enum ac_pc_gpu_block block = G_REG_BLOCK(regs[i]);
379
380       if (block != prev_block) {
381          block_reg_count = 0;
382          prev_block = block;
383          ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
384       }
385
386       ++block_reg_count;
387
388       passes_needed =
389          MAX2(passes_needed, DIV_ROUND_UP(block_reg_count, ac_block->b->b->num_counters));
390    }
391
392    return passes_needed;
393 }