From 2cc981a0cd548ecce1be16a92275d7a2a111e0fc Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 12 Sep 2023 09:05:48 +0200 Subject: [PATCH] radv: fix capturing RGP on RDNA3 with more than one Shader Engine PKT3_RESET_FILTER_CAM_S seems required on GFX11. Otherwise, capturing with more than on SE can hang. Cc: mesa-stable Signed-off-by: Samuel Pitoiset Part-of: --- src/amd/common/ac_sqtt.c | 4 ---- src/amd/vulkan/radv_cs.h | 7 +++---- src/amd/vulkan/radv_perfcounter.c | 4 +++- src/amd/vulkan/radv_sqtt.c | 15 +++++++++------ src/gallium/drivers/radeonsi/si_sqtt.c | 18 +++++++++++++++--- 5 files changed, 30 insertions(+), 18 deletions(-) diff --git a/src/amd/common/ac_sqtt.c b/src/amd/common/ac_sqtt.c index c0cab4f..f2bf94f 100644 --- a/src/amd/common/ac_sqtt.c +++ b/src/amd/common/ac_sqtt.c @@ -229,10 +229,6 @@ ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt *data, enum amd_ip_type ip_type) bool ac_sqtt_se_is_disabled(const struct radeon_info *info, unsigned se) { - /* FIXME: SQTT only works on SE0 for some unknown reasons. */ - if (info->gfx_level == GFX11) - return se != 0; - /* No active CU on the SE means it is disabled. */ return info->cu_mask[se][0] == 0; } diff --git a/src/amd/vulkan/radv_cs.h b/src/amd/vulkan/radv_cs.h index 05939dc..8899cf3 100644 --- a/src/amd/vulkan/radv_cs.h +++ b/src/amd/vulkan/radv_cs.h @@ -168,9 +168,9 @@ radeon_set_uconfig_reg_idx(const struct radv_physical_device *pdevice, struct ra } static inline void -radeon_set_perfctr_reg(struct radv_cmd_buffer *cmd_buffer, unsigned reg, unsigned value) +radeon_set_perfctr_reg(enum amd_gfx_level gfx_level, enum radv_queue_family qf, struct radeon_cmdbuf *cs, unsigned reg, + unsigned value) { - struct radeon_cmdbuf *cs = cmd_buffer->cs; assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END); assert(cs->cdw + 3 <= cs->reserved_dw); @@ -179,8 +179,7 @@ radeon_set_perfctr_reg(struct radv_cmd_buffer *cmd_buffer, unsigned reg, unsigne * that means that it can skip register writes due to not taking correctly into account the * fields from the GRBM_GFX_INDEX. With this bit we can force the write. */ - bool filter_cam_workaround = - cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10 && cmd_buffer->qf == RADV_QUEUE_GENERAL; + bool filter_cam_workaround = gfx_level >= GFX10 && qf == RADV_QUEUE_GENERAL; radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, 1, 0) | PKT3_RESET_FILTER_CAM_S(filter_cam_workaround)); radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2); diff --git a/src/amd/vulkan/radv_perfcounter.c b/src/amd/vulkan/radv_perfcounter.c index f7d7c34..c8746ae 100644 --- a/src/amd/vulkan/radv_perfcounter.c +++ b/src/amd/vulkan/radv_perfcounter.c @@ -462,6 +462,8 @@ radv_emit_instance(struct radv_cmd_buffer *cmd_buffer, int se, int instance) static void radv_emit_select(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count, unsigned *selectors) { + const enum amd_gfx_level gfx_level = cmd_buffer->device->physical_device->rad_info.gfx_level; + const enum radv_queue_family qf = cmd_buffer->qf; struct ac_pc_block_base *regs = block->b->b; struct radeon_cmdbuf *cs = cmd_buffer->cs; unsigned idx; @@ -473,7 +475,7 @@ radv_emit_select(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, return; for (idx = 0; idx < count; ++idx) { - radeon_set_perfctr_reg(cmd_buffer, regs->select0[idx], G_REG_SEL(selectors[idx]) | regs->select_or); + radeon_set_perfctr_reg(gfx_level, qf, cs, regs->select0[idx], G_REG_SEL(selectors[idx]) | regs->select_or); } for (idx = 0; idx < regs->num_spm_counters; idx++) { diff --git a/src/amd/vulkan/radv_sqtt.c b/src/amd/vulkan/radv_sqtt.c index 0cf40c2..00a249d 100644 --- a/src/amd/vulkan/radv_sqtt.c +++ b/src/amd/vulkan/radv_sqtt.c @@ -75,6 +75,7 @@ radv_emit_wait_for_idle(const struct radv_device *device, struct radeon_cmdbuf * static void radv_emit_sqtt_start(const struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf) { + const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level; uint32_t shifted_size = device->sqtt.buffer_size >> SQTT_BUFFER_ALIGN_SHIFT; const struct radeon_info *rad_info = &device->physical_device->rad_info; unsigned max_se = rad_info->max_se; @@ -94,12 +95,12 @@ radv_emit_sqtt_start(const struct radv_device *device, struct radeon_cmdbuf *cs, if (device->physical_device->rad_info.gfx_level >= GFX11) { /* Order seems important for the following 2 registers. */ - radeon_set_uconfig_reg(cs, R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE, + radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE, S_0367A4_SIZE(shifted_size) | S_0367A4_BASE_HI(shifted_va >> 32)); - radeon_set_uconfig_reg(cs, R_0367A0_SQ_THREAD_TRACE_BUF0_BASE, shifted_va); + radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367A0_SQ_THREAD_TRACE_BUF0_BASE, shifted_va); - radeon_set_uconfig_reg(cs, R_0367B4_SQ_THREAD_TRACE_MASK, + radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367B4_SQ_THREAD_TRACE_MASK, S_0367B4_WTYPE_INCLUDE(0x7f) | /* all shader stages */ S_0367B4_SA_SEL(0) | S_0367B4_WGP_SEL(first_active_cu / 2) | S_0367B4_SIMD_SEL(0)); @@ -118,10 +119,11 @@ radv_emit_sqtt_start(const struct radv_device *device, struct radeon_cmdbuf *cs, } sqtt_token_mask |= S_0367B8_TOKEN_EXCLUDE(token_exclude); - radeon_set_uconfig_reg(cs, R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask); + radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask); /* Should be emitted last (it enables thread traces). */ - radeon_set_uconfig_reg(cs, R_0367B0_SQ_THREAD_TRACE_CTRL, gfx11_get_sqtt_ctrl(device, true)); + radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367B0_SQ_THREAD_TRACE_CTRL, gfx11_get_sqtt_ctrl(device, true)); + } else if (device->physical_device->rad_info.gfx_level >= GFX10) { /* Order seems important for the following 2 registers. */ radeon_set_privileged_config_reg(cs, R_008D04_SQ_THREAD_TRACE_BUF0_SIZE, @@ -301,6 +303,7 @@ radv_copy_sqtt_info_regs(const struct radv_device *device, struct radeon_cmdbuf static void radv_emit_sqtt_stop(const struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf) { + const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level; unsigned max_se = device->physical_device->rad_info.max_se; /* Stop the thread trace with a different event based on the queue. */ @@ -338,7 +341,7 @@ radv_emit_sqtt_stop(const struct radv_device *device, struct radeon_cmdbuf *cs, radeon_emit(cs, 4); /* poll interval */ /* Disable the thread trace mode. */ - radeon_set_uconfig_reg(cs, R_0367B0_SQ_THREAD_TRACE_CTRL, gfx11_get_sqtt_ctrl(device, false)); + radeon_set_perfctr_reg(gfx_level, qf, cs, R_0367B0_SQ_THREAD_TRACE_CTRL, gfx11_get_sqtt_ctrl(device, false)); /* Wait for thread trace completion. */ radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); diff --git a/src/gallium/drivers/radeonsi/si_sqtt.c b/src/gallium/drivers/radeonsi/si_sqtt.c index 5850e5f..63618e6 100644 --- a/src/gallium/drivers/radeonsi/si_sqtt.c +++ b/src/gallium/drivers/radeonsi/si_sqtt.c @@ -47,6 +47,18 @@ static bool si_sqtt_init_bo(struct si_context *sctx) { return true; } +static bool +si_sqtt_se_is_disabled(const struct radeon_info *info, unsigned se) +{ + /* FIXME: SQTT only works on SE0 for some unknown reasons. See RADV for the + * solution */ + if (info->gfx_level == GFX11) + return se != 0; + + /* No active CU on the SE means it is disabled. */ + return info->cu_mask[se][0] == 0; +} + static void si_emit_sqtt_start(struct si_context *sctx, struct radeon_cmdbuf *cs, uint32_t queue_family_index) { @@ -62,7 +74,7 @@ static void si_emit_sqtt_start(struct si_context *sctx, ac_sqtt_get_data_va(&sctx->screen->info, sctx->sqtt, va, se); uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT; - if (ac_sqtt_se_is_disabled(&sctx->screen->info, se)) + if (si_sqtt_se_is_disabled(&sctx->screen->info, se)) continue; /* Target SEx and SH0. */ @@ -333,7 +345,7 @@ static void si_emit_sqtt_stop(struct si_context *sctx, struct radeon_cmdbuf *cs, } for (unsigned se = 0; se < max_se; se++) { - if (ac_sqtt_se_is_disabled(&sctx->screen->info, se)) + if (si_sqtt_se_is_disabled(&sctx->screen->info, se)) continue; radeon_begin(cs); @@ -565,7 +577,7 @@ static bool si_get_sqtt_trace(struct si_context *sctx, void *info_ptr = sqtt_ptr + info_offset; struct ac_sqtt_data_info *info = (struct ac_sqtt_data_info *)info_ptr; - if (ac_sqtt_se_is_disabled(&sctx->screen->info, se)) + if (si_sqtt_se_is_disabled(&sctx->screen->info, se)) continue; if (!ac_is_sqtt_complete(&sctx->screen->info, sctx->sqtt, info)) { -- 2.7.4