From 22f3bcfb5a3311a2c61ad26c943976e66b68b09c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 3 Jun 2023 07:19:01 -0400 Subject: [PATCH] radeonsi/gfx11: use SET_*_REG_PAIRS_PACKED packets for pm4 states It can generate all PACKED packets, but only SET_CONTEXT_REG_PAIRS_PACKED is generated because register shadowing is required by SET_SH_REG_PAIRS_PACKED*. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_pm4.c | 173 +++++++++++++++++++++- src/gallium/drivers/radeonsi/si_pm4.h | 2 + src/gallium/drivers/radeonsi/si_state.c | 7 + src/gallium/drivers/radeonsi/si_state_draw.cpp | 1 + src/gallium/drivers/radeonsi/si_state_shaders.cpp | 12 ++ 5 files changed, 194 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c index 0c7496c..e297aad 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.c +++ b/src/gallium/drivers/radeonsi/si_pm4.c @@ -9,13 +9,144 @@ #include "sid.h" #include "util/u_memory.h" +static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint32_t val, + unsigned opcode, unsigned idx); + +static bool opcode_is_packed(unsigned opcode) +{ + return opcode == PKT3_SET_CONTEXT_REG_PAIRS_PACKED || + opcode == PKT3_SET_SH_REG_PAIRS_PACKED || + opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N; +} + +static unsigned packed_opcode_to_unpacked(unsigned opcode) +{ + switch (opcode) { + case PKT3_SET_CONTEXT_REG_PAIRS_PACKED: + return PKT3_SET_CONTEXT_REG; + case PKT3_SET_SH_REG_PAIRS_PACKED: + return PKT3_SET_SH_REG; + default: + unreachable("invalid packed opcode"); + } +} + +static unsigned unpacked_opcode_to_packed(struct si_pm4_state *state, unsigned opcode) +{ + switch (opcode) { + case PKT3_SET_CONTEXT_REG: + if (state->screen->info.gfx_level >= GFX11) + return PKT3_SET_CONTEXT_REG_PAIRS_PACKED; + break; + case PKT3_SET_SH_REG: + if (state->screen->info.gfx_level >= GFX11 && + state->screen->info.register_shadowing_required) + return PKT3_SET_SH_REG_PAIRS_PACKED; + break; + } + + return opcode; +} + +static bool packed_next_is_reg_offset_pair(struct si_pm4_state *state) +{ + return (state->ndw - state->last_pm4) % 3 == 2; +} + +static bool packed_next_is_reg_value1(struct si_pm4_state *state) +{ + return (state->ndw - state->last_pm4) % 3 == 1; +} + +static bool packed_prev_is_reg_value0(struct si_pm4_state *state) +{ + return packed_next_is_reg_value1(state); +} + +static unsigned get_packed_reg_dw_offsetN(struct si_pm4_state *state, unsigned index) +{ + unsigned i = state->last_pm4 + 2 + (index / 2) * 3; + assert(i < state->ndw); + return (state->pm4[i] >> ((index % 2) * 16)) & 0xffff; +} + +static unsigned get_packed_reg_valueN_idx(struct si_pm4_state *state, unsigned index) +{ + unsigned i = state->last_pm4 + 2 + (index / 2) * 3 + 1 + (index % 2); + assert(i < state->ndw); + return i; +} + +static unsigned get_packed_reg_valueN(struct si_pm4_state *state, unsigned index) +{ + return state->pm4[get_packed_reg_valueN_idx(state, index)]; +} + +static unsigned get_packed_reg_count(struct si_pm4_state *state) +{ + int body_size = state->ndw - state->last_pm4 - 2; + assert(body_size > 0 && body_size % 3 == 0); + return (body_size / 3) * 2; +} + +void si_pm4_finalize(struct si_pm4_state *state) +{ + if (opcode_is_packed(state->last_opcode)) { + unsigned reg_count = get_packed_reg_count(state); + unsigned reg_dw_offset0 = get_packed_reg_dw_offsetN(state, 0); + + if (state->packed_is_padded) + reg_count--; + + bool all_consecutive = true; + + /* If the whole packed SET packet only sets consecutive registers, rewrite the packet + * to be unpacked to make it shorter. + * + * This also eliminates the invalid scenario when the packed SET packet sets only + * 2 registers and the register offsets are equal due to padding. + */ + for (unsigned i = 1; i < reg_count; i++) { + if (reg_dw_offset0 != get_packed_reg_dw_offsetN(state, i) - i) { + all_consecutive = false; + break; + } + } + + if (all_consecutive) { + assert(state->ndw - state->last_pm4 == 2 + 3 * (reg_count + state->packed_is_padded) / 2); + state->pm4[state->last_pm4] = PKT3(packed_opcode_to_unpacked(state->last_opcode), + reg_count, 0); + state->pm4[state->last_pm4 + 1] = reg_dw_offset0; + for (unsigned i = 0; i < reg_count; i++) + state->pm4[state->last_pm4 + 2 + i] = get_packed_reg_valueN(state, i); + state->ndw = state->last_pm4 + 2 + reg_count; + state->last_opcode = PKT3_SET_SH_REG; + } else { + /* All SET_*_PAIRS* packets on the gfx queue must set RESET_FILTER_CAM. */ + if (!state->is_compute_queue) + state->pm4[state->last_pm4] |= PKT3_RESET_FILTER_CAM_S(1); + + /* If it's a packed SET_SH packet, use the *_N variant when possible. */ + if (state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED && reg_count <= 14) { + state->pm4[state->last_pm4] &= PKT3_IT_OPCODE_C; + state->pm4[state->last_pm4] |= PKT3_IT_OPCODE_S(PKT3_SET_SH_REG_PAIRS_PACKED_N); + } + + } + } +} + static void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode) { + si_pm4_finalize(state); + assert(state->max_dw); assert(state->ndw < state->max_dw); assert(opcode <= 254); state->last_opcode = opcode; state->last_pm4 = state->ndw++; + state->packed_is_padded = false; } void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw) @@ -31,17 +162,36 @@ static void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate) unsigned count; count = state->ndw - state->last_pm4 - 2; state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate); + + if (opcode_is_packed(state->last_opcode)) { + if (packed_prev_is_reg_value0(state)) { + /* Duplicate the first register at the end to make the number of registers aligned to 2. */ + si_pm4_set_reg_custom(state, get_packed_reg_dw_offsetN(state, 0) * 4, + get_packed_reg_valueN(state, 0), + state->last_opcode, 0); + state->packed_is_padded = true; + } + + state->pm4[state->last_pm4 + 1] = get_packed_reg_count(state); + } } static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint32_t val, unsigned opcode, unsigned idx) { + bool is_packed = opcode_is_packed(opcode); reg >>= 2; assert(state->max_dw); assert(state->ndw + 2 <= state->max_dw); - if (opcode != state->last_opcode || reg != (state->last_reg + 1) || idx != state->last_idx) { + if (is_packed) { + if (opcode != state->last_opcode) { + si_pm4_cmd_begin(state, opcode); /* reserve space for the header */ + state->ndw++; /* reserve space for the register count, it will be set at the end */ + } + } else if (opcode != state->last_opcode || reg != (state->last_reg + 1) || + idx != state->last_idx) { si_pm4_cmd_begin(state, opcode); state->pm4[state->ndw++] = reg | (idx << 28); } @@ -49,6 +199,25 @@ static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint assert(reg <= UINT16_MAX); state->last_reg = reg; state->last_idx = idx; + + if (is_packed) { + if (state->packed_is_padded) { + /* The packet is padded, which means the first register is written redundantly again + * at the end. Remove it, so that we can replace it with this register. + */ + state->packed_is_padded = false; + state->ndw--; + } + + if (packed_next_is_reg_offset_pair(state)) { + state->pm4[state->ndw++] = reg; + } else if (packed_next_is_reg_value1(state)) { + /* Set the second register offset in the high 16 bits. */ + state->pm4[state->ndw - 2] &= 0x0000ffff; + state->pm4[state->ndw - 2] |= reg << 16; + } + } + state->pm4[state->ndw++] = val; si_pm4_cmd_end(state, false); } @@ -78,6 +247,8 @@ void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val) return; } + opcode = unpacked_opcode_to_packed(state, opcode); + si_pm4_set_reg_custom(state, reg, val, opcode, 0); } diff --git a/src/gallium/drivers/radeonsi/si_pm4.h b/src/gallium/drivers/radeonsi/si_pm4.h index f30e8a3..4f91d49 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.h +++ b/src/gallium/drivers/radeonsi/si_pm4.h @@ -34,6 +34,7 @@ struct si_pm4_state { uint8_t last_opcode; uint8_t last_idx; bool is_compute_queue; + bool packed_is_padded; /* whether SET_*_REG_PAIRS_PACKED is padded to an even number of regs */ /* For shader states only */ bool is_shader; @@ -53,6 +54,7 @@ void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw); void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val); void si_pm4_set_reg_va(struct si_pm4_state *state, unsigned reg, uint32_t val); void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val); +void si_pm4_finalize(struct si_pm4_state *state); void si_pm4_clear_state(struct si_pm4_state *state, struct si_screen *sscreen, bool is_compute_queue); diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 6ee32a4..2cd7607 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -636,6 +636,7 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, } si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control); + si_pm4_finalize(pm4); return blend; } @@ -1105,6 +1106,7 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast S_028230_ER_LINE_TB(0xA) | S_028230_ER_LINE_BT(0xA)); } + si_pm4_finalize(pm4); if (!rs->uses_poly_offset) return rs; @@ -1148,6 +1150,7 @@ static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rast si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, fui(offset_units)); si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, fui(offset_scale)); si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, fui(offset_units)); + si_pm4_finalize(pm4); } return rs; @@ -1393,6 +1396,7 @@ static void *si_create_dsa_state(struct pipe_context *ctx, si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth_bounds_min)); si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth_bounds_max)); } + si_pm4_finalize(pm4); dsa->depth_enabled = state->depth_enabled; dsa->depth_write_enabled = state->depth_enabled && state->depth_writemask; @@ -5770,6 +5774,7 @@ static void gfx6_init_gfx_preamble_state(struct si_context *sctx, bool uses_reg_ } done: + si_pm4_finalize(pm4); sctx->cs_preamble_state = pm4; sctx->cs_preamble_state_tmz = si_pm4_clone(pm4); /* Make a copy of the preamble for TMZ. */ } @@ -5815,6 +5820,7 @@ static void cdna_init_compute_preamble_state(struct si_context *sctx) S_030E04_ADDRESS(border_color_va >> 40)); } + si_pm4_finalize(pm4); sctx->cs_preamble_state = pm4; sctx->cs_preamble_state_tmz = si_pm4_clone(pm4); /* Make a copy of the preamble for TMZ. */ } @@ -6111,6 +6117,7 @@ static void gfx10_init_gfx_preamble_state(struct si_context *sctx, bool uses_reg } done: + si_pm4_finalize(pm4); sctx->cs_preamble_state = pm4; sctx->cs_preamble_state_tmz = si_pm4_clone(pm4); /* Make a copy of the preamble for TMZ. */ } diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 0c951a7..bdb75c1 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -379,6 +379,7 @@ static bool si_update_shaders(struct si_context *sctx) si_pm4_set_reg(&pipeline->pm4, reg, va_low); } } + si_pm4_finalize(&pipeline->pm4); sctx->screen->ws->buffer_unmap(sctx->screen->ws, bo->buf); _mesa_hash_table_u64_insert(sctx->sqtt->pipeline_bos, diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 777aa16..3cb663e 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -694,6 +694,7 @@ static void si_shader_ls(struct si_screen *sscreen, struct si_shader *shader) S_00B528_FLOAT_MODE(shader->config.float_mode); shader->config.rsrc2 = S_00B52C_USER_SGPR(si_get_num_vs_user_sgprs(shader, SI_VS_NUM_USER_SGPR)) | S_00B52C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); + si_pm4_finalize(pm4); } static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) @@ -745,6 +746,8 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader) if (sscreen->info.gfx_level <= GFX8) si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS, shader->config.rsrc2); + + si_pm4_finalize(pm4); } static void si_emit_shader_es(struct si_context *sctx) @@ -811,6 +814,7 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader) si_set_tesseval_regs(sscreen, shader->selector, shader); polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader); + si_pm4_finalize(pm4); } void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs, @@ -1124,6 +1128,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) | S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); } + si_pm4_finalize(pm4); } bool gfx10_is_ngg_passthrough(struct si_shader *shader) @@ -1499,6 +1504,8 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_028B54_NGG_WAVE_ID_EN(si_shader_uses_streamout(shader)) | S_028B54_GS_W32_EN(shader->wave_size == 32) | S_028B54_MAX_PRIMGRP_IN_WAVE(2); + + si_pm4_finalize(pm4); } static void si_emit_shader_vs(struct si_context *sctx) @@ -1704,6 +1711,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, si_set_tesseval_regs(sscreen, shader->selector, shader); polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader); + si_pm4_finalize(pm4); } static unsigned si_get_spi_shader_col_format(struct si_shader *shader) @@ -1949,6 +1957,7 @@ static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader) S_00B02C_EXTRA_LDS_SIZE(shader->config.lds_size) | S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) | S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); + si_pm4_finalize(pm4); } static void si_shader_init_pm4_state(struct si_screen *sscreen, struct si_shader *shader) @@ -3696,6 +3705,7 @@ static void si_cs_preamble_add_vgt_flush(struct si_context *sctx, bool tmz) /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */ si_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0)); si_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + si_pm4_finalize(pm4); *has_vgt_flush = true; } @@ -3857,6 +3867,7 @@ bool si_update_gs_ring_buffers(struct si_context *sctx) si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE, sctx->gsvs_ring ? sctx->gsvs_ring->width0 / 256 : 0); } + si_pm4_finalize(pm4); if (old_ndw) { pm4->ndw = old_ndw; @@ -4126,6 +4137,7 @@ void si_init_tess_factor_ring(struct si_context *sctx) si_pm4_set_reg(pm4, R_0089B8_VGT_TF_MEMORY_BASE, factor_va >> 8); si_pm4_set_reg(pm4, R_0089B0_VGT_HS_OFFCHIP_PARAM, sctx->screen->hs.hs_offchip_param); } + si_pm4_finalize(pm4); } /* Flush the context to re-emit the cs_preamble state. -- 2.7.4