From: Marek Olšák Date: Sun, 27 Dec 2020 04:14:01 +0000 (-0500) Subject: radeonsi: add a specialized function for CP DMA L2 prefetch X-Git-Tag: upstream/21.2.3~9370 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=961aa67adf651ddb2b035a2ad5608db84fdbd258;p=platform%2Fupstream%2Fmesa.git radeonsi: add a specialized function for CP DMA L2 prefetch This radically simplifies the code to decrease CPU overhead in si_draw_vbo. The generic CP DMA copy function is too complicated. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 4289e83..f3f948d 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -900,7 +900,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info /* Prefetch the compute shader to L2. */ if (sctx->chip_class >= GFX7 && prefetch) - cik_prefetch_TC_L2_async(sctx, &program->shader.bo->b.b, 0, program->shader.bo->b.b.width0); + si_cp_dma_prefetch(sctx, &program->shader.bo->b.b, 0, program->shader.bo->b.b.width0); if (program->ir_type != PIPE_SHADER_IR_NATIVE) si_setup_nir_user_data(sctx, info); diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index f8e483d..7945143 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -399,6 +399,44 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, } } +void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf, + unsigned offset, unsigned size) +{ + uint64_t address = si_resource(buf)->gpu_address + offset; + + assert(sctx->chip_class >= GFX7); + + /* The prefetch address and size must be aligned, so that we don't have to apply + * the complicated hw bug workaround. + * + * The size should also be less than 2 MB, so that we don't have to use a loop. + * Callers shouldn't need to prefetch more than 2 MB. + */ + assert(size % SI_CPDMA_ALIGNMENT == 0); + assert(address % SI_CPDMA_ALIGNMENT == 0); + assert(size < S_414_BYTE_COUNT_GFX6(~0u)); + + uint32_t header = S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2); + uint32_t command = S_414_BYTE_COUNT_GFX6(size); + + if (sctx->chip_class >= GFX9) { + command |= S_414_DISABLE_WR_CONFIRM_GFX9(1); + header |= S_411_DST_SEL(V_411_NOWHERE); + } else { + command |= S_414_DISABLE_WR_CONFIRM_GFX6(1); + header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2); + } + + struct radeon_cmdbuf *cs = &sctx->gfx_cs; + radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0)); + radeon_emit(cs, header); + radeon_emit(cs, address); /* SRC_ADDR_LO [31:0] */ + radeon_emit(cs, address >> 32); /* SRC_ADDR_HI [31:0] */ + radeon_emit(cs, address); /* DST_ADDR_LO [31:0] */ + radeon_emit(cs, address >> 32); /* DST_ADDR_HI [31:0] */ + radeon_emit(cs, command); +} + void si_test_gds(struct si_context *sctx) { struct pipe_context *ctx = &sctx->b; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 1fc7d0d..0623ad3 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1401,6 +1401,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned user_flags, enum si_coherency coher, enum si_cache_policy cache_policy); +void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf, + unsigned offset, unsigned size); void si_test_gds(struct si_context *sctx); void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned offset, unsigned size, unsigned dst_sel, unsigned engine, const void *data); diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 63ede1d..a293787 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -587,8 +587,6 @@ unsigned si_get_input_prim(const struct si_shader_selector *gs); bool si_update_ngg(struct si_context *sctx); /* si_state_draw.c */ -void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset, - unsigned size); void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx); void si_trace_emit(struct si_context *sctx); void si_init_draw_functions(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index c180e1d..9731780 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -59,20 +59,11 @@ static unsigned si_conv_pipe_prim(unsigned mode) return prim_conv[mode]; } -void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset, - unsigned size) -{ - assert(sctx->chip_class >= GFX7); - - si_cp_dma_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL, - SI_COHERENCY_SHADER, L2_LRU); -} - static void si_prefetch_shader_async(struct si_context *sctx, struct si_pm4_state *state) { struct pipe_resource *bo = &state->shader->bo->b.b; - cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0); + si_cp_dma_prefetch(sctx, bo, 0, bo->width0); } static void si_prefetch_VBO_descriptors(struct si_context *sctx) @@ -80,7 +71,7 @@ static void si_prefetch_VBO_descriptors(struct si_context *sctx) if (!sctx->vertex_elements || !sctx->vertex_elements->vb_desc_list_alloc_size) return; - cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset, + si_cp_dma_prefetch(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset, sctx->vertex_elements->vb_desc_list_alloc_size); }