From: Marek Olšák <marek.olsak@amd.com>
Date: Sun, 27 Dec 2020 04:14:01 +0000 (-0500)
Subject: radeonsi: add a specialized function for CP DMA L2 prefetch
X-Git-Tag: upstream/21.2.3~9370
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=961aa67adf651ddb2b035a2ad5608db84fdbd258;p=platform%2Fupstream%2Fmesa.git

radeonsi: add a specialized function for CP DMA L2 prefetch

This radically simplifies the code to decrease CPU overhead in si_draw_vbo.
The generic CP DMA copy function is too complicated.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8548>
---

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 4289e83..f3f948d 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -900,7 +900,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
 
    /* Prefetch the compute shader to L2. */
    if (sctx->chip_class >= GFX7 && prefetch)
-      cik_prefetch_TC_L2_async(sctx, &program->shader.bo->b.b, 0, program->shader.bo->b.b.width0);
+      si_cp_dma_prefetch(sctx, &program->shader.bo->b.b, 0, program->shader.bo->b.b.width0);
 
    if (program->ir_type != PIPE_SHADER_IR_NATIVE)
       si_setup_nir_user_data(sctx, info);
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index f8e483d..7945143 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -399,6 +399,44 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
    }
 }
 
+void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
+                        unsigned offset, unsigned size)
+{
+   uint64_t address = si_resource(buf)->gpu_address + offset;
+
+   assert(sctx->chip_class >= GFX7);
+
+   /* The prefetch address and size must be aligned, so that we don't have to apply
+    * the complicated hw bug workaround.
+    *
+    * The size should also be less than 2 MB, so that we don't have to use a loop.
+    * Callers shouldn't need to prefetch more than 2 MB.
+    */
+   assert(size % SI_CPDMA_ALIGNMENT == 0);
+   assert(address % SI_CPDMA_ALIGNMENT == 0);
+   assert(size < S_414_BYTE_COUNT_GFX6(~0u));
+
+   uint32_t header = S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2);
+   uint32_t command = S_414_BYTE_COUNT_GFX6(size);
+
+   if (sctx->chip_class >= GFX9) {
+      command |= S_414_DISABLE_WR_CONFIRM_GFX9(1);
+      header |= S_411_DST_SEL(V_411_NOWHERE);
+   } else {
+      command |= S_414_DISABLE_WR_CONFIRM_GFX6(1);
+      header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2);
+   }
+
+   struct radeon_cmdbuf *cs = &sctx->gfx_cs;
+   radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+   radeon_emit(cs, header);
+   radeon_emit(cs, address);       /* SRC_ADDR_LO [31:0] */
+   radeon_emit(cs, address >> 32); /* SRC_ADDR_HI [31:0] */
+   radeon_emit(cs, address);       /* DST_ADDR_LO [31:0] */
+   radeon_emit(cs, address >> 32); /* DST_ADDR_HI [31:0] */
+   radeon_emit(cs, command);
+}
+
 void si_test_gds(struct si_context *sctx)
 {
    struct pipe_context *ctx = &sctx->b;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 1fc7d0d..0623ad3 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1401,6 +1401,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
                            struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
                            unsigned size, unsigned user_flags, enum si_coherency coher,
                            enum si_cache_policy cache_policy);
+void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
+                        unsigned offset, unsigned size);
 void si_test_gds(struct si_context *sctx);
 void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned offset,
                       unsigned size, unsigned dst_sel, unsigned engine, const void *data);
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 63ede1d..a293787 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -587,8 +587,6 @@ unsigned si_get_input_prim(const struct si_shader_selector *gs);
 bool si_update_ngg(struct si_context *sctx);
 
 /* si_state_draw.c */
-void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset,
-                              unsigned size);
 void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
 void si_trace_emit(struct si_context *sctx);
 void si_init_draw_functions(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index c180e1d..9731780 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -59,20 +59,11 @@ static unsigned si_conv_pipe_prim(unsigned mode)
    return prim_conv[mode];
 }
 
-void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset,
-                              unsigned size)
-{
-   assert(sctx->chip_class >= GFX7);
-
-   si_cp_dma_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL,
-                         SI_COHERENCY_SHADER, L2_LRU);
-}
-
 static void si_prefetch_shader_async(struct si_context *sctx, struct si_pm4_state *state)
 {
    struct pipe_resource *bo = &state->shader->bo->b.b;
 
-   cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
+   si_cp_dma_prefetch(sctx, bo, 0, bo->width0);
 }
 
 static void si_prefetch_VBO_descriptors(struct si_context *sctx)
@@ -80,7 +71,7 @@ static void si_prefetch_VBO_descriptors(struct si_context *sctx)
    if (!sctx->vertex_elements || !sctx->vertex_elements->vb_desc_list_alloc_size)
       return;
 
-   cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset,
+   si_cp_dma_prefetch(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset,
                             sctx->vertex_elements->vb_desc_list_alloc_size);
 }