radeonsi,radv: do not overallocate the SQTT buffer size
authorSamuel Pitoiset <samuel.pitoiset@gmail.com>
Fri, 26 Feb 2021 14:19:25 +0000 (15:19 +0100)
committerSamuel Pitoiset <samuel.pitoiset@gmail.com>
Mon, 1 Mar 2021 12:13:36 +0000 (13:13 +0100)
The number of shader engines isn't always 4.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9307>

src/amd/common/ac_sqtt.c
src/amd/common/ac_sqtt.h
src/amd/vulkan/radv_sqtt.c
src/gallium/drivers/radeonsi/si_sqtt.c

index 863dfa2..1f8bda5 100644 (file)
@@ -35,11 +35,13 @@ ac_thread_trace_get_info_offset(unsigned se)
 }
 
 uint64_t
-ac_thread_trace_get_data_offset(struct ac_thread_trace_data *data, unsigned se)
+ac_thread_trace_get_data_offset(const struct radeon_info *rad_info,
+                                const struct ac_thread_trace_data *data, unsigned se)
 {
+   unsigned max_se = rad_info->max_se;
    uint64_t data_offset;
 
-   data_offset = align64(sizeof(struct ac_thread_trace_info) * 4,
+   data_offset = align64(sizeof(struct ac_thread_trace_info) * max_se,
                1 << SQTT_BUFFER_ALIGN_SHIFT);
    data_offset += data->buffer_size * se;
 
@@ -53,9 +55,10 @@ ac_thread_trace_get_info_va(uint64_t va, unsigned se)
 }
 
 uint64_t
-ac_thread_trace_get_data_va(struct ac_thread_trace_data *data, uint64_t va, unsigned se)
+ac_thread_trace_get_data_va(const struct radeon_info *rad_info,
+                            const struct ac_thread_trace_data *data, uint64_t va, unsigned se)
 {
-   return va + ac_thread_trace_get_data_offset(data, se);
+   return va + ac_thread_trace_get_data_offset(rad_info, data, se);
 }
 
 bool
index 4b8e282..267b609 100644 (file)
@@ -78,12 +78,14 @@ uint64_t
 ac_thread_trace_get_info_offset(unsigned se);
 
 uint64_t
-ac_thread_trace_get_data_offset(struct ac_thread_trace_data *data, unsigned se);
+ac_thread_trace_get_data_offset(const struct radeon_info *rad_info,
+                                const struct ac_thread_trace_data *data, unsigned se);
 uint64_t
 ac_thread_trace_get_info_va(uint64_t va, unsigned se);
 
 uint64_t
-ac_thread_trace_get_data_va(struct ac_thread_trace_data *data, uint64_t va, unsigned se);
+ac_thread_trace_get_data_va(const struct radeon_info *rad_info,
+                            const struct ac_thread_trace_data *data, uint64_t va, unsigned se);
 
 bool
 ac_is_thread_trace_complete(struct radeon_info *rad_info, const struct ac_thread_trace_info *info);
index ed7e5df..4bfdc4b 100644 (file)
@@ -36,13 +36,14 @@ radv_emit_thread_trace_start(struct radv_device *device,
                             uint32_t queue_family_index)
 {
        uint32_t shifted_size = device->thread_trace.buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
-       unsigned max_se = device->physical_device->rad_info.max_se;
+       struct radeon_info *rad_info = &device->physical_device->rad_info;
+       unsigned max_se = rad_info->max_se;
 
        assert(device->physical_device->rad_info.chip_class >= GFX8);
 
        for (unsigned se = 0; se < max_se; se++) {
                uint64_t va = radv_buffer_get_va(device->thread_trace.bo);
-               uint64_t data_va = ac_thread_trace_get_data_va(&device->thread_trace, va, se);
+               uint64_t data_va = ac_thread_trace_get_data_va(rad_info, &device->thread_trace, va, se);
                uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
                int first_active_cu = ffs(device->physical_device->rad_info.cu_mask[se][0]);
 
@@ -400,6 +401,7 @@ radv_emit_wait_for_idle(struct radv_device *device,
 static bool
 radv_thread_trace_init_bo(struct radv_device *device)
 {
+       unsigned max_se = device->physical_device->rad_info.max_se;
        struct radeon_winsys *ws = device->ws;
        uint64_t size;
 
@@ -409,10 +411,10 @@ radv_thread_trace_init_bo(struct radv_device *device)
        device->thread_trace.buffer_size = align64(device->thread_trace.buffer_size,
                                                   1u << SQTT_BUFFER_ALIGN_SHIFT);
 
-       /* Compute total size of the thread trace BO for 4 SEs. */
-       size = align64(sizeof(struct ac_thread_trace_info) * 4,
+       /* Compute total size of the thread trace BO for all SEs. */
+       size = align64(sizeof(struct ac_thread_trace_info) * max_se,
                       1 << SQTT_BUFFER_ALIGN_SHIFT);
-       size += device->thread_trace.buffer_size * 4ll;
+       size += device->thread_trace.buffer_size * (uint64_t)max_se;
 
        device->thread_trace.bo = ws->buffer_create(ws, size, 4096,
                                                    RADEON_DOMAIN_VRAM,
@@ -625,7 +627,8 @@ radv_get_thread_trace(struct radv_queue *queue,
                      struct ac_thread_trace *thread_trace)
 {
        struct radv_device *device = queue->device;
-       unsigned max_se = device->physical_device->rad_info.max_se;
+       struct radeon_info *rad_info = &device->physical_device->rad_info;
+       unsigned max_se = rad_info->max_se;
        void *thread_trace_ptr = device->thread_trace.ptr;
 
        memset(thread_trace, 0, sizeof(*thread_trace));
@@ -633,7 +636,7 @@ radv_get_thread_trace(struct radv_queue *queue,
 
        for (unsigned se = 0; se < max_se; se++) {
                uint64_t info_offset = ac_thread_trace_get_info_offset(se);
-               uint64_t data_offset = ac_thread_trace_get_data_offset(&device->thread_trace, se);
+               uint64_t data_offset = ac_thread_trace_get_data_offset(rad_info, &device->thread_trace, se);
                void *info_ptr = (uint8_t *)thread_trace_ptr + info_offset;
                void *data_ptr = (uint8_t *)thread_trace_ptr + data_offset;
                struct ac_thread_trace_info *info =
index 8fc751b..3cca359 100644 (file)
@@ -38,6 +38,7 @@ si_emit_spi_config_cntl(struct si_context* sctx,
 static bool
 si_thread_trace_init_bo(struct si_context *sctx)
 {
+   unsigned max_se = sctx->screen->info.max_se;
    struct radeon_winsys *ws = sctx->ws;
    uint64_t size;
 
@@ -47,10 +48,10 @@ si_thread_trace_init_bo(struct si_context *sctx)
    sctx->thread_trace->buffer_size = align64(sctx->thread_trace->buffer_size,
                                              1u << SQTT_BUFFER_ALIGN_SHIFT);
 
-   /* Compute total size of the thread trace BO for 4 SEs. */
-   size = align64(sizeof(struct ac_thread_trace_info) * 4,
+   /* Compute total size of the thread trace BO for all SEs. */
+   size = align64(sizeof(struct ac_thread_trace_info) * max_se,
                   1 << SQTT_BUFFER_ALIGN_SHIFT);
-   size += sctx->thread_trace->buffer_size * 4ll;
+   size += sctx->thread_trace->buffer_size * (uint64_t)max_se;
 
    sctx->thread_trace->bo =
       ws->buffer_create(ws, size, 4096,
@@ -77,7 +78,7 @@ si_emit_thread_trace_start(struct si_context* sctx,
 
    for (unsigned se = 0; se < max_se; se++) {
       uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo);
-      uint64_t data_va = ac_thread_trace_get_data_va(sctx->thread_trace, va, se);
+      uint64_t data_va = ac_thread_trace_get_data_va(&sctx->screen->info, sctx->thread_trace, va, se);
       uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
 
       /* Target SEx and SH0. */
@@ -495,7 +496,7 @@ si_get_thread_trace(struct si_context *sctx,
 
    for (unsigned se = 0; se < max_se; se++) {
       uint64_t info_offset = ac_thread_trace_get_info_offset(se);
-      uint64_t data_offset = ac_thread_trace_get_data_offset(sctx->thread_trace, se);
+      uint64_t data_offset = ac_thread_trace_get_data_offset(&sctx->screen->info, sctx->thread_trace, se);
       void *info_ptr = thread_trace_ptr + info_offset;
       void *data_ptr = thread_trace_ptr + data_offset;
       struct ac_thread_trace_info *info =