anv: use COMPUTE_WALKER post sync field to track compute work
authorLionel Landwerlin <lionel.g.landwerlin@intel.com>
Fri, 19 May 2023 14:01:23 +0000 (17:01 +0300)
committerLionel Landwerlin <lionel.g.landwerlin@intel.com>
Wed, 24 May 2023 06:09:01 +0000 (09:09 +0300)
This is more accurate than PIPE_CONTROL as it won't introduce stalls
between the compute dispatches.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Felix DeGrood <felix.j.degrood@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23131>

src/intel/vulkan/anv_cmd_buffer.c
src/intel/vulkan/anv_genX.h
src/intel/vulkan/anv_measure.c
src/intel/vulkan/anv_private.h
src/intel/vulkan/anv_utrace.c
src/intel/vulkan/genX_cmd_buffer.c

index eb75f0a..ddab251 100644 (file)
@@ -79,6 +79,8 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
 {
    anv_cmd_state_finish(cmd_buffer);
    anv_cmd_state_init(cmd_buffer);
+
+   cmd_buffer->last_compute_walker = NULL;
 }
 
 static VkResult
@@ -136,6 +138,8 @@ anv_create_cmd_buffer(struct vk_command_pool *pool,
    cmd_buffer->generation_return_addr = ANV_NULL_ADDRESS;
    cmd_buffer->generation_bt_state = ANV_STATE_NULL;
 
+   cmd_buffer->last_compute_walker = NULL;
+
    anv_cmd_state_init(cmd_buffer);
 
    anv_measure_init(cmd_buffer);
index 30a323d..c4bce5e 100644 (file)
@@ -172,7 +172,8 @@ void genX(blorp_exec)(struct blorp_batch *batch,
 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
                               struct anv_device *device,
                               struct anv_address addr,
-                              enum anv_timestamp_capture_type);
+                              enum anv_timestamp_capture_type type,
+                              void *data);
 
 void genX(batch_emit_dummy_post_sync_op)(struct anv_batch *batch,
                                          struct anv_device *device,
index 088306a..10210dc 100644 (file)
@@ -139,7 +139,8 @@ anv_measure_start_snapshot(struct anv_cmd_buffer *cmd_buffer,
                                  (struct anv_address) {
                                     .bo = measure->bo,
                                     .offset = index * sizeof(uint64_t) },
-                                 ANV_TIMESTAMP_CAPTURE_AT_CS_STALL);
+                                 ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
+                                 NULL);
 
    struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
    memset(snapshot, 0, sizeof(*snapshot));
@@ -183,7 +184,8 @@ anv_measure_end_snapshot(struct anv_cmd_buffer *cmd_buffer,
                                  (struct anv_address) {
                                     .bo = measure->bo,
                                     .offset = index * sizeof(uint64_t) },
-                                 ANV_TIMESTAMP_CAPTURE_AT_CS_STALL);
+                                 ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
+                                 NULL);
 
    struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
    memset(snapshot, 0, sizeof(*snapshot));
index 3ba559a..080c750 100644 (file)
@@ -843,6 +843,7 @@ enum anv_timestamp_capture_type {
     ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE,
     ANV_TIMESTAMP_CAPTURE_END_OF_PIPE,
     ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
+    ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER,
 };
 
 struct anv_physical_device {
@@ -962,7 +963,8 @@ struct anv_physical_device {
     int64_t                                     master_minor;
     struct intel_query_engine_info *            engine_info;
 
-    void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address, enum anv_timestamp_capture_type);
+    void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address,
+                               enum anv_timestamp_capture_type, void *);
     struct intel_measure_device                 measure_device;
 };
 
@@ -2847,6 +2849,13 @@ struct anv_cmd_buffer {
     */
    struct u_trace                               trace;
 
+   /** Pointer to the last emitted COMPUTE_WALKER.
+    *
+    * This is used to edit the instruction post emission to replace the "Post
+    * Sync" field for utrace timestamp emission.
+    */
+   void                                        *last_compute_walker;
+
    struct {
       struct anv_video_session *vid;
       struct anv_video_session_params *params;
@@ -4436,6 +4445,11 @@ struct anv_utrace_submit {
    /* Buffer of 64bits timestamps (only used for timestamp copies) */
    struct anv_bo *trace_bo;
 
+   /* Last fully read 64bit timestamp (used to rebuild the upper bits of 32bit
+    * timestamps)
+    */
+   uint64_t last_full_timestamp;
+
    /* Memcpy state tracking (only used for timestamp copies) */
    struct anv_memcpy_state memcpy_state;
 };
index 393cc28..1bf45d0 100644 (file)
 
 #include "vulkan/runtime/vk_common_entrypoints.h"
 
+/** Timestamp structure format */
+union anv_utrace_timestamp {
+   /* Timestamp writtem by either 2 * MI_STORE_REGISTER_MEM or
+    * PIPE_CONTROL.
+    */
+   uint64_t timestamp;
+
+   /* Timestamp written by COMPUTE_WALKER::PostSync
+    *
+    * Layout is described in PRMs.
+    * ATSM PRMs, Volume 2d: Command Reference: Structures, POSTSYNC_DATA:
+    *
+    *    "The timestamp layout :
+    *        [0] = 32b Context Timestamp Start
+    *        [1] = 32b Global Timestamp Start
+    *        [2] = 32b Context Timestamp End
+    *        [3] = 32b Global Timestamp End"
+    */
+   uint32_t compute_walker[4];
+};
+
 static uint32_t
 command_buffers_count_utraces(struct anv_device *device,
                               uint32_t cmd_buffer_count,
@@ -88,7 +109,8 @@ anv_device_utrace_emit_copy_ts_buffer(struct u_trace_context *utctx,
       .bo = ts_to, .offset = to_offset * sizeof(uint64_t) };
 
    anv_genX(device->info, emit_so_memcpy)(&submit->memcpy_state,
-                                           to_addr, from_addr, count * sizeof(uint64_t));
+                                          to_addr, from_addr,
+                                          count * sizeof(union anv_utrace_timestamp));
 }
 
 VkResult
@@ -162,6 +184,7 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
          }
       }
       anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state);
+
       anv_genX(device->info, emit_so_memcpy_end)(&submit->memcpy_state);
 
       u_trace_flush(&submit->ds.trace, submit, true);
@@ -203,13 +226,19 @@ anv_utrace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size_b)
    struct anv_device *device =
       container_of(utctx, struct anv_device, ds.trace_context);
 
+   uint32_t anv_ts_size_b = (size_b / sizeof(uint64_t)) *
+      sizeof(union anv_utrace_timestamp);
+
    struct anv_bo *bo = NULL;
    UNUSED VkResult result =
       anv_bo_pool_alloc(&device->utrace_bo_pool,
-                        align(size_b, 4096),
+                        align(anv_ts_size_b, 4096),
                         &bo);
    assert(result == VK_SUCCESS);
 
+   memset(bo->map, 0, bo->size);
+   intel_clflush_range(bo->map, bo->size);
+
    return bo;
 }
 
@@ -230,19 +259,30 @@ anv_utrace_record_ts(struct u_trace *ut, void *cs,
 {
    struct anv_device *device =
       container_of(ut->utctx, struct anv_device, ds.trace_context);
-   struct anv_batch *batch =
-      cs != NULL ? cs :
-      &container_of(ut, struct anv_cmd_buffer, trace)->batch;
+   struct anv_cmd_buffer *cmd_buffer =
+      container_of(ut, struct anv_cmd_buffer, trace);
+   /* cmd_buffer is only valid if cs == NULL */
+   struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch;
    struct anv_bo *bo = timestamps;
 
-   enum anv_timestamp_capture_type capture_type =
-      (end_of_pipe) ? ANV_TIMESTAMP_CAPTURE_END_OF_PIPE
-                    : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
-   device->physical->cmd_emit_timestamp(batch, device,
-                                        (struct anv_address) {
-                                           .bo = bo,
-                                           .offset = idx * sizeof(uint64_t) },
-                                        capture_type);
+   struct anv_address ts_address = (struct anv_address) {
+      .bo = bo,
+      .offset = idx * sizeof(union anv_utrace_timestamp)
+   };
+
+   /* Is this a end of compute trace point? */
+   const bool is_end_compute =
+      (cs == NULL && cmd_buffer->last_compute_walker != NULL && end_of_pipe);
+
+   enum anv_timestamp_capture_type capture_type = end_of_pipe ?
+      is_end_compute ? ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER :
+      ANV_TIMESTAMP_CAPTURE_END_OF_PIPE : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
+   device->physical->cmd_emit_timestamp(batch, device, ts_address,
+                                        capture_type,
+                                        is_end_compute ?
+                                        cmd_buffer->last_compute_walker : NULL);
+   if (is_end_compute)
+         cmd_buffer->last_compute_walker = NULL;
 }
 
 static uint64_t
@@ -265,13 +305,30 @@ anv_utrace_read_ts(struct u_trace_context *utctx,
       assert(result == VK_SUCCESS);
    }
 
-   uint64_t *ts = bo->map;
+   union anv_utrace_timestamp *ts = (union anv_utrace_timestamp *)bo->map;
 
    /* Don't translate the no-timestamp marker: */
-   if (ts[idx] == U_TRACE_NO_TIMESTAMP)
+   if (ts[idx].timestamp == U_TRACE_NO_TIMESTAMP)
       return U_TRACE_NO_TIMESTAMP;
 
-   return intel_device_info_timebase_scale(device->info, ts[idx]);
+   /* Detect a 16bytes timestamp write */
+   if (ts[idx].compute_walker[2] != 0 || ts[idx].compute_walker[3] != 0) {
+      /* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We
+       * need to rebuild the full 64bits using the previous timestamp. We
+       * assume that utrace is reading the timestamp in order. Anyway
+       * timestamp rollover on 32bits in a few minutes so in most cases that
+       * should be correct.
+       */
+      uint64_t timestamp =
+         (submit->last_full_timestamp & 0xffffffff00000000) |
+         (uint64_t) ts[idx].compute_walker[3];
+
+      return intel_device_info_timebase_scale(device->info, timestamp);
+   }
+
+   submit->last_full_timestamp = ts[idx].timestamp;
+
+   return intel_device_info_timebase_scale(device->info, ts[idx].timestamp);
 }
 
 void
index 505a7a7..54c9ff1 100644 (file)
@@ -5659,37 +5659,42 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
    const struct brw_cs_dispatch_info dispatch =
       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
-      cw.IndirectParameterEnable        = indirect;
-      cw.PredicateEnable                = predicate;
-      cw.SIMDSize                       = dispatch.simd_size / 16;
-      cw.IndirectDataStartAddress       = comp_state->push_data.offset;
-      cw.IndirectDataLength             = comp_state->push_data.alloc_size;
-      cw.LocalXMaximum                  = prog_data->local_size[0] - 1;
-      cw.LocalYMaximum                  = prog_data->local_size[1] - 1;
-      cw.LocalZMaximum                  = prog_data->local_size[2] - 1;
-      cw.ThreadGroupIDXDimension        = groupCountX;
-      cw.ThreadGroupIDYDimension        = groupCountY;
-      cw.ThreadGroupIDZDimension        = groupCountZ;
-      cw.ExecutionMask                  = dispatch.right_mask;
-      cw.PostSync.MOCS                  = anv_mocs(pipeline->base.device, NULL, 0);
+   cmd_buffer->last_compute_walker =
+      anv_batch_emitn(
+         &cmd_buffer->batch,
+         GENX(COMPUTE_WALKER_length),
+         GENX(COMPUTE_WALKER),
+         .IndirectParameterEnable        = indirect,
+         .PredicateEnable                = predicate,
+         .SIMDSize                       = dispatch.simd_size / 16,
+         .IndirectDataStartAddress       = comp_state->push_data.offset,
+         .IndirectDataLength             = comp_state->push_data.alloc_size,
+         .LocalXMaximum                  = prog_data->local_size[0] - 1,
+         .LocalYMaximum                  = prog_data->local_size[1] - 1,
+         .LocalZMaximum                  = prog_data->local_size[2] - 1,
+         .ThreadGroupIDXDimension        = groupCountX,
+         .ThreadGroupIDYDimension        = groupCountY,
+         .ThreadGroupIDZDimension        = groupCountZ,
+         .ExecutionMask                  = dispatch.right_mask,
+         .PostSync                       = {
+            .MOCS                        = anv_mocs(pipeline->base.device, NULL, 0),
+         },
 
-      cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
-         .KernelStartPointer = cs_bin->kernel.offset,
-         .SamplerStatePointer =
-            cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
-         .BindingTablePointer =
-            cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
-         /* Typically set to 0 to avoid prefetching on every thread dispatch. */
-         .BindingTableEntryCount = devinfo->verx10 == 125 ?
-            0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
-         .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
-         .SharedLocalMemorySize = encode_slm_size(GFX_VER,
-                                                  prog_data->base.total_shared),
-         .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
-         .NumberOfBarriers = prog_data->uses_barrier,
-      };
-   }
+         .InterfaceDescriptor            = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
+            .KernelStartPointer                = cs_bin->kernel.offset,
+            .SamplerStatePointer               = cmd_buffer->state.samplers[
+               MESA_SHADER_COMPUTE].offset,
+            .BindingTablePointer               = cmd_buffer->state.binding_tables[
+               MESA_SHADER_COMPUTE].offset,
+            /* Typically set to 0 to avoid prefetching on every thread dispatch. */
+            .BindingTableEntryCount            = devinfo->verx10 == 125 ?
+               0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
+            .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
+            .SharedLocalMemorySize             = encode_slm_size(
+               GFX_VER, prog_data->base.total_shared),
+            .PreferredSLMAllocationSize        = preferred_slm_allocation_size(devinfo),
+            .NumberOfBarriers                  = prog_data->uses_barrier,
+         });
 }
 
 #else /* #if GFX_VERx10 >= 125 */
@@ -8067,7 +8072,8 @@ VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
                               struct anv_device *device,
                               struct anv_address addr,
-                              enum anv_timestamp_capture_type type) {
+                              enum anv_timestamp_capture_type type,
+                              void *data) {
    switch (type) {
    case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
       struct mi_builder b;
@@ -8077,6 +8083,7 @@ void genX(cmd_emit_timestamp)(struct anv_batch *batch,
    }
 
    case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE:
+
       anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
          pc.PostSyncOperation   = WriteTimestamp;
          pc.Address             = addr;
@@ -8093,6 +8100,24 @@ void genX(cmd_emit_timestamp)(struct anv_batch *batch,
       }
       break;
 
+#if GFX_VERx10 >= 125
+   case ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER: {
+      uint32_t dwords[GENX(COMPUTE_WALKER_length)];
+
+      GENX(COMPUTE_WALKER_pack)(batch, dwords, &(struct GENX(COMPUTE_WALKER)) {
+            .PostSync = (struct GENX(POSTSYNC_DATA)) {
+               .Operation = WriteTimestamp,
+               .DestinationAddress = addr,
+               .MOCS = anv_mocs(device, NULL, 0),
+            },
+         });
+
+      for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++)
+         ((uint32_t *)data)[i] |= dwords[i];
+      break;
+   }
+#endif
+
    default:
       unreachable("invalid");
    }