From 521c216efcc0d0292ceedb3451c5a0a1ef956083 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Fri, 19 May 2023 17:01:23 +0300 Subject: [PATCH] anv: use COMPUTE_WALKER post sync field to track compute work This is more accurate than PIPE_CONTROL as it won't introduce stalls between the compute dispatches. Signed-off-by: Lionel Landwerlin Reviewed-by: Felix DeGrood Part-of: --- src/intel/vulkan/anv_cmd_buffer.c | 4 ++ src/intel/vulkan/anv_genX.h | 3 +- src/intel/vulkan/anv_measure.c | 6 ++- src/intel/vulkan/anv_private.h | 16 ++++++- src/intel/vulkan/anv_utrace.c | 89 +++++++++++++++++++++++++++++++------- src/intel/vulkan/genX_cmd_buffer.c | 87 ++++++++++++++++++++++++------------- 6 files changed, 154 insertions(+), 51 deletions(-) diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index eb75f0a..ddab251 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -79,6 +79,8 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer) { anv_cmd_state_finish(cmd_buffer); anv_cmd_state_init(cmd_buffer); + + cmd_buffer->last_compute_walker = NULL; } static VkResult @@ -136,6 +138,8 @@ anv_create_cmd_buffer(struct vk_command_pool *pool, cmd_buffer->generation_return_addr = ANV_NULL_ADDRESS; cmd_buffer->generation_bt_state = ANV_STATE_NULL; + cmd_buffer->last_compute_walker = NULL; + anv_cmd_state_init(cmd_buffer); anv_measure_init(cmd_buffer); diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index 30a323d..c4bce5e 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -172,7 +172,8 @@ void genX(blorp_exec)(struct blorp_batch *batch, void genX(cmd_emit_timestamp)(struct anv_batch *batch, struct anv_device *device, struct anv_address addr, - enum anv_timestamp_capture_type); + enum anv_timestamp_capture_type type, + void *data); void genX(batch_emit_dummy_post_sync_op)(struct anv_batch *batch, struct anv_device *device, diff --git a/src/intel/vulkan/anv_measure.c b/src/intel/vulkan/anv_measure.c index 088306a..10210dc 100644 --- a/src/intel/vulkan/anv_measure.c +++ b/src/intel/vulkan/anv_measure.c @@ -139,7 +139,8 @@ anv_measure_start_snapshot(struct anv_cmd_buffer *cmd_buffer, (struct anv_address) { .bo = measure->bo, .offset = index * sizeof(uint64_t) }, - ANV_TIMESTAMP_CAPTURE_AT_CS_STALL); + ANV_TIMESTAMP_CAPTURE_AT_CS_STALL, + NULL); struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]); memset(snapshot, 0, sizeof(*snapshot)); @@ -183,7 +184,8 @@ anv_measure_end_snapshot(struct anv_cmd_buffer *cmd_buffer, (struct anv_address) { .bo = measure->bo, .offset = index * sizeof(uint64_t) }, - ANV_TIMESTAMP_CAPTURE_AT_CS_STALL); + ANV_TIMESTAMP_CAPTURE_AT_CS_STALL, + NULL); struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]); memset(snapshot, 0, sizeof(*snapshot)); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 3ba559a..080c750 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -843,6 +843,7 @@ enum anv_timestamp_capture_type { ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE, ANV_TIMESTAMP_CAPTURE_END_OF_PIPE, ANV_TIMESTAMP_CAPTURE_AT_CS_STALL, + ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER, }; struct anv_physical_device { @@ -962,7 +963,8 @@ struct anv_physical_device { int64_t master_minor; struct intel_query_engine_info * engine_info; - void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address, enum anv_timestamp_capture_type); + void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address, + enum anv_timestamp_capture_type, void *); struct intel_measure_device measure_device; }; @@ -2847,6 +2849,13 @@ struct anv_cmd_buffer { */ struct u_trace trace; + /** Pointer to the last emitted COMPUTE_WALKER. + * + * This is used to edit the instruction post emission to replace the "Post + * Sync" field for utrace timestamp emission. + */ + void *last_compute_walker; + struct { struct anv_video_session *vid; struct anv_video_session_params *params; @@ -4436,6 +4445,11 @@ struct anv_utrace_submit { /* Buffer of 64bits timestamps (only used for timestamp copies) */ struct anv_bo *trace_bo; + /* Last fully read 64bit timestamp (used to rebuild the upper bits of 32bit + * timestamps) + */ + uint64_t last_full_timestamp; + /* Memcpy state tracking (only used for timestamp copies) */ struct anv_memcpy_state memcpy_state; }; diff --git a/src/intel/vulkan/anv_utrace.c b/src/intel/vulkan/anv_utrace.c index 393cc28..1bf45d0 100644 --- a/src/intel/vulkan/anv_utrace.c +++ b/src/intel/vulkan/anv_utrace.c @@ -29,6 +29,27 @@ #include "vulkan/runtime/vk_common_entrypoints.h" +/** Timestamp structure format */ +union anv_utrace_timestamp { + /* Timestamp writtem by either 2 * MI_STORE_REGISTER_MEM or + * PIPE_CONTROL. + */ + uint64_t timestamp; + + /* Timestamp written by COMPUTE_WALKER::PostSync + * + * Layout is described in PRMs. + * ATSM PRMs, Volume 2d: Command Reference: Structures, POSTSYNC_DATA: + * + * "The timestamp layout : + * [0] = 32b Context Timestamp Start + * [1] = 32b Global Timestamp Start + * [2] = 32b Context Timestamp End + * [3] = 32b Global Timestamp End" + */ + uint32_t compute_walker[4]; +}; + static uint32_t command_buffers_count_utraces(struct anv_device *device, uint32_t cmd_buffer_count, @@ -88,7 +109,8 @@ anv_device_utrace_emit_copy_ts_buffer(struct u_trace_context *utctx, .bo = ts_to, .offset = to_offset * sizeof(uint64_t) }; anv_genX(device->info, emit_so_memcpy)(&submit->memcpy_state, - to_addr, from_addr, count * sizeof(uint64_t)); + to_addr, from_addr, + count * sizeof(union anv_utrace_timestamp)); } VkResult @@ -162,6 +184,7 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue, } } anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state); + anv_genX(device->info, emit_so_memcpy_end)(&submit->memcpy_state); u_trace_flush(&submit->ds.trace, submit, true); @@ -203,13 +226,19 @@ anv_utrace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size_b) struct anv_device *device = container_of(utctx, struct anv_device, ds.trace_context); + uint32_t anv_ts_size_b = (size_b / sizeof(uint64_t)) * + sizeof(union anv_utrace_timestamp); + struct anv_bo *bo = NULL; UNUSED VkResult result = anv_bo_pool_alloc(&device->utrace_bo_pool, - align(size_b, 4096), + align(anv_ts_size_b, 4096), &bo); assert(result == VK_SUCCESS); + memset(bo->map, 0, bo->size); + intel_clflush_range(bo->map, bo->size); + return bo; } @@ -230,19 +259,30 @@ anv_utrace_record_ts(struct u_trace *ut, void *cs, { struct anv_device *device = container_of(ut->utctx, struct anv_device, ds.trace_context); - struct anv_batch *batch = - cs != NULL ? cs : - &container_of(ut, struct anv_cmd_buffer, trace)->batch; + struct anv_cmd_buffer *cmd_buffer = + container_of(ut, struct anv_cmd_buffer, trace); + /* cmd_buffer is only valid if cs == NULL */ + struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch; struct anv_bo *bo = timestamps; - enum anv_timestamp_capture_type capture_type = - (end_of_pipe) ? ANV_TIMESTAMP_CAPTURE_END_OF_PIPE - : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE; - device->physical->cmd_emit_timestamp(batch, device, - (struct anv_address) { - .bo = bo, - .offset = idx * sizeof(uint64_t) }, - capture_type); + struct anv_address ts_address = (struct anv_address) { + .bo = bo, + .offset = idx * sizeof(union anv_utrace_timestamp) + }; + + /* Is this a end of compute trace point? */ + const bool is_end_compute = + (cs == NULL && cmd_buffer->last_compute_walker != NULL && end_of_pipe); + + enum anv_timestamp_capture_type capture_type = end_of_pipe ? + is_end_compute ? ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER : + ANV_TIMESTAMP_CAPTURE_END_OF_PIPE : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE; + device->physical->cmd_emit_timestamp(batch, device, ts_address, + capture_type, + is_end_compute ? + cmd_buffer->last_compute_walker : NULL); + if (is_end_compute) + cmd_buffer->last_compute_walker = NULL; } static uint64_t @@ -265,13 +305,30 @@ anv_utrace_read_ts(struct u_trace_context *utctx, assert(result == VK_SUCCESS); } - uint64_t *ts = bo->map; + union anv_utrace_timestamp *ts = (union anv_utrace_timestamp *)bo->map; /* Don't translate the no-timestamp marker: */ - if (ts[idx] == U_TRACE_NO_TIMESTAMP) + if (ts[idx].timestamp == U_TRACE_NO_TIMESTAMP) return U_TRACE_NO_TIMESTAMP; - return intel_device_info_timebase_scale(device->info, ts[idx]); + /* Detect a 16bytes timestamp write */ + if (ts[idx].compute_walker[2] != 0 || ts[idx].compute_walker[3] != 0) { + /* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We + * need to rebuild the full 64bits using the previous timestamp. We + * assume that utrace is reading the timestamp in order. Anyway + * timestamp rollover on 32bits in a few minutes so in most cases that + * should be correct. + */ + uint64_t timestamp = + (submit->last_full_timestamp & 0xffffffff00000000) | + (uint64_t) ts[idx].compute_walker[3]; + + return intel_device_info_timebase_scale(device->info, timestamp); + } + + submit->last_full_timestamp = ts[idx].timestamp; + + return intel_device_info_timebase_scale(device->info, ts[idx].timestamp); } void diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 505a7a7..54c9ff1 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -5659,37 +5659,42 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, const struct brw_cs_dispatch_info dispatch = brw_cs_get_dispatch_info(devinfo, prog_data, NULL); - anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) { - cw.IndirectParameterEnable = indirect; - cw.PredicateEnable = predicate; - cw.SIMDSize = dispatch.simd_size / 16; - cw.IndirectDataStartAddress = comp_state->push_data.offset; - cw.IndirectDataLength = comp_state->push_data.alloc_size; - cw.LocalXMaximum = prog_data->local_size[0] - 1; - cw.LocalYMaximum = prog_data->local_size[1] - 1; - cw.LocalZMaximum = prog_data->local_size[2] - 1; - cw.ThreadGroupIDXDimension = groupCountX; - cw.ThreadGroupIDYDimension = groupCountY; - cw.ThreadGroupIDZDimension = groupCountZ; - cw.ExecutionMask = dispatch.right_mask; - cw.PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0); + cmd_buffer->last_compute_walker = + anv_batch_emitn( + &cmd_buffer->batch, + GENX(COMPUTE_WALKER_length), + GENX(COMPUTE_WALKER), + .IndirectParameterEnable = indirect, + .PredicateEnable = predicate, + .SIMDSize = dispatch.simd_size / 16, + .IndirectDataStartAddress = comp_state->push_data.offset, + .IndirectDataLength = comp_state->push_data.alloc_size, + .LocalXMaximum = prog_data->local_size[0] - 1, + .LocalYMaximum = prog_data->local_size[1] - 1, + .LocalZMaximum = prog_data->local_size[2] - 1, + .ThreadGroupIDXDimension = groupCountX, + .ThreadGroupIDYDimension = groupCountY, + .ThreadGroupIDZDimension = groupCountZ, + .ExecutionMask = dispatch.right_mask, + .PostSync = { + .MOCS = anv_mocs(pipeline->base.device, NULL, 0), + }, - cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { - .KernelStartPointer = cs_bin->kernel.offset, - .SamplerStatePointer = - cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset, - .BindingTablePointer = - cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset, - /* Typically set to 0 to avoid prefetching on every thread dispatch. */ - .BindingTableEntryCount = devinfo->verx10 == 125 ? - 0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30), - .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, - .SharedLocalMemorySize = encode_slm_size(GFX_VER, - prog_data->base.total_shared), - .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo), - .NumberOfBarriers = prog_data->uses_barrier, - }; - } + .InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { + .KernelStartPointer = cs_bin->kernel.offset, + .SamplerStatePointer = cmd_buffer->state.samplers[ + MESA_SHADER_COMPUTE].offset, + .BindingTablePointer = cmd_buffer->state.binding_tables[ + MESA_SHADER_COMPUTE].offset, + /* Typically set to 0 to avoid prefetching on every thread dispatch. */ + .BindingTableEntryCount = devinfo->verx10 == 125 ? + 0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30), + .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, + .SharedLocalMemorySize = encode_slm_size( + GFX_VER, prog_data->base.total_shared), + .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo), + .NumberOfBarriers = prog_data->uses_barrier, + }); } #else /* #if GFX_VERx10 >= 125 */ @@ -8067,7 +8072,8 @@ VkResult genX(CmdSetPerformanceStreamMarkerINTEL)( void genX(cmd_emit_timestamp)(struct anv_batch *batch, struct anv_device *device, struct anv_address addr, - enum anv_timestamp_capture_type type) { + enum anv_timestamp_capture_type type, + void *data) { switch (type) { case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: { struct mi_builder b; @@ -8077,6 +8083,7 @@ void genX(cmd_emit_timestamp)(struct anv_batch *batch, } case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE: + anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) { pc.PostSyncOperation = WriteTimestamp; pc.Address = addr; @@ -8093,6 +8100,24 @@ void genX(cmd_emit_timestamp)(struct anv_batch *batch, } break; +#if GFX_VERx10 >= 125 + case ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER: { + uint32_t dwords[GENX(COMPUTE_WALKER_length)]; + + GENX(COMPUTE_WALKER_pack)(batch, dwords, &(struct GENX(COMPUTE_WALKER)) { + .PostSync = (struct GENX(POSTSYNC_DATA)) { + .Operation = WriteTimestamp, + .DestinationAddress = addr, + .MOCS = anv_mocs(device, NULL, 0), + }, + }); + + for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++) + ((uint32_t *)data)[i] |= dwords[i]; + break; + } +#endif + default: unreachable("invalid"); } -- 2.7.4