{
anv_cmd_state_finish(cmd_buffer);
anv_cmd_state_init(cmd_buffer);
+
+ cmd_buffer->last_compute_walker = NULL;
}
static VkResult
cmd_buffer->generation_return_addr = ANV_NULL_ADDRESS;
cmd_buffer->generation_bt_state = ANV_STATE_NULL;
+ cmd_buffer->last_compute_walker = NULL;
+
anv_cmd_state_init(cmd_buffer);
anv_measure_init(cmd_buffer);
void genX(cmd_emit_timestamp)(struct anv_batch *batch,
struct anv_device *device,
struct anv_address addr,
- enum anv_timestamp_capture_type);
+ enum anv_timestamp_capture_type type,
+ void *data);
void genX(batch_emit_dummy_post_sync_op)(struct anv_batch *batch,
struct anv_device *device,
(struct anv_address) {
.bo = measure->bo,
.offset = index * sizeof(uint64_t) },
- ANV_TIMESTAMP_CAPTURE_AT_CS_STALL);
+ ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
+ NULL);
struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
memset(snapshot, 0, sizeof(*snapshot));
(struct anv_address) {
.bo = measure->bo,
.offset = index * sizeof(uint64_t) },
- ANV_TIMESTAMP_CAPTURE_AT_CS_STALL);
+ ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
+ NULL);
struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
memset(snapshot, 0, sizeof(*snapshot));
ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE,
ANV_TIMESTAMP_CAPTURE_END_OF_PIPE,
ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
+ ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER,
};
struct anv_physical_device {
int64_t master_minor;
struct intel_query_engine_info * engine_info;
- void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address, enum anv_timestamp_capture_type);
+ void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address,
+ enum anv_timestamp_capture_type, void *);
struct intel_measure_device measure_device;
};
*/
struct u_trace trace;
+ /** Pointer to the last emitted COMPUTE_WALKER.
+ *
+ * This is used to edit the instruction post emission to replace the "Post
+ * Sync" field for utrace timestamp emission.
+ */
+ void *last_compute_walker;
+
struct {
struct anv_video_session *vid;
struct anv_video_session_params *params;
/* Buffer of 64bits timestamps (only used for timestamp copies) */
struct anv_bo *trace_bo;
+ /* Last fully read 64bit timestamp (used to rebuild the upper bits of 32bit
+ * timestamps)
+ */
+ uint64_t last_full_timestamp;
+
/* Memcpy state tracking (only used for timestamp copies) */
struct anv_memcpy_state memcpy_state;
};
#include "vulkan/runtime/vk_common_entrypoints.h"
+/** Timestamp structure format */
+union anv_utrace_timestamp {
+ /* Timestamp writtem by either 2 * MI_STORE_REGISTER_MEM or
+ * PIPE_CONTROL.
+ */
+ uint64_t timestamp;
+
+ /* Timestamp written by COMPUTE_WALKER::PostSync
+ *
+ * Layout is described in PRMs.
+ * ATSM PRMs, Volume 2d: Command Reference: Structures, POSTSYNC_DATA:
+ *
+ * "The timestamp layout :
+ * [0] = 32b Context Timestamp Start
+ * [1] = 32b Global Timestamp Start
+ * [2] = 32b Context Timestamp End
+ * [3] = 32b Global Timestamp End"
+ */
+ uint32_t compute_walker[4];
+};
+
static uint32_t
command_buffers_count_utraces(struct anv_device *device,
uint32_t cmd_buffer_count,
.bo = ts_to, .offset = to_offset * sizeof(uint64_t) };
anv_genX(device->info, emit_so_memcpy)(&submit->memcpy_state,
- to_addr, from_addr, count * sizeof(uint64_t));
+ to_addr, from_addr,
+ count * sizeof(union anv_utrace_timestamp));
}
VkResult
}
}
anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state);
+
anv_genX(device->info, emit_so_memcpy_end)(&submit->memcpy_state);
u_trace_flush(&submit->ds.trace, submit, true);
struct anv_device *device =
container_of(utctx, struct anv_device, ds.trace_context);
+ uint32_t anv_ts_size_b = (size_b / sizeof(uint64_t)) *
+ sizeof(union anv_utrace_timestamp);
+
struct anv_bo *bo = NULL;
UNUSED VkResult result =
anv_bo_pool_alloc(&device->utrace_bo_pool,
- align(size_b, 4096),
+ align(anv_ts_size_b, 4096),
&bo);
assert(result == VK_SUCCESS);
+ memset(bo->map, 0, bo->size);
+ intel_clflush_range(bo->map, bo->size);
+
return bo;
}
{
struct anv_device *device =
container_of(ut->utctx, struct anv_device, ds.trace_context);
- struct anv_batch *batch =
- cs != NULL ? cs :
- &container_of(ut, struct anv_cmd_buffer, trace)->batch;
+ struct anv_cmd_buffer *cmd_buffer =
+ container_of(ut, struct anv_cmd_buffer, trace);
+ /* cmd_buffer is only valid if cs == NULL */
+ struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch;
struct anv_bo *bo = timestamps;
- enum anv_timestamp_capture_type capture_type =
- (end_of_pipe) ? ANV_TIMESTAMP_CAPTURE_END_OF_PIPE
- : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
- device->physical->cmd_emit_timestamp(batch, device,
- (struct anv_address) {
- .bo = bo,
- .offset = idx * sizeof(uint64_t) },
- capture_type);
+ struct anv_address ts_address = (struct anv_address) {
+ .bo = bo,
+ .offset = idx * sizeof(union anv_utrace_timestamp)
+ };
+
+ /* Is this a end of compute trace point? */
+ const bool is_end_compute =
+ (cs == NULL && cmd_buffer->last_compute_walker != NULL && end_of_pipe);
+
+ enum anv_timestamp_capture_type capture_type = end_of_pipe ?
+ is_end_compute ? ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER :
+ ANV_TIMESTAMP_CAPTURE_END_OF_PIPE : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
+ device->physical->cmd_emit_timestamp(batch, device, ts_address,
+ capture_type,
+ is_end_compute ?
+ cmd_buffer->last_compute_walker : NULL);
+ if (is_end_compute)
+ cmd_buffer->last_compute_walker = NULL;
}
static uint64_t
assert(result == VK_SUCCESS);
}
- uint64_t *ts = bo->map;
+ union anv_utrace_timestamp *ts = (union anv_utrace_timestamp *)bo->map;
/* Don't translate the no-timestamp marker: */
- if (ts[idx] == U_TRACE_NO_TIMESTAMP)
+ if (ts[idx].timestamp == U_TRACE_NO_TIMESTAMP)
return U_TRACE_NO_TIMESTAMP;
- return intel_device_info_timebase_scale(device->info, ts[idx]);
+ /* Detect a 16bytes timestamp write */
+ if (ts[idx].compute_walker[2] != 0 || ts[idx].compute_walker[3] != 0) {
+ /* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We
+ * need to rebuild the full 64bits using the previous timestamp. We
+ * assume that utrace is reading the timestamp in order. Anyway
+ * timestamp rollover on 32bits in a few minutes so in most cases that
+ * should be correct.
+ */
+ uint64_t timestamp =
+ (submit->last_full_timestamp & 0xffffffff00000000) |
+ (uint64_t) ts[idx].compute_walker[3];
+
+ return intel_device_info_timebase_scale(device->info, timestamp);
+ }
+
+ submit->last_full_timestamp = ts[idx].timestamp;
+
+ return intel_device_info_timebase_scale(device->info, ts[idx].timestamp);
}
void
const struct brw_cs_dispatch_info dispatch =
brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
- anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
- cw.IndirectParameterEnable = indirect;
- cw.PredicateEnable = predicate;
- cw.SIMDSize = dispatch.simd_size / 16;
- cw.IndirectDataStartAddress = comp_state->push_data.offset;
- cw.IndirectDataLength = comp_state->push_data.alloc_size;
- cw.LocalXMaximum = prog_data->local_size[0] - 1;
- cw.LocalYMaximum = prog_data->local_size[1] - 1;
- cw.LocalZMaximum = prog_data->local_size[2] - 1;
- cw.ThreadGroupIDXDimension = groupCountX;
- cw.ThreadGroupIDYDimension = groupCountY;
- cw.ThreadGroupIDZDimension = groupCountZ;
- cw.ExecutionMask = dispatch.right_mask;
- cw.PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0);
+ cmd_buffer->last_compute_walker =
+ anv_batch_emitn(
+ &cmd_buffer->batch,
+ GENX(COMPUTE_WALKER_length),
+ GENX(COMPUTE_WALKER),
+ .IndirectParameterEnable = indirect,
+ .PredicateEnable = predicate,
+ .SIMDSize = dispatch.simd_size / 16,
+ .IndirectDataStartAddress = comp_state->push_data.offset,
+ .IndirectDataLength = comp_state->push_data.alloc_size,
+ .LocalXMaximum = prog_data->local_size[0] - 1,
+ .LocalYMaximum = prog_data->local_size[1] - 1,
+ .LocalZMaximum = prog_data->local_size[2] - 1,
+ .ThreadGroupIDXDimension = groupCountX,
+ .ThreadGroupIDYDimension = groupCountY,
+ .ThreadGroupIDZDimension = groupCountZ,
+ .ExecutionMask = dispatch.right_mask,
+ .PostSync = {
+ .MOCS = anv_mocs(pipeline->base.device, NULL, 0),
+ },
- cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
- .KernelStartPointer = cs_bin->kernel.offset,
- .SamplerStatePointer =
- cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
- .BindingTablePointer =
- cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
- /* Typically set to 0 to avoid prefetching on every thread dispatch. */
- .BindingTableEntryCount = devinfo->verx10 == 125 ?
- 0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
- .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
- .SharedLocalMemorySize = encode_slm_size(GFX_VER,
- prog_data->base.total_shared),
- .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
- .NumberOfBarriers = prog_data->uses_barrier,
- };
- }
+ .InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
+ .KernelStartPointer = cs_bin->kernel.offset,
+ .SamplerStatePointer = cmd_buffer->state.samplers[
+ MESA_SHADER_COMPUTE].offset,
+ .BindingTablePointer = cmd_buffer->state.binding_tables[
+ MESA_SHADER_COMPUTE].offset,
+ /* Typically set to 0 to avoid prefetching on every thread dispatch. */
+ .BindingTableEntryCount = devinfo->verx10 == 125 ?
+ 0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
+ .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
+ .SharedLocalMemorySize = encode_slm_size(
+ GFX_VER, prog_data->base.total_shared),
+ .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
+ .NumberOfBarriers = prog_data->uses_barrier,
+ });
}
#else /* #if GFX_VERx10 >= 125 */
void genX(cmd_emit_timestamp)(struct anv_batch *batch,
struct anv_device *device,
struct anv_address addr,
- enum anv_timestamp_capture_type type) {
+ enum anv_timestamp_capture_type type,
+ void *data) {
switch (type) {
case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
struct mi_builder b;
}
case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE:
+
anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
pc.PostSyncOperation = WriteTimestamp;
pc.Address = addr;
}
break;
+#if GFX_VERx10 >= 125
+ case ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER: {
+ uint32_t dwords[GENX(COMPUTE_WALKER_length)];
+
+ GENX(COMPUTE_WALKER_pack)(batch, dwords, &(struct GENX(COMPUTE_WALKER)) {
+ .PostSync = (struct GENX(POSTSYNC_DATA)) {
+ .Operation = WriteTimestamp,
+ .DestinationAddress = addr,
+ .MOCS = anv_mocs(device, NULL, 0),
+ },
+ });
+
+ for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++)
+ ((uint32_t *)data)[i] |= dwords[i];
+ break;
+ }
+#endif
+
default:
unreachable("invalid");
}