From 559d3b0f9ab79ffeff1aaddb3dd6b9f313d71ff6 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 26 Apr 2023 17:02:38 +0200 Subject: [PATCH] ac,radv,radeonsi: rename thread_trace to sqtt everywhere SQTT stands for SQ Thread Trace but it's shorter. Note that environment variables aren't renamed because this might break external applications. This renames: - ac_thread_trace_data to ac_sqtt (this is the main struct) - ac_thread_trace_info to ac_sqtt_data_info - ac_thread_trace_se to ac_sqtt_data_se - ac_thread_trace to ac_sqtt_trace (this contains trace only) Signed-off-by: Samuel Pitoiset Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/amd/common/ac_rgp.c | 35 +- src/amd/common/ac_rgp.h | 10 +- src/amd/common/ac_sqtt.c | 92 +- src/amd/common/ac_sqtt.h | 74 +- src/amd/vulkan/layers/radv_sqtt_layer.c | 97 +- src/amd/vulkan/radv_cmd_buffer.c | 2 +- src/amd/vulkan/radv_device.c | 20 +- src/amd/vulkan/radv_physical_device.c | 8 +- src/amd/vulkan/radv_private.h | 22 +- src/amd/vulkan/radv_sqtt.c | 230 ++-- src/gallium/drivers/radeonsi/si_blit.c | 6 +- src/gallium/drivers/radeonsi/si_clear.c | 2 +- src/gallium/drivers/radeonsi/si_compute.c | 9 +- src/gallium/drivers/radeonsi/si_fence.c | 5 +- src/gallium/drivers/radeonsi/si_gfx_cs.c | 17 +- src/gallium/drivers/radeonsi/si_pipe.c | 8 +- src/gallium/drivers/radeonsi/si_pipe.h | 12 +- src/gallium/drivers/radeonsi/si_sqtt.c | 1478 ++++++++++++------------ src/gallium/drivers/radeonsi/si_state_draw.cpp | 31 +- 19 files changed, 1041 insertions(+), 1117 deletions(-) diff --git a/src/amd/common/ac_rgp.c b/src/amd/common/ac_rgp.c index 69f98b5..86706db 100644 --- a/src/amd/common/ac_rgp.c +++ b/src/amd/common/ac_rgp.c @@ -991,22 +991,21 @@ static void ac_sqtt_dump_spm(const struct ac_spm_trace *spm_trace, } #if defined(USE_LIBELF) -static void ac_sqtt_dump_data(struct radeon_info *rad_info, - struct ac_thread_trace *thread_trace, - const struct ac_spm_trace *spm_trace, - FILE *output) +static void +ac_sqtt_dump_data(struct radeon_info *rad_info, struct ac_sqtt_trace *sqtt_trace, + const struct ac_spm_trace *spm_trace, FILE *output) { struct sqtt_file_chunk_asic_info asic_info = {0}; struct sqtt_file_chunk_cpu_info cpu_info = {0}; struct sqtt_file_chunk_api_info api_info = {0}; struct sqtt_file_header header = {0}; size_t file_offset = 0; - const struct rgp_code_object *rgp_code_object = thread_trace->rgp_code_object; - const struct rgp_loader_events *rgp_loader_events = thread_trace->rgp_loader_events; - const struct rgp_pso_correlation *rgp_pso_correlation = thread_trace->rgp_pso_correlation; - const struct rgp_queue_info *rgp_queue_info = thread_trace->rgp_queue_info; - const struct rgp_queue_event *rgp_queue_event = thread_trace->rgp_queue_event; - const struct rgp_clock_calibration *rgp_clock_calibration = thread_trace->rgp_clock_calibration; + const struct rgp_code_object *rgp_code_object = sqtt_trace->rgp_code_object; + const struct rgp_loader_events *rgp_loader_events = sqtt_trace->rgp_loader_events; + const struct rgp_pso_correlation *rgp_pso_correlation = sqtt_trace->rgp_pso_correlation; + const struct rgp_queue_info *rgp_queue_info = sqtt_trace->rgp_queue_info; + const struct rgp_queue_event *rgp_queue_event = sqtt_trace->rgp_queue_event; + const struct rgp_clock_calibration *rgp_clock_calibration = sqtt_trace->rgp_clock_calibration; /* SQTT header file. */ ac_sqtt_fill_header(&header); @@ -1145,10 +1144,10 @@ static void ac_sqtt_dump_data(struct radeon_info *rad_info, } } - if (thread_trace) { - for (unsigned i = 0; i < thread_trace->num_traces; i++) { - const struct ac_thread_trace_se *se = &thread_trace->traces[i]; - const struct ac_thread_trace_info *info = &se->info; + if (sqtt_trace) { + for (unsigned i = 0; i < sqtt_trace->num_traces; i++) { + const struct ac_sqtt_data_se *se = &sqtt_trace->traces[i]; + const struct ac_sqtt_data_info *info = &se->info; struct sqtt_file_chunk_sqtt_desc desc = {0}; struct sqtt_file_chunk_sqtt_data data = {0}; uint64_t size = info->cur_offset * 32; /* unit of 32 bytes */ @@ -1175,9 +1174,9 @@ static void ac_sqtt_dump_data(struct radeon_info *rad_info, } #endif -int ac_dump_rgp_capture(struct radeon_info *info, - struct ac_thread_trace *thread_trace, - const struct ac_spm_trace *spm_trace) +int +ac_dump_rgp_capture(struct radeon_info *info, struct ac_sqtt_trace *sqtt_trace, + const struct ac_spm_trace *spm_trace) { #if !defined(USE_LIBELF) return -1; @@ -1198,7 +1197,7 @@ int ac_dump_rgp_capture(struct radeon_info *info, if (!f) return -1; - ac_sqtt_dump_data(info, thread_trace, spm_trace, f); + ac_sqtt_dump_data(info, sqtt_trace, spm_trace, f); fprintf(stderr, "RGP capture saved to '%s'\n", filename); diff --git a/src/amd/common/ac_rgp.h b/src/amd/common/ac_rgp.h index c33129d..b8eda7a 100644 --- a/src/amd/common/ac_rgp.h +++ b/src/amd/common/ac_rgp.h @@ -32,8 +32,8 @@ #include "util/simple_mtx.h" struct radeon_info; -struct ac_thread_trace; -struct ac_thread_trace_data; +struct ac_sqtt_trace; +struct ac_sqtt; struct ac_spm_trace; enum rgp_hardware_stages { @@ -188,10 +188,8 @@ struct rgp_clock_calibration { simple_mtx_t lock; }; -int -ac_dump_rgp_capture(struct radeon_info *info, - struct ac_thread_trace *thread_trace, - const struct ac_spm_trace *spm_trace); +int ac_dump_rgp_capture(struct radeon_info *info, struct ac_sqtt_trace *sqtt_trace, + const struct ac_spm_trace *spm_trace); void ac_rgp_file_write_elf_object(FILE *output, size_t file_elf_start, diff --git a/src/amd/common/ac_sqtt.c b/src/amd/common/ac_sqtt.c index dec9f06..3684838 100644 --- a/src/amd/common/ac_sqtt.c +++ b/src/amd/common/ac_sqtt.c @@ -30,40 +30,38 @@ #include "util/os_time.h" uint64_t -ac_thread_trace_get_info_offset(unsigned se) +ac_sqtt_get_info_offset(unsigned se) { - return sizeof(struct ac_thread_trace_info) * se; + return sizeof(struct ac_sqtt_data_info) * se; } uint64_t -ac_thread_trace_get_data_offset(const struct radeon_info *rad_info, - const struct ac_thread_trace_data *data, unsigned se) +ac_sqtt_get_data_offset(const struct radeon_info *rad_info, const struct ac_sqtt *data, unsigned se) { unsigned max_se = rad_info->max_se; uint64_t data_offset; - data_offset = align64(sizeof(struct ac_thread_trace_info) * max_se, - 1 << SQTT_BUFFER_ALIGN_SHIFT); + data_offset = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT); data_offset += data->buffer_size * se; return data_offset; } uint64_t -ac_thread_trace_get_info_va(uint64_t va, unsigned se) +ac_sqtt_get_info_va(uint64_t va, unsigned se) { - return va + ac_thread_trace_get_info_offset(se); + return va + ac_sqtt_get_info_offset(se); } uint64_t -ac_thread_trace_get_data_va(const struct radeon_info *rad_info, - const struct ac_thread_trace_data *data, uint64_t va, unsigned se) +ac_sqtt_get_data_va(const struct radeon_info *rad_info, const struct ac_sqtt *data, uint64_t va, + unsigned se) { - return va + ac_thread_trace_get_data_offset(rad_info, data, se); + return va + ac_sqtt_get_data_offset(rad_info, data, se); } void -ac_thread_trace_init(struct ac_thread_trace_data *data) +ac_sqtt_init(struct ac_sqtt *data) { list_inithead(&data->rgp_pso_correlation.record); simple_mtx_init(&data->rgp_pso_correlation.lock, mtx_plain); @@ -85,7 +83,7 @@ ac_thread_trace_init(struct ac_thread_trace_data *data) } void -ac_thread_trace_finish(struct ac_thread_trace_data *data) +ac_sqtt_finish(struct ac_sqtt *data) { assert(data->rgp_pso_correlation.record_count == 0); simple_mtx_destroy(&data->rgp_pso_correlation.lock); @@ -107,9 +105,8 @@ ac_thread_trace_finish(struct ac_thread_trace_data *data) } bool -ac_is_thread_trace_complete(const struct radeon_info *rad_info, - const struct ac_thread_trace_data *data, - const struct ac_thread_trace_info *info) +ac_is_sqtt_complete(const struct radeon_info *rad_info, const struct ac_sqtt *data, + const struct ac_sqtt_data_info *info) { if (rad_info->gfx_level >= GFX10) { /* GFX10 doesn't have THREAD_TRACE_CNTR but it reports the number of @@ -131,8 +128,7 @@ ac_is_thread_trace_complete(const struct radeon_info *rad_info, } uint32_t -ac_get_expected_buffer_size(struct radeon_info *rad_info, - const struct ac_thread_trace_info *info) +ac_get_expected_buffer_size(struct radeon_info *rad_info, const struct ac_sqtt_data_info *info) { if (rad_info->gfx_level >= GFX10) { uint32_t dropped_cntr_per_se = info->gfx10_dropped_cntr / rad_info->max_se; @@ -143,10 +139,9 @@ ac_get_expected_buffer_size(struct radeon_info *rad_info, } bool -ac_sqtt_add_pso_correlation(struct ac_thread_trace_data *thread_trace_data, - uint64_t pipeline_hash) +ac_sqtt_add_pso_correlation(struct ac_sqtt *sqtt, uint64_t pipeline_hash) { - struct rgp_pso_correlation *pso_correlation = &thread_trace_data->rgp_pso_correlation; + struct rgp_pso_correlation *pso_correlation = &sqtt->rgp_pso_correlation; struct rgp_pso_correlation_record *record; record = malloc(sizeof(struct rgp_pso_correlation_record)); @@ -167,11 +162,10 @@ ac_sqtt_add_pso_correlation(struct ac_thread_trace_data *thread_trace_data, } bool -ac_sqtt_add_code_object_loader_event(struct ac_thread_trace_data *thread_trace_data, - uint64_t pipeline_hash, +ac_sqtt_add_code_object_loader_event(struct ac_sqtt *sqtt, uint64_t pipeline_hash, uint64_t base_address) { - struct rgp_loader_events *loader_events = &thread_trace_data->rgp_loader_events; + struct rgp_loader_events *loader_events = &sqtt->rgp_loader_events; struct rgp_loader_events_record *record; record = malloc(sizeof(struct rgp_loader_events_record)); @@ -194,10 +188,9 @@ ac_sqtt_add_code_object_loader_event(struct ac_thread_trace_data *thread_trace_d } bool -ac_sqtt_add_clock_calibration(struct ac_thread_trace_data *thread_trace_data, - uint64_t cpu_timestamp, uint64_t gpu_timestamp) +ac_sqtt_add_clock_calibration(struct ac_sqtt *sqtt, uint64_t cpu_timestamp, uint64_t gpu_timestamp) { - struct rgp_clock_calibration *clock_calibration = &thread_trace_data->rgp_clock_calibration; + struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration; struct rgp_clock_calibration_record *record; record = malloc(sizeof(struct rgp_clock_calibration_record)); @@ -241,8 +234,7 @@ ac_check_profile_state(const struct radeon_info *info) } union rgp_sqtt_marker_cb_id -ac_sqtt_get_next_cmdbuf_id(struct ac_thread_trace_data *data, - enum amd_ip_type ip_type) +ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt *data, enum amd_ip_type ip_type) { union rgp_sqtt_marker_cb_id cb_id = {0}; @@ -264,48 +256,46 @@ ac_sqtt_se_is_disabled(const struct radeon_info *info, unsigned se) } bool -ac_sqtt_get_trace(struct ac_thread_trace_data *data, - const struct radeon_info *info, - struct ac_thread_trace *thread_trace) +ac_sqtt_get_trace(struct ac_sqtt *data, const struct radeon_info *info, + struct ac_sqtt_trace *sqtt_trace) { unsigned max_se = info->max_se; void *ptr = data->ptr; - memset(thread_trace, 0, sizeof(*thread_trace)); + memset(sqtt_trace, 0, sizeof(*sqtt_trace)); for (unsigned se = 0; se < max_se; se++) { - uint64_t info_offset = ac_thread_trace_get_info_offset(se); - uint64_t data_offset = ac_thread_trace_get_data_offset(info, data, se); + uint64_t info_offset = ac_sqtt_get_info_offset(se); + uint64_t data_offset = ac_sqtt_get_data_offset(info, data, se); void *info_ptr = (uint8_t *)ptr + info_offset; void *data_ptr = (uint8_t *)ptr + data_offset; - struct ac_thread_trace_info *trace_info = (struct ac_thread_trace_info *)info_ptr; - struct ac_thread_trace_se thread_trace_se = {0}; + struct ac_sqtt_data_info *trace_info = (struct ac_sqtt_data_info *)info_ptr; + struct ac_sqtt_data_se data_se = {0}; int first_active_cu = ffs(info->cu_mask[se][0]); if (ac_sqtt_se_is_disabled(info, se)) continue; - if (!ac_is_thread_trace_complete(info, data, trace_info)) + if (!ac_is_sqtt_complete(info, data, trace_info)) return false; - thread_trace_se.data_ptr = data_ptr; - thread_trace_se.info = *trace_info; - thread_trace_se.shader_engine = se; + data_se.data_ptr = data_ptr; + data_se.info = *trace_info; + data_se.shader_engine = se; /* RGP seems to expect units of WGP on GFX10+. */ - thread_trace_se.compute_unit = - info->gfx_level >= GFX10 ? (first_active_cu / 2) : first_active_cu; + data_se.compute_unit = info->gfx_level >= GFX10 ? (first_active_cu / 2) : first_active_cu; - thread_trace->traces[thread_trace->num_traces] = thread_trace_se; - thread_trace->num_traces++; + sqtt_trace->traces[sqtt_trace->num_traces] = data_se; + sqtt_trace->num_traces++; } - thread_trace->rgp_code_object = &data->rgp_code_object; - thread_trace->rgp_loader_events = &data->rgp_loader_events; - thread_trace->rgp_pso_correlation = &data->rgp_pso_correlation; - thread_trace->rgp_queue_info = &data->rgp_queue_info; - thread_trace->rgp_queue_event = &data->rgp_queue_event; - thread_trace->rgp_clock_calibration = &data->rgp_clock_calibration; + sqtt_trace->rgp_code_object = &data->rgp_code_object; + sqtt_trace->rgp_loader_events = &data->rgp_loader_events; + sqtt_trace->rgp_pso_correlation = &data->rgp_pso_correlation; + sqtt_trace->rgp_queue_info = &data->rgp_queue_info; + sqtt_trace->rgp_queue_event = &data->rgp_queue_event; + sqtt_trace->rgp_clock_calibration = &data->rgp_clock_calibration; return true; } diff --git a/src/amd/common/ac_sqtt.h b/src/amd/common/ac_sqtt.h index 272fa5f..a9613d8 100644 --- a/src/amd/common/ac_sqtt.h +++ b/src/amd/common/ac_sqtt.h @@ -36,7 +36,19 @@ struct radeon_cmdbuf; struct radeon_info; -struct ac_thread_trace_data { +/** + * SQ Thread tracing is a tracing mechanism that allows taking a detailed look + * at what the shader cores are doing. + * + * Among the things recorded are: + * - draws/dispatches + state + * - when each wave starts and stops. + * - for one SIMD per SE all instructions executed on that SIMD. + * + * The hardware stores all these as events in a buffer, no manual barrier + * around each command needed. The primary user of this is RGP. + */ +struct ac_sqtt { struct radeon_cmdbuf *start_cs[2]; struct radeon_cmdbuf *stop_cs[2]; /* struct radeon_winsys_bo or struct pb_buffer */ @@ -62,7 +74,7 @@ struct ac_thread_trace_data { #define SQTT_BUFFER_ALIGN_SHIFT 12 -struct ac_thread_trace_info { +struct ac_sqtt_data_info { uint32_t cur_offset; uint32_t trace_status; union { @@ -71,8 +83,8 @@ struct ac_thread_trace_info { }; }; -struct ac_thread_trace_se { - struct ac_thread_trace_info info; +struct ac_sqtt_data_se { + struct ac_sqtt_data_info info; void *data_ptr; uint32_t shader_engine; uint32_t compute_unit; @@ -80,7 +92,7 @@ struct ac_thread_trace_se { #define SQTT_MAX_TRACES 6 -struct ac_thread_trace { +struct ac_sqtt_trace { const struct rgp_code_object *rgp_code_object; const struct rgp_loader_events *rgp_loader_events; const struct rgp_pso_correlation *rgp_pso_correlation; @@ -89,36 +101,27 @@ struct ac_thread_trace { const struct rgp_clock_calibration *rgp_clock_calibration; uint32_t num_traces; - struct ac_thread_trace_se traces[SQTT_MAX_TRACES]; + struct ac_sqtt_data_se traces[SQTT_MAX_TRACES]; }; -uint64_t -ac_thread_trace_get_info_offset(unsigned se); +uint64_t ac_sqtt_get_info_offset(unsigned se); -uint64_t -ac_thread_trace_get_data_offset(const struct radeon_info *rad_info, - const struct ac_thread_trace_data *data, unsigned se); -uint64_t -ac_thread_trace_get_info_va(uint64_t va, unsigned se); +uint64_t ac_sqtt_get_data_offset(const struct radeon_info *rad_info, const struct ac_sqtt *sqtt, + unsigned se); +uint64_t ac_sqtt_get_info_va(uint64_t va, unsigned se); -uint64_t -ac_thread_trace_get_data_va(const struct radeon_info *rad_info, - const struct ac_thread_trace_data *data, uint64_t va, unsigned se); +uint64_t ac_sqtt_get_data_va(const struct radeon_info *rad_info, const struct ac_sqtt *sqtt, + uint64_t va, unsigned se); -void -ac_thread_trace_init(struct ac_thread_trace_data *data); +void ac_sqtt_init(struct ac_sqtt *data); -void -ac_thread_trace_finish(struct ac_thread_trace_data *data); +void ac_sqtt_finish(struct ac_sqtt *data); -bool -ac_is_thread_trace_complete(const struct radeon_info *rad_info, - const struct ac_thread_trace_data *data, - const struct ac_thread_trace_info *info); +bool ac_is_sqtt_complete(const struct radeon_info *rad_info, const struct ac_sqtt *sqtt, + const struct ac_sqtt_data_info *info); -uint32_t -ac_get_expected_buffer_size(struct radeon_info *rad_info, - const struct ac_thread_trace_info *info); +uint32_t ac_get_expected_buffer_size(struct radeon_info *rad_info, + const struct ac_sqtt_data_info *info); /** * Identifiers for RGP SQ thread-tracing markers (Table 1) @@ -549,27 +552,22 @@ struct rgp_sqtt_marker_pipeline_bind { static_assert(sizeof(struct rgp_sqtt_marker_pipeline_bind) == 12, "rgp_sqtt_marker_pipeline_bind doesn't match RGP spec"); +bool ac_sqtt_add_pso_correlation(struct ac_sqtt *sqtt, uint64_t pipeline_hash); -bool ac_sqtt_add_pso_correlation(struct ac_thread_trace_data *thread_trace_data, - uint64_t pipeline_hash); - -bool ac_sqtt_add_code_object_loader_event(struct ac_thread_trace_data *thread_trace_data, - uint64_t pipeline_hash, +bool ac_sqtt_add_code_object_loader_event(struct ac_sqtt *sqtt, uint64_t pipeline_hash, uint64_t base_address); -bool ac_sqtt_add_clock_calibration(struct ac_thread_trace_data *thread_trace_data, - uint64_t cpu_timestamp, +bool ac_sqtt_add_clock_calibration(struct ac_sqtt *sqtt, uint64_t cpu_timestamp, uint64_t gpu_timestamp); bool ac_check_profile_state(const struct radeon_info *info); -union rgp_sqtt_marker_cb_id ac_sqtt_get_next_cmdbuf_id(struct ac_thread_trace_data *data, +union rgp_sqtt_marker_cb_id ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt *sqtt, enum amd_ip_type ip_type); bool ac_sqtt_se_is_disabled(const struct radeon_info *info, unsigned se); -bool ac_sqtt_get_trace(struct ac_thread_trace_data *data, - const struct radeon_info *info, - struct ac_thread_trace *thread_trace); +bool ac_sqtt_get_trace(struct ac_sqtt *sqtt, const struct radeon_info *info, + struct ac_sqtt_trace *sqtt_trace); #endif diff --git a/src/amd/vulkan/layers/radv_sqtt_layer.c b/src/amd/vulkan/layers/radv_sqtt_layer.c index 01eae49..2895c8b 100644 --- a/src/amd/vulkan/layers/radv_sqtt_layer.c +++ b/src/amd/vulkan/layers/radv_sqtt_layer.c @@ -219,7 +219,7 @@ radv_write_begin_general_api_marker(struct radv_cmd_buffer *cmd_buffer, marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API; marker.api_type = api_type; - radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4); + radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4); } static void @@ -232,7 +232,7 @@ radv_write_end_general_api_marker(struct radv_cmd_buffer *cmd_buffer, marker.api_type = api_type; marker.is_end = 1; - radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4); + radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4); } static void @@ -259,7 +259,7 @@ radv_write_event_marker(struct radv_cmd_buffer *cmd_buffer, marker.instance_offset_reg_idx = instance_offset_user_data; marker.draw_index_reg_idx = draw_index_user_data; - radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4); + radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4); } static void @@ -279,7 +279,7 @@ radv_write_event_with_dims_marker(struct radv_cmd_buffer *cmd_buffer, marker.thread_y = y; marker.thread_z = z; - radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4); + radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4); } static void @@ -292,7 +292,7 @@ radv_write_user_event_marker(struct radv_cmd_buffer *cmd_buffer, marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT; marker.data_type = type; - radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4); + radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4); } else { assert(str != NULL); unsigned len = strlen(str); @@ -306,8 +306,7 @@ radv_write_user_event_marker(struct radv_cmd_buffer *cmd_buffer, memcpy(buffer, &marker, sizeof(marker)); memcpy(buffer + sizeof(marker), str, len); - radv_emit_thread_trace_userdata(cmd_buffer, buffer, - sizeof(marker) / 4 + marker.length / 4); + radv_emit_sqtt_userdata(cmd_buffer, buffer, sizeof(marker) / 4 + marker.length / 4); } } @@ -317,14 +316,14 @@ radv_describe_begin_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) uint64_t device_id = (uintptr_t)cmd_buffer->device; struct rgp_sqtt_marker_cb_start marker = {0}; - if (likely(!cmd_buffer->device->thread_trace.bo)) + if (likely(!cmd_buffer->device->sqtt.bo)) return; /* Reserve a command buffer ID for SQTT. */ enum amd_ip_type ip_type = radv_queue_family_to_ring(cmd_buffer->device->physical_device, cmd_buffer->qf); union rgp_sqtt_marker_cb_id cb_id = - ac_sqtt_get_next_cmdbuf_id(&cmd_buffer->device->thread_trace, ip_type); + ac_sqtt_get_next_cmdbuf_id(&cmd_buffer->device->sqtt, ip_type); cmd_buffer->sqtt_cb_id = cb_id.all; marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_CB_START; @@ -337,7 +336,7 @@ radv_describe_begin_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) if (cmd_buffer->qf == RADV_QUEUE_GENERAL) marker.queue_flags |= VK_QUEUE_GRAPHICS_BIT; - radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4); + radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4); } void @@ -346,7 +345,7 @@ radv_describe_end_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) uint64_t device_id = (uintptr_t)cmd_buffer->device; struct rgp_sqtt_marker_cb_end marker = {0}; - if (likely(!cmd_buffer->device->thread_trace.bo)) + if (likely(!cmd_buffer->device->sqtt.bo)) return; marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_CB_END; @@ -354,13 +353,13 @@ radv_describe_end_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) marker.device_id_low = device_id; marker.device_id_high = device_id >> 32; - radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4); + radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4); } void radv_describe_draw(struct radv_cmd_buffer *cmd_buffer) { - if (likely(!cmd_buffer->device->thread_trace.bo)) + if (likely(!cmd_buffer->device->sqtt.bo)) return; radv_write_event_marker(cmd_buffer, cmd_buffer->state.current_event_type, UINT_MAX, UINT_MAX, @@ -370,7 +369,7 @@ radv_describe_draw(struct radv_cmd_buffer *cmd_buffer) void radv_describe_dispatch(struct radv_cmd_buffer *cmd_buffer, int x, int y, int z) { - if (likely(!cmd_buffer->device->thread_trace.bo)) + if (likely(!cmd_buffer->device->sqtt.bo)) return; radv_write_event_with_dims_marker(cmd_buffer, cmd_buffer->state.current_event_type, x, y, z); @@ -408,7 +407,7 @@ radv_describe_barrier_end_delayed(struct radv_cmd_buffer *cmd_buffer) { struct rgp_sqtt_marker_barrier_end marker = {0}; - if (likely(!cmd_buffer->device->thread_trace.bo) || !cmd_buffer->state.pending_sqtt_barrier_end) + if (likely(!cmd_buffer->device->sqtt.bo) || !cmd_buffer->state.pending_sqtt_barrier_end) return; cmd_buffer->state.pending_sqtt_barrier_end = false; @@ -451,7 +450,7 @@ radv_describe_barrier_end_delayed(struct radv_cmd_buffer *cmd_buffer) if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_L1) marker.inval_gl1 = true; - radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4); + radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4); cmd_buffer->state.num_layout_transitions = 0; } @@ -461,7 +460,7 @@ radv_describe_barrier_start(struct radv_cmd_buffer *cmd_buffer, enum rgp_barrier { struct rgp_sqtt_marker_barrier_start marker = {0}; - if (likely(!cmd_buffer->device->thread_trace.bo)) + if (likely(!cmd_buffer->device->sqtt.bo)) return; radv_describe_barrier_end_delayed(cmd_buffer); @@ -471,7 +470,7 @@ radv_describe_barrier_start(struct radv_cmd_buffer *cmd_buffer, enum rgp_barrier marker.cb_id = cmd_buffer->sqtt_cb_id; marker.dword02 = reason; - radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4); + radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4); } void @@ -486,7 +485,7 @@ radv_describe_layout_transition(struct radv_cmd_buffer *cmd_buffer, { struct rgp_sqtt_marker_layout_transition marker = {0}; - if (likely(!cmd_buffer->device->thread_trace.bo)) + if (likely(!cmd_buffer->device->sqtt.bo)) return; marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_LAYOUT_TRANSITION; @@ -499,7 +498,7 @@ radv_describe_layout_transition(struct radv_cmd_buffer *cmd_buffer, marker.fmask_color_expand = barrier->layout_transitions.fmask_color_expand; marker.init_mask_ram = barrier->layout_transitions.init_mask_ram; - radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4); + radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4); cmd_buffer->state.num_layout_transitions++; } @@ -510,7 +509,7 @@ radv_describe_pipeline_bind(struct radv_cmd_buffer *cmd_buffer, { struct rgp_sqtt_marker_pipeline_bind marker = {0}; - if (likely(!cmd_buffer->device->thread_trace.bo)) + if (likely(!cmd_buffer->device->sqtt.bo)) return; marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE; @@ -519,34 +518,34 @@ radv_describe_pipeline_bind(struct radv_cmd_buffer *cmd_buffer, marker.api_pso_hash[0] = pipeline->pipeline_hash; marker.api_pso_hash[1] = pipeline->pipeline_hash >> 32; - radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4); + radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4); } /* TODO: Improve the way to trigger capture (overlay, etc). */ static void -radv_handle_thread_trace(VkQueue _queue) +radv_handle_sqtt(VkQueue _queue) { RADV_FROM_HANDLE(radv_queue, queue, _queue); - static bool thread_trace_enabled = false; + static bool sqtt_enabled = false; static uint64_t num_frames = 0; bool resize_trigger = false; - if (thread_trace_enabled) { - struct ac_thread_trace thread_trace = {0}; + if (sqtt_enabled) { + struct ac_sqtt_trace sqtt_trace = {0}; - radv_end_thread_trace(queue); - thread_trace_enabled = false; + radv_end_sqtt(queue); + sqtt_enabled = false; /* TODO: Do something better than this whole sync. */ queue->device->vk.dispatch_table.QueueWaitIdle(_queue); - if (radv_get_thread_trace(queue, &thread_trace)) { + if (radv_get_sqtt_trace(queue, &sqtt_trace)) { struct ac_spm_trace spm_trace; if (queue->device->spm.bo) ac_spm_get_trace(&queue->device->spm, &spm_trace); - ac_dump_rgp_capture(&queue->device->physical_device->rad_info, &thread_trace, + ac_dump_rgp_capture(&queue->device->physical_device->rad_info, &sqtt_trace, queue->device->spm.bo ? &spm_trace : NULL); } else { /* Trigger a new capture if the driver failed to get @@ -556,16 +555,15 @@ radv_handle_thread_trace(VkQueue _queue) } /* Clear resources used for this capture. */ - radv_reset_thread_trace(queue->device); + radv_reset_sqtt_trace(queue->device); } - if (!thread_trace_enabled) { - bool frame_trigger = num_frames == queue->device->thread_trace.start_frame; + if (!sqtt_enabled) { + bool frame_trigger = num_frames == queue->device->sqtt.start_frame; bool file_trigger = false; #ifndef _WIN32 - if (queue->device->thread_trace.trigger_file && - access(queue->device->thread_trace.trigger_file, W_OK) == 0) { - if (unlink(queue->device->thread_trace.trigger_file) == 0) { + if (queue->device->sqtt.trigger_file && access(queue->device->sqtt.trigger_file, W_OK) == 0) { + if (unlink(queue->device->sqtt.trigger_file) == 0) { file_trigger = true; } else { /* Do not enable tracing if we cannot remove the file, @@ -585,13 +583,13 @@ radv_handle_thread_trace(VkQueue _queue) } /* Sample CPU/GPU clocks before starting the trace. */ - if (!radv_thread_trace_sample_clocks(queue->device)) { + if (!radv_sqtt_sample_clocks(queue->device)) { fprintf(stderr, "radv: Failed to sample clocks\n"); } - radv_begin_thread_trace(queue); - assert(!thread_trace_enabled); - thread_trace_enabled = true; + radv_begin_sqtt(queue); + assert(!sqtt_enabled); + sqtt_enabled = true; } } num_frames++; @@ -607,7 +605,7 @@ sqtt_QueuePresentKHR(VkQueue _queue, const VkPresentInfoKHR *pPresentInfo) if (result != VK_SUCCESS) return result; - radv_handle_thread_trace(_queue); + radv_handle_sqtt(_queue); return VK_SUCCESS; } @@ -1172,8 +1170,8 @@ radv_mesa_to_rgp_shader_stage(struct radv_pipeline *pipeline, gl_shader_stage st static VkResult radv_add_code_object(struct radv_device *device, struct radv_pipeline *pipeline) { - struct ac_thread_trace_data *thread_trace_data = &device->thread_trace; - struct rgp_code_object *code_object = &thread_trace_data->rgp_code_object; + struct ac_sqtt *sqtt = &device->sqtt; + struct rgp_code_object *code_object = &sqtt->rgp_code_object; struct rgp_code_object_record *record; record = malloc(sizeof(struct rgp_code_object_record)); @@ -1225,7 +1223,7 @@ radv_register_pipeline(struct radv_device *device, struct radv_pipeline *pipelin bool result; uint64_t base_va = ~0; - result = ac_sqtt_add_pso_correlation(&device->thread_trace, pipeline->pipeline_hash); + result = ac_sqtt_add_pso_correlation(&device->sqtt, pipeline->pipeline_hash); if (!result) return VK_ERROR_OUT_OF_HOST_MEMORY; @@ -1241,8 +1239,7 @@ radv_register_pipeline(struct radv_device *device, struct radv_pipeline *pipelin base_va = MIN2(base_va, va); } - result = - ac_sqtt_add_code_object_loader_event(&device->thread_trace, pipeline->pipeline_hash, base_va); + result = ac_sqtt_add_code_object_loader_event(&device->sqtt, pipeline->pipeline_hash, base_va); if (!result) return VK_ERROR_OUT_OF_HOST_MEMORY; @@ -1256,10 +1253,10 @@ radv_register_pipeline(struct radv_device *device, struct radv_pipeline *pipelin static void radv_unregister_pipeline(struct radv_device *device, struct radv_pipeline *pipeline) { - struct ac_thread_trace_data *thread_trace_data = &device->thread_trace; - struct rgp_pso_correlation *pso_correlation = &thread_trace_data->rgp_pso_correlation; - struct rgp_loader_events *loader_events = &thread_trace_data->rgp_loader_events; - struct rgp_code_object *code_object = &thread_trace_data->rgp_code_object; + struct ac_sqtt *sqtt = &device->sqtt; + struct rgp_pso_correlation *pso_correlation = &sqtt->rgp_pso_correlation; + struct rgp_loader_events *loader_events = &sqtt->rgp_loader_events; + struct rgp_code_object *code_object = &sqtt->rgp_code_object; /* Destroy the PSO correlation record. */ simple_mtx_lock(&pso_correlation->lock); diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index faad8c5..0b6e98c 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -737,7 +737,7 @@ static void radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags) { const struct radv_device *device = cmd_buffer->device; - if (unlikely(device->thread_trace.bo)) { + if (unlikely(device->sqtt.bo)) { radeon_check_space(device->ws, cmd_buffer->cs, 2); radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 3488155..5f3618e 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -106,8 +106,7 @@ radv_get_int_debug_option(const char *name, int default_value) static bool radv_spm_trace_enabled() { - return radv_thread_trace_enabled() && - debug_get_bool_option("RADV_THREAD_TRACE_CACHE_COUNTERS", false); + return radv_sqtt_enabled() && debug_get_bool_option("RADV_THREAD_TRACE_CACHE_COUNTERS", false); } VKAPI_ATTR VkResult VKAPI_CALL @@ -587,7 +586,7 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *ph add_entrypoints(&b, &rage2_device_entrypoints, RADV_APP_DISPATCH_TABLE); } - if (radv_thread_trace_enabled()) + if (radv_sqtt_enabled()) add_entrypoints(&b, &sqtt_device_entrypoints, RADV_RGP_DISPATCH_TABLE); if (radv_rra_trace_enabled() && radv_enable_rt(physical_device, false)) @@ -933,7 +932,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr radv_dump_enabled_options(device, stderr); } - if (radv_thread_trace_enabled()) { + if (radv_sqtt_enabled()) { if (device->physical_device->rad_info.gfx_level < GFX8 || device->physical_device->rad_info.gfx_level > GFX11) { fprintf(stderr, "GPU hardware not supported: refer to " @@ -942,14 +941,15 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr abort(); } - if (!radv_thread_trace_init(device)) { + if (!radv_sqtt_init(device)) { result = VK_ERROR_INITIALIZATION_FAILED; goto fail; } - fprintf(stderr, "radv: Thread trace support is enabled (initial buffer size: %u MiB, " - "instruction timing: %s, cache counters: %s).\n", - device->thread_trace.buffer_size / (1024 * 1024), + fprintf(stderr, + "radv: Thread trace support is enabled (initial buffer size: %u MiB, " + "instruction timing: %s, cache counters: %s).\n", + device->sqtt.buffer_size / (1024 * 1024), radv_is_instruction_timing_enabled() ? "enabled" : "disabled", radv_spm_trace_enabled() ? "enabled" : "disabled"); @@ -1093,7 +1093,7 @@ fail_cache: fail_meta: radv_device_finish_meta(device); fail: - radv_thread_trace_finish(device); + radv_sqtt_finish(device); radv_spm_finish(device); @@ -1195,7 +1195,7 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) radv_destroy_shader_arenas(device); - radv_thread_trace_finish(device); + radv_sqtt_finish(device); radv_rra_trace_finish(_device, &device->rra_trace); diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c index 6e30558..4e14109 100644 --- a/src/amd/vulkan/radv_physical_device.c +++ b/src/amd/vulkan/radv_physical_device.c @@ -54,7 +54,7 @@ typedef void *drmDevicePtr; #endif bool -radv_thread_trace_enabled(void) +radv_sqtt_enabled(void) { return radv_get_int_debug_option("RADV_THREAD_TRACE", -1) >= 0 || getenv("RADV_THREAD_TRACE_TRIGGER"); @@ -65,7 +65,7 @@ radv_perf_query_supported(const struct radv_physical_device *pdev) { /* SQTT / SPM interfere with the register states for perf counters, and * the code has only been tested on GFX10.3 */ - return pdev->rad_info.gfx_level == GFX10_3 && !radv_thread_trace_enabled(); + return pdev->rad_info.gfx_level == GFX10_3 && !radv_sqtt_enabled(); } static bool @@ -489,7 +489,7 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device .EXT_conditional_rendering = true, .EXT_conservative_rasterization = device->rad_info.gfx_level >= GFX9, .EXT_custom_border_color = true, - .EXT_debug_marker = radv_thread_trace_enabled(), + .EXT_debug_marker = radv_sqtt_enabled(), .EXT_depth_clip_control = true, .EXT_depth_clip_enable = true, .EXT_depth_range_unrestricted = true, @@ -2184,7 +2184,7 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm device->ws = radv_null_winsys_create(); #else if (drm_device) { - bool reserve_vmid = radv_thread_trace_enabled(); + bool reserve_vmid = radv_sqtt_enabled(); device->ws = radv_amdgpu_winsys_create(fd, instance->debug_flags, instance->perftest_flags, reserve_vmid); diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index df0f21e..69f022d 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -387,7 +387,7 @@ VkResult create_drm_physical_device(struct vk_instance *vk_instance, struct _drm void radv_physical_device_destroy(struct vk_physical_device *vk_device); -bool radv_thread_trace_enabled(void); +bool radv_sqtt_enabled(void); struct radv_instance { struct vk_instance vk; @@ -1021,7 +1021,7 @@ struct radv_device { struct radv_device_border_color_data border_color_data; /* Thread trace. */ - struct ac_thread_trace_data thread_trace; + struct ac_sqtt sqtt; /* Memory trace. */ struct radv_memory_trace_data memory_trace; @@ -3071,16 +3071,16 @@ void radv_nir_shader_info_link(struct radv_device *device, const struct radv_pipeline_key *pipeline_key, struct radv_pipeline_stage *stages); -bool radv_thread_trace_init(struct radv_device *device); -void radv_thread_trace_finish(struct radv_device *device); -bool radv_begin_thread_trace(struct radv_queue *queue); -bool radv_end_thread_trace(struct radv_queue *queue); -bool radv_get_thread_trace(struct radv_queue *queue, struct ac_thread_trace *thread_trace); -void radv_reset_thread_trace(struct radv_device *device); -void radv_emit_thread_trace_userdata(struct radv_cmd_buffer *cmd_buffer, const void *data, - uint32_t num_dwords); +bool radv_sqtt_init(struct radv_device *device); +void radv_sqtt_finish(struct radv_device *device); +bool radv_begin_sqtt(struct radv_queue *queue); +bool radv_end_sqtt(struct radv_queue *queue); +bool radv_get_sqtt_trace(struct radv_queue *queue, struct ac_sqtt_trace *sqtt_trace); +void radv_reset_sqtt_trace(struct radv_device *device); +void radv_emit_sqtt_userdata(struct radv_cmd_buffer *cmd_buffer, const void *data, + uint32_t num_dwords); bool radv_is_instruction_timing_enabled(void); -bool radv_thread_trace_sample_clocks(struct radv_device *device); +bool radv_sqtt_sample_clocks(struct radv_device *device); void radv_emit_inhibit_clockgating(struct radv_device *device, struct radeon_cmdbuf *cs, bool inhibit); diff --git a/src/amd/vulkan/radv_sqtt.c b/src/amd/vulkan/radv_sqtt.c index 208e8a2..6fb0818 100644 --- a/src/amd/vulkan/radv_sqtt.c +++ b/src/amd/vulkan/radv_sqtt.c @@ -36,7 +36,7 @@ radv_is_instruction_timing_enabled(void) } static uint32_t -gfx11_get_thread_trace_ctrl(struct radv_device *device, bool enable) +gfx11_get_sqtt_ctrl(struct radv_device *device, bool enable) { return S_0367B0_MODE(enable) | S_0367B0_HIWATER(5) | S_0367B0_UTIL_TIMER(1) | S_0367B0_RT_FREQ(2) | /* 4096 clk */ @@ -45,21 +45,21 @@ gfx11_get_thread_trace_ctrl(struct radv_device *device, bool enable) } static uint32_t -gfx10_get_thread_trace_ctrl(struct radv_device *device, bool enable) +gfx10_get_sqtt_ctrl(struct radv_device *device, bool enable) { - uint32_t thread_trace_ctrl = S_008D1C_MODE(enable) | S_008D1C_HIWATER(5) | - S_008D1C_UTIL_TIMER(1) | S_008D1C_RT_FREQ(2) | /* 4096 clk */ - S_008D1C_DRAW_EVENT_EN(1) | S_008D1C_REG_STALL_EN(1) | - S_008D1C_SPI_STALL_EN(1) | S_008D1C_SQ_STALL_EN(1) | - S_008D1C_REG_DROP_ON_STALL(0); + uint32_t sqtt_ctrl = S_008D1C_MODE(enable) | S_008D1C_HIWATER(5) | S_008D1C_UTIL_TIMER(1) | + S_008D1C_RT_FREQ(2) | /* 4096 clk */ + S_008D1C_DRAW_EVENT_EN(1) | S_008D1C_REG_STALL_EN(1) | + S_008D1C_SPI_STALL_EN(1) | S_008D1C_SQ_STALL_EN(1) | + S_008D1C_REG_DROP_ON_STALL(0); if (device->physical_device->rad_info.gfx_level == GFX10_3) - thread_trace_ctrl |= S_008D1C_LOWATER_OFFSET(4); + sqtt_ctrl |= S_008D1C_LOWATER_OFFSET(4); if (device->physical_device->rad_info.has_sqtt_auto_flush_mode_bug) - thread_trace_ctrl |= S_008D1C_AUTO_FLUSH_MODE(1); + sqtt_ctrl |= S_008D1C_AUTO_FLUSH_MODE(1); - return thread_trace_ctrl; + return sqtt_ctrl; } static void @@ -78,16 +78,16 @@ radv_emit_wait_for_idle(struct radv_device *device, struct radeon_cmdbuf *cs, in } static void -radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *cs, - enum radv_queue_family qf) +radv_emit_sqtt_start(struct radv_device *device, struct radeon_cmdbuf *cs, + enum radv_queue_family qf) { - uint32_t shifted_size = device->thread_trace.buffer_size >> SQTT_BUFFER_ALIGN_SHIFT; + uint32_t shifted_size = device->sqtt.buffer_size >> SQTT_BUFFER_ALIGN_SHIFT; struct radeon_info *rad_info = &device->physical_device->rad_info; unsigned max_se = rad_info->max_se; for (unsigned se = 0; se < max_se; se++) { - uint64_t va = radv_buffer_get_va(device->thread_trace.bo); - uint64_t data_va = ac_thread_trace_get_data_va(rad_info, &device->thread_trace, va, se); + uint64_t va = radv_buffer_get_va(device->sqtt.bo); + uint64_t data_va = ac_sqtt_get_data_va(rad_info, &device->sqtt, va, se); uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT; int first_active_cu = ffs(device->physical_device->rad_info.cu_mask[se][0]); @@ -111,7 +111,7 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c S_0367B4_SA_SEL(0) | S_0367B4_WGP_SEL(first_active_cu / 2) | S_0367B4_SIMD_SEL(0)); - uint32_t thread_trace_token_mask = S_0367B8_REG_INCLUDE( + uint32_t sqtt_token_mask = S_0367B8_REG_INCLUDE( V_0367B8_REG_INCLUDE_SQDEC | V_0367B8_REG_INCLUDE_SHDEC | V_0367B8_REG_INCLUDE_GFXUDEC | V_0367B8_REG_INCLUDE_COMP | V_0367B8_REG_INCLUDE_CONTEXT | V_0367B8_REG_INCLUDE_CONFIG); @@ -124,13 +124,13 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c V_0367B8_TOKEN_EXCLUDE_VALUINST | V_0367B8_TOKEN_EXCLUDE_IMMEDIATE | V_0367B8_TOKEN_EXCLUDE_INST; } - thread_trace_token_mask |= S_0367B8_TOKEN_EXCLUDE(token_exclude); + sqtt_token_mask |= S_0367B8_TOKEN_EXCLUDE(token_exclude); - radeon_set_uconfig_reg(cs, R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK, thread_trace_token_mask); + radeon_set_uconfig_reg(cs, R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask); /* Should be emitted last (it enables thread traces). */ radeon_set_uconfig_reg(cs, R_0367B0_SQ_THREAD_TRACE_CTRL, - gfx11_get_thread_trace_ctrl(device, true)); + gfx11_get_sqtt_ctrl(device, true)); } else if (device->physical_device->rad_info.gfx_level >= GFX10) { /* Order seems important for the following 2 registers. */ radeon_set_privileged_config_reg( @@ -144,7 +144,7 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c S_008D14_WTYPE_INCLUDE(0x7f) | /* all shader stages */ S_008D14_SA_SEL(0) | S_008D14_WGP_SEL(first_active_cu / 2) | S_008D14_SIMD_SEL(0)); - uint32_t thread_trace_token_mask = S_008D18_REG_INCLUDE( + uint32_t sqtt_token_mask = S_008D18_REG_INCLUDE( V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC | V_008D18_REG_INCLUDE_GFXUDEC | V_008D18_REG_INCLUDE_COMP | V_008D18_REG_INCLUDE_CONTEXT | V_008D18_REG_INCLUDE_CONFIG); @@ -159,14 +159,13 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c V_008D18_TOKEN_EXCLUDE_IMMEDIATE | V_008D18_TOKEN_EXCLUDE_INST; } - thread_trace_token_mask |= S_008D18_TOKEN_EXCLUDE(token_exclude); + sqtt_token_mask |= S_008D18_TOKEN_EXCLUDE(token_exclude); - radeon_set_privileged_config_reg(cs, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK, - thread_trace_token_mask); + radeon_set_privileged_config_reg(cs, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask); /* Should be emitted last (it enables thread traces). */ radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL, - gfx10_get_thread_trace_ctrl(device, true)); + gfx10_get_sqtt_ctrl(device, true)); } else { /* Order seems important for the following 4 registers. */ radeon_set_uconfig_reg(cs, R_030CDC_SQ_THREAD_TRACE_BASE2, @@ -178,16 +177,16 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c radeon_set_uconfig_reg(cs, R_030CD4_SQ_THREAD_TRACE_CTRL, S_030CD4_RESET_BUFFER(1)); - uint32_t thread_trace_mask = S_030CC8_CU_SEL(first_active_cu) | S_030CC8_SH_SEL(0) | - S_030CC8_SIMD_EN(0xf) | S_030CC8_VM_ID_MASK(0) | - S_030CC8_REG_STALL_EN(1) | S_030CC8_SPI_STALL_EN(1) | - S_030CC8_SQ_STALL_EN(1); + uint32_t sqtt_mask = S_030CC8_CU_SEL(first_active_cu) | S_030CC8_SH_SEL(0) | + S_030CC8_SIMD_EN(0xf) | S_030CC8_VM_ID_MASK(0) | + S_030CC8_REG_STALL_EN(1) | S_030CC8_SPI_STALL_EN(1) | + S_030CC8_SQ_STALL_EN(1); if (device->physical_device->rad_info.gfx_level < GFX9) { - thread_trace_mask |= S_030CC8_RANDOM_SEED(0xffff); + sqtt_mask |= S_030CC8_RANDOM_SEED(0xffff); } - radeon_set_uconfig_reg(cs, R_030CC8_SQ_THREAD_TRACE_MASK, thread_trace_mask); + radeon_set_uconfig_reg(cs, R_030CC8_SQ_THREAD_TRACE_MASK, sqtt_mask); /* Trace all tokens and registers. */ radeon_set_uconfig_reg( @@ -208,7 +207,7 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c } /* Enable the thread trace mode. */ - uint32_t thread_trace_mode = + uint32_t sqtt_mode = S_030CD8_MASK_PS(1) | S_030CD8_MASK_VS(1) | S_030CD8_MASK_GS(1) | S_030CD8_MASK_ES(1) | S_030CD8_MASK_HS(1) | S_030CD8_MASK_LS(1) | S_030CD8_MASK_CS(1) | S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */ @@ -216,10 +215,10 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c if (device->physical_device->rad_info.gfx_level == GFX9) { /* Count SQTT traffic in TCC perf counters. */ - thread_trace_mode |= S_030CD8_TC_PERF_EN(1); + sqtt_mode |= S_030CD8_TC_PERF_EN(1); } - radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE, thread_trace_mode); + radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE, sqtt_mode); } } @@ -237,57 +236,56 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c } } -static const uint32_t gfx8_thread_trace_info_regs[] = { +static const uint32_t gfx8_sqtt_info_regs[] = { R_030CE4_SQ_THREAD_TRACE_WPTR, R_030CE8_SQ_THREAD_TRACE_STATUS, R_008E40_SQ_THREAD_TRACE_CNTR, }; -static const uint32_t gfx9_thread_trace_info_regs[] = { +static const uint32_t gfx9_sqtt_info_regs[] = { R_030CE4_SQ_THREAD_TRACE_WPTR, R_030CE8_SQ_THREAD_TRACE_STATUS, R_030CF0_SQ_THREAD_TRACE_CNTR, }; -static const uint32_t gfx10_thread_trace_info_regs[] = { +static const uint32_t gfx10_sqtt_info_regs[] = { R_008D10_SQ_THREAD_TRACE_WPTR, R_008D20_SQ_THREAD_TRACE_STATUS, R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR, }; -static const uint32_t gfx11_thread_trace_info_regs[] = { +static const uint32_t gfx11_sqtt_info_regs[] = { R_0367BC_SQ_THREAD_TRACE_WPTR, R_0367D0_SQ_THREAD_TRACE_STATUS, R_0367E8_SQ_THREAD_TRACE_DROPPED_CNTR, }; static void -radv_copy_thread_trace_info_regs(struct radv_device *device, struct radeon_cmdbuf *cs, - unsigned se_index) +radv_copy_sqtt_info_regs(struct radv_device *device, struct radeon_cmdbuf *cs, unsigned se_index) { const struct radv_physical_device *pdevice = device->physical_device; - const uint32_t *thread_trace_info_regs = NULL; + const uint32_t *sqtt_info_regs = NULL; if (device->physical_device->rad_info.gfx_level >= GFX11) { - thread_trace_info_regs = gfx11_thread_trace_info_regs; + sqtt_info_regs = gfx11_sqtt_info_regs; } else if (device->physical_device->rad_info.gfx_level >= GFX10) { - thread_trace_info_regs = gfx10_thread_trace_info_regs; + sqtt_info_regs = gfx10_sqtt_info_regs; } else if (device->physical_device->rad_info.gfx_level == GFX9) { - thread_trace_info_regs = gfx9_thread_trace_info_regs; + sqtt_info_regs = gfx9_sqtt_info_regs; } else { assert(device->physical_device->rad_info.gfx_level == GFX8); - thread_trace_info_regs = gfx8_thread_trace_info_regs; + sqtt_info_regs = gfx8_sqtt_info_regs; } /* Get the VA where the info struct is stored for this SE. */ - uint64_t va = radv_buffer_get_va(device->thread_trace.bo); - uint64_t info_va = ac_thread_trace_get_info_va(va, se_index); + uint64_t va = radv_buffer_get_va(device->sqtt.bo); + uint64_t info_va = ac_sqtt_get_info_va(va, se_index); /* Copy back the info struct one DWORD at a time. */ for (unsigned i = 0; i < 3; i++) { radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM); - radeon_emit(cs, thread_trace_info_regs[i] >> 2); + radeon_emit(cs, sqtt_info_regs[i] >> 2); radeon_emit(cs, 0); /* unused */ radeon_emit(cs, (info_va + i * 4)); radeon_emit(cs, (info_va + i * 4) >> 32); @@ -302,8 +300,7 @@ radv_copy_thread_trace_info_regs(struct radv_device *device, struct radeon_cmdbu * 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned * 3) mask off the higher 3 bits because WPTR.OFFSET is 29 bits */ - uint64_t data_va = - ac_thread_trace_get_data_va(&pdevice->rad_info, &device->thread_trace, va, se_index); + uint64_t data_va = ac_sqtt_get_data_va(&pdevice->rad_info, &device->sqtt, va, se_index); uint64_t shifted_data_va = (data_va >> 5); uint32_t init_wptr_value = shifted_data_va & 0x1fffffff; @@ -320,8 +317,7 @@ radv_copy_thread_trace_info_regs(struct radv_device *device, struct radeon_cmdbu } static void -radv_emit_thread_trace_stop(struct radv_device *device, struct radeon_cmdbuf *cs, - enum radv_queue_family qf) +radv_emit_sqtt_stop(struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf) { unsigned max_se = device->physical_device->rad_info.max_se; @@ -364,7 +360,7 @@ radv_emit_thread_trace_stop(struct radv_device *device, struct radeon_cmdbuf *cs /* Disable the thread trace mode. */ radeon_set_uconfig_reg(cs, R_0367B0_SQ_THREAD_TRACE_CTRL, - gfx11_get_thread_trace_ctrl(device, false)); + gfx11_get_sqtt_ctrl(device, false)); /* Wait for thread trace completion. */ radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); @@ -391,7 +387,7 @@ radv_emit_thread_trace_stop(struct radv_device *device, struct radeon_cmdbuf *cs /* Disable the thread trace mode. */ radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL, - gfx10_get_thread_trace_ctrl(device, false)); + gfx10_get_sqtt_ctrl(device, false)); /* Wait for thread trace completion. */ radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); @@ -417,7 +413,7 @@ radv_emit_thread_trace_stop(struct radv_device *device, struct radeon_cmdbuf *cs radeon_emit(cs, 4); /* poll interval */ } - radv_copy_thread_trace_info_regs(device, cs, se); + radv_copy_sqtt_info_regs(device, cs, se); } /* Restore global broadcasting. */ @@ -427,8 +423,7 @@ radv_emit_thread_trace_stop(struct radv_device *device, struct radeon_cmdbuf *cs } void -radv_emit_thread_trace_userdata(struct radv_cmd_buffer *cmd_buffer, const void *data, - uint32_t num_dwords) +radv_emit_sqtt_userdata(struct radv_cmd_buffer *cmd_buffer, const void *data, uint32_t num_dwords) { struct radv_device *device = cmd_buffer->device; struct radeon_cmdbuf *cs = cmd_buffer->cs; @@ -492,7 +487,7 @@ radv_emit_inhibit_clockgating(struct radv_device *device, struct radeon_cmdbuf * } static bool -radv_thread_trace_init_bo(struct radv_device *device) +radv_sqtt_init_bo(struct radv_device *device) { unsigned max_se = device->physical_device->rad_info.max_se; struct radeon_winsys *ws = device->ws; @@ -502,49 +497,48 @@ radv_thread_trace_init_bo(struct radv_device *device) /* The buffer size and address need to be aligned in HW regs. Align the * size as early as possible so that we do all the allocation & addressing * correctly. */ - device->thread_trace.buffer_size = - align64(device->thread_trace.buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT); + device->sqtt.buffer_size = align64(device->sqtt.buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT); /* Compute total size of the thread trace BO for all SEs. */ - size = align64(sizeof(struct ac_thread_trace_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT); - size += device->thread_trace.buffer_size * (uint64_t)max_se; + size = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT); + size += device->sqtt.buffer_size * (uint64_t)max_se; struct radeon_winsys_bo *bo = NULL; result = ws->buffer_create( ws, size, 4096, RADEON_DOMAIN_VRAM, RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_ZERO_VRAM, RADV_BO_PRIORITY_SCRATCH, 0, &bo); - device->thread_trace.bo = bo; + device->sqtt.bo = bo; if (result != VK_SUCCESS) return false; - result = ws->buffer_make_resident(ws, device->thread_trace.bo, true); + result = ws->buffer_make_resident(ws, device->sqtt.bo, true); if (result != VK_SUCCESS) return false; - device->thread_trace.ptr = ws->buffer_map(device->thread_trace.bo); - if (!device->thread_trace.ptr) + device->sqtt.ptr = ws->buffer_map(device->sqtt.bo); + if (!device->sqtt.ptr) return false; return true; } static void -radv_thread_trace_finish_bo(struct radv_device *device) +radv_sqtt_finish_bo(struct radv_device *device) { struct radeon_winsys *ws = device->ws; - if (unlikely(device->thread_trace.bo)) { - ws->buffer_make_resident(ws, device->thread_trace.bo, false); - ws->buffer_destroy(ws, device->thread_trace.bo); + if (unlikely(device->sqtt.bo)) { + ws->buffer_make_resident(ws, device->sqtt.bo, false); + ws->buffer_destroy(ws, device->sqtt.bo); } } static VkResult radv_register_queue(struct radv_device *device, struct radv_queue *queue) { - struct ac_thread_trace_data *thread_trace_data = &device->thread_trace; - struct rgp_queue_info *queue_info = &thread_trace_data->rgp_queue_info; + struct ac_sqtt *sqtt = &device->sqtt; + struct rgp_queue_info *queue_info = &sqtt->rgp_queue_info; struct rgp_queue_info_record *record; record = malloc(sizeof(struct rgp_queue_info_record)); @@ -572,8 +566,8 @@ radv_register_queue(struct radv_device *device, struct radv_queue *queue) static void radv_unregister_queue(struct radv_device *device, struct radv_queue *queue) { - struct ac_thread_trace_data *thread_trace_data = &device->thread_trace; - struct rgp_queue_info *queue_info = &thread_trace_data->rgp_queue_info; + struct ac_sqtt *sqtt = &device->sqtt; + struct rgp_queue_info *queue_info = &sqtt->rgp_queue_info; /* Destroy queue info record. */ simple_mtx_lock(&queue_info->lock); @@ -592,7 +586,7 @@ radv_unregister_queue(struct radv_device *device, struct radv_queue *queue) } static void -radv_register_queues(struct radv_device *device, struct ac_thread_trace_data *thread_trace_data) +radv_register_queues(struct radv_device *device, struct ac_sqtt *sqtt) { radv_register_queue(device, &device->queues[RADV_QUEUE_GENERAL][0]); for (uint32_t i = 0; i < device->queue_count[RADV_QUEUE_COMPUTE]; i++) @@ -600,7 +594,7 @@ radv_register_queues(struct radv_device *device, struct ac_thread_trace_data *th } static void -radv_unregister_queues(struct radv_device *device, struct ac_thread_trace_data *thread_trace_data) +radv_unregister_queues(struct radv_device *device, struct ac_sqtt *sqtt) { radv_unregister_queue(device, &device->queues[RADV_QUEUE_GENERAL][0]); for (uint32_t i = 0; i < device->queue_count[RADV_QUEUE_COMPUTE]; i++) @@ -608,74 +602,74 @@ radv_unregister_queues(struct radv_device *device, struct ac_thread_trace_data * } bool -radv_thread_trace_init(struct radv_device *device) +radv_sqtt_init(struct radv_device *device) { - struct ac_thread_trace_data *thread_trace_data = &device->thread_trace; + struct ac_sqtt *sqtt = &device->sqtt; /* Default buffer size set to 32MB per SE. */ - device->thread_trace.buffer_size = + device->sqtt.buffer_size = radv_get_int_debug_option("RADV_THREAD_TRACE_BUFFER_SIZE", 32 * 1024 * 1024); - device->thread_trace.start_frame = radv_get_int_debug_option("RADV_THREAD_TRACE", -1); + device->sqtt.start_frame = radv_get_int_debug_option("RADV_THREAD_TRACE", -1); const char *trigger_file = getenv("RADV_THREAD_TRACE_TRIGGER"); if (trigger_file) - device->thread_trace.trigger_file = strdup(trigger_file); + device->sqtt.trigger_file = strdup(trigger_file); - if (!radv_thread_trace_init_bo(device)) + if (!radv_sqtt_init_bo(device)) return false; if (!radv_device_acquire_performance_counters(device)) return false; - ac_thread_trace_init(thread_trace_data); + ac_sqtt_init(sqtt); - radv_register_queues(device, thread_trace_data); + radv_register_queues(device, sqtt); return true; } void -radv_thread_trace_finish(struct radv_device *device) +radv_sqtt_finish(struct radv_device *device) { - struct ac_thread_trace_data *thread_trace_data = &device->thread_trace; + struct ac_sqtt *sqtt = &device->sqtt; struct radeon_winsys *ws = device->ws; - free(device->thread_trace.trigger_file); + free(device->sqtt.trigger_file); - radv_thread_trace_finish_bo(device); + radv_sqtt_finish_bo(device); for (unsigned i = 0; i < 2; i++) { - if (device->thread_trace.start_cs[i]) - ws->cs_destroy(device->thread_trace.start_cs[i]); - if (device->thread_trace.stop_cs[i]) - ws->cs_destroy(device->thread_trace.stop_cs[i]); + if (device->sqtt.start_cs[i]) + ws->cs_destroy(device->sqtt.start_cs[i]); + if (device->sqtt.stop_cs[i]) + ws->cs_destroy(device->sqtt.stop_cs[i]); } - radv_unregister_queues(device, thread_trace_data); + radv_unregister_queues(device, sqtt); - ac_thread_trace_finish(thread_trace_data); + ac_sqtt_finish(sqtt); } static bool -radv_thread_trace_resize_bo(struct radv_device *device) +radv_sqtt_resize_bo(struct radv_device *device) { /* Destroy the previous thread trace BO. */ - radv_thread_trace_finish_bo(device); + radv_sqtt_finish_bo(device); /* Double the size of the thread trace buffer per SE. */ - device->thread_trace.buffer_size *= 2; + device->sqtt.buffer_size *= 2; fprintf(stderr, "Failed to get the thread trace because the buffer " "was too small, resizing to %d KB\n", - device->thread_trace.buffer_size / 1024); + device->sqtt.buffer_size / 1024); /* Re-create the thread trace BO. */ - return radv_thread_trace_init_bo(device); + return radv_sqtt_init_bo(device); } bool -radv_begin_thread_trace(struct radv_queue *queue) +radv_begin_sqtt(struct radv_queue *queue) { struct radv_device *device = queue->device; enum radv_queue_family family = queue->state.qf; @@ -684,9 +678,9 @@ radv_begin_thread_trace(struct radv_queue *queue) VkResult result; /* Destroy the previous start CS and create a new one. */ - if (device->thread_trace.start_cs[family]) { - ws->cs_destroy(device->thread_trace.start_cs[family]); - device->thread_trace.start_cs[family] = NULL; + if (device->sqtt.start_cs[family]) { + ws->cs_destroy(device->sqtt.start_cs[family]); + device->sqtt.start_cs[family] = NULL; } cs = ws->cs_create(ws, radv_queue_ring(queue), false); @@ -727,7 +721,7 @@ radv_begin_thread_trace(struct radv_queue *queue) } /* Start SQTT. */ - radv_emit_thread_trace_start(device, cs, family); + radv_emit_sqtt_start(device, cs, family); if (device->spm.bo) radv_perfcounter_emit_spm_start(device, cs, family); @@ -738,13 +732,13 @@ radv_begin_thread_trace(struct radv_queue *queue) return false; } - device->thread_trace.start_cs[family] = cs; + device->sqtt.start_cs[family] = cs; return radv_queue_internal_submit(queue, cs); } bool -radv_end_thread_trace(struct radv_queue *queue) +radv_end_sqtt(struct radv_queue *queue) { struct radv_device *device = queue->device; enum radv_queue_family family = queue->state.qf; @@ -753,9 +747,9 @@ radv_end_thread_trace(struct radv_queue *queue) VkResult result; /* Destroy the previous stop CS and create a new one. */ - if (queue->device->thread_trace.stop_cs[family]) { - ws->cs_destroy(device->thread_trace.stop_cs[family]); - device->thread_trace.stop_cs[family] = NULL; + if (queue->device->sqtt.stop_cs[family]) { + ws->cs_destroy(device->sqtt.stop_cs[family]); + device->sqtt.stop_cs[family] = NULL; } cs = ws->cs_create(ws, radv_queue_ring(queue), false); @@ -784,7 +778,7 @@ radv_end_thread_trace(struct radv_queue *queue) radv_perfcounter_emit_spm_stop(device, cs, family); /* Stop SQTT. */ - radv_emit_thread_trace_stop(device, cs, family); + radv_emit_sqtt_stop(device, cs, family); radv_perfcounter_emit_spm_reset(cs); @@ -800,19 +794,19 @@ radv_end_thread_trace(struct radv_queue *queue) return false; } - device->thread_trace.stop_cs[family] = cs; + device->sqtt.stop_cs[family] = cs; return radv_queue_internal_submit(queue, cs); } bool -radv_get_thread_trace(struct radv_queue *queue, struct ac_thread_trace *thread_trace) +radv_get_sqtt_trace(struct radv_queue *queue, struct ac_sqtt_trace *sqtt_trace) { struct radv_device *device = queue->device; struct radeon_info *rad_info = &device->physical_device->rad_info; - if (!ac_sqtt_get_trace(&device->thread_trace, rad_info, thread_trace)) { - if (!radv_thread_trace_resize_bo(device)) + if (!ac_sqtt_get_trace(&device->sqtt, rad_info, sqtt_trace)) { + if (!radv_sqtt_resize_bo(device)) fprintf(stderr, "radv: Failed to resize the SQTT buffer.\n"); return false; } @@ -821,10 +815,10 @@ radv_get_thread_trace(struct radv_queue *queue, struct ac_thread_trace *thread_t } void -radv_reset_thread_trace(struct radv_device *device) +radv_reset_sqtt_trace(struct radv_device *device) { - struct ac_thread_trace_data *thread_trace_data = &device->thread_trace; - struct rgp_clock_calibration *clock_calibration = &thread_trace_data->rgp_clock_calibration; + struct ac_sqtt *sqtt = &device->sqtt; + struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration; /* Clear clock calibration records. */ simple_mtx_lock(&clock_calibration->lock); @@ -869,7 +863,7 @@ radv_get_calibrated_timestamps(struct radv_device *device, uint64_t *cpu_timesta } bool -radv_thread_trace_sample_clocks(struct radv_device *device) +radv_sqtt_sample_clocks(struct radv_device *device) { uint64_t cpu_timestamp = 0, gpu_timestamp = 0; VkResult result; @@ -878,5 +872,5 @@ radv_thread_trace_sample_clocks(struct radv_device *device) if (result != VK_SUCCESS) return false; - return ac_sqtt_add_clock_calibration(&device->thread_trace, cpu_timestamp, gpu_timestamp); + return ac_sqtt_add_clock_calibration(&device->sqtt, cpu_timestamp, gpu_timestamp); } diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 7ff5728..c91174d 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -1211,13 +1211,13 @@ static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info) simple_mtx_unlock(&sscreen->async_compute_context_lock); } - if (unlikely(sctx->thread_trace_enabled)) + if (unlikely(sctx->sqtt_enabled)) sctx->sqtt_next_event = EventCmdResolveImage; if (si_msaa_resolve_blit_via_CB(ctx, info)) return; - if (unlikely(sctx->thread_trace_enabled)) + if (unlikely(sctx->sqtt_enabled)) sctx->sqtt_next_event = EventCmdCopyImage; /* Using compute for copying to a linear texture in GTT is much faster than @@ -1252,7 +1252,7 @@ void si_gfx_blit(struct pipe_context *ctx, const struct pipe_blit_info *info) info->src.box.z, info->src.box.z + info->src.box.depth - 1, false); - if (unlikely(sctx->thread_trace_enabled)) + if (unlikely(sctx->sqtt_enabled)) sctx->sqtt_next_event = EventCmdBlitImage; si_blitter_begin(sctx, SI_BLIT | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND)); diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c index 2c333e9..548c0d4 100644 --- a/src/gallium/drivers/radeonsi/si_clear.c +++ b/src/gallium/drivers/radeonsi/si_clear.c @@ -1186,7 +1186,7 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers, sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB; } - if (unlikely(sctx->thread_trace_enabled)) { + if (unlikely(sctx->sqtt_enabled)) { if (buffers & PIPE_CLEAR_COLOR) sctx->sqtt_next_event = EventCmdClearColorImage; else if (buffers & PIPE_CLEAR_DEPTHSTENCIL) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 9ed4f90..1670a11 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -337,14 +337,13 @@ static void si_bind_compute_state(struct pipe_context *ctx, void *state) sctx->compute_shaderbuf_sgprs_dirty = true; sctx->compute_image_sgprs_dirty = true; - if (unlikely((sctx->screen->debug_flags & DBG(SQTT)) && sctx->thread_trace)) { + if (unlikely((sctx->screen->debug_flags & DBG(SQTT)) && sctx->sqtt)) { uint32_t pipeline_code_hash = _mesa_hash_data_with_seed( program->shader.binary.elf_buffer, program->shader.binary.elf_size, 0); - struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; - if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) { + if (!si_sqtt_pipeline_is_registered(sctx->sqtt, pipeline_code_hash)) { /* Short lived fake pipeline: we don't need to reupload the compute shaders, * as we do for the gfx ones so just create a temp pipeline to be able to * call si_sqtt_register_pipeline, and then drop it. @@ -769,7 +768,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ if (sctx->gfx_level >= GFX10 && waves_per_threadgroup == 1) threadgroups_per_cu = 2; - if (unlikely(sctx->thread_trace_enabled)) { + if (unlikely(sctx->sqtt_enabled)) { si_write_event_with_dims_marker(sctx, &sctx->gfx_cs, info->indirect ? EventCmdDispatchIndirect : EventCmdDispatch, info->grid[0], info->grid[1], info->grid[2]); @@ -839,7 +838,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_ radeon_emit(dispatch_initiator); } - if (unlikely(sctx->thread_trace_enabled && sctx->gfx_level >= GFX9)) { + if (unlikely(sctx->sqtt_enabled && sctx->gfx_level >= GFX9)) { radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0)); } diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c index 3dea23c..61d4b01 100644 --- a/src/gallium/drivers/radeonsi/si_fence.c +++ b/src/gallium/drivers/radeonsi/si_fence.c @@ -489,9 +489,8 @@ static void si_flush_all_queues(struct pipe_context *ctx, tc_driver_internal_flush_notify(sctx->tc); - if (unlikely(sctx->thread_trace && - (flags & PIPE_FLUSH_END_OF_FRAME))) { - si_handle_thread_trace(sctx, &sctx->gfx_cs); + if (unlikely(sctx->sqtt && (flags & PIPE_FLUSH_END_OF_FRAME))) { + si_handle_sqtt(sctx, &sctx->gfx_cs); } } else { /* Instead of flushing, create a deferred fence. Constraints: diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 698d984..683790c 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -170,9 +170,8 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, AMD_IP_GFX); } - if (unlikely(ctx->thread_trace && - (flags & PIPE_FLUSH_END_OF_FRAME))) { - si_handle_thread_trace(ctx, &ctx->gfx_cs); + if (unlikely(ctx->sqtt && (flags & PIPE_FLUSH_END_OF_FRAME))) { + si_handle_sqtt(ctx, &ctx->gfx_cs); } if (ctx->current_saved_cs) @@ -795,7 +794,7 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs) radeon_emit(0); /* DATA_HI */ radeon_emit(0); /* INT_CTXID */ - if (unlikely(ctx->thread_trace_enabled)) { + if (unlikely(ctx->sqtt_enabled)) { radeon_end(); si_sqtt_describe_barrier_start(ctx, &ctx->gfx_cs); radeon_begin_again(cs); @@ -815,7 +814,7 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs) radeon_emit(S_585_PWS_ENA(1)); radeon_emit(gcr_cntl); /* GCR_CNTL */ - if (unlikely(ctx->thread_trace_enabled)) { + if (unlikely(ctx->sqtt_enabled)) { radeon_end(); si_sqtt_describe_barrier_end(ctx, &ctx->gfx_cs, flags); radeon_begin_again(cs); @@ -859,13 +858,13 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs) EOP_DATA_SEL_VALUE_32BIT, wait_mem_scratch, va, ctx->wait_mem_number, SI_NOT_QUERY); - if (unlikely(ctx->thread_trace_enabled)) { + if (unlikely(ctx->sqtt_enabled)) { si_sqtt_describe_barrier_start(ctx, &ctx->gfx_cs); } si_cp_wait_mem(ctx, cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL); - if (unlikely(ctx->thread_trace_enabled)) { + if (unlikely(ctx->sqtt_enabled)) { si_sqtt_describe_barrier_end(ctx, &ctx->gfx_cs, flags); } @@ -1071,13 +1070,13 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs) EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT, wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY); - if (unlikely(sctx->thread_trace_enabled)) { + if (unlikely(sctx->sqtt_enabled)) { si_sqtt_describe_barrier_start(sctx, &sctx->gfx_cs); } si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL); - if (unlikely(sctx->thread_trace_enabled)) { + if (unlikely(sctx->sqtt_enabled)) { si_sqtt_describe_barrier_end(sctx, &sctx->gfx_cs, sctx->flags); } } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index afd13ff..e2b246e 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -209,13 +209,13 @@ static void si_destroy_context(struct pipe_context *context) if (sctx->gfx_level >= GFX10 && sctx->has_graphics) gfx10_destroy_query(sctx); - if (sctx->thread_trace) { + if (sctx->sqtt) { struct si_screen *sscreen = sctx->screen; if (sscreen->info.has_stable_pstate && sscreen->b.num_contexts == 1 && !(sctx->context_flags & SI_CONTEXT_FLAG_AUX)) sscreen->ws->cs_set_pstate(&sctx->gfx_cs, RADEON_CTX_PSTATE_NONE); - si_destroy_thread_trace(sctx); + si_destroy_sqtt(sctx); } pipe_resource_reference(&sctx->esgs_ring, NULL); @@ -429,7 +429,7 @@ static void si_emit_string_marker(struct pipe_context *ctx, const char *string, dd_parse_apitrace_marker(string, len, &sctx->apitrace_call_number); - if (sctx->thread_trace_enabled) + if (sctx->sqtt_enabled) si_write_user_event(sctx, &sctx->gfx_cs, UserEventTrigger, string, len); if (sctx->log) @@ -896,7 +896,7 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, v "detected. Force the GPU into a profiling mode with e.g. " "\"echo profile_peak > " "/sys/class/drm/card0/device/power_dpm_force_performance_level\"\n"); - } else if (!si_init_thread_trace((struct si_context *)ctx)) { + } else if (!si_init_sqtt((struct si_context *)ctx)) { FREE(ctx); return NULL; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 29bc59f..44a4f3e 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1319,11 +1319,11 @@ struct si_context { void (*emit_spi_map[33])(struct si_context *sctx); /* SQTT */ - struct ac_thread_trace_data *thread_trace; + struct ac_sqtt *sqtt; struct ac_spm spm; struct pipe_fence_handle *last_sqtt_fence; enum rgp_sqtt_marker_event_type sqtt_next_event; - bool thread_trace_enabled; + bool sqtt_enabled; unsigned context_flags; @@ -1666,7 +1666,7 @@ void si_sqtt_write_event_marker(struct si_context* sctx, struct radeon_cmdbuf *r uint32_t instance_offset_user_data, uint32_t draw_index_user_data); bool si_sqtt_register_pipeline(struct si_context* sctx, struct si_sqtt_fake_pipeline *pipeline, bool is_compute); -bool si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data, +bool si_sqtt_pipeline_is_registered(struct ac_sqtt *sqtt, uint64_t pipeline_hash); void si_sqtt_describe_pipeline_bind(struct si_context* sctx, uint64_t pipeline_hash, int bind_point); void @@ -1681,9 +1681,9 @@ void si_sqtt_describe_barrier_start(struct si_context* sctx, struct radeon_cmdbuf *rcs); void si_sqtt_describe_barrier_end(struct si_context* sctx, struct radeon_cmdbuf *rcs, unsigned flags); -bool si_init_thread_trace(struct si_context *sctx); -void si_destroy_thread_trace(struct si_context *sctx); -void si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs); +bool si_init_sqtt(struct si_context *sctx); +void si_destroy_sqtt(struct si_context *sctx); +void si_handle_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs); /* * common helpers diff --git a/src/gallium/drivers/radeonsi/si_sqtt.c b/src/gallium/drivers/radeonsi/si_sqtt.c index d9c91c1..8fd6f9a 100644 --- a/src/gallium/drivers/radeonsi/si_sqtt.c +++ b/src/gallium/drivers/radeonsi/si_sqtt.c @@ -39,825 +39,784 @@ static void si_emit_spi_config_cntl(struct si_context* sctx, struct radeon_cmdbuf *cs, bool enable); -static bool -si_thread_trace_init_bo(struct si_context *sctx) -{ - unsigned max_se = sctx->screen->info.max_se; - struct radeon_winsys *ws = sctx->ws; - uint64_t size; - - /* The buffer size and address need to be aligned in HW regs. Align the - * size as early as possible so that we do all the allocation & addressing - * correctly. */ - sctx->thread_trace->buffer_size = align64(sctx->thread_trace->buffer_size, - 1u << SQTT_BUFFER_ALIGN_SHIFT); - - /* Compute total size of the thread trace BO for all SEs. */ - size = align64(sizeof(struct ac_thread_trace_info) * max_se, - 1 << SQTT_BUFFER_ALIGN_SHIFT); - size += sctx->thread_trace->buffer_size * (uint64_t)max_se; - - sctx->thread_trace->pipeline_bos = _mesa_hash_table_u64_create(NULL); - - sctx->thread_trace->bo = - ws->buffer_create(ws, size, 4096, - RADEON_DOMAIN_VRAM, +static bool si_sqtt_init_bo(struct si_context *sctx) { + unsigned max_se = sctx->screen->info.max_se; + struct radeon_winsys *ws = sctx->ws; + uint64_t size; + + /* The buffer size and address need to be aligned in HW regs. Align the + * size as early as possible so that we do all the allocation & addressing + * correctly. */ + sctx->sqtt->buffer_size = + align64(sctx->sqtt->buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT); + + /* Compute total size of the thread trace BO for all SEs. */ + size = align64(sizeof(struct ac_sqtt_data_info) * max_se, + 1 << SQTT_BUFFER_ALIGN_SHIFT); + size += sctx->sqtt->buffer_size * (uint64_t)max_se; + + sctx->sqtt->pipeline_bos = _mesa_hash_table_u64_create(NULL); + + sctx->sqtt->bo = + ws->buffer_create(ws, size, 4096, RADEON_DOMAIN_VRAM, RADEON_FLAG_NO_INTERPROCESS_SHARING | - RADEON_FLAG_GTT_WC | - RADEON_FLAG_NO_SUBALLOC); - if (!sctx->thread_trace->bo) - return false; + RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_SUBALLOC); + if (!sctx->sqtt->bo) + return false; - return true; + return true; } -static void -si_emit_thread_trace_start(struct si_context* sctx, - struct radeon_cmdbuf *cs, - uint32_t queue_family_index) -{ - struct si_screen *sscreen = sctx->screen; - uint32_t shifted_size = sctx->thread_trace->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT; - unsigned max_se = sscreen->info.max_se; - - radeon_begin(cs); - - for (unsigned se = 0; se < max_se; se++) { - uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo); - uint64_t data_va = ac_thread_trace_get_data_va(&sctx->screen->info, sctx->thread_trace, va, se); - uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT; - - if (ac_sqtt_se_is_disabled(&sctx->screen->info, se)) - continue; - - /* Target SEx and SH0. */ - radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, - S_030800_SE_INDEX(se) | - S_030800_SH_INDEX(0) | - S_030800_INSTANCE_BROADCAST_WRITES(1)); - - /* Select the first active CUs */ - int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]); - - if (sctx->gfx_level >= GFX10) { - uint32_t token_mask = V_008D18_REG_INCLUDE_SQDEC | - V_008D18_REG_INCLUDE_SHDEC | - V_008D18_REG_INCLUDE_GFXUDEC | - V_008D18_REG_INCLUDE_CONTEXT | - V_008D18_REG_INCLUDE_COMP | - V_008D18_REG_INCLUDE_CONFIG; - int wgp = first_active_cu / 2; - unsigned shader_mask = 0x7f; /* all shader stages */ - - /* Order seems important for the following 2 registers. */ - if (sctx->gfx_level >= GFX11) { - /* Disable unsupported hw shader stages */ - shader_mask &= ~(0x02 /* VS */ | 0x08 /* ES */ | 0x20 /* LS */); - - radeon_set_uconfig_reg(R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE, - S_0367A4_SIZE(shifted_size) | - S_0367A4_BASE_HI(shifted_va >> 32)); - - radeon_set_uconfig_reg(R_0367A0_SQ_THREAD_TRACE_BUF0_BASE, shifted_va); - - radeon_set_uconfig_reg(R_0367B4_SQ_THREAD_TRACE_MASK, - S_0367B4_WTYPE_INCLUDE(shader_mask) | - S_0367B4_SA_SEL(0) | - S_0367B4_WGP_SEL(wgp) | - S_0367B4_SIMD_SEL(0)); - - radeon_set_uconfig_reg(R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK, - S_0367B8_REG_INCLUDE(token_mask) | - S_0367B8_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF)); - } else { - radeon_set_privileged_config_reg(R_008D04_SQ_THREAD_TRACE_BUF0_SIZE, - S_008D04_SIZE(shifted_size) | - S_008D04_BASE_HI(shifted_va >> 32)); - - radeon_set_privileged_config_reg(R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va); - - radeon_set_privileged_config_reg(R_008D14_SQ_THREAD_TRACE_MASK, - S_008D14_WTYPE_INCLUDE(shader_mask) | - S_008D14_SA_SEL(0) | - S_008D14_WGP_SEL(wgp) | - S_008D14_SIMD_SEL(0)); - - radeon_set_privileged_config_reg(R_008D18_SQ_THREAD_TRACE_TOKEN_MASK, - S_008D18_REG_INCLUDE(token_mask) | - S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF)); - } - - /* Should be emitted last (it enables thread traces). */ - uint32_t ctrl = S_008D1C_MODE(1) | S_008D1C_HIWATER(5) | S_008D1C_UTIL_TIMER(1) | - S_008D1C_RT_FREQ(2) | /* 4096 clk */S_008D1C_DRAW_EVENT_EN(1); - - if (sctx->gfx_level == GFX10_3) - ctrl |= S_008D1C_LOWATER_OFFSET(4); - - ctrl |= S_008D1C_AUTO_FLUSH_MODE(sctx->screen->info.has_sqtt_auto_flush_mode_bug); - - switch (sctx->gfx_level) { - case GFX10: - case GFX10_3: - ctrl |= S_008D1C_REG_STALL_EN(1) | S_008D1C_SPI_STALL_EN(1) | - S_008D1C_SQ_STALL_EN(1) |S_008D1C_REG_DROP_ON_STALL(0); - radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL, ctrl); - break; - case GFX11: - ctrl |= S_0367B0_SPI_STALL_EN(1) | S_0367B0_SQ_STALL_EN(1) | - S_0367B0_REG_AT_HWM(2); - radeon_set_uconfig_reg(R_0367B0_SQ_THREAD_TRACE_CTRL, ctrl); - break; - default: - assert(false); - } +static void si_emit_sqtt_start(struct si_context *sctx, + struct radeon_cmdbuf *cs, + uint32_t queue_family_index) { + struct si_screen *sscreen = sctx->screen; + uint32_t shifted_size = sctx->sqtt->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT; + unsigned max_se = sscreen->info.max_se; + + radeon_begin(cs); + + for (unsigned se = 0; se < max_se; se++) { + uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->sqtt->bo); + uint64_t data_va = + ac_sqtt_get_data_va(&sctx->screen->info, sctx->sqtt, va, se); + uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT; + + if (ac_sqtt_se_is_disabled(&sctx->screen->info, se)) + continue; + + /* Target SEx and SH0. */ + radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, + S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | + S_030800_INSTANCE_BROADCAST_WRITES(1)); + + /* Select the first active CUs */ + int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]); + + if (sctx->gfx_level >= GFX10) { + uint32_t token_mask = + V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC | + V_008D18_REG_INCLUDE_GFXUDEC | V_008D18_REG_INCLUDE_CONTEXT | + V_008D18_REG_INCLUDE_COMP | V_008D18_REG_INCLUDE_CONFIG; + int wgp = first_active_cu / 2; + unsigned shader_mask = 0x7f; /* all shader stages */ + + /* Order seems important for the following 2 registers. */ + if (sctx->gfx_level >= GFX11) { + /* Disable unsupported hw shader stages */ + shader_mask &= ~(0x02 /* VS */ | 0x08 /* ES */ | 0x20 /* LS */); + + radeon_set_uconfig_reg(R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE, + S_0367A4_SIZE(shifted_size) | + S_0367A4_BASE_HI(shifted_va >> 32)); + + radeon_set_uconfig_reg(R_0367A0_SQ_THREAD_TRACE_BUF0_BASE, shifted_va); + + radeon_set_uconfig_reg(R_0367B4_SQ_THREAD_TRACE_MASK, + S_0367B4_WTYPE_INCLUDE(shader_mask) | + S_0367B4_SA_SEL(0) | S_0367B4_WGP_SEL(wgp) | + S_0367B4_SIMD_SEL(0)); + + radeon_set_uconfig_reg( + R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK, + S_0367B8_REG_INCLUDE(token_mask) | + S_0367B8_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF)); } else { - /* Order seems important for the following 4 registers. */ - radeon_set_uconfig_reg(R_030CDC_SQ_THREAD_TRACE_BASE2, - S_030CDC_ADDR_HI(shifted_va >> 32)); - - radeon_set_uconfig_reg(R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va); - - radeon_set_uconfig_reg(R_030CC4_SQ_THREAD_TRACE_SIZE, - S_030CC4_SIZE(shifted_size)); - - radeon_set_uconfig_reg(R_030CD4_SQ_THREAD_TRACE_CTRL, - S_030CD4_RESET_BUFFER(1)); - - uint32_t thread_trace_mask = S_030CC8_CU_SEL(first_active_cu) | - S_030CC8_SH_SEL(0) | - S_030CC8_SIMD_EN(0xf) | - S_030CC8_VM_ID_MASK(0) | - S_030CC8_REG_STALL_EN(1) | - S_030CC8_SPI_STALL_EN(1) | - S_030CC8_SQ_STALL_EN(1); - - radeon_set_uconfig_reg(R_030CC8_SQ_THREAD_TRACE_MASK, - thread_trace_mask); - - /* Trace all tokens and registers. */ - radeon_set_uconfig_reg(R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK, - S_030CCC_TOKEN_MASK(0xbfff) | - S_030CCC_REG_MASK(0xff) | - S_030CCC_REG_DROP_ON_STALL(0)); - - /* Enable SQTT perf counters for all CUs. */ - radeon_set_uconfig_reg(R_030CD0_SQ_THREAD_TRACE_PERF_MASK, - S_030CD0_SH0_MASK(0xffff) | - S_030CD0_SH1_MASK(0xffff)); - - radeon_set_uconfig_reg(R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff); - - radeon_set_uconfig_reg(R_030CEC_SQ_THREAD_TRACE_HIWATER, - S_030CEC_HIWATER(4)); - - if (sctx->gfx_level == GFX9) { - /* Reset thread trace status errors. */ - radeon_set_uconfig_reg(R_030CE8_SQ_THREAD_TRACE_STATUS, - S_030CE8_UTC_ERROR(0)); - } - - /* Enable the thread trace mode. */ - uint32_t thread_trace_mode = - S_030CD8_MASK_PS(1) | - S_030CD8_MASK_VS(1) | - S_030CD8_MASK_GS(1) | - S_030CD8_MASK_ES(1) | - S_030CD8_MASK_HS(1) | - S_030CD8_MASK_LS(1) | - S_030CD8_MASK_CS(1) | - S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */ - S_030CD8_MODE(1); - - if (sctx->gfx_level == GFX9) { - /* Count SQTT traffic in TCC perf counters. */ - thread_trace_mode |= S_030CD8_TC_PERF_EN(1); - } - - radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE, - thread_trace_mode); + radeon_set_privileged_config_reg( + R_008D04_SQ_THREAD_TRACE_BUF0_SIZE, + S_008D04_SIZE(shifted_size) | S_008D04_BASE_HI(shifted_va >> 32)); + + radeon_set_privileged_config_reg(R_008D00_SQ_THREAD_TRACE_BUF0_BASE, + shifted_va); + + radeon_set_privileged_config_reg( + R_008D14_SQ_THREAD_TRACE_MASK, + S_008D14_WTYPE_INCLUDE(shader_mask) | S_008D14_SA_SEL(0) | + S_008D14_WGP_SEL(wgp) | S_008D14_SIMD_SEL(0)); + + radeon_set_privileged_config_reg( + R_008D18_SQ_THREAD_TRACE_TOKEN_MASK, + S_008D18_REG_INCLUDE(token_mask) | + S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF)); } - } - - /* Restore global broadcasting. */ - radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, - S_030800_SE_BROADCAST_WRITES(1) | - S_030800_SH_BROADCAST_WRITES(1) | - S_030800_INSTANCE_BROADCAST_WRITES(1)); - - /* Start the thread trace with a different event based on the queue. */ - if (queue_family_index == AMD_IP_COMPUTE) { - radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, - S_00B878_THREAD_TRACE_ENABLE(1)); - } else { - radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0)); - } - radeon_end(); -} -static const uint32_t gfx9_thread_trace_info_regs[] = -{ - R_030CE4_SQ_THREAD_TRACE_WPTR, - R_030CE8_SQ_THREAD_TRACE_STATUS, - R_030CF0_SQ_THREAD_TRACE_CNTR, -}; + /* Should be emitted last (it enables thread traces). */ + uint32_t ctrl = S_008D1C_MODE(1) | S_008D1C_HIWATER(5) | + S_008D1C_UTIL_TIMER(1) | S_008D1C_RT_FREQ(2) | + /* 4096 clk */ S_008D1C_DRAW_EVENT_EN(1); + + if (sctx->gfx_level == GFX10_3) + ctrl |= S_008D1C_LOWATER_OFFSET(4); + + ctrl |= S_008D1C_AUTO_FLUSH_MODE( + sctx->screen->info.has_sqtt_auto_flush_mode_bug); + + switch (sctx->gfx_level) { + case GFX10: + case GFX10_3: + ctrl |= S_008D1C_REG_STALL_EN(1) | S_008D1C_SPI_STALL_EN(1) | + S_008D1C_SQ_STALL_EN(1) | S_008D1C_REG_DROP_ON_STALL(0); + radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL, ctrl); + break; + case GFX11: + ctrl |= S_0367B0_SPI_STALL_EN(1) | S_0367B0_SQ_STALL_EN(1) | + S_0367B0_REG_AT_HWM(2); + radeon_set_uconfig_reg(R_0367B0_SQ_THREAD_TRACE_CTRL, ctrl); + break; + default: + assert(false); + } + } else { + /* Order seems important for the following 4 registers. */ + radeon_set_uconfig_reg(R_030CDC_SQ_THREAD_TRACE_BASE2, + S_030CDC_ADDR_HI(shifted_va >> 32)); -static const uint32_t gfx10_thread_trace_info_regs[] = -{ - R_008D10_SQ_THREAD_TRACE_WPTR, - R_008D20_SQ_THREAD_TRACE_STATUS, - R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR, -}; + radeon_set_uconfig_reg(R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va); -static const uint32_t gfx11_thread_trace_info_regs[] = -{ - R_0367BC_SQ_THREAD_TRACE_WPTR, - R_0367D0_SQ_THREAD_TRACE_STATUS, - R_0367E8_SQ_THREAD_TRACE_DROPPED_CNTR, -}; + radeon_set_uconfig_reg(R_030CC4_SQ_THREAD_TRACE_SIZE, + S_030CC4_SIZE(shifted_size)); + radeon_set_uconfig_reg(R_030CD4_SQ_THREAD_TRACE_CTRL, + S_030CD4_RESET_BUFFER(1)); -static void -si_copy_thread_trace_info_regs(struct si_context* sctx, - struct radeon_cmdbuf *cs, - unsigned se_index) -{ - const uint32_t *thread_trace_info_regs = NULL; - - switch (sctx->gfx_level) { - case GFX10_3: - case GFX10: - thread_trace_info_regs = gfx10_thread_trace_info_regs; - break; - case GFX11: - thread_trace_info_regs = gfx11_thread_trace_info_regs; - break; - case GFX9: - thread_trace_info_regs = gfx9_thread_trace_info_regs; - break; - default: - unreachable("Unsupported gfx_level"); - } + uint32_t sqtt_mask = S_030CC8_CU_SEL(first_active_cu) | + S_030CC8_SH_SEL(0) | S_030CC8_SIMD_EN(0xf) | + S_030CC8_VM_ID_MASK(0) | S_030CC8_REG_STALL_EN(1) | + S_030CC8_SPI_STALL_EN(1) | S_030CC8_SQ_STALL_EN(1); - /* Get the VA where the info struct is stored for this SE. */ - uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo); - uint64_t info_va = ac_thread_trace_get_info_va(va, se_index); + radeon_set_uconfig_reg(R_030CC8_SQ_THREAD_TRACE_MASK, sqtt_mask); - radeon_begin(cs); + /* Trace all tokens and registers. */ + radeon_set_uconfig_reg(R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK, + S_030CCC_TOKEN_MASK(0xbfff) | + S_030CCC_REG_MASK(0xff) | + S_030CCC_REG_DROP_ON_STALL(0)); - /* Copy back the info struct one DWORD at a time. */ - for (unsigned i = 0; i < 3; i++) { - radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0)); - radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | - COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | - COPY_DATA_WR_CONFIRM); - radeon_emit(thread_trace_info_regs[i] >> 2); - radeon_emit(0); /* unused */ - radeon_emit((info_va + i * 4)); - radeon_emit((info_va + i * 4) >> 32); - } + /* Enable SQTT perf counters for all CUs. */ + radeon_set_uconfig_reg(R_030CD0_SQ_THREAD_TRACE_PERF_MASK, + S_030CD0_SH0_MASK(0xffff) | + S_030CD0_SH1_MASK(0xffff)); - if (sctx->gfx_level == GFX11) { - /* On GFX11, WPTR is incremented from the offset of the current buffer base address and it - * needs to be subtracted to get the correct offset: - * - * 1) get the current buffer base address for this SE - * 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned - * 3) mask off the higher 3 bits because WPTR.OFFSET is 29 bits - */ - uint64_t data_va = - ac_thread_trace_get_data_va(&sctx->screen->info, sctx->thread_trace, va, se_index); - uint64_t shifted_data_va = (data_va >> 5); - uint64_t init_wptr_value = shifted_data_va & 0x1fffffff; - - radeon_emit(PKT3(PKT3_ATOMIC_MEM, 7, 0)); - radeon_emit(ATOMIC_OP(TC_OP_ATOMIC_SUB_32)); - radeon_emit(info_va); - radeon_emit(info_va >> 32); - radeon_emit(init_wptr_value); - radeon_emit(init_wptr_value >> 32); - radeon_emit(0); - radeon_emit(0); - radeon_emit(0); - } - - radeon_end(); -} + radeon_set_uconfig_reg(R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff); + radeon_set_uconfig_reg(R_030CEC_SQ_THREAD_TRACE_HIWATER, + S_030CEC_HIWATER(4)); + if (sctx->gfx_level == GFX9) { + /* Reset thread trace status errors. */ + radeon_set_uconfig_reg(R_030CE8_SQ_THREAD_TRACE_STATUS, + S_030CE8_UTC_ERROR(0)); + } -static void -si_emit_thread_trace_stop(struct si_context *sctx, - struct radeon_cmdbuf *cs, - uint32_t queue_family_index) -{ - unsigned max_se = sctx->screen->info.max_se; + /* Enable the thread trace mode. */ + uint32_t sqtt_mode = S_030CD8_MASK_PS(1) | S_030CD8_MASK_VS(1) | + S_030CD8_MASK_GS(1) | S_030CD8_MASK_ES(1) | + S_030CD8_MASK_HS(1) | S_030CD8_MASK_LS(1) | + S_030CD8_MASK_CS(1) | + S_030CD8_AUTOFLUSH_EN( + 1) | /* periodically flush SQTT data to memory */ + S_030CD8_MODE(1); + + if (sctx->gfx_level == GFX9) { + /* Count SQTT traffic in TCC perf counters. */ + sqtt_mode |= S_030CD8_TC_PERF_EN(1); + } - radeon_begin(cs); + radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE, sqtt_mode); + } + } - /* Stop the thread trace with a different event based on the queue. */ - if (queue_family_index == AMD_IP_COMPUTE) { - radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, - S_00B878_THREAD_TRACE_ENABLE(0)); - } else { - radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0)); - } + /* Restore global broadcasting. */ + radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, + S_030800_SE_BROADCAST_WRITES(1) | + S_030800_SH_BROADCAST_WRITES(1) | + S_030800_INSTANCE_BROADCAST_WRITES(1)); - radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0)); - radeon_end(); + /* Start the thread trace with a different event based on the queue. */ + if (queue_family_index == AMD_IP_COMPUTE) { + radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, + S_00B878_THREAD_TRACE_ENABLE(1)); + } else { + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0)); + } + radeon_end(); +} - if (sctx->screen->info.has_sqtt_rb_harvest_bug) { - /* Some chips with disabled RBs should wait for idle because FINISH_DONE doesn't work. */ - sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB | - SI_CONTEXT_FLUSH_AND_INV_DB | - SI_CONTEXT_CS_PARTIAL_FLUSH; - sctx->emit_cache_flush(sctx, cs); - } +static const uint32_t gfx9_sqtt_info_regs[] = { + R_030CE4_SQ_THREAD_TRACE_WPTR, + R_030CE8_SQ_THREAD_TRACE_STATUS, + R_030CF0_SQ_THREAD_TRACE_CNTR, +}; - for (unsigned se = 0; se < max_se; se++) { - if (ac_sqtt_se_is_disabled(&sctx->screen->info, se)) - continue; +static const uint32_t gfx10_sqtt_info_regs[] = { + R_008D10_SQ_THREAD_TRACE_WPTR, + R_008D20_SQ_THREAD_TRACE_STATUS, + R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR, +}; - radeon_begin(cs); +static const uint32_t gfx11_sqtt_info_regs[] = { + R_0367BC_SQ_THREAD_TRACE_WPTR, + R_0367D0_SQ_THREAD_TRACE_STATUS, + R_0367E8_SQ_THREAD_TRACE_DROPPED_CNTR, +}; - /* Target SEi and SH0. */ - radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, - S_030800_SE_INDEX(se) | - S_030800_SH_INDEX(0) | - S_030800_INSTANCE_BROADCAST_WRITES(1)); +static void si_copy_sqtt_info_regs(struct si_context *sctx, + struct radeon_cmdbuf *cs, + unsigned se_index) { + const uint32_t *sqtt_info_regs = NULL; + + switch (sctx->gfx_level) { + case GFX10_3: + case GFX10: + sqtt_info_regs = gfx10_sqtt_info_regs; + break; + case GFX11: + sqtt_info_regs = gfx11_sqtt_info_regs; + break; + case GFX9: + sqtt_info_regs = gfx9_sqtt_info_regs; + break; + default: + unreachable("Unsupported gfx_level"); + } + + /* Get the VA where the info struct is stored for this SE. */ + uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->sqtt->bo); + uint64_t info_va = ac_sqtt_get_info_va(va, se_index); + + radeon_begin(cs); + + /* Copy back the info struct one DWORD at a time. */ + for (unsigned i = 0; i < 3; i++) { + radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0)); + radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | + COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM); + radeon_emit(sqtt_info_regs[i] >> 2); + radeon_emit(0); /* unused */ + radeon_emit((info_va + i * 4)); + radeon_emit((info_va + i * 4) >> 32); + } + + if (sctx->gfx_level == GFX11) { + /* On GFX11, WPTR is incremented from the offset of the current buffer base + * address and it needs to be subtracted to get the correct offset: + * + * 1) get the current buffer base address for this SE + * 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned + * 3) mask off the higher 3 bits because WPTR.OFFSET is 29 bits + */ + uint64_t data_va = + ac_sqtt_get_data_va(&sctx->screen->info, sctx->sqtt, va, se_index); + uint64_t shifted_data_va = (data_va >> 5); + uint64_t init_wptr_value = shifted_data_va & 0x1fffffff; + + radeon_emit(PKT3(PKT3_ATOMIC_MEM, 7, 0)); + radeon_emit(ATOMIC_OP(TC_OP_ATOMIC_SUB_32)); + radeon_emit(info_va); + radeon_emit(info_va >> 32); + radeon_emit(init_wptr_value); + radeon_emit(init_wptr_value >> 32); + radeon_emit(0); + radeon_emit(0); + radeon_emit(0); + } + + radeon_end(); +} - if (sctx->gfx_level >= GFX10) { - uint32_t tt_status_reg = sctx->gfx_level >= GFX11 ? R_0367D0_SQ_THREAD_TRACE_STATUS : - R_008D20_SQ_THREAD_TRACE_STATUS; - if (!sctx->screen->info.has_sqtt_rb_harvest_bug) { - /* Make sure to wait for the trace buffer. */ - radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); - radeon_emit(WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */ - radeon_emit(tt_status_reg >> 2); /* register */ - radeon_emit(0); - radeon_emit(0); /* reference value */ - radeon_emit(sctx->gfx_level >= GFX11 ? ~C_0367D0_FINISH_DONE : ~C_008D20_FINISH_DONE); /* mask */ - radeon_emit(4); /* poll interval */ - } - - /* Disable the thread trace mode. */ - if (sctx->gfx_level >= GFX11) - radeon_set_uconfig_reg(R_0367B0_SQ_THREAD_TRACE_CTRL, S_008D1C_MODE(0)); - else - radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL, S_008D1C_MODE(0)); - - /* Wait for thread trace completion. */ - radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); - radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ - radeon_emit(tt_status_reg >> 2); /* register */ - radeon_emit(0); - radeon_emit(0); /* reference value */ - radeon_emit(sctx->gfx_level >= GFX11 ? ~C_0367D0_BUSY : ~C_008D20_BUSY); /* mask */ - radeon_emit(4); /* poll interval */ - } else { - /* Disable the thread trace mode. */ - radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE, - S_030CD8_MODE(0)); - - /* Wait for thread trace completion. */ - radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); - radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ - radeon_emit(R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */ - radeon_emit(0); - radeon_emit(0); /* reference value */ - radeon_emit(~C_030CE8_BUSY); /* mask */ - radeon_emit(4); /* poll interval */ +static void si_emit_sqtt_stop(struct si_context *sctx, struct radeon_cmdbuf *cs, + uint32_t queue_family_index) { + unsigned max_se = sctx->screen->info.max_se; + + radeon_begin(cs); + + /* Stop the thread trace with a different event based on the queue. */ + if (queue_family_index == AMD_IP_COMPUTE) { + radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, + S_00B878_THREAD_TRACE_ENABLE(0)); + } else { + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0)); + } + + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0)); + radeon_end(); + + if (sctx->screen->info.has_sqtt_rb_harvest_bug) { + /* Some chips with disabled RBs should wait for idle because FINISH_DONE + * doesn't work. */ + sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB | + SI_CONTEXT_CS_PARTIAL_FLUSH; + sctx->emit_cache_flush(sctx, cs); + } + + for (unsigned se = 0; se < max_se; se++) { + if (ac_sqtt_se_is_disabled(&sctx->screen->info, se)) + continue; + + radeon_begin(cs); + + /* Target SEi and SH0. */ + radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, + S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | + S_030800_INSTANCE_BROADCAST_WRITES(1)); + + if (sctx->gfx_level >= GFX10) { + uint32_t tt_status_reg = sctx->gfx_level >= GFX11 + ? R_0367D0_SQ_THREAD_TRACE_STATUS + : R_008D20_SQ_THREAD_TRACE_STATUS; + if (!sctx->screen->info.has_sqtt_rb_harvest_bug) { + /* Make sure to wait for the trace buffer. */ + radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); + radeon_emit(WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal + to the reference value */ + radeon_emit(tt_status_reg >> 2); /* register */ + radeon_emit(0); + radeon_emit(0); /* reference value */ + radeon_emit(sctx->gfx_level >= GFX11 + ? ~C_0367D0_FINISH_DONE + : ~C_008D20_FINISH_DONE); /* mask */ + radeon_emit(4); /* poll interval */ } - radeon_end(); - si_copy_thread_trace_info_regs(sctx, cs, se); - } - - /* Restore global broadcasting. */ - radeon_begin_again(cs); - radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, - S_030800_SE_BROADCAST_WRITES(1) | + /* Disable the thread trace mode. */ + if (sctx->gfx_level >= GFX11) + radeon_set_uconfig_reg(R_0367B0_SQ_THREAD_TRACE_CTRL, S_008D1C_MODE(0)); + else + radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL, + S_008D1C_MODE(0)); + + /* Wait for thread trace completion. */ + radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); + radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to + the reference value */ + radeon_emit(tt_status_reg >> 2); /* register */ + radeon_emit(0); + radeon_emit(0); /* reference value */ + radeon_emit(sctx->gfx_level >= GFX11 ? ~C_0367D0_BUSY + : ~C_008D20_BUSY); /* mask */ + radeon_emit(4); /* poll interval */ + } else { + /* Disable the thread trace mode. */ + radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE, S_030CD8_MODE(0)); + + /* Wait for thread trace completion. */ + radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); + radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to + the reference value */ + radeon_emit(R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */ + radeon_emit(0); + radeon_emit(0); /* reference value */ + radeon_emit(~C_030CE8_BUSY); /* mask */ + radeon_emit(4); /* poll interval */ + } + radeon_end(); + + si_copy_sqtt_info_regs(sctx, cs, se); + } + + /* Restore global broadcasting. */ + radeon_begin_again(cs); + radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, + S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1)); - radeon_end(); + radeon_end(); } -static void -si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf *cs) -{ - struct radeon_winsys *ws = sctx->ws; - - radeon_begin(cs); - - switch (family) { - case AMD_IP_GFX: - radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); - radeon_emit(CC0_UPDATE_LOAD_ENABLES(1)); - radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1)); - break; - case AMD_IP_COMPUTE: - radeon_emit(PKT3(PKT3_NOP, 0, 0)); - radeon_emit(0); - break; - } - radeon_end(); - - ws->cs_add_buffer(cs, - sctx->thread_trace->bo, - RADEON_USAGE_READWRITE, - RADEON_DOMAIN_VRAM); - if (sctx->spm.bo) - ws->cs_add_buffer(cs, - sctx->spm.bo, - RADEON_USAGE_READWRITE, - RADEON_DOMAIN_VRAM); - - si_cp_dma_wait_for_idle(sctx, cs); - - /* Make sure to wait-for-idle before starting SQTT. */ - sctx->flags |= - SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | - SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME; - sctx->emit_cache_flush(sctx, cs); - - si_inhibit_clockgating(sctx, cs, true); - - /* Enable SQG events that collects thread trace data. */ - si_emit_spi_config_cntl(sctx, cs, true); - - if (sctx->spm.bo) { - si_pc_emit_spm_reset(cs); - si_pc_emit_shaders(cs, 0x7f); - si_emit_spm_setup(sctx, cs); - } - - si_emit_thread_trace_start(sctx, cs, family); - - if (sctx->spm.bo) - si_pc_emit_spm_start(cs); +static void si_sqtt_start(struct si_context *sctx, int family, + struct radeon_cmdbuf *cs) { + struct radeon_winsys *ws = sctx->ws; + + radeon_begin(cs); + + switch (family) { + case AMD_IP_GFX: + radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); + radeon_emit(CC0_UPDATE_LOAD_ENABLES(1)); + radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1)); + break; + case AMD_IP_COMPUTE: + radeon_emit(PKT3(PKT3_NOP, 0, 0)); + radeon_emit(0); + break; + } + radeon_end(); + + ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE, + RADEON_DOMAIN_VRAM); + if (sctx->spm.bo) + ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE, + RADEON_DOMAIN_VRAM); + + si_cp_dma_wait_for_idle(sctx, cs); + + /* Make sure to wait-for-idle before starting SQTT. */ + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | + SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | + SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_L2 | + SI_CONTEXT_PFP_SYNC_ME; + sctx->emit_cache_flush(sctx, cs); + + si_inhibit_clockgating(sctx, cs, true); + + /* Enable SQG events that collects thread trace data. */ + si_emit_spi_config_cntl(sctx, cs, true); + + if (sctx->spm.bo) { + si_pc_emit_spm_reset(cs); + si_pc_emit_shaders(cs, 0x7f); + si_emit_spm_setup(sctx, cs); + } + + si_emit_sqtt_start(sctx, cs, family); + + if (sctx->spm.bo) + si_pc_emit_spm_start(cs); } -static void -si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *cs) -{ - struct radeon_winsys *ws = sctx->ws; +static void si_sqtt_stop(struct si_context *sctx, int family, + struct radeon_cmdbuf *cs) { + struct radeon_winsys *ws = sctx->ws; - radeon_begin(cs); + radeon_begin(cs); - switch (family) { - case AMD_IP_GFX: - radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); - radeon_emit(CC0_UPDATE_LOAD_ENABLES(1)); - radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1)); - break; - case AMD_IP_COMPUTE: - radeon_emit(PKT3(PKT3_NOP, 0, 0)); - radeon_emit(0); - break; - } - radeon_end(); + switch (family) { + case AMD_IP_GFX: + radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); + radeon_emit(CC0_UPDATE_LOAD_ENABLES(1)); + radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1)); + break; + case AMD_IP_COMPUTE: + radeon_emit(PKT3(PKT3_NOP, 0, 0)); + radeon_emit(0); + break; + } + radeon_end(); - ws->cs_add_buffer(cs, - sctx->thread_trace->bo, - RADEON_USAGE_READWRITE, - RADEON_DOMAIN_VRAM); + ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE, + RADEON_DOMAIN_VRAM); - if (sctx->spm.bo) - ws->cs_add_buffer(cs, - sctx->spm.bo, - RADEON_USAGE_READWRITE, - RADEON_DOMAIN_VRAM); + if (sctx->spm.bo) + ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE, + RADEON_DOMAIN_VRAM); - si_cp_dma_wait_for_idle(sctx, cs); + si_cp_dma_wait_for_idle(sctx, cs); - if (sctx->spm.bo) - si_pc_emit_spm_stop(cs, sctx->screen->info.never_stop_sq_perf_counters, - sctx->screen->info.never_send_perfcounter_stop); + if (sctx->spm.bo) + si_pc_emit_spm_stop(cs, sctx->screen->info.never_stop_sq_perf_counters, + sctx->screen->info.never_send_perfcounter_stop); - /* Make sure to wait-for-idle before stopping SQTT. */ - sctx->flags |= - SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | - SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME; - sctx->emit_cache_flush(sctx, cs); + /* Make sure to wait-for-idle before stopping SQTT. */ + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | + SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | + SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_L2 | + SI_CONTEXT_PFP_SYNC_ME; + sctx->emit_cache_flush(sctx, cs); - si_emit_thread_trace_stop(sctx, cs, family); + si_emit_sqtt_stop(sctx, cs, family); - if (sctx->spm.bo) - si_pc_emit_spm_reset(cs); + if (sctx->spm.bo) + si_pc_emit_spm_reset(cs); - /* Restore previous state by disabling SQG events. */ - si_emit_spi_config_cntl(sctx, cs, false); + /* Restore previous state by disabling SQG events. */ + si_emit_spi_config_cntl(sctx, cs, false); - si_inhibit_clockgating(sctx, cs, false); + si_inhibit_clockgating(sctx, cs, false); } - -static void -si_thread_trace_init_cs(struct si_context *sctx) -{ - struct radeon_winsys *ws = sctx->ws; - - /* Thread trace start CS (only handles AMD_IP_GFX). */ - sctx->thread_trace->start_cs[AMD_IP_GFX] = CALLOC_STRUCT(radeon_cmdbuf); - if (!ws->cs_create(sctx->thread_trace->start_cs[AMD_IP_GFX], - sctx->ctx, AMD_IP_GFX, NULL, NULL, 0)) { - free(sctx->thread_trace->start_cs[AMD_IP_GFX]); - sctx->thread_trace->start_cs[AMD_IP_GFX] = NULL; - return; - } - - si_thread_trace_start(sctx, AMD_IP_GFX, sctx->thread_trace->start_cs[AMD_IP_GFX]); - - /* Thread trace stop CS. */ - sctx->thread_trace->stop_cs[AMD_IP_GFX] = CALLOC_STRUCT(radeon_cmdbuf); - if (!ws->cs_create(sctx->thread_trace->stop_cs[AMD_IP_GFX], - sctx->ctx, AMD_IP_GFX, NULL, NULL, 0)) { - free(sctx->thread_trace->start_cs[AMD_IP_GFX]); - sctx->thread_trace->start_cs[AMD_IP_GFX] = NULL; - free(sctx->thread_trace->stop_cs[AMD_IP_GFX]); - sctx->thread_trace->stop_cs[AMD_IP_GFX] = NULL; - return; - } - - si_thread_trace_stop(sctx, AMD_IP_GFX, sctx->thread_trace->stop_cs[AMD_IP_GFX]); +static void si_sqtt_init_cs(struct si_context *sctx) { + struct radeon_winsys *ws = sctx->ws; + + /* Thread trace start CS (only handles AMD_IP_GFX). */ + sctx->sqtt->start_cs[AMD_IP_GFX] = CALLOC_STRUCT(radeon_cmdbuf); + if (!ws->cs_create(sctx->sqtt->start_cs[AMD_IP_GFX], sctx->ctx, AMD_IP_GFX, + NULL, NULL, 0)) { + free(sctx->sqtt->start_cs[AMD_IP_GFX]); + sctx->sqtt->start_cs[AMD_IP_GFX] = NULL; + return; + } + + si_sqtt_start(sctx, AMD_IP_GFX, sctx->sqtt->start_cs[AMD_IP_GFX]); + + /* Thread trace stop CS. */ + sctx->sqtt->stop_cs[AMD_IP_GFX] = CALLOC_STRUCT(radeon_cmdbuf); + if (!ws->cs_create(sctx->sqtt->stop_cs[AMD_IP_GFX], sctx->ctx, AMD_IP_GFX, + NULL, NULL, 0)) { + free(sctx->sqtt->start_cs[AMD_IP_GFX]); + sctx->sqtt->start_cs[AMD_IP_GFX] = NULL; + free(sctx->sqtt->stop_cs[AMD_IP_GFX]); + sctx->sqtt->stop_cs[AMD_IP_GFX] = NULL; + return; + } + + si_sqtt_stop(sctx, AMD_IP_GFX, sctx->sqtt->stop_cs[AMD_IP_GFX]); } -static void -si_begin_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs) -{ - struct radeon_cmdbuf *cs = sctx->thread_trace->start_cs[AMD_IP_GFX]; - sctx->ws->cs_flush(cs, 0, NULL); +static void si_begin_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs) { + struct radeon_cmdbuf *cs = sctx->sqtt->start_cs[AMD_IP_GFX]; + sctx->ws->cs_flush(cs, 0, NULL); } -static void -si_end_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs) -{ - struct radeon_cmdbuf *cs = sctx->thread_trace->stop_cs[AMD_IP_GFX]; - sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence); +static void si_end_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs) { + struct radeon_cmdbuf *cs = sctx->sqtt->stop_cs[AMD_IP_GFX]; + sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence); } -static bool -si_get_thread_trace(struct si_context *sctx, - struct ac_thread_trace *thread_trace) -{ - unsigned max_se = sctx->screen->info.max_se; +static bool si_get_sqtt_trace(struct si_context *sctx, + struct ac_sqtt_trace *sqtt) { + unsigned max_se = sctx->screen->info.max_se; - memset(thread_trace, 0, sizeof(*thread_trace)); + memset(sqtt, 0, sizeof(*sqtt)); - sctx->thread_trace->ptr = sctx->ws->buffer_map(sctx->ws, sctx->thread_trace->bo, - NULL, - PIPE_MAP_READ); + sctx->sqtt->ptr = + sctx->ws->buffer_map(sctx->ws, sctx->sqtt->bo, NULL, PIPE_MAP_READ); - if (!sctx->thread_trace->ptr) - return false; + if (!sctx->sqtt->ptr) + return false; - if (!ac_sqtt_get_trace(sctx->thread_trace, &sctx->screen->info, - thread_trace)) { - void *thread_trace_ptr = sctx->thread_trace->ptr; + if (!ac_sqtt_get_trace(sctx->sqtt, &sctx->screen->info, sqtt)) { + void *sqtt_ptr = sctx->sqtt->ptr; - for (unsigned se = 0; se < max_se; se++) { - uint64_t info_offset = ac_thread_trace_get_info_offset(se); - void *info_ptr = thread_trace_ptr + info_offset; - struct ac_thread_trace_info *info = - (struct ac_thread_trace_info *)info_ptr; + for (unsigned se = 0; se < max_se; se++) { + uint64_t info_offset = ac_sqtt_get_info_offset(se); + void *info_ptr = sqtt_ptr + info_offset; + struct ac_sqtt_data_info *info = (struct ac_sqtt_data_info *)info_ptr; - if (ac_sqtt_se_is_disabled(&sctx->screen->info, se)) - continue; - - if (!ac_is_thread_trace_complete(&sctx->screen->info, sctx->thread_trace, info)) { - uint32_t expected_size = - ac_get_expected_buffer_size(&sctx->screen->info, info); - uint32_t available_size = (info->cur_offset * 32) / 1024; - - fprintf(stderr, "Failed to get the thread trace " - "because the buffer is too small. The " - "hardware needs %d KB but the " - "buffer size is %d KB.\n", - expected_size, available_size); - fprintf(stderr, "Please update the buffer size with " - "AMD_THREAD_TRACE_BUFFER_SIZE=\n"); - return false; - } + if (ac_sqtt_se_is_disabled(&sctx->screen->info, se)) + continue; + + if (!ac_is_sqtt_complete(&sctx->screen->info, sctx->sqtt, info)) { + uint32_t expected_size = + ac_get_expected_buffer_size(&sctx->screen->info, info); + uint32_t available_size = (info->cur_offset * 32) / 1024; + + fprintf(stderr, + "Failed to get the thread trace " + "because the buffer is too small. The " + "hardware needs %d KB but the " + "buffer size is %d KB.\n", + expected_size, available_size); + fprintf(stderr, "Please update the buffer size with " + "AMD_THREAD_TRACE_BUFFER_SIZE=\n"); + return false; } - } + } + } - return true; + return true; } - -bool -si_init_thread_trace(struct si_context *sctx) -{ - static bool warn_once = true; - if (warn_once) { - fprintf(stderr, "*************************************************\n"); - fprintf(stderr, "* WARNING: Thread trace support is experimental *\n"); - fprintf(stderr, "*************************************************\n"); - warn_once = false; - } - - sctx->thread_trace = CALLOC_STRUCT(ac_thread_trace_data); - - if (sctx->gfx_level < GFX8) { - fprintf(stderr, "GPU hardware not supported: refer to " - "the RGP documentation for the list of " - "supported GPUs!\n"); - return false; - } - - if (sctx->gfx_level > GFX11) { - fprintf(stderr, "radeonsi: Thread trace is not supported " - "for that GPU!\n"); - return false; - } - - /* Default buffer size set to 32MB per SE. */ - sctx->thread_trace->buffer_size = debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 32 * 1024) * 1024; - sctx->thread_trace->start_frame = 10; - - const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER"); - if (trigger) { - sctx->thread_trace->start_frame = atoi(trigger); - if (sctx->thread_trace->start_frame <= 0) { - /* This isn't a frame number, must be a file */ - sctx->thread_trace->trigger_file = strdup(trigger); - sctx->thread_trace->start_frame = -1; - } - } - - if (!si_thread_trace_init_bo(sctx)) - return false; - - ac_thread_trace_init(sctx->thread_trace); - - if (sctx->gfx_level >= GFX10 && - debug_get_bool_option("AMD_THREAD_TRACE_SPM", sctx->gfx_level < GFX11)) { - /* Limit SPM counters to GFX10 and GFX10_3 for now */ - ASSERTED bool r = si_spm_init(sctx); - assert(r); - } - - si_thread_trace_init_cs(sctx); - - sctx->sqtt_next_event = EventInvalid; - - return true; +bool si_init_sqtt(struct si_context *sctx) { + static bool warn_once = true; + if (warn_once) { + fprintf(stderr, "*************************************************\n"); + fprintf(stderr, "* WARNING: Thread trace support is experimental *\n"); + fprintf(stderr, "*************************************************\n"); + warn_once = false; + } + + sctx->sqtt = CALLOC_STRUCT(ac_sqtt); + + if (sctx->gfx_level < GFX8) { + fprintf(stderr, "GPU hardware not supported: refer to " + "the RGP documentation for the list of " + "supported GPUs!\n"); + return false; + } + + if (sctx->gfx_level > GFX11) { + fprintf(stderr, "radeonsi: Thread trace is not supported " + "for that GPU!\n"); + return false; + } + + /* Default buffer size set to 32MB per SE. */ + sctx->sqtt->buffer_size = + debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 32 * 1024) * 1024; + sctx->sqtt->start_frame = 10; + + const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER"); + if (trigger) { + sctx->sqtt->start_frame = atoi(trigger); + if (sctx->sqtt->start_frame <= 0) { + /* This isn't a frame number, must be a file */ + sctx->sqtt->trigger_file = strdup(trigger); + sctx->sqtt->start_frame = -1; + } + } + + if (!si_sqtt_init_bo(sctx)) + return false; + + ac_sqtt_init(sctx->sqtt); + + if (sctx->gfx_level >= GFX10 && + debug_get_bool_option("AMD_THREAD_TRACE_SPM", sctx->gfx_level < GFX11)) { + /* Limit SPM counters to GFX10 and GFX10_3 for now */ + ASSERTED bool r = si_spm_init(sctx); + assert(r); + } + + si_sqtt_init_cs(sctx); + + sctx->sqtt_next_event = EventInvalid; + + return true; } -void -si_destroy_thread_trace(struct si_context *sctx) -{ - struct si_screen *sscreen = sctx->screen; - struct pb_buffer *bo = sctx->thread_trace->bo; - radeon_bo_reference(sctx->screen->ws, &bo, NULL); - - if (sctx->thread_trace->trigger_file) - free(sctx->thread_trace->trigger_file); - - sscreen->ws->cs_destroy(sctx->thread_trace->start_cs[AMD_IP_GFX]); - sscreen->ws->cs_destroy(sctx->thread_trace->stop_cs[AMD_IP_GFX]); - - struct rgp_pso_correlation *pso_correlation = &sctx->thread_trace->rgp_pso_correlation; - struct rgp_loader_events *loader_events = &sctx->thread_trace->rgp_loader_events; - struct rgp_code_object *code_object = &sctx->thread_trace->rgp_code_object; - list_for_each_entry_safe(struct rgp_pso_correlation_record, record, - &pso_correlation->record, list) { - list_del(&record->list); - free(record); - } - - list_for_each_entry_safe(struct rgp_loader_events_record, record, - &loader_events->record, list) { - list_del(&record->list); - free(record); - } - - list_for_each_entry_safe(struct rgp_code_object_record, record, - &code_object->record, list) { - uint32_t mask = record->shader_stages_mask; - int i; - - /* Free the disassembly. */ - while (mask) { - i = u_bit_scan(&mask); - free(record->shader_data[i].code); - } - list_del(&record->list); - free(record); - } - - ac_thread_trace_finish(sctx->thread_trace); - - hash_table_foreach(sctx->thread_trace->pipeline_bos->table, entry) { - struct si_sqtt_fake_pipeline *pipeline = (struct si_sqtt_fake_pipeline *)entry->data; - si_resource_reference(&pipeline->bo, NULL); - FREE(pipeline); - } - - free(sctx->thread_trace); - sctx->thread_trace = NULL; - - if (sctx->spm.bo) - si_spm_finish(sctx); +void si_destroy_sqtt(struct si_context *sctx) { + struct si_screen *sscreen = sctx->screen; + struct pb_buffer *bo = sctx->sqtt->bo; + radeon_bo_reference(sctx->screen->ws, &bo, NULL); + + if (sctx->sqtt->trigger_file) + free(sctx->sqtt->trigger_file); + + sscreen->ws->cs_destroy(sctx->sqtt->start_cs[AMD_IP_GFX]); + sscreen->ws->cs_destroy(sctx->sqtt->stop_cs[AMD_IP_GFX]); + + struct rgp_pso_correlation *pso_correlation = + &sctx->sqtt->rgp_pso_correlation; + struct rgp_loader_events *loader_events = &sctx->sqtt->rgp_loader_events; + struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object; + list_for_each_entry_safe(struct rgp_pso_correlation_record, record, + &pso_correlation->record, list) { + list_del(&record->list); + free(record); + } + + list_for_each_entry_safe(struct rgp_loader_events_record, record, + &loader_events->record, list) { + list_del(&record->list); + free(record); + } + + list_for_each_entry_safe(struct rgp_code_object_record, record, + &code_object->record, list) { + uint32_t mask = record->shader_stages_mask; + int i; + + /* Free the disassembly. */ + while (mask) { + i = u_bit_scan(&mask); + free(record->shader_data[i].code); + } + list_del(&record->list); + free(record); + } + + ac_sqtt_finish(sctx->sqtt); + + hash_table_foreach(sctx->sqtt->pipeline_bos->table, entry) { + struct si_sqtt_fake_pipeline *pipeline = + (struct si_sqtt_fake_pipeline *)entry->data; + si_resource_reference(&pipeline->bo, NULL); + FREE(pipeline); + } + + free(sctx->sqtt); + sctx->sqtt = NULL; + + if (sctx->spm.bo) + si_spm_finish(sctx); } static uint64_t num_frames = 0; -void -si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs) -{ - /* Should we enable SQTT yet? */ - if (!sctx->thread_trace_enabled) { - bool frame_trigger = num_frames == sctx->thread_trace->start_frame; - bool file_trigger = false; - if (sctx->thread_trace->trigger_file && - access(sctx->thread_trace->trigger_file, W_OK) == 0) { - if (unlink(sctx->thread_trace->trigger_file) == 0) { - file_trigger = true; - } else { - /* Do not enable tracing if we cannot remove the file, - * because by then we'll trace every frame. - */ - fprintf(stderr, "radeonsi: could not remove thread trace trigger file, ignoring\n"); - } +void si_handle_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs) { + /* Should we enable SQTT yet? */ + if (!sctx->sqtt_enabled) { + bool frame_trigger = num_frames == sctx->sqtt->start_frame; + bool file_trigger = false; + if (sctx->sqtt->trigger_file && + access(sctx->sqtt->trigger_file, W_OK) == 0) { + if (unlink(sctx->sqtt->trigger_file) == 0) { + file_trigger = true; + } else { + /* Do not enable tracing if we cannot remove the file, + * because by then we'll trace every frame. + */ + fprintf( + stderr, + "radeonsi: could not remove thread trace trigger file, ignoring\n"); } + } - if (frame_trigger || file_trigger) { - /* Wait for last submission */ - sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence, PIPE_TIMEOUT_INFINITE); + if (frame_trigger || file_trigger) { + /* Wait for last submission */ + sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence, + PIPE_TIMEOUT_INFINITE); - /* Start SQTT */ - si_begin_thread_trace(sctx, rcs); + /* Start SQTT */ + si_begin_sqtt(sctx, rcs); - sctx->thread_trace_enabled = true; - sctx->thread_trace->start_frame = -1; + sctx->sqtt_enabled = true; + sctx->sqtt->start_frame = -1; - /* Force shader update to make sure si_sqtt_describe_pipeline_bind is called - * for the current "pipeline". - */ - sctx->do_update_shaders = true; - } - } else { - struct ac_thread_trace thread_trace = {0}; - - /* Stop SQTT */ - si_end_thread_trace(sctx, rcs); - sctx->thread_trace_enabled = false; - sctx->thread_trace->start_frame = -1; - assert (sctx->last_sqtt_fence); - - /* Wait for SQTT to finish and read back the bo */ - if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence, PIPE_TIMEOUT_INFINITE) && - si_get_thread_trace(sctx, &thread_trace)) { - struct ac_spm_trace spm_trace; - - /* Map the SPM counter buffer */ - if (sctx->spm.bo) { - sctx->spm.ptr = sctx->ws->buffer_map(sctx->ws, sctx->spm.bo, - NULL, PIPE_MAP_READ | RADEON_MAP_TEMPORARY); - ac_spm_get_trace(&sctx->spm, &spm_trace); - } - - ac_dump_rgp_capture(&sctx->screen->info, &thread_trace, sctx->spm.bo ? &spm_trace : NULL); - - if (sctx->spm.ptr) - sctx->ws->buffer_unmap(sctx->ws, sctx->spm.bo); - } else { - fprintf(stderr, "Failed to read the trace\n"); + /* Force shader update to make sure si_sqtt_describe_pipeline_bind is + * called for the current "pipeline". + */ + sctx->do_update_shaders = true; + } + } else { + struct ac_sqtt_trace sqtt_trace = {0}; + + /* Stop SQTT */ + si_end_sqtt(sctx, rcs); + sctx->sqtt_enabled = false; + sctx->sqtt->start_frame = -1; + assert(sctx->last_sqtt_fence); + + /* Wait for SQTT to finish and read back the bo */ + if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence, + PIPE_TIMEOUT_INFINITE) && + si_get_sqtt_trace(sctx, &sqtt_trace)) { + struct ac_spm_trace spm_trace; + + /* Map the SPM counter buffer */ + if (sctx->spm.bo) { + sctx->spm.ptr = sctx->ws->buffer_map( + sctx->ws, sctx->spm.bo, NULL, PIPE_MAP_READ | RADEON_MAP_TEMPORARY); + ac_spm_get_trace(&sctx->spm, &spm_trace); } - } - num_frames++; -} + ac_dump_rgp_capture(&sctx->screen->info, &sqtt_trace, + sctx->spm.bo ? &spm_trace : NULL); + if (sctx->spm.ptr) + sctx->ws->buffer_unmap(sctx->ws, sctx->spm.bo); + } else { + fprintf(stderr, "Failed to read the trace\n"); + } + } -static void -si_emit_thread_trace_userdata(struct si_context* sctx, - struct radeon_cmdbuf *cs, - const void *data, uint32_t num_dwords) -{ - const uint32_t *dwords = (uint32_t *)data; + num_frames++; +} - radeon_begin(cs); +static void si_emit_sqtt_userdata(struct si_context *sctx, + struct radeon_cmdbuf *cs, const void *data, + uint32_t num_dwords) { + const uint32_t *dwords = (uint32_t *)data; - while (num_dwords > 0) { - uint32_t count = MIN2(num_dwords, 2); + radeon_begin(cs); - /* Without the perfctr bit the CP might not always pass the - * write on correctly. */ - radeon_set_uconfig_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count, sctx->gfx_level >= GFX10); + while (num_dwords > 0) { + uint32_t count = MIN2(num_dwords, 2); - radeon_emit_array(dwords, count); + /* Without the perfctr bit the CP might not always pass the + * write on correctly. */ + radeon_set_uconfig_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count, + sctx->gfx_level >= GFX10); - dwords += count; - num_dwords -= count; - } - radeon_end(); + radeon_emit_array(dwords, count); + + dwords += count; + num_dwords -= count; + } + radeon_end(); } static void @@ -913,7 +872,7 @@ si_sqtt_write_event_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs, marker.instance_offset_reg_idx = instance_offset_user_data; marker.draw_index_reg_idx = draw_index_user_data; - si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4); + si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4); sctx->sqtt_next_event = EventInvalid; } @@ -935,7 +894,7 @@ si_write_event_with_dims_marker(struct si_context* sctx, struct radeon_cmdbuf *r marker.thread_y = y; marker.thread_z = z; - si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4); + si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4); sctx->sqtt_next_event = EventInvalid; } @@ -948,7 +907,7 @@ si_sqtt_describe_barrier_start(struct si_context* sctx, struct radeon_cmdbuf *rc marker.cb_id = 0; marker.dword02 = 0xC0000000 + 10; /* RGP_BARRIER_INTERNAL_BASE */ - si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4); + si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4); } void @@ -988,7 +947,7 @@ si_sqtt_describe_barrier_end(struct si_context* sctx, struct radeon_cmdbuf *rcs, marker.flush_db = true; } - si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4); + si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4); } void @@ -1002,7 +961,7 @@ si_write_user_event(struct si_context* sctx, struct radeon_cmdbuf *rcs, marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT; marker.data_type = type; - si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4); + si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4); } else { assert (str != NULL); struct rgp_sqtt_marker_user_event_with_length marker = { 0 }; @@ -1016,31 +975,26 @@ si_write_user_event(struct si_context* sctx, struct radeon_cmdbuf *rcs, memcpy(buffer + sizeof(marker), str, len); buffer[sizeof(marker) + len - 1] = '\0'; - si_emit_thread_trace_userdata(sctx, rcs, buffer, sizeof(marker) / 4 + marker.length / 4); + si_emit_sqtt_userdata(sctx, rcs, buffer, + sizeof(marker) / 4 + marker.length / 4); } } - -bool -si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data, - uint64_t pipeline_hash) -{ - simple_mtx_lock(&thread_trace_data->rgp_pso_correlation.lock); +bool si_sqtt_pipeline_is_registered(struct ac_sqtt *sqtt, + uint64_t pipeline_hash) { + simple_mtx_lock(&sqtt->rgp_pso_correlation.lock); list_for_each_entry_safe(struct rgp_pso_correlation_record, record, - &thread_trace_data->rgp_pso_correlation.record, list) { + &sqtt->rgp_pso_correlation.record, list) { if (record->pipeline_hash[0] == pipeline_hash) { - simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock); + simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock); return true; } - } - simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock); + simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock); return false; } - - static enum rgp_hardware_stages si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key* key, enum pipe_shader_type stage) { @@ -1079,8 +1033,7 @@ si_sqtt_add_code_object(struct si_context* sctx, struct si_sqtt_fake_pipeline *pipeline, bool is_compute) { - struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; - struct rgp_code_object *code_object = &thread_trace_data->rgp_code_object; + struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object; struct rgp_code_object_record *record; record = malloc(sizeof(struct rgp_code_object_record)); @@ -1147,15 +1100,14 @@ si_sqtt_add_code_object(struct si_context* sctx, bool si_sqtt_register_pipeline(struct si_context* sctx, struct si_sqtt_fake_pipeline *pipeline, bool is_compute) { - struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; - - assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline->code_hash)); + assert(!si_sqtt_pipeline_is_registered(sctx->sqtt, pipeline->code_hash)); - bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline->code_hash); + bool result = ac_sqtt_add_pso_correlation(sctx->sqtt, pipeline->code_hash); if (!result) return false; - result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline->code_hash, pipeline->bo->gpu_address); + result = ac_sqtt_add_code_object_loader_event( + sctx->sqtt, pipeline->code_hash, pipeline->bo->gpu_address); if (!result) return false; @@ -1170,7 +1122,7 @@ si_sqtt_describe_pipeline_bind(struct si_context* sctx, struct rgp_sqtt_marker_pipeline_bind marker = {0}; struct radeon_cmdbuf *cs = &sctx->gfx_cs; - if (likely(!sctx->thread_trace_enabled)) { + if (likely(!sctx->sqtt_enabled)) { return; } @@ -1180,5 +1132,5 @@ si_sqtt_describe_pipeline_bind(struct si_context* sctx, marker.api_pso_hash[0] = pipeline_hash; marker.api_pso_hash[1] = pipeline_hash >> 32; - si_emit_thread_trace_userdata(sctx, cs, &marker, sizeof(marker) / 4); + si_emit_sqtt_userdata(sctx, cs, &marker, sizeof(marker) / 4); } diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 8bd203a..8284542 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -303,7 +303,7 @@ static bool si_update_shaders(struct si_context *sctx) si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs); } - if (GFX_VERSION >= GFX9 && unlikely(sctx->thread_trace)) { + if (GFX_VERSION >= GFX9 && unlikely(sctx->sqtt)) { /* Pretend the bound shaders form a vk pipeline. Include the scratch size in * the hash calculation to force re-emitting the pipeline if the scratch bo * changes. @@ -326,8 +326,7 @@ static bool si_update_shaders(struct si_context *sctx) } struct si_sqtt_fake_pipeline *pipeline = NULL; - struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; - if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) { + if (!si_sqtt_pipeline_is_registered(sctx->sqtt, pipeline_code_hash)) { /* This is a new pipeline. Allocate a new bo to hold all the shaders. Without * this, shader code export process creates huge rgp files because RGP assumes * the shaders live sequentially in memory (shader N address = shader 0 + offset N) @@ -387,7 +386,7 @@ static bool si_update_shaders(struct si_context *sctx) } sctx->screen->ws->buffer_unmap(sctx->screen->ws, bo->buf); - _mesa_hash_table_u64_insert(sctx->thread_trace->pipeline_bos, + _mesa_hash_table_u64_insert(sctx->sqtt->pipeline_bos, pipeline_code_hash, pipeline); si_sqtt_register_pipeline(sctx, pipeline, false); @@ -396,8 +395,8 @@ static bool si_update_shaders(struct si_context *sctx) si_resource_reference(&bo, NULL); } } else { - pipeline = (struct si_sqtt_fake_pipeline *) - _mesa_hash_table_u64_search(sctx->thread_trace->pipeline_bos, pipeline_code_hash); + pipeline = (struct si_sqtt_fake_pipeline *)_mesa_hash_table_u64_search( + sctx->sqtt->pipeline_bos, pipeline_code_hash); } assert(pipeline); @@ -1389,15 +1388,15 @@ static void si_emit_draw_registers(struct si_context *sctx, radeon_end(); } -#define EMIT_SQTT_END_DRAW do { \ - if (GFX_VERSION >= GFX9 && unlikely(sctx->thread_trace_enabled)) { \ - radeon_begin(&sctx->gfx_cs); \ - radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); \ - radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | \ - EVENT_INDEX(0)); \ - radeon_end(); \ - } \ - } while (0) +#define EMIT_SQTT_END_DRAW \ + do { \ + if (GFX_VERSION >= GFX9 && unlikely(sctx->sqtt_enabled)) { \ + radeon_begin(&sctx->gfx_cs); \ + radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); \ + radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0)); \ + radeon_end(); \ + } \ + } while (0) template ALWAYS_INLINE @@ -1411,7 +1410,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw { struct radeon_cmdbuf *cs = &sctx->gfx_cs; - if (unlikely(sctx->thread_trace_enabled)) { + if (unlikely(sctx->sqtt_enabled)) { si_sqtt_write_event_marker(sctx, &sctx->gfx_cs, sctx->sqtt_next_event, UINT_MAX, UINT_MAX, UINT_MAX); } -- 2.7.4