ac,radv,radeonsi: rename thread_trace to sqtt everywhere
authorSamuel Pitoiset <samuel.pitoiset@gmail.com>
Wed, 26 Apr 2023 15:02:38 +0000 (17:02 +0200)
committerMarge Bot <emma+marge@anholt.net>
Fri, 28 Apr 2023 16:55:13 +0000 (16:55 +0000)
SQTT stands for SQ Thread Trace but it's shorter.
Note that environment variables aren't renamed because this might
break external applications.

This renames:
- ac_thread_trace_data to ac_sqtt (this is the main struct)
- ac_thread_trace_info to ac_sqtt_data_info
- ac_thread_trace_se to ac_sqtt_data_se
- ac_thread_trace to ac_sqtt_trace (this contains trace only)

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22732>

19 files changed:
src/amd/common/ac_rgp.c
src/amd/common/ac_rgp.h
src/amd/common/ac_sqtt.c
src/amd/common/ac_sqtt.h
src/amd/vulkan/layers/radv_sqtt_layer.c
src/amd/vulkan/radv_cmd_buffer.c
src/amd/vulkan/radv_device.c
src/amd/vulkan/radv_physical_device.c
src/amd/vulkan/radv_private.h
src/amd/vulkan/radv_sqtt.c
src/gallium/drivers/radeonsi/si_blit.c
src/gallium/drivers/radeonsi/si_clear.c
src/gallium/drivers/radeonsi/si_compute.c
src/gallium/drivers/radeonsi/si_fence.c
src/gallium/drivers/radeonsi/si_gfx_cs.c
src/gallium/drivers/radeonsi/si_pipe.c
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_sqtt.c
src/gallium/drivers/radeonsi/si_state_draw.cpp

index 69f98b5..86706db 100644 (file)
@@ -991,22 +991,21 @@ static void ac_sqtt_dump_spm(const struct ac_spm_trace *spm_trace,
 }
 
 #if defined(USE_LIBELF)
-static void ac_sqtt_dump_data(struct radeon_info *rad_info,
-                              struct ac_thread_trace *thread_trace,
-                              const struct ac_spm_trace *spm_trace,
-                              FILE *output)
+static void
+ac_sqtt_dump_data(struct radeon_info *rad_info, struct ac_sqtt_trace *sqtt_trace,
+                  const struct ac_spm_trace *spm_trace, FILE *output)
 {
    struct sqtt_file_chunk_asic_info asic_info = {0};
    struct sqtt_file_chunk_cpu_info cpu_info = {0};
    struct sqtt_file_chunk_api_info api_info = {0};
    struct sqtt_file_header header = {0};
    size_t file_offset = 0;
-   const struct rgp_code_object *rgp_code_object = thread_trace->rgp_code_object;
-   const struct rgp_loader_events *rgp_loader_events = thread_trace->rgp_loader_events;
-   const struct rgp_pso_correlation *rgp_pso_correlation = thread_trace->rgp_pso_correlation;
-   const struct rgp_queue_info *rgp_queue_info = thread_trace->rgp_queue_info;
-   const struct rgp_queue_event *rgp_queue_event = thread_trace->rgp_queue_event;
-   const struct rgp_clock_calibration *rgp_clock_calibration = thread_trace->rgp_clock_calibration;
+   const struct rgp_code_object *rgp_code_object = sqtt_trace->rgp_code_object;
+   const struct rgp_loader_events *rgp_loader_events = sqtt_trace->rgp_loader_events;
+   const struct rgp_pso_correlation *rgp_pso_correlation = sqtt_trace->rgp_pso_correlation;
+   const struct rgp_queue_info *rgp_queue_info = sqtt_trace->rgp_queue_info;
+   const struct rgp_queue_event *rgp_queue_event = sqtt_trace->rgp_queue_event;
+   const struct rgp_clock_calibration *rgp_clock_calibration = sqtt_trace->rgp_clock_calibration;
 
    /* SQTT header file. */
    ac_sqtt_fill_header(&header);
@@ -1145,10 +1144,10 @@ static void ac_sqtt_dump_data(struct radeon_info *rad_info,
       }
    }
 
-   if (thread_trace) {
-      for (unsigned i = 0; i < thread_trace->num_traces; i++) {
-         const struct ac_thread_trace_se *se = &thread_trace->traces[i];
-         const struct ac_thread_trace_info *info = &se->info;
+   if (sqtt_trace) {
+      for (unsigned i = 0; i < sqtt_trace->num_traces; i++) {
+         const struct ac_sqtt_data_se *se = &sqtt_trace->traces[i];
+         const struct ac_sqtt_data_info *info = &se->info;
          struct sqtt_file_chunk_sqtt_desc desc = {0};
          struct sqtt_file_chunk_sqtt_data data = {0};
          uint64_t size = info->cur_offset * 32; /* unit of 32 bytes */
@@ -1175,9 +1174,9 @@ static void ac_sqtt_dump_data(struct radeon_info *rad_info,
 }
 #endif
 
-int ac_dump_rgp_capture(struct radeon_info *info,
-                        struct ac_thread_trace *thread_trace,
-                        const struct ac_spm_trace *spm_trace)
+int
+ac_dump_rgp_capture(struct radeon_info *info, struct ac_sqtt_trace *sqtt_trace,
+                    const struct ac_spm_trace *spm_trace)
 {
 #if !defined(USE_LIBELF)
    return -1;
@@ -1198,7 +1197,7 @@ int ac_dump_rgp_capture(struct radeon_info *info,
    if (!f)
       return -1;
 
-   ac_sqtt_dump_data(info, thread_trace, spm_trace, f);
+   ac_sqtt_dump_data(info, sqtt_trace, spm_trace, f);
 
    fprintf(stderr, "RGP capture saved to '%s'\n", filename);
 
index c33129d..b8eda7a 100644 (file)
@@ -32,8 +32,8 @@
 #include "util/simple_mtx.h"
 
 struct radeon_info;
-struct ac_thread_trace;
-struct ac_thread_trace_data;
+struct ac_sqtt_trace;
+struct ac_sqtt;
 struct ac_spm_trace;
 
 enum rgp_hardware_stages {
@@ -188,10 +188,8 @@ struct rgp_clock_calibration {
    simple_mtx_t lock;
 };
 
-int
-ac_dump_rgp_capture(struct radeon_info *info,
-                    struct ac_thread_trace *thread_trace,
-                    const struct ac_spm_trace *spm_trace);
+int ac_dump_rgp_capture(struct radeon_info *info, struct ac_sqtt_trace *sqtt_trace,
+                        const struct ac_spm_trace *spm_trace);
 
 void
 ac_rgp_file_write_elf_object(FILE *output, size_t file_elf_start,
index dec9f06..3684838 100644 (file)
 #include "util/os_time.h"
 
 uint64_t
-ac_thread_trace_get_info_offset(unsigned se)
+ac_sqtt_get_info_offset(unsigned se)
 {
-   return sizeof(struct ac_thread_trace_info) * se;
+   return sizeof(struct ac_sqtt_data_info) * se;
 }
 
 uint64_t
-ac_thread_trace_get_data_offset(const struct radeon_info *rad_info,
-                                const struct ac_thread_trace_data *data, unsigned se)
+ac_sqtt_get_data_offset(const struct radeon_info *rad_info, const struct ac_sqtt *data, unsigned se)
 {
    unsigned max_se = rad_info->max_se;
    uint64_t data_offset;
 
-   data_offset = align64(sizeof(struct ac_thread_trace_info) * max_se,
-               1 << SQTT_BUFFER_ALIGN_SHIFT);
+   data_offset = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT);
    data_offset += data->buffer_size * se;
 
    return data_offset;
 }
 
 uint64_t
-ac_thread_trace_get_info_va(uint64_t va, unsigned se)
+ac_sqtt_get_info_va(uint64_t va, unsigned se)
 {
-   return va + ac_thread_trace_get_info_offset(se);
+   return va + ac_sqtt_get_info_offset(se);
 }
 
 uint64_t
-ac_thread_trace_get_data_va(const struct radeon_info *rad_info,
-                            const struct ac_thread_trace_data *data, uint64_t va, unsigned se)
+ac_sqtt_get_data_va(const struct radeon_info *rad_info, const struct ac_sqtt *data, uint64_t va,
+                    unsigned se)
 {
-   return va + ac_thread_trace_get_data_offset(rad_info, data, se);
+   return va + ac_sqtt_get_data_offset(rad_info, data, se);
 }
 
 void
-ac_thread_trace_init(struct ac_thread_trace_data *data)
+ac_sqtt_init(struct ac_sqtt *data)
 {
    list_inithead(&data->rgp_pso_correlation.record);
    simple_mtx_init(&data->rgp_pso_correlation.lock, mtx_plain);
@@ -85,7 +83,7 @@ ac_thread_trace_init(struct ac_thread_trace_data *data)
 }
 
 void
-ac_thread_trace_finish(struct ac_thread_trace_data *data)
+ac_sqtt_finish(struct ac_sqtt *data)
 {
    assert(data->rgp_pso_correlation.record_count == 0);
    simple_mtx_destroy(&data->rgp_pso_correlation.lock);
@@ -107,9 +105,8 @@ ac_thread_trace_finish(struct ac_thread_trace_data *data)
 }
 
 bool
-ac_is_thread_trace_complete(const struct radeon_info *rad_info,
-                            const struct ac_thread_trace_data *data,
-                            const struct ac_thread_trace_info *info)
+ac_is_sqtt_complete(const struct radeon_info *rad_info, const struct ac_sqtt *data,
+                    const struct ac_sqtt_data_info *info)
 {
    if (rad_info->gfx_level >= GFX10) {
       /* GFX10 doesn't have THREAD_TRACE_CNTR but it reports the number of
@@ -131,8 +128,7 @@ ac_is_thread_trace_complete(const struct radeon_info *rad_info,
 }
 
 uint32_t
-ac_get_expected_buffer_size(struct radeon_info *rad_info,
-                            const struct ac_thread_trace_info *info)
+ac_get_expected_buffer_size(struct radeon_info *rad_info, const struct ac_sqtt_data_info *info)
 {
    if (rad_info->gfx_level >= GFX10) {
       uint32_t dropped_cntr_per_se = info->gfx10_dropped_cntr / rad_info->max_se;
@@ -143,10 +139,9 @@ ac_get_expected_buffer_size(struct radeon_info *rad_info,
 }
 
 bool
-ac_sqtt_add_pso_correlation(struct ac_thread_trace_data *thread_trace_data,
-                            uint64_t pipeline_hash)
+ac_sqtt_add_pso_correlation(struct ac_sqtt *sqtt, uint64_t pipeline_hash)
 {
-   struct rgp_pso_correlation *pso_correlation = &thread_trace_data->rgp_pso_correlation;
+   struct rgp_pso_correlation *pso_correlation = &sqtt->rgp_pso_correlation;
    struct rgp_pso_correlation_record *record;
 
    record = malloc(sizeof(struct rgp_pso_correlation_record));
@@ -167,11 +162,10 @@ ac_sqtt_add_pso_correlation(struct ac_thread_trace_data *thread_trace_data,
 }
 
 bool
-ac_sqtt_add_code_object_loader_event(struct ac_thread_trace_data *thread_trace_data,
-                                     uint64_t pipeline_hash,
+ac_sqtt_add_code_object_loader_event(struct ac_sqtt *sqtt, uint64_t pipeline_hash,
                                      uint64_t base_address)
 {
-   struct rgp_loader_events *loader_events = &thread_trace_data->rgp_loader_events;
+   struct rgp_loader_events *loader_events = &sqtt->rgp_loader_events;
    struct rgp_loader_events_record *record;
 
    record = malloc(sizeof(struct rgp_loader_events_record));
@@ -194,10 +188,9 @@ ac_sqtt_add_code_object_loader_event(struct ac_thread_trace_data *thread_trace_d
 }
 
 bool
-ac_sqtt_add_clock_calibration(struct ac_thread_trace_data *thread_trace_data,
-                              uint64_t cpu_timestamp, uint64_t gpu_timestamp)
+ac_sqtt_add_clock_calibration(struct ac_sqtt *sqtt, uint64_t cpu_timestamp, uint64_t gpu_timestamp)
 {
-   struct rgp_clock_calibration *clock_calibration = &thread_trace_data->rgp_clock_calibration;
+   struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration;
    struct rgp_clock_calibration_record *record;
 
    record = malloc(sizeof(struct rgp_clock_calibration_record));
@@ -241,8 +234,7 @@ ac_check_profile_state(const struct radeon_info *info)
 }
 
 union rgp_sqtt_marker_cb_id
-ac_sqtt_get_next_cmdbuf_id(struct ac_thread_trace_data *data,
-                           enum amd_ip_type ip_type)
+ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt *data, enum amd_ip_type ip_type)
 {
    union rgp_sqtt_marker_cb_id cb_id = {0};
 
@@ -264,48 +256,46 @@ ac_sqtt_se_is_disabled(const struct radeon_info *info, unsigned se)
 }
 
 bool
-ac_sqtt_get_trace(struct ac_thread_trace_data *data,
-                  const struct radeon_info *info,
-                  struct ac_thread_trace *thread_trace)
+ac_sqtt_get_trace(struct ac_sqtt *data, const struct radeon_info *info,
+                  struct ac_sqtt_trace *sqtt_trace)
 {
    unsigned max_se = info->max_se;
    void *ptr = data->ptr;
 
-   memset(thread_trace, 0, sizeof(*thread_trace));
+   memset(sqtt_trace, 0, sizeof(*sqtt_trace));
 
    for (unsigned se = 0; se < max_se; se++) {
-      uint64_t info_offset = ac_thread_trace_get_info_offset(se);
-      uint64_t data_offset = ac_thread_trace_get_data_offset(info, data, se);
+      uint64_t info_offset = ac_sqtt_get_info_offset(se);
+      uint64_t data_offset = ac_sqtt_get_data_offset(info, data, se);
       void *info_ptr = (uint8_t *)ptr + info_offset;
       void *data_ptr = (uint8_t *)ptr + data_offset;
-      struct ac_thread_trace_info *trace_info = (struct ac_thread_trace_info *)info_ptr;
-      struct ac_thread_trace_se thread_trace_se = {0};
+      struct ac_sqtt_data_info *trace_info = (struct ac_sqtt_data_info *)info_ptr;
+      struct ac_sqtt_data_se data_se = {0};
       int first_active_cu = ffs(info->cu_mask[se][0]);
 
       if (ac_sqtt_se_is_disabled(info, se))
          continue;
 
-      if (!ac_is_thread_trace_complete(info, data, trace_info))
+      if (!ac_is_sqtt_complete(info, data, trace_info))
          return false;
 
-      thread_trace_se.data_ptr = data_ptr;
-      thread_trace_se.info = *trace_info;
-      thread_trace_se.shader_engine = se;
+      data_se.data_ptr = data_ptr;
+      data_se.info = *trace_info;
+      data_se.shader_engine = se;
 
       /* RGP seems to expect units of WGP on GFX10+. */
-      thread_trace_se.compute_unit =
-         info->gfx_level >= GFX10 ? (first_active_cu / 2) : first_active_cu;
+      data_se.compute_unit = info->gfx_level >= GFX10 ? (first_active_cu / 2) : first_active_cu;
 
-      thread_trace->traces[thread_trace->num_traces] = thread_trace_se;
-      thread_trace->num_traces++;
+      sqtt_trace->traces[sqtt_trace->num_traces] = data_se;
+      sqtt_trace->num_traces++;
    }
 
-   thread_trace->rgp_code_object = &data->rgp_code_object;
-   thread_trace->rgp_loader_events = &data->rgp_loader_events;
-   thread_trace->rgp_pso_correlation = &data->rgp_pso_correlation;
-   thread_trace->rgp_queue_info = &data->rgp_queue_info;
-   thread_trace->rgp_queue_event = &data->rgp_queue_event;
-   thread_trace->rgp_clock_calibration = &data->rgp_clock_calibration;
+   sqtt_trace->rgp_code_object = &data->rgp_code_object;
+   sqtt_trace->rgp_loader_events = &data->rgp_loader_events;
+   sqtt_trace->rgp_pso_correlation = &data->rgp_pso_correlation;
+   sqtt_trace->rgp_queue_info = &data->rgp_queue_info;
+   sqtt_trace->rgp_queue_event = &data->rgp_queue_event;
+   sqtt_trace->rgp_clock_calibration = &data->rgp_clock_calibration;
 
    return true;
 }
index 272fa5f..a9613d8 100644 (file)
 struct radeon_cmdbuf;
 struct radeon_info;
 
-struct ac_thread_trace_data {
+/**
+ * SQ Thread tracing is a tracing mechanism that allows taking a detailed look
+ * at what the shader cores are doing.
+ *
+ * Among the things recorded are:
+ *  - draws/dispatches + state
+ *  - when each wave starts and stops.
+ *  - for one SIMD per SE all instructions executed on that SIMD.
+ *
+ * The hardware stores all these as events in a buffer, no manual barrier
+ * around each command needed. The primary user of this is RGP.
+ */
+struct ac_sqtt {
    struct radeon_cmdbuf *start_cs[2];
    struct radeon_cmdbuf *stop_cs[2];
    /* struct radeon_winsys_bo or struct pb_buffer */
@@ -62,7 +74,7 @@ struct ac_thread_trace_data {
 
 #define SQTT_BUFFER_ALIGN_SHIFT 12
 
-struct ac_thread_trace_info {
+struct ac_sqtt_data_info {
    uint32_t cur_offset;
    uint32_t trace_status;
    union {
@@ -71,8 +83,8 @@ struct ac_thread_trace_info {
    };
 };
 
-struct ac_thread_trace_se {
-   struct ac_thread_trace_info info;
+struct ac_sqtt_data_se {
+   struct ac_sqtt_data_info info;
    void *data_ptr;
    uint32_t shader_engine;
    uint32_t compute_unit;
@@ -80,7 +92,7 @@ struct ac_thread_trace_se {
 
 #define SQTT_MAX_TRACES 6
 
-struct ac_thread_trace {
+struct ac_sqtt_trace {
    const struct rgp_code_object *rgp_code_object;
    const struct rgp_loader_events *rgp_loader_events;
    const struct rgp_pso_correlation *rgp_pso_correlation;
@@ -89,36 +101,27 @@ struct ac_thread_trace {
    const struct rgp_clock_calibration *rgp_clock_calibration;
 
    uint32_t num_traces;
-   struct ac_thread_trace_se traces[SQTT_MAX_TRACES];
+   struct ac_sqtt_data_se traces[SQTT_MAX_TRACES];
 };
 
-uint64_t
-ac_thread_trace_get_info_offset(unsigned se);
+uint64_t ac_sqtt_get_info_offset(unsigned se);
 
-uint64_t
-ac_thread_trace_get_data_offset(const struct radeon_info *rad_info,
-                                const struct ac_thread_trace_data *data, unsigned se);
-uint64_t
-ac_thread_trace_get_info_va(uint64_t va, unsigned se);
+uint64_t ac_sqtt_get_data_offset(const struct radeon_info *rad_info, const struct ac_sqtt *sqtt,
+                                 unsigned se);
+uint64_t ac_sqtt_get_info_va(uint64_t va, unsigned se);
 
-uint64_t
-ac_thread_trace_get_data_va(const struct radeon_info *rad_info,
-                            const struct ac_thread_trace_data *data, uint64_t va, unsigned se);
+uint64_t ac_sqtt_get_data_va(const struct radeon_info *rad_info, const struct ac_sqtt *sqtt,
+                             uint64_t va, unsigned se);
 
-void
-ac_thread_trace_init(struct ac_thread_trace_data *data);
+void ac_sqtt_init(struct ac_sqtt *data);
 
-void
-ac_thread_trace_finish(struct ac_thread_trace_data *data);
+void ac_sqtt_finish(struct ac_sqtt *data);
 
-bool
-ac_is_thread_trace_complete(const struct radeon_info *rad_info,
-                            const struct ac_thread_trace_data *data,
-                            const struct ac_thread_trace_info *info);
+bool ac_is_sqtt_complete(const struct radeon_info *rad_info, const struct ac_sqtt *sqtt,
+                         const struct ac_sqtt_data_info *info);
 
-uint32_t
-ac_get_expected_buffer_size(struct radeon_info *rad_info,
-                            const struct ac_thread_trace_info *info);
+uint32_t ac_get_expected_buffer_size(struct radeon_info *rad_info,
+                                     const struct ac_sqtt_data_info *info);
 
 /**
  * Identifiers for RGP SQ thread-tracing markers (Table 1)
@@ -549,27 +552,22 @@ struct rgp_sqtt_marker_pipeline_bind {
 static_assert(sizeof(struct rgp_sqtt_marker_pipeline_bind) == 12,
               "rgp_sqtt_marker_pipeline_bind doesn't match RGP spec");
 
+bool ac_sqtt_add_pso_correlation(struct ac_sqtt *sqtt, uint64_t pipeline_hash);
 
-bool ac_sqtt_add_pso_correlation(struct ac_thread_trace_data *thread_trace_data,
-                                 uint64_t pipeline_hash);
-
-bool ac_sqtt_add_code_object_loader_event(struct ac_thread_trace_data *thread_trace_data,
-                                          uint64_t pipeline_hash,
+bool ac_sqtt_add_code_object_loader_event(struct ac_sqtt *sqtt, uint64_t pipeline_hash,
                                           uint64_t base_address);
 
-bool ac_sqtt_add_clock_calibration(struct ac_thread_trace_data *thread_trace_data,
-                                   uint64_t cpu_timestamp,
+bool ac_sqtt_add_clock_calibration(struct ac_sqtt *sqtt, uint64_t cpu_timestamp,
                                    uint64_t gpu_timestamp);
 
 bool ac_check_profile_state(const struct radeon_info *info);
 
-union rgp_sqtt_marker_cb_id ac_sqtt_get_next_cmdbuf_id(struct ac_thread_trace_data *data,
+union rgp_sqtt_marker_cb_id ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt *sqtt,
                                                        enum amd_ip_type ip_type);
 
 bool ac_sqtt_se_is_disabled(const struct radeon_info *info, unsigned se);
 
-bool ac_sqtt_get_trace(struct ac_thread_trace_data *data,
-                       const struct radeon_info *info,
-                       struct ac_thread_trace *thread_trace);
+bool ac_sqtt_get_trace(struct ac_sqtt *sqtt, const struct radeon_info *info,
+                       struct ac_sqtt_trace *sqtt_trace);
 
 #endif
index 01eae49..2895c8b 100644 (file)
@@ -219,7 +219,7 @@ radv_write_begin_general_api_marker(struct radv_cmd_buffer *cmd_buffer,
    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API;
    marker.api_type = api_type;
 
-   radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
+   radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
 }
 
 static void
@@ -232,7 +232,7 @@ radv_write_end_general_api_marker(struct radv_cmd_buffer *cmd_buffer,
    marker.api_type = api_type;
    marker.is_end = 1;
 
-   radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
+   radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
 }
 
 static void
@@ -259,7 +259,7 @@ radv_write_event_marker(struct radv_cmd_buffer *cmd_buffer,
    marker.instance_offset_reg_idx = instance_offset_user_data;
    marker.draw_index_reg_idx = draw_index_user_data;
 
-   radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
+   radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
 }
 
 static void
@@ -279,7 +279,7 @@ radv_write_event_with_dims_marker(struct radv_cmd_buffer *cmd_buffer,
    marker.thread_y = y;
    marker.thread_z = z;
 
-   radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
+   radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
 }
 
 static void
@@ -292,7 +292,7 @@ radv_write_user_event_marker(struct radv_cmd_buffer *cmd_buffer,
       marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
       marker.data_type = type;
 
-      radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
+      radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
    } else {
       assert(str != NULL);
       unsigned len = strlen(str);
@@ -306,8 +306,7 @@ radv_write_user_event_marker(struct radv_cmd_buffer *cmd_buffer,
       memcpy(buffer, &marker, sizeof(marker));
       memcpy(buffer + sizeof(marker), str, len);
 
-      radv_emit_thread_trace_userdata(cmd_buffer, buffer,
-                                      sizeof(marker) / 4 + marker.length / 4);
+      radv_emit_sqtt_userdata(cmd_buffer, buffer, sizeof(marker) / 4 + marker.length / 4);
    }
 }
 
@@ -317,14 +316,14 @@ radv_describe_begin_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
    uint64_t device_id = (uintptr_t)cmd_buffer->device;
    struct rgp_sqtt_marker_cb_start marker = {0};
 
-   if (likely(!cmd_buffer->device->thread_trace.bo))
+   if (likely(!cmd_buffer->device->sqtt.bo))
       return;
 
    /* Reserve a command buffer ID for SQTT. */
    enum amd_ip_type ip_type =
       radv_queue_family_to_ring(cmd_buffer->device->physical_device, cmd_buffer->qf);
    union rgp_sqtt_marker_cb_id cb_id =
-      ac_sqtt_get_next_cmdbuf_id(&cmd_buffer->device->thread_trace, ip_type);
+      ac_sqtt_get_next_cmdbuf_id(&cmd_buffer->device->sqtt, ip_type);
    cmd_buffer->sqtt_cb_id = cb_id.all;
 
    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_CB_START;
@@ -337,7 +336,7 @@ radv_describe_begin_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
    if (cmd_buffer->qf == RADV_QUEUE_GENERAL)
       marker.queue_flags |= VK_QUEUE_GRAPHICS_BIT;
 
-   radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
+   radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
 }
 
 void
@@ -346,7 +345,7 @@ radv_describe_end_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
    uint64_t device_id = (uintptr_t)cmd_buffer->device;
    struct rgp_sqtt_marker_cb_end marker = {0};
 
-   if (likely(!cmd_buffer->device->thread_trace.bo))
+   if (likely(!cmd_buffer->device->sqtt.bo))
       return;
 
    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_CB_END;
@@ -354,13 +353,13 @@ radv_describe_end_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
    marker.device_id_low = device_id;
    marker.device_id_high = device_id >> 32;
 
-   radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
+   radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
 }
 
 void
 radv_describe_draw(struct radv_cmd_buffer *cmd_buffer)
 {
-   if (likely(!cmd_buffer->device->thread_trace.bo))
+   if (likely(!cmd_buffer->device->sqtt.bo))
       return;
 
    radv_write_event_marker(cmd_buffer, cmd_buffer->state.current_event_type, UINT_MAX, UINT_MAX,
@@ -370,7 +369,7 @@ radv_describe_draw(struct radv_cmd_buffer *cmd_buffer)
 void
 radv_describe_dispatch(struct radv_cmd_buffer *cmd_buffer, int x, int y, int z)
 {
-   if (likely(!cmd_buffer->device->thread_trace.bo))
+   if (likely(!cmd_buffer->device->sqtt.bo))
       return;
 
    radv_write_event_with_dims_marker(cmd_buffer, cmd_buffer->state.current_event_type, x, y, z);
@@ -408,7 +407,7 @@ radv_describe_barrier_end_delayed(struct radv_cmd_buffer *cmd_buffer)
 {
    struct rgp_sqtt_marker_barrier_end marker = {0};
 
-   if (likely(!cmd_buffer->device->thread_trace.bo) || !cmd_buffer->state.pending_sqtt_barrier_end)
+   if (likely(!cmd_buffer->device->sqtt.bo) || !cmd_buffer->state.pending_sqtt_barrier_end)
       return;
 
    cmd_buffer->state.pending_sqtt_barrier_end = false;
@@ -451,7 +450,7 @@ radv_describe_barrier_end_delayed(struct radv_cmd_buffer *cmd_buffer)
    if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_L1)
       marker.inval_gl1 = true;
 
-   radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
+   radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
 
    cmd_buffer->state.num_layout_transitions = 0;
 }
@@ -461,7 +460,7 @@ radv_describe_barrier_start(struct radv_cmd_buffer *cmd_buffer, enum rgp_barrier
 {
    struct rgp_sqtt_marker_barrier_start marker = {0};
 
-   if (likely(!cmd_buffer->device->thread_trace.bo))
+   if (likely(!cmd_buffer->device->sqtt.bo))
       return;
 
    radv_describe_barrier_end_delayed(cmd_buffer);
@@ -471,7 +470,7 @@ radv_describe_barrier_start(struct radv_cmd_buffer *cmd_buffer, enum rgp_barrier
    marker.cb_id = cmd_buffer->sqtt_cb_id;
    marker.dword02 = reason;
 
-   radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
+   radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
 }
 
 void
@@ -486,7 +485,7 @@ radv_describe_layout_transition(struct radv_cmd_buffer *cmd_buffer,
 {
    struct rgp_sqtt_marker_layout_transition marker = {0};
 
-   if (likely(!cmd_buffer->device->thread_trace.bo))
+   if (likely(!cmd_buffer->device->sqtt.bo))
       return;
 
    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_LAYOUT_TRANSITION;
@@ -499,7 +498,7 @@ radv_describe_layout_transition(struct radv_cmd_buffer *cmd_buffer,
    marker.fmask_color_expand = barrier->layout_transitions.fmask_color_expand;
    marker.init_mask_ram = barrier->layout_transitions.init_mask_ram;
 
-   radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
+   radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
 
    cmd_buffer->state.num_layout_transitions++;
 }
@@ -510,7 +509,7 @@ radv_describe_pipeline_bind(struct radv_cmd_buffer *cmd_buffer,
 {
    struct rgp_sqtt_marker_pipeline_bind marker = {0};
 
-   if (likely(!cmd_buffer->device->thread_trace.bo))
+   if (likely(!cmd_buffer->device->sqtt.bo))
       return;
 
    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE;
@@ -519,34 +518,34 @@ radv_describe_pipeline_bind(struct radv_cmd_buffer *cmd_buffer,
    marker.api_pso_hash[0] = pipeline->pipeline_hash;
    marker.api_pso_hash[1] = pipeline->pipeline_hash >> 32;
 
-   radv_emit_thread_trace_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
+   radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
 }
 
 /* TODO: Improve the way to trigger capture (overlay, etc). */
 static void
-radv_handle_thread_trace(VkQueue _queue)
+radv_handle_sqtt(VkQueue _queue)
 {
    RADV_FROM_HANDLE(radv_queue, queue, _queue);
-   static bool thread_trace_enabled = false;
+   static bool sqtt_enabled = false;
    static uint64_t num_frames = 0;
    bool resize_trigger = false;
 
-   if (thread_trace_enabled) {
-      struct ac_thread_trace thread_trace = {0};
+   if (sqtt_enabled) {
+      struct ac_sqtt_trace sqtt_trace = {0};
 
-      radv_end_thread_trace(queue);
-      thread_trace_enabled = false;
+      radv_end_sqtt(queue);
+      sqtt_enabled = false;
 
       /* TODO: Do something better than this whole sync. */
       queue->device->vk.dispatch_table.QueueWaitIdle(_queue);
 
-      if (radv_get_thread_trace(queue, &thread_trace)) {
+      if (radv_get_sqtt_trace(queue, &sqtt_trace)) {
          struct ac_spm_trace spm_trace;
 
          if (queue->device->spm.bo)
             ac_spm_get_trace(&queue->device->spm, &spm_trace);
 
-         ac_dump_rgp_capture(&queue->device->physical_device->rad_info, &thread_trace,
+         ac_dump_rgp_capture(&queue->device->physical_device->rad_info, &sqtt_trace,
                              queue->device->spm.bo ? &spm_trace : NULL);
       } else {
          /* Trigger a new capture if the driver failed to get
@@ -556,16 +555,15 @@ radv_handle_thread_trace(VkQueue _queue)
       }
 
       /* Clear resources used for this capture. */
-      radv_reset_thread_trace(queue->device);
+      radv_reset_sqtt_trace(queue->device);
    }
 
-   if (!thread_trace_enabled) {
-      bool frame_trigger = num_frames == queue->device->thread_trace.start_frame;
+   if (!sqtt_enabled) {
+      bool frame_trigger = num_frames == queue->device->sqtt.start_frame;
       bool file_trigger = false;
 #ifndef _WIN32
-      if (queue->device->thread_trace.trigger_file &&
-          access(queue->device->thread_trace.trigger_file, W_OK) == 0) {
-         if (unlink(queue->device->thread_trace.trigger_file) == 0) {
+      if (queue->device->sqtt.trigger_file && access(queue->device->sqtt.trigger_file, W_OK) == 0) {
+         if (unlink(queue->device->sqtt.trigger_file) == 0) {
             file_trigger = true;
          } else {
             /* Do not enable tracing if we cannot remove the file,
@@ -585,13 +583,13 @@ radv_handle_thread_trace(VkQueue _queue)
          }
 
          /* Sample CPU/GPU clocks before starting the trace. */
-         if (!radv_thread_trace_sample_clocks(queue->device)) {
+         if (!radv_sqtt_sample_clocks(queue->device)) {
             fprintf(stderr, "radv: Failed to sample clocks\n");
          }
 
-         radv_begin_thread_trace(queue);
-         assert(!thread_trace_enabled);
-         thread_trace_enabled = true;
+         radv_begin_sqtt(queue);
+         assert(!sqtt_enabled);
+         sqtt_enabled = true;
       }
    }
    num_frames++;
@@ -607,7 +605,7 @@ sqtt_QueuePresentKHR(VkQueue _queue, const VkPresentInfoKHR *pPresentInfo)
    if (result != VK_SUCCESS)
       return result;
 
-   radv_handle_thread_trace(_queue);
+   radv_handle_sqtt(_queue);
 
    return VK_SUCCESS;
 }
@@ -1172,8 +1170,8 @@ radv_mesa_to_rgp_shader_stage(struct radv_pipeline *pipeline, gl_shader_stage st
 static VkResult
 radv_add_code_object(struct radv_device *device, struct radv_pipeline *pipeline)
 {
-   struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
-   struct rgp_code_object *code_object = &thread_trace_data->rgp_code_object;
+   struct ac_sqtt *sqtt = &device->sqtt;
+   struct rgp_code_object *code_object = &sqtt->rgp_code_object;
    struct rgp_code_object_record *record;
 
    record = malloc(sizeof(struct rgp_code_object_record));
@@ -1225,7 +1223,7 @@ radv_register_pipeline(struct radv_device *device, struct radv_pipeline *pipelin
    bool result;
    uint64_t base_va = ~0;
 
-   result = ac_sqtt_add_pso_correlation(&device->thread_trace, pipeline->pipeline_hash);
+   result = ac_sqtt_add_pso_correlation(&device->sqtt, pipeline->pipeline_hash);
    if (!result)
       return VK_ERROR_OUT_OF_HOST_MEMORY;
 
@@ -1241,8 +1239,7 @@ radv_register_pipeline(struct radv_device *device, struct radv_pipeline *pipelin
       base_va = MIN2(base_va, va);
    }
 
-   result =
-      ac_sqtt_add_code_object_loader_event(&device->thread_trace, pipeline->pipeline_hash, base_va);
+   result = ac_sqtt_add_code_object_loader_event(&device->sqtt, pipeline->pipeline_hash, base_va);
    if (!result)
       return VK_ERROR_OUT_OF_HOST_MEMORY;
 
@@ -1256,10 +1253,10 @@ radv_register_pipeline(struct radv_device *device, struct radv_pipeline *pipelin
 static void
 radv_unregister_pipeline(struct radv_device *device, struct radv_pipeline *pipeline)
 {
-   struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
-   struct rgp_pso_correlation *pso_correlation = &thread_trace_data->rgp_pso_correlation;
-   struct rgp_loader_events *loader_events = &thread_trace_data->rgp_loader_events;
-   struct rgp_code_object *code_object = &thread_trace_data->rgp_code_object;
+   struct ac_sqtt *sqtt = &device->sqtt;
+   struct rgp_pso_correlation *pso_correlation = &sqtt->rgp_pso_correlation;
+   struct rgp_loader_events *loader_events = &sqtt->rgp_loader_events;
+   struct rgp_code_object *code_object = &sqtt->rgp_code_object;
 
    /* Destroy the PSO correlation record. */
    simple_mtx_lock(&pso_correlation->lock);
index faad8c5..0b6e98c 100644 (file)
@@ -737,7 +737,7 @@ static void
 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags)
 {
    const struct radv_device *device = cmd_buffer->device;
-   if (unlikely(device->thread_trace.bo)) {
+   if (unlikely(device->sqtt.bo)) {
       radeon_check_space(device->ws, cmd_buffer->cs, 2);
 
       radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
index 3488155..5f3618e 100644 (file)
@@ -106,8 +106,7 @@ radv_get_int_debug_option(const char *name, int default_value)
 static bool
 radv_spm_trace_enabled()
 {
-   return radv_thread_trace_enabled() &&
-          debug_get_bool_option("RADV_THREAD_TRACE_CACHE_COUNTERS", false);
+   return radv_sqtt_enabled() && debug_get_bool_option("RADV_THREAD_TRACE_CACHE_COUNTERS", false);
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL
@@ -587,7 +586,7 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *ph
       add_entrypoints(&b, &rage2_device_entrypoints, RADV_APP_DISPATCH_TABLE);
    }
 
-   if (radv_thread_trace_enabled())
+   if (radv_sqtt_enabled())
       add_entrypoints(&b, &sqtt_device_entrypoints, RADV_RGP_DISPATCH_TABLE);
 
    if (radv_rra_trace_enabled() && radv_enable_rt(physical_device, false))
@@ -933,7 +932,7 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
       radv_dump_enabled_options(device, stderr);
    }
 
-   if (radv_thread_trace_enabled()) {
+   if (radv_sqtt_enabled()) {
       if (device->physical_device->rad_info.gfx_level < GFX8 ||
           device->physical_device->rad_info.gfx_level > GFX11) {
          fprintf(stderr, "GPU hardware not supported: refer to "
@@ -942,14 +941,15 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
          abort();
       }
 
-      if (!radv_thread_trace_init(device)) {
+      if (!radv_sqtt_init(device)) {
          result = VK_ERROR_INITIALIZATION_FAILED;
          goto fail;
       }
 
-      fprintf(stderr, "radv: Thread trace support is enabled (initial buffer size: %u MiB, "
-                      "instruction timing: %s, cache counters: %s).\n",
-              device->thread_trace.buffer_size / (1024 * 1024),
+      fprintf(stderr,
+              "radv: Thread trace support is enabled (initial buffer size: %u MiB, "
+              "instruction timing: %s, cache counters: %s).\n",
+              device->sqtt.buffer_size / (1024 * 1024),
               radv_is_instruction_timing_enabled() ? "enabled" : "disabled",
               radv_spm_trace_enabled() ? "enabled" : "disabled");
 
@@ -1093,7 +1093,7 @@ fail_cache:
 fail_meta:
    radv_device_finish_meta(device);
 fail:
-   radv_thread_trace_finish(device);
+   radv_sqtt_finish(device);
 
    radv_spm_finish(device);
 
@@ -1195,7 +1195,7 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
 
    radv_destroy_shader_arenas(device);
 
-   radv_thread_trace_finish(device);
+   radv_sqtt_finish(device);
 
    radv_rra_trace_finish(_device, &device->rra_trace);
 
index 6e30558..4e14109 100644 (file)
@@ -54,7 +54,7 @@ typedef void *drmDevicePtr;
 #endif
 
 bool
-radv_thread_trace_enabled(void)
+radv_sqtt_enabled(void)
 {
    return radv_get_int_debug_option("RADV_THREAD_TRACE", -1) >= 0 ||
           getenv("RADV_THREAD_TRACE_TRIGGER");
@@ -65,7 +65,7 @@ radv_perf_query_supported(const struct radv_physical_device *pdev)
 {
    /* SQTT / SPM interfere with the register states for perf counters, and
     * the code has only been tested on GFX10.3 */
-   return pdev->rad_info.gfx_level == GFX10_3 && !radv_thread_trace_enabled();
+   return pdev->rad_info.gfx_level == GFX10_3 && !radv_sqtt_enabled();
 }
 
 static bool
@@ -489,7 +489,7 @@ radv_physical_device_get_supported_extensions(const struct radv_physical_device
       .EXT_conditional_rendering = true,
       .EXT_conservative_rasterization = device->rad_info.gfx_level >= GFX9,
       .EXT_custom_border_color = true,
-      .EXT_debug_marker = radv_thread_trace_enabled(),
+      .EXT_debug_marker = radv_sqtt_enabled(),
       .EXT_depth_clip_control = true,
       .EXT_depth_clip_enable = true,
       .EXT_depth_range_unrestricted = true,
@@ -2184,7 +2184,7 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm
    device->ws = radv_null_winsys_create();
 #else
    if (drm_device) {
-      bool reserve_vmid = radv_thread_trace_enabled();
+      bool reserve_vmid = radv_sqtt_enabled();
 
       device->ws = radv_amdgpu_winsys_create(fd, instance->debug_flags, instance->perftest_flags,
                                              reserve_vmid);
index df0f21e..69f022d 100644 (file)
@@ -387,7 +387,7 @@ VkResult create_drm_physical_device(struct vk_instance *vk_instance, struct _drm
 
 void radv_physical_device_destroy(struct vk_physical_device *vk_device);
 
-bool radv_thread_trace_enabled(void);
+bool radv_sqtt_enabled(void);
 
 struct radv_instance {
    struct vk_instance vk;
@@ -1021,7 +1021,7 @@ struct radv_device {
    struct radv_device_border_color_data border_color_data;
 
    /* Thread trace. */
-   struct ac_thread_trace_data thread_trace;
+   struct ac_sqtt sqtt;
 
    /* Memory trace. */
    struct radv_memory_trace_data memory_trace;
@@ -3071,16 +3071,16 @@ void radv_nir_shader_info_link(struct radv_device *device,
                                const struct radv_pipeline_key *pipeline_key,
                                struct radv_pipeline_stage *stages);
 
-bool radv_thread_trace_init(struct radv_device *device);
-void radv_thread_trace_finish(struct radv_device *device);
-bool radv_begin_thread_trace(struct radv_queue *queue);
-bool radv_end_thread_trace(struct radv_queue *queue);
-bool radv_get_thread_trace(struct radv_queue *queue, struct ac_thread_trace *thread_trace);
-void radv_reset_thread_trace(struct radv_device *device);
-void radv_emit_thread_trace_userdata(struct radv_cmd_buffer *cmd_buffer, const void *data,
-                                     uint32_t num_dwords);
+bool radv_sqtt_init(struct radv_device *device);
+void radv_sqtt_finish(struct radv_device *device);
+bool radv_begin_sqtt(struct radv_queue *queue);
+bool radv_end_sqtt(struct radv_queue *queue);
+bool radv_get_sqtt_trace(struct radv_queue *queue, struct ac_sqtt_trace *sqtt_trace);
+void radv_reset_sqtt_trace(struct radv_device *device);
+void radv_emit_sqtt_userdata(struct radv_cmd_buffer *cmd_buffer, const void *data,
+                             uint32_t num_dwords);
 bool radv_is_instruction_timing_enabled(void);
-bool radv_thread_trace_sample_clocks(struct radv_device *device);
+bool radv_sqtt_sample_clocks(struct radv_device *device);
 
 void radv_emit_inhibit_clockgating(struct radv_device *device, struct radeon_cmdbuf *cs,
                                    bool inhibit);
index 208e8a2..6fb0818 100644 (file)
@@ -36,7 +36,7 @@ radv_is_instruction_timing_enabled(void)
 }
 
 static uint32_t
-gfx11_get_thread_trace_ctrl(struct radv_device *device, bool enable)
+gfx11_get_sqtt_ctrl(struct radv_device *device, bool enable)
 {
    return S_0367B0_MODE(enable) | S_0367B0_HIWATER(5) | S_0367B0_UTIL_TIMER(1) |
           S_0367B0_RT_FREQ(2) | /* 4096 clk */
@@ -45,21 +45,21 @@ gfx11_get_thread_trace_ctrl(struct radv_device *device, bool enable)
 }
 
 static uint32_t
-gfx10_get_thread_trace_ctrl(struct radv_device *device, bool enable)
+gfx10_get_sqtt_ctrl(struct radv_device *device, bool enable)
 {
-   uint32_t thread_trace_ctrl = S_008D1C_MODE(enable) | S_008D1C_HIWATER(5) |
-                                S_008D1C_UTIL_TIMER(1) | S_008D1C_RT_FREQ(2) | /* 4096 clk */
-                                S_008D1C_DRAW_EVENT_EN(1) | S_008D1C_REG_STALL_EN(1) |
-                                S_008D1C_SPI_STALL_EN(1) | S_008D1C_SQ_STALL_EN(1) |
-                                S_008D1C_REG_DROP_ON_STALL(0);
+   uint32_t sqtt_ctrl = S_008D1C_MODE(enable) | S_008D1C_HIWATER(5) | S_008D1C_UTIL_TIMER(1) |
+                        S_008D1C_RT_FREQ(2) | /* 4096 clk */
+                        S_008D1C_DRAW_EVENT_EN(1) | S_008D1C_REG_STALL_EN(1) |
+                        S_008D1C_SPI_STALL_EN(1) | S_008D1C_SQ_STALL_EN(1) |
+                        S_008D1C_REG_DROP_ON_STALL(0);
 
    if (device->physical_device->rad_info.gfx_level == GFX10_3)
-      thread_trace_ctrl |= S_008D1C_LOWATER_OFFSET(4);
+      sqtt_ctrl |= S_008D1C_LOWATER_OFFSET(4);
 
    if (device->physical_device->rad_info.has_sqtt_auto_flush_mode_bug)
-      thread_trace_ctrl |= S_008D1C_AUTO_FLUSH_MODE(1);
+      sqtt_ctrl |= S_008D1C_AUTO_FLUSH_MODE(1);
 
-   return thread_trace_ctrl;
+   return sqtt_ctrl;
 }
 
 static void
@@ -78,16 +78,16 @@ radv_emit_wait_for_idle(struct radv_device *device, struct radeon_cmdbuf *cs, in
 }
 
 static void
-radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *cs,
-                             enum radv_queue_family qf)
+radv_emit_sqtt_start(struct radv_device *device, struct radeon_cmdbuf *cs,
+                     enum radv_queue_family qf)
 {
-   uint32_t shifted_size = device->thread_trace.buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
+   uint32_t shifted_size = device->sqtt.buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
    struct radeon_info *rad_info = &device->physical_device->rad_info;
    unsigned max_se = rad_info->max_se;
 
    for (unsigned se = 0; se < max_se; se++) {
-      uint64_t va = radv_buffer_get_va(device->thread_trace.bo);
-      uint64_t data_va = ac_thread_trace_get_data_va(rad_info, &device->thread_trace, va, se);
+      uint64_t va = radv_buffer_get_va(device->sqtt.bo);
+      uint64_t data_va = ac_sqtt_get_data_va(rad_info, &device->sqtt, va, se);
       uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
       int first_active_cu = ffs(device->physical_device->rad_info.cu_mask[se][0]);
 
@@ -111,7 +111,7 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c
                                    S_0367B4_SA_SEL(0) | S_0367B4_WGP_SEL(first_active_cu / 2) |
                                    S_0367B4_SIMD_SEL(0));
 
-         uint32_t thread_trace_token_mask = S_0367B8_REG_INCLUDE(
+         uint32_t sqtt_token_mask = S_0367B8_REG_INCLUDE(
             V_0367B8_REG_INCLUDE_SQDEC | V_0367B8_REG_INCLUDE_SHDEC | V_0367B8_REG_INCLUDE_GFXUDEC |
             V_0367B8_REG_INCLUDE_COMP | V_0367B8_REG_INCLUDE_CONTEXT | V_0367B8_REG_INCLUDE_CONFIG);
 
@@ -124,13 +124,13 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c
                              V_0367B8_TOKEN_EXCLUDE_VALUINST | V_0367B8_TOKEN_EXCLUDE_IMMEDIATE |
                              V_0367B8_TOKEN_EXCLUDE_INST;
          }
-         thread_trace_token_mask |= S_0367B8_TOKEN_EXCLUDE(token_exclude);
+         sqtt_token_mask |= S_0367B8_TOKEN_EXCLUDE(token_exclude);
 
-         radeon_set_uconfig_reg(cs, R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK, thread_trace_token_mask);
+         radeon_set_uconfig_reg(cs, R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask);
 
          /* Should be emitted last (it enables thread traces). */
          radeon_set_uconfig_reg(cs, R_0367B0_SQ_THREAD_TRACE_CTRL,
-                                gfx11_get_thread_trace_ctrl(device, true));
+                                gfx11_get_sqtt_ctrl(device, true));
       } else if (device->physical_device->rad_info.gfx_level >= GFX10) {
          /* Order seems important for the following 2 registers. */
          radeon_set_privileged_config_reg(
@@ -144,7 +144,7 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c
             S_008D14_WTYPE_INCLUDE(0x7f) | /* all shader stages */
                S_008D14_SA_SEL(0) | S_008D14_WGP_SEL(first_active_cu / 2) | S_008D14_SIMD_SEL(0));
 
-         uint32_t thread_trace_token_mask = S_008D18_REG_INCLUDE(
+         uint32_t sqtt_token_mask = S_008D18_REG_INCLUDE(
             V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC | V_008D18_REG_INCLUDE_GFXUDEC |
             V_008D18_REG_INCLUDE_COMP | V_008D18_REG_INCLUDE_CONTEXT | V_008D18_REG_INCLUDE_CONFIG);
 
@@ -159,14 +159,13 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c
                              V_008D18_TOKEN_EXCLUDE_IMMEDIATE |
                              V_008D18_TOKEN_EXCLUDE_INST;
          }
-         thread_trace_token_mask |= S_008D18_TOKEN_EXCLUDE(token_exclude);
+         sqtt_token_mask |= S_008D18_TOKEN_EXCLUDE(token_exclude);
 
-         radeon_set_privileged_config_reg(cs, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK,
-                                          thread_trace_token_mask);
+         radeon_set_privileged_config_reg(cs, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask);
 
          /* Should be emitted last (it enables thread traces). */
          radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL,
-                                          gfx10_get_thread_trace_ctrl(device, true));
+                                          gfx10_get_sqtt_ctrl(device, true));
       } else {
          /* Order seems important for the following 4 registers. */
          radeon_set_uconfig_reg(cs, R_030CDC_SQ_THREAD_TRACE_BASE2,
@@ -178,16 +177,16 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c
 
          radeon_set_uconfig_reg(cs, R_030CD4_SQ_THREAD_TRACE_CTRL, S_030CD4_RESET_BUFFER(1));
 
-         uint32_t thread_trace_mask = S_030CC8_CU_SEL(first_active_cu) | S_030CC8_SH_SEL(0) |
-                                      S_030CC8_SIMD_EN(0xf) | S_030CC8_VM_ID_MASK(0) |
-                                      S_030CC8_REG_STALL_EN(1) | S_030CC8_SPI_STALL_EN(1) |
-                                      S_030CC8_SQ_STALL_EN(1);
+         uint32_t sqtt_mask = S_030CC8_CU_SEL(first_active_cu) | S_030CC8_SH_SEL(0) |
+                              S_030CC8_SIMD_EN(0xf) | S_030CC8_VM_ID_MASK(0) |
+                              S_030CC8_REG_STALL_EN(1) | S_030CC8_SPI_STALL_EN(1) |
+                              S_030CC8_SQ_STALL_EN(1);
 
          if (device->physical_device->rad_info.gfx_level < GFX9) {
-            thread_trace_mask |= S_030CC8_RANDOM_SEED(0xffff);
+            sqtt_mask |= S_030CC8_RANDOM_SEED(0xffff);
          }
 
-         radeon_set_uconfig_reg(cs, R_030CC8_SQ_THREAD_TRACE_MASK, thread_trace_mask);
+         radeon_set_uconfig_reg(cs, R_030CC8_SQ_THREAD_TRACE_MASK, sqtt_mask);
 
          /* Trace all tokens and registers. */
          radeon_set_uconfig_reg(
@@ -208,7 +207,7 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c
          }
 
          /* Enable the thread trace mode. */
-         uint32_t thread_trace_mode =
+         uint32_t sqtt_mode =
             S_030CD8_MASK_PS(1) | S_030CD8_MASK_VS(1) | S_030CD8_MASK_GS(1) | S_030CD8_MASK_ES(1) |
             S_030CD8_MASK_HS(1) | S_030CD8_MASK_LS(1) | S_030CD8_MASK_CS(1) |
             S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
@@ -216,10 +215,10 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c
 
          if (device->physical_device->rad_info.gfx_level == GFX9) {
             /* Count SQTT traffic in TCC perf counters. */
-            thread_trace_mode |= S_030CD8_TC_PERF_EN(1);
+            sqtt_mode |= S_030CD8_TC_PERF_EN(1);
          }
 
-         radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE, thread_trace_mode);
+         radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE, sqtt_mode);
       }
    }
 
@@ -237,57 +236,56 @@ radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *c
    }
 }
 
-static const uint32_t gfx8_thread_trace_info_regs[] = {
+static const uint32_t gfx8_sqtt_info_regs[] = {
    R_030CE4_SQ_THREAD_TRACE_WPTR,
    R_030CE8_SQ_THREAD_TRACE_STATUS,
    R_008E40_SQ_THREAD_TRACE_CNTR,
 };
 
-static const uint32_t gfx9_thread_trace_info_regs[] = {
+static const uint32_t gfx9_sqtt_info_regs[] = {
    R_030CE4_SQ_THREAD_TRACE_WPTR,
    R_030CE8_SQ_THREAD_TRACE_STATUS,
    R_030CF0_SQ_THREAD_TRACE_CNTR,
 };
 
-static const uint32_t gfx10_thread_trace_info_regs[] = {
+static const uint32_t gfx10_sqtt_info_regs[] = {
    R_008D10_SQ_THREAD_TRACE_WPTR,
    R_008D20_SQ_THREAD_TRACE_STATUS,
    R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
 };
 
-static const uint32_t gfx11_thread_trace_info_regs[] = {
+static const uint32_t gfx11_sqtt_info_regs[] = {
    R_0367BC_SQ_THREAD_TRACE_WPTR,
    R_0367D0_SQ_THREAD_TRACE_STATUS,
    R_0367E8_SQ_THREAD_TRACE_DROPPED_CNTR,
 };
 static void
-radv_copy_thread_trace_info_regs(struct radv_device *device, struct radeon_cmdbuf *cs,
-                                 unsigned se_index)
+radv_copy_sqtt_info_regs(struct radv_device *device, struct radeon_cmdbuf *cs, unsigned se_index)
 {
    const struct radv_physical_device *pdevice = device->physical_device;
-   const uint32_t *thread_trace_info_regs = NULL;
+   const uint32_t *sqtt_info_regs = NULL;
 
    if (device->physical_device->rad_info.gfx_level >= GFX11) {
-      thread_trace_info_regs = gfx11_thread_trace_info_regs;
+      sqtt_info_regs = gfx11_sqtt_info_regs;
    } else if (device->physical_device->rad_info.gfx_level >= GFX10) {
-      thread_trace_info_regs = gfx10_thread_trace_info_regs;
+      sqtt_info_regs = gfx10_sqtt_info_regs;
    } else if (device->physical_device->rad_info.gfx_level == GFX9) {
-      thread_trace_info_regs = gfx9_thread_trace_info_regs;
+      sqtt_info_regs = gfx9_sqtt_info_regs;
    } else {
       assert(device->physical_device->rad_info.gfx_level == GFX8);
-      thread_trace_info_regs = gfx8_thread_trace_info_regs;
+      sqtt_info_regs = gfx8_sqtt_info_regs;
    }
 
    /* Get the VA where the info struct is stored for this SE. */
-   uint64_t va = radv_buffer_get_va(device->thread_trace.bo);
-   uint64_t info_va = ac_thread_trace_get_info_va(va, se_index);
+   uint64_t va = radv_buffer_get_va(device->sqtt.bo);
+   uint64_t info_va = ac_sqtt_get_info_va(va, se_index);
 
    /* Copy back the info struct one DWORD at a time. */
    for (unsigned i = 0; i < 3; i++) {
       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) |
                          COPY_DATA_WR_CONFIRM);
-      radeon_emit(cs, thread_trace_info_regs[i] >> 2);
+      radeon_emit(cs, sqtt_info_regs[i] >> 2);
       radeon_emit(cs, 0); /* unused */
       radeon_emit(cs, (info_va + i * 4));
       radeon_emit(cs, (info_va + i * 4) >> 32);
@@ -302,8 +300,7 @@ radv_copy_thread_trace_info_regs(struct radv_device *device, struct radeon_cmdbu
        * 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned
        * 3) mask off the higher 3 bits because WPTR.OFFSET is 29 bits
        */
-      uint64_t data_va =
-         ac_thread_trace_get_data_va(&pdevice->rad_info, &device->thread_trace, va, se_index);
+      uint64_t data_va = ac_sqtt_get_data_va(&pdevice->rad_info, &device->sqtt, va, se_index);
       uint64_t shifted_data_va = (data_va >> 5);
       uint32_t init_wptr_value = shifted_data_va & 0x1fffffff;
 
@@ -320,8 +317,7 @@ radv_copy_thread_trace_info_regs(struct radv_device *device, struct radeon_cmdbu
 }
 
 static void
-radv_emit_thread_trace_stop(struct radv_device *device, struct radeon_cmdbuf *cs,
-                            enum radv_queue_family qf)
+radv_emit_sqtt_stop(struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf)
 {
    unsigned max_se = device->physical_device->rad_info.max_se;
 
@@ -364,7 +360,7 @@ radv_emit_thread_trace_stop(struct radv_device *device, struct radeon_cmdbuf *cs
 
          /* Disable the thread trace mode. */
          radeon_set_uconfig_reg(cs, R_0367B0_SQ_THREAD_TRACE_CTRL,
-                                gfx11_get_thread_trace_ctrl(device, false));
+                                gfx11_get_sqtt_ctrl(device, false));
 
          /* Wait for thread trace completion. */
          radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
@@ -391,7 +387,7 @@ radv_emit_thread_trace_stop(struct radv_device *device, struct radeon_cmdbuf *cs
 
          /* Disable the thread trace mode. */
          radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL,
-                                          gfx10_get_thread_trace_ctrl(device, false));
+                                          gfx10_get_sqtt_ctrl(device, false));
 
          /* Wait for thread trace completion. */
          radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
@@ -417,7 +413,7 @@ radv_emit_thread_trace_stop(struct radv_device *device, struct radeon_cmdbuf *cs
          radeon_emit(cs, 4);                /* poll interval */
       }
 
-      radv_copy_thread_trace_info_regs(device, cs, se);
+      radv_copy_sqtt_info_regs(device, cs, se);
    }
 
    /* Restore global broadcasting. */
@@ -427,8 +423,7 @@ radv_emit_thread_trace_stop(struct radv_device *device, struct radeon_cmdbuf *cs
 }
 
 void
-radv_emit_thread_trace_userdata(struct radv_cmd_buffer *cmd_buffer, const void *data,
-                                uint32_t num_dwords)
+radv_emit_sqtt_userdata(struct radv_cmd_buffer *cmd_buffer, const void *data, uint32_t num_dwords)
 {
    struct radv_device *device = cmd_buffer->device;
    struct radeon_cmdbuf *cs = cmd_buffer->cs;
@@ -492,7 +487,7 @@ radv_emit_inhibit_clockgating(struct radv_device *device, struct radeon_cmdbuf *
 }
 
 static bool
-radv_thread_trace_init_bo(struct radv_device *device)
+radv_sqtt_init_bo(struct radv_device *device)
 {
    unsigned max_se = device->physical_device->rad_info.max_se;
    struct radeon_winsys *ws = device->ws;
@@ -502,49 +497,48 @@ radv_thread_trace_init_bo(struct radv_device *device)
    /* The buffer size and address need to be aligned in HW regs. Align the
     * size as early as possible so that we do all the allocation & addressing
     * correctly. */
-   device->thread_trace.buffer_size =
-      align64(device->thread_trace.buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT);
+   device->sqtt.buffer_size = align64(device->sqtt.buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT);
 
    /* Compute total size of the thread trace BO for all SEs. */
-   size = align64(sizeof(struct ac_thread_trace_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT);
-   size += device->thread_trace.buffer_size * (uint64_t)max_se;
+   size = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT);
+   size += device->sqtt.buffer_size * (uint64_t)max_se;
 
    struct radeon_winsys_bo *bo = NULL;
    result = ws->buffer_create(
       ws, size, 4096, RADEON_DOMAIN_VRAM,
       RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_ZERO_VRAM,
       RADV_BO_PRIORITY_SCRATCH, 0, &bo);
-   device->thread_trace.bo = bo;
+   device->sqtt.bo = bo;
    if (result != VK_SUCCESS)
       return false;
 
-   result = ws->buffer_make_resident(ws, device->thread_trace.bo, true);
+   result = ws->buffer_make_resident(ws, device->sqtt.bo, true);
    if (result != VK_SUCCESS)
       return false;
 
-   device->thread_trace.ptr = ws->buffer_map(device->thread_trace.bo);
-   if (!device->thread_trace.ptr)
+   device->sqtt.ptr = ws->buffer_map(device->sqtt.bo);
+   if (!device->sqtt.ptr)
       return false;
 
    return true;
 }
 
 static void
-radv_thread_trace_finish_bo(struct radv_device *device)
+radv_sqtt_finish_bo(struct radv_device *device)
 {
    struct radeon_winsys *ws = device->ws;
 
-   if (unlikely(device->thread_trace.bo)) {
-      ws->buffer_make_resident(ws, device->thread_trace.bo, false);
-      ws->buffer_destroy(ws, device->thread_trace.bo);
+   if (unlikely(device->sqtt.bo)) {
+      ws->buffer_make_resident(ws, device->sqtt.bo, false);
+      ws->buffer_destroy(ws, device->sqtt.bo);
    }
 }
 
 static VkResult
 radv_register_queue(struct radv_device *device, struct radv_queue *queue)
 {
-   struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
-   struct rgp_queue_info *queue_info = &thread_trace_data->rgp_queue_info;
+   struct ac_sqtt *sqtt = &device->sqtt;
+   struct rgp_queue_info *queue_info = &sqtt->rgp_queue_info;
    struct rgp_queue_info_record *record;
 
    record = malloc(sizeof(struct rgp_queue_info_record));
@@ -572,8 +566,8 @@ radv_register_queue(struct radv_device *device, struct radv_queue *queue)
 static void
 radv_unregister_queue(struct radv_device *device, struct radv_queue *queue)
 {
-   struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
-   struct rgp_queue_info *queue_info = &thread_trace_data->rgp_queue_info;
+   struct ac_sqtt *sqtt = &device->sqtt;
+   struct rgp_queue_info *queue_info = &sqtt->rgp_queue_info;
 
    /* Destroy queue info record. */
    simple_mtx_lock(&queue_info->lock);
@@ -592,7 +586,7 @@ radv_unregister_queue(struct radv_device *device, struct radv_queue *queue)
 }
 
 static void
-radv_register_queues(struct radv_device *device, struct ac_thread_trace_data *thread_trace_data)
+radv_register_queues(struct radv_device *device, struct ac_sqtt *sqtt)
 {
    radv_register_queue(device, &device->queues[RADV_QUEUE_GENERAL][0]);
    for (uint32_t i = 0; i < device->queue_count[RADV_QUEUE_COMPUTE]; i++)
@@ -600,7 +594,7 @@ radv_register_queues(struct radv_device *device, struct ac_thread_trace_data *th
 }
 
 static void
-radv_unregister_queues(struct radv_device *device, struct ac_thread_trace_data *thread_trace_data)
+radv_unregister_queues(struct radv_device *device, struct ac_sqtt *sqtt)
 {
    radv_unregister_queue(device, &device->queues[RADV_QUEUE_GENERAL][0]);
    for (uint32_t i = 0; i < device->queue_count[RADV_QUEUE_COMPUTE]; i++)
@@ -608,74 +602,74 @@ radv_unregister_queues(struct radv_device *device, struct ac_thread_trace_data *
 }
 
 bool
-radv_thread_trace_init(struct radv_device *device)
+radv_sqtt_init(struct radv_device *device)
 {
-   struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
+   struct ac_sqtt *sqtt = &device->sqtt;
 
    /* Default buffer size set to 32MB per SE. */
-   device->thread_trace.buffer_size =
+   device->sqtt.buffer_size =
       radv_get_int_debug_option("RADV_THREAD_TRACE_BUFFER_SIZE", 32 * 1024 * 1024);
-   device->thread_trace.start_frame = radv_get_int_debug_option("RADV_THREAD_TRACE", -1);
+   device->sqtt.start_frame = radv_get_int_debug_option("RADV_THREAD_TRACE", -1);
 
    const char *trigger_file = getenv("RADV_THREAD_TRACE_TRIGGER");
    if (trigger_file)
-      device->thread_trace.trigger_file = strdup(trigger_file);
+      device->sqtt.trigger_file = strdup(trigger_file);
 
-   if (!radv_thread_trace_init_bo(device))
+   if (!radv_sqtt_init_bo(device))
       return false;
 
    if (!radv_device_acquire_performance_counters(device))
       return false;
 
-   ac_thread_trace_init(thread_trace_data);
+   ac_sqtt_init(sqtt);
 
-   radv_register_queues(device, thread_trace_data);
+   radv_register_queues(device, sqtt);
 
    return true;
 }
 
 void
-radv_thread_trace_finish(struct radv_device *device)
+radv_sqtt_finish(struct radv_device *device)
 {
-   struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
+   struct ac_sqtt *sqtt = &device->sqtt;
    struct radeon_winsys *ws = device->ws;
 
-   free(device->thread_trace.trigger_file);
+   free(device->sqtt.trigger_file);
 
-   radv_thread_trace_finish_bo(device);
+   radv_sqtt_finish_bo(device);
 
    for (unsigned i = 0; i < 2; i++) {
-      if (device->thread_trace.start_cs[i])
-         ws->cs_destroy(device->thread_trace.start_cs[i]);
-      if (device->thread_trace.stop_cs[i])
-         ws->cs_destroy(device->thread_trace.stop_cs[i]);
+      if (device->sqtt.start_cs[i])
+         ws->cs_destroy(device->sqtt.start_cs[i]);
+      if (device->sqtt.stop_cs[i])
+         ws->cs_destroy(device->sqtt.stop_cs[i]);
    }
 
-   radv_unregister_queues(device, thread_trace_data);
+   radv_unregister_queues(device, sqtt);
 
-   ac_thread_trace_finish(thread_trace_data);
+   ac_sqtt_finish(sqtt);
 }
 
 static bool
-radv_thread_trace_resize_bo(struct radv_device *device)
+radv_sqtt_resize_bo(struct radv_device *device)
 {
    /* Destroy the previous thread trace BO. */
-   radv_thread_trace_finish_bo(device);
+   radv_sqtt_finish_bo(device);
 
    /* Double the size of the thread trace buffer per SE. */
-   device->thread_trace.buffer_size *= 2;
+   device->sqtt.buffer_size *= 2;
 
    fprintf(stderr,
            "Failed to get the thread trace because the buffer "
            "was too small, resizing to %d KB\n",
-           device->thread_trace.buffer_size / 1024);
+           device->sqtt.buffer_size / 1024);
 
    /* Re-create the thread trace BO. */
-   return radv_thread_trace_init_bo(device);
+   return radv_sqtt_init_bo(device);
 }
 
 bool
-radv_begin_thread_trace(struct radv_queue *queue)
+radv_begin_sqtt(struct radv_queue *queue)
 {
    struct radv_device *device = queue->device;
    enum radv_queue_family family = queue->state.qf;
@@ -684,9 +678,9 @@ radv_begin_thread_trace(struct radv_queue *queue)
    VkResult result;
 
    /* Destroy the previous start CS and create a new one. */
-   if (device->thread_trace.start_cs[family]) {
-      ws->cs_destroy(device->thread_trace.start_cs[family]);
-      device->thread_trace.start_cs[family] = NULL;
+   if (device->sqtt.start_cs[family]) {
+      ws->cs_destroy(device->sqtt.start_cs[family]);
+      device->sqtt.start_cs[family] = NULL;
    }
 
    cs = ws->cs_create(ws, radv_queue_ring(queue), false);
@@ -727,7 +721,7 @@ radv_begin_thread_trace(struct radv_queue *queue)
    }
 
    /* Start SQTT. */
-   radv_emit_thread_trace_start(device, cs, family);
+   radv_emit_sqtt_start(device, cs, family);
 
    if (device->spm.bo)
       radv_perfcounter_emit_spm_start(device, cs, family);
@@ -738,13 +732,13 @@ radv_begin_thread_trace(struct radv_queue *queue)
       return false;
    }
 
-   device->thread_trace.start_cs[family] = cs;
+   device->sqtt.start_cs[family] = cs;
 
    return radv_queue_internal_submit(queue, cs);
 }
 
 bool
-radv_end_thread_trace(struct radv_queue *queue)
+radv_end_sqtt(struct radv_queue *queue)
 {
    struct radv_device *device = queue->device;
    enum radv_queue_family family = queue->state.qf;
@@ -753,9 +747,9 @@ radv_end_thread_trace(struct radv_queue *queue)
    VkResult result;
 
    /* Destroy the previous stop CS and create a new one. */
-   if (queue->device->thread_trace.stop_cs[family]) {
-      ws->cs_destroy(device->thread_trace.stop_cs[family]);
-      device->thread_trace.stop_cs[family] = NULL;
+   if (queue->device->sqtt.stop_cs[family]) {
+      ws->cs_destroy(device->sqtt.stop_cs[family]);
+      device->sqtt.stop_cs[family] = NULL;
    }
 
    cs = ws->cs_create(ws, radv_queue_ring(queue), false);
@@ -784,7 +778,7 @@ radv_end_thread_trace(struct radv_queue *queue)
       radv_perfcounter_emit_spm_stop(device, cs, family);
 
    /* Stop SQTT. */
-   radv_emit_thread_trace_stop(device, cs, family);
+   radv_emit_sqtt_stop(device, cs, family);
 
    radv_perfcounter_emit_spm_reset(cs);
 
@@ -800,19 +794,19 @@ radv_end_thread_trace(struct radv_queue *queue)
       return false;
    }
 
-   device->thread_trace.stop_cs[family] = cs;
+   device->sqtt.stop_cs[family] = cs;
 
    return radv_queue_internal_submit(queue, cs);
 }
 
 bool
-radv_get_thread_trace(struct radv_queue *queue, struct ac_thread_trace *thread_trace)
+radv_get_sqtt_trace(struct radv_queue *queue, struct ac_sqtt_trace *sqtt_trace)
 {
    struct radv_device *device = queue->device;
    struct radeon_info *rad_info = &device->physical_device->rad_info;
 
-   if (!ac_sqtt_get_trace(&device->thread_trace, rad_info, thread_trace)) {
-      if (!radv_thread_trace_resize_bo(device))
+   if (!ac_sqtt_get_trace(&device->sqtt, rad_info, sqtt_trace)) {
+      if (!radv_sqtt_resize_bo(device))
          fprintf(stderr, "radv: Failed to resize the SQTT buffer.\n");
       return false;
    }
@@ -821,10 +815,10 @@ radv_get_thread_trace(struct radv_queue *queue, struct ac_thread_trace *thread_t
 }
 
 void
-radv_reset_thread_trace(struct radv_device *device)
+radv_reset_sqtt_trace(struct radv_device *device)
 {
-   struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
-   struct rgp_clock_calibration *clock_calibration = &thread_trace_data->rgp_clock_calibration;
+   struct ac_sqtt *sqtt = &device->sqtt;
+   struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration;
 
    /* Clear clock calibration records. */
    simple_mtx_lock(&clock_calibration->lock);
@@ -869,7 +863,7 @@ radv_get_calibrated_timestamps(struct radv_device *device, uint64_t *cpu_timesta
 }
 
 bool
-radv_thread_trace_sample_clocks(struct radv_device *device)
+radv_sqtt_sample_clocks(struct radv_device *device)
 {
    uint64_t cpu_timestamp = 0, gpu_timestamp = 0;
    VkResult result;
@@ -878,5 +872,5 @@ radv_thread_trace_sample_clocks(struct radv_device *device)
    if (result != VK_SUCCESS)
       return false;
 
-   return ac_sqtt_add_clock_calibration(&device->thread_trace, cpu_timestamp, gpu_timestamp);
+   return ac_sqtt_add_clock_calibration(&device->sqtt, cpu_timestamp, gpu_timestamp);
 }
index 7ff5728..c91174d 100644 (file)
@@ -1211,13 +1211,13 @@ static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
       simple_mtx_unlock(&sscreen->async_compute_context_lock);
    }
 
-   if (unlikely(sctx->thread_trace_enabled))
+   if (unlikely(sctx->sqtt_enabled))
       sctx->sqtt_next_event = EventCmdResolveImage;
 
    if (si_msaa_resolve_blit_via_CB(ctx, info))
       return;
 
-   if (unlikely(sctx->thread_trace_enabled))
+   if (unlikely(sctx->sqtt_enabled))
       sctx->sqtt_next_event = EventCmdCopyImage;
 
    /* Using compute for copying to a linear texture in GTT is much faster than
@@ -1252,7 +1252,7 @@ void si_gfx_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
                              info->src.box.z, info->src.box.z + info->src.box.depth - 1,
                              false);
 
-   if (unlikely(sctx->thread_trace_enabled))
+   if (unlikely(sctx->sqtt_enabled))
       sctx->sqtt_next_event = EventCmdBlitImage;
 
    si_blitter_begin(sctx, SI_BLIT | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
index 2c333e9..548c0d4 100644 (file)
@@ -1186,7 +1186,7 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
          sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
    }
 
-   if (unlikely(sctx->thread_trace_enabled)) {
+   if (unlikely(sctx->sqtt_enabled)) {
       if (buffers & PIPE_CLEAR_COLOR)
          sctx->sqtt_next_event = EventCmdClearColorImage;
       else if (buffers & PIPE_CLEAR_DEPTHSTENCIL)
index 9ed4f90..1670a11 100644 (file)
@@ -337,14 +337,13 @@ static void si_bind_compute_state(struct pipe_context *ctx, void *state)
    sctx->compute_shaderbuf_sgprs_dirty = true;
    sctx->compute_image_sgprs_dirty = true;
 
-   if (unlikely((sctx->screen->debug_flags & DBG(SQTT)) && sctx->thread_trace)) {
+   if (unlikely((sctx->screen->debug_flags & DBG(SQTT)) && sctx->sqtt)) {
       uint32_t pipeline_code_hash = _mesa_hash_data_with_seed(
          program->shader.binary.elf_buffer,
          program->shader.binary.elf_size,
          0);
 
-      struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
-      if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) {
+      if (!si_sqtt_pipeline_is_registered(sctx->sqtt, pipeline_code_hash)) {
          /* Short lived fake pipeline: we don't need to reupload the compute shaders,
           * as we do for the gfx ones so just create a temp pipeline to be able to
           * call si_sqtt_register_pipeline, and then drop it.
@@ -769,7 +768,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
    if (sctx->gfx_level >= GFX10 && waves_per_threadgroup == 1)
       threadgroups_per_cu = 2;
 
-   if (unlikely(sctx->thread_trace_enabled)) {
+   if (unlikely(sctx->sqtt_enabled)) {
       si_write_event_with_dims_marker(sctx, &sctx->gfx_cs,
                                       info->indirect ? EventCmdDispatchIndirect : EventCmdDispatch,
                                       info->grid[0], info->grid[1], info->grid[2]);
@@ -839,7 +838,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
       radeon_emit(dispatch_initiator);
    }
 
-   if (unlikely(sctx->thread_trace_enabled && sctx->gfx_level >= GFX9)) {
+   if (unlikely(sctx->sqtt_enabled && sctx->gfx_level >= GFX9)) {
       radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
       radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
    }
index 3dea23c..61d4b01 100644 (file)
@@ -489,9 +489,8 @@ static void si_flush_all_queues(struct pipe_context *ctx,
 
       tc_driver_internal_flush_notify(sctx->tc);
 
-      if (unlikely(sctx->thread_trace &&
-                   (flags & PIPE_FLUSH_END_OF_FRAME))) {
-         si_handle_thread_trace(sctx, &sctx->gfx_cs);
+      if (unlikely(sctx->sqtt && (flags & PIPE_FLUSH_END_OF_FRAME))) {
+         si_handle_sqtt(sctx, &sctx->gfx_cs);
       }
    } else {
       /* Instead of flushing, create a deferred fence. Constraints:
index 698d984..683790c 100644 (file)
@@ -170,9 +170,8 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
       si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, AMD_IP_GFX);
    }
 
-   if (unlikely(ctx->thread_trace &&
-                (flags & PIPE_FLUSH_END_OF_FRAME))) {
-      si_handle_thread_trace(ctx, &ctx->gfx_cs);
+   if (unlikely(ctx->sqtt && (flags & PIPE_FLUSH_END_OF_FRAME))) {
+      si_handle_sqtt(ctx, &ctx->gfx_cs);
    }
 
    if (ctx->current_saved_cs)
@@ -795,7 +794,7 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
          radeon_emit(0); /* DATA_HI */
          radeon_emit(0); /* INT_CTXID */
 
-         if (unlikely(ctx->thread_trace_enabled)) {
+         if (unlikely(ctx->sqtt_enabled)) {
             radeon_end();
             si_sqtt_describe_barrier_start(ctx, &ctx->gfx_cs);
             radeon_begin_again(cs);
@@ -815,7 +814,7 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
          radeon_emit(S_585_PWS_ENA(1));
          radeon_emit(gcr_cntl); /* GCR_CNTL */
 
-         if (unlikely(ctx->thread_trace_enabled)) {
+         if (unlikely(ctx->sqtt_enabled)) {
             radeon_end();
             si_sqtt_describe_barrier_end(ctx, &ctx->gfx_cs, flags);
             radeon_begin_again(cs);
@@ -859,13 +858,13 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
                            EOP_DATA_SEL_VALUE_32BIT, wait_mem_scratch, va, ctx->wait_mem_number,
                            SI_NOT_QUERY);
 
-         if (unlikely(ctx->thread_trace_enabled)) {
+         if (unlikely(ctx->sqtt_enabled)) {
             si_sqtt_describe_barrier_start(ctx, &ctx->gfx_cs);
          }
 
          si_cp_wait_mem(ctx, cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
 
-         if (unlikely(ctx->thread_trace_enabled)) {
+         if (unlikely(ctx->sqtt_enabled)) {
             si_sqtt_describe_barrier_end(ctx, &ctx->gfx_cs, flags);
          }
 
@@ -1071,13 +1070,13 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
                         EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT,
                         wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY);
 
-      if (unlikely(sctx->thread_trace_enabled)) {
+      if (unlikely(sctx->sqtt_enabled)) {
          si_sqtt_describe_barrier_start(sctx, &sctx->gfx_cs);
       }
 
       si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
 
-      if (unlikely(sctx->thread_trace_enabled)) {
+      if (unlikely(sctx->sqtt_enabled)) {
          si_sqtt_describe_barrier_end(sctx, &sctx->gfx_cs, sctx->flags);
       }
    }
index afd13ff..e2b246e 100644 (file)
@@ -209,13 +209,13 @@ static void si_destroy_context(struct pipe_context *context)
    if (sctx->gfx_level >= GFX10 && sctx->has_graphics)
       gfx10_destroy_query(sctx);
 
-   if (sctx->thread_trace) {
+   if (sctx->sqtt) {
       struct si_screen *sscreen = sctx->screen;
       if (sscreen->info.has_stable_pstate && sscreen->b.num_contexts == 1 &&
           !(sctx->context_flags & SI_CONTEXT_FLAG_AUX))
           sscreen->ws->cs_set_pstate(&sctx->gfx_cs, RADEON_CTX_PSTATE_NONE);
 
-      si_destroy_thread_trace(sctx);
+      si_destroy_sqtt(sctx);
    }
 
    pipe_resource_reference(&sctx->esgs_ring, NULL);
@@ -429,7 +429,7 @@ static void si_emit_string_marker(struct pipe_context *ctx, const char *string,
 
    dd_parse_apitrace_marker(string, len, &sctx->apitrace_call_number);
 
-   if (sctx->thread_trace_enabled)
+   if (sctx->sqtt_enabled)
       si_write_user_event(sctx, &sctx->gfx_cs, UserEventTrigger, string, len);
 
    if (sctx->log)
@@ -896,7 +896,7 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, v
                          "detected. Force the GPU into a profiling mode with e.g. "
                          "\"echo profile_peak  > "
                          "/sys/class/drm/card0/device/power_dpm_force_performance_level\"\n");
-      } else if (!si_init_thread_trace((struct si_context *)ctx)) {
+      } else if (!si_init_sqtt((struct si_context *)ctx)) {
          FREE(ctx);
          return NULL;
       }
index 29bc59f..44a4f3e 100644 (file)
@@ -1319,11 +1319,11 @@ struct si_context {
    void (*emit_spi_map[33])(struct si_context *sctx);
 
    /* SQTT */
-   struct ac_thread_trace_data *thread_trace;
+   struct ac_sqtt *sqtt;
    struct ac_spm spm;
    struct pipe_fence_handle *last_sqtt_fence;
    enum rgp_sqtt_marker_event_type sqtt_next_event;
-   bool thread_trace_enabled;
+   bool sqtt_enabled;
 
    unsigned context_flags;
 
@@ -1666,7 +1666,7 @@ void si_sqtt_write_event_marker(struct si_context* sctx, struct radeon_cmdbuf *r
                                 uint32_t instance_offset_user_data,
                                 uint32_t draw_index_user_data);
 bool si_sqtt_register_pipeline(struct si_context* sctx, struct si_sqtt_fake_pipeline *pipeline, bool is_compute);
-bool si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data,
+bool si_sqtt_pipeline_is_registered(struct ac_sqtt *sqtt,
                                     uint64_t pipeline_hash);
 void si_sqtt_describe_pipeline_bind(struct si_context* sctx, uint64_t pipeline_hash, int bind_point);
 void
@@ -1681,9 +1681,9 @@ void
 si_sqtt_describe_barrier_start(struct si_context* sctx, struct radeon_cmdbuf *rcs);
 void
 si_sqtt_describe_barrier_end(struct si_context* sctx, struct radeon_cmdbuf *rcs, unsigned flags);
-bool si_init_thread_trace(struct si_context *sctx);
-void si_destroy_thread_trace(struct si_context *sctx);
-void si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs);
+bool si_init_sqtt(struct si_context *sctx);
+void si_destroy_sqtt(struct si_context *sctx);
+void si_handle_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs);
 
 /*
  * common helpers
index d9c91c1..8fd6f9a 100644 (file)
@@ -39,825 +39,784 @@ static void
 si_emit_spi_config_cntl(struct si_context* sctx,
                         struct radeon_cmdbuf *cs, bool enable);
 
-static bool
-si_thread_trace_init_bo(struct si_context *sctx)
-{
-   unsigned max_se = sctx->screen->info.max_se;
-   struct radeon_winsys *ws = sctx->ws;
-   uint64_t size;
-
-   /* The buffer size and address need to be aligned in HW regs. Align the
-    * size as early as possible so that we do all the allocation & addressing
-    * correctly. */
-   sctx->thread_trace->buffer_size = align64(sctx->thread_trace->buffer_size,
-                                             1u << SQTT_BUFFER_ALIGN_SHIFT);
-
-   /* Compute total size of the thread trace BO for all SEs. */
-   size = align64(sizeof(struct ac_thread_trace_info) * max_se,
-                  1 << SQTT_BUFFER_ALIGN_SHIFT);
-   size += sctx->thread_trace->buffer_size * (uint64_t)max_se;
-
-   sctx->thread_trace->pipeline_bos = _mesa_hash_table_u64_create(NULL);
-
-   sctx->thread_trace->bo =
-      ws->buffer_create(ws, size, 4096,
-                        RADEON_DOMAIN_VRAM,
+static bool si_sqtt_init_bo(struct si_context *sctx) {
+  unsigned max_se = sctx->screen->info.max_se;
+  struct radeon_winsys *ws = sctx->ws;
+  uint64_t size;
+
+  /* The buffer size and address need to be aligned in HW regs. Align the
+   * size as early as possible so that we do all the allocation & addressing
+   * correctly. */
+  sctx->sqtt->buffer_size =
+      align64(sctx->sqtt->buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT);
+
+  /* Compute total size of the thread trace BO for all SEs. */
+  size = align64(sizeof(struct ac_sqtt_data_info) * max_se,
+                 1 << SQTT_BUFFER_ALIGN_SHIFT);
+  size += sctx->sqtt->buffer_size * (uint64_t)max_se;
+
+  sctx->sqtt->pipeline_bos = _mesa_hash_table_u64_create(NULL);
+
+  sctx->sqtt->bo =
+      ws->buffer_create(ws, size, 4096, RADEON_DOMAIN_VRAM,
                         RADEON_FLAG_NO_INTERPROCESS_SHARING |
-                        RADEON_FLAG_GTT_WC |
-                        RADEON_FLAG_NO_SUBALLOC);
-   if (!sctx->thread_trace->bo)
-      return false;
+                            RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_SUBALLOC);
+  if (!sctx->sqtt->bo)
+    return false;
 
-   return true;
+  return true;
 }
 
-static void
-si_emit_thread_trace_start(struct si_context* sctx,
-                           struct radeon_cmdbuf *cs,
-                           uint32_t queue_family_index)
-{
-   struct si_screen *sscreen = sctx->screen;
-   uint32_t shifted_size = sctx->thread_trace->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
-   unsigned max_se = sscreen->info.max_se;
-
-   radeon_begin(cs);
-
-   for (unsigned se = 0; se < max_se; se++) {
-      uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo);
-      uint64_t data_va = ac_thread_trace_get_data_va(&sctx->screen->info, sctx->thread_trace, va, se);
-      uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
-
-      if (ac_sqtt_se_is_disabled(&sctx->screen->info, se))
-         continue;
-
-      /* Target SEx and SH0. */
-      radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
-                             S_030800_SE_INDEX(se) |
-                             S_030800_SH_INDEX(0) |
-                             S_030800_INSTANCE_BROADCAST_WRITES(1));
-
-      /* Select the first active CUs */
-      int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]);
-
-      if (sctx->gfx_level >= GFX10) {
-         uint32_t token_mask = V_008D18_REG_INCLUDE_SQDEC |
-                               V_008D18_REG_INCLUDE_SHDEC |
-                               V_008D18_REG_INCLUDE_GFXUDEC |
-                               V_008D18_REG_INCLUDE_CONTEXT |
-                               V_008D18_REG_INCLUDE_COMP |
-                               V_008D18_REG_INCLUDE_CONFIG;
-         int wgp = first_active_cu / 2;
-         unsigned shader_mask = 0x7f; /* all shader stages */
-
-         /* Order seems important for the following 2 registers. */
-         if (sctx->gfx_level >= GFX11) {
-            /* Disable unsupported hw shader stages */
-            shader_mask &= ~(0x02 /* VS */ | 0x08 /* ES */ | 0x20 /* LS */);
-
-            radeon_set_uconfig_reg(R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE,
-                                             S_0367A4_SIZE(shifted_size) |
-                                             S_0367A4_BASE_HI(shifted_va >> 32));
-
-            radeon_set_uconfig_reg(R_0367A0_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
-
-            radeon_set_uconfig_reg(R_0367B4_SQ_THREAD_TRACE_MASK,
-                                             S_0367B4_WTYPE_INCLUDE(shader_mask) |
-                                             S_0367B4_SA_SEL(0) |
-                                             S_0367B4_WGP_SEL(wgp) |
-                                             S_0367B4_SIMD_SEL(0));
-
-            radeon_set_uconfig_reg(R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK,
-                         S_0367B8_REG_INCLUDE(token_mask) |
-                         S_0367B8_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF));
-         } else {
-            radeon_set_privileged_config_reg(R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
-                                             S_008D04_SIZE(shifted_size) |
-                                             S_008D04_BASE_HI(shifted_va >> 32));
-
-            radeon_set_privileged_config_reg(R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
-
-            radeon_set_privileged_config_reg(R_008D14_SQ_THREAD_TRACE_MASK,
-                                             S_008D14_WTYPE_INCLUDE(shader_mask) |
-                                             S_008D14_SA_SEL(0) |
-                                             S_008D14_WGP_SEL(wgp) |
-                                             S_008D14_SIMD_SEL(0));
-
-            radeon_set_privileged_config_reg(R_008D18_SQ_THREAD_TRACE_TOKEN_MASK,
-                         S_008D18_REG_INCLUDE(token_mask) |
-                         S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF));
-         }
-
-         /* Should be emitted last (it enables thread traces). */
-         uint32_t ctrl = S_008D1C_MODE(1) | S_008D1C_HIWATER(5) | S_008D1C_UTIL_TIMER(1) |
-                         S_008D1C_RT_FREQ(2) | /* 4096 clk */S_008D1C_DRAW_EVENT_EN(1);
-
-         if (sctx->gfx_level == GFX10_3)
-            ctrl |= S_008D1C_LOWATER_OFFSET(4);
-
-         ctrl |= S_008D1C_AUTO_FLUSH_MODE(sctx->screen->info.has_sqtt_auto_flush_mode_bug);
-
-         switch (sctx->gfx_level) {
-            case GFX10:
-            case GFX10_3:
-               ctrl |= S_008D1C_REG_STALL_EN(1) | S_008D1C_SPI_STALL_EN(1) |
-                       S_008D1C_SQ_STALL_EN(1) |S_008D1C_REG_DROP_ON_STALL(0);
-               radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL, ctrl);
-               break;
-            case GFX11:
-               ctrl |= S_0367B0_SPI_STALL_EN(1) | S_0367B0_SQ_STALL_EN(1) |
-                       S_0367B0_REG_AT_HWM(2);
-               radeon_set_uconfig_reg(R_0367B0_SQ_THREAD_TRACE_CTRL, ctrl);
-               break;
-            default:
-               assert(false);
-         }
+static void si_emit_sqtt_start(struct si_context *sctx,
+                               struct radeon_cmdbuf *cs,
+                               uint32_t queue_family_index) {
+  struct si_screen *sscreen = sctx->screen;
+  uint32_t shifted_size = sctx->sqtt->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
+  unsigned max_se = sscreen->info.max_se;
+
+  radeon_begin(cs);
+
+  for (unsigned se = 0; se < max_se; se++) {
+    uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->sqtt->bo);
+    uint64_t data_va =
+        ac_sqtt_get_data_va(&sctx->screen->info, sctx->sqtt, va, se);
+    uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
+
+    if (ac_sqtt_se_is_disabled(&sctx->screen->info, se))
+      continue;
+
+    /* Target SEx and SH0. */
+    radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
+                           S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) |
+                               S_030800_INSTANCE_BROADCAST_WRITES(1));
+
+    /* Select the first active CUs */
+    int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]);
+
+    if (sctx->gfx_level >= GFX10) {
+      uint32_t token_mask =
+          V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC |
+          V_008D18_REG_INCLUDE_GFXUDEC | V_008D18_REG_INCLUDE_CONTEXT |
+          V_008D18_REG_INCLUDE_COMP | V_008D18_REG_INCLUDE_CONFIG;
+      int wgp = first_active_cu / 2;
+      unsigned shader_mask = 0x7f; /* all shader stages */
+
+      /* Order seems important for the following 2 registers. */
+      if (sctx->gfx_level >= GFX11) {
+        /* Disable unsupported hw shader stages */
+        shader_mask &= ~(0x02 /* VS */ | 0x08 /* ES */ | 0x20 /* LS */);
+
+        radeon_set_uconfig_reg(R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE,
+                               S_0367A4_SIZE(shifted_size) |
+                                   S_0367A4_BASE_HI(shifted_va >> 32));
+
+        radeon_set_uconfig_reg(R_0367A0_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
+
+        radeon_set_uconfig_reg(R_0367B4_SQ_THREAD_TRACE_MASK,
+                               S_0367B4_WTYPE_INCLUDE(shader_mask) |
+                                   S_0367B4_SA_SEL(0) | S_0367B4_WGP_SEL(wgp) |
+                                   S_0367B4_SIMD_SEL(0));
+
+        radeon_set_uconfig_reg(
+            R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK,
+            S_0367B8_REG_INCLUDE(token_mask) |
+                S_0367B8_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF));
       } else {
-         /* Order seems important for the following 4 registers. */
-         radeon_set_uconfig_reg(R_030CDC_SQ_THREAD_TRACE_BASE2,
-                                S_030CDC_ADDR_HI(shifted_va >> 32));
-
-         radeon_set_uconfig_reg(R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
-
-         radeon_set_uconfig_reg(R_030CC4_SQ_THREAD_TRACE_SIZE,
-                                S_030CC4_SIZE(shifted_size));
-
-         radeon_set_uconfig_reg(R_030CD4_SQ_THREAD_TRACE_CTRL,
-                                S_030CD4_RESET_BUFFER(1));
-
-         uint32_t thread_trace_mask = S_030CC8_CU_SEL(first_active_cu) |
-                                      S_030CC8_SH_SEL(0) |
-                                      S_030CC8_SIMD_EN(0xf) |
-                                      S_030CC8_VM_ID_MASK(0) |
-                                      S_030CC8_REG_STALL_EN(1) |
-                                      S_030CC8_SPI_STALL_EN(1) |
-                                      S_030CC8_SQ_STALL_EN(1);
-
-         radeon_set_uconfig_reg(R_030CC8_SQ_THREAD_TRACE_MASK,
-                                thread_trace_mask);
-
-         /* Trace all tokens and registers. */
-         radeon_set_uconfig_reg(R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
-                                S_030CCC_TOKEN_MASK(0xbfff) |
-                                S_030CCC_REG_MASK(0xff) |
-                                S_030CCC_REG_DROP_ON_STALL(0));
-
-         /* Enable SQTT perf counters for all CUs. */
-         radeon_set_uconfig_reg(R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
-                                S_030CD0_SH0_MASK(0xffff) |
-                                S_030CD0_SH1_MASK(0xffff));
-
-         radeon_set_uconfig_reg(R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
-
-         radeon_set_uconfig_reg(R_030CEC_SQ_THREAD_TRACE_HIWATER,
-                                S_030CEC_HIWATER(4));
-
-         if (sctx->gfx_level == GFX9) {
-            /* Reset thread trace status errors. */
-            radeon_set_uconfig_reg(R_030CE8_SQ_THREAD_TRACE_STATUS,
-                                   S_030CE8_UTC_ERROR(0));
-         }
-
-         /* Enable the thread trace mode. */
-         uint32_t thread_trace_mode =
-            S_030CD8_MASK_PS(1) |
-            S_030CD8_MASK_VS(1) |
-            S_030CD8_MASK_GS(1) |
-            S_030CD8_MASK_ES(1) |
-            S_030CD8_MASK_HS(1) |
-            S_030CD8_MASK_LS(1) |
-            S_030CD8_MASK_CS(1) |
-            S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
-            S_030CD8_MODE(1);
-
-         if (sctx->gfx_level == GFX9) {
-            /* Count SQTT traffic in TCC perf counters. */
-            thread_trace_mode |= S_030CD8_TC_PERF_EN(1);
-         }
-
-         radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE,
-                                thread_trace_mode);
+        radeon_set_privileged_config_reg(
+            R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
+            S_008D04_SIZE(shifted_size) | S_008D04_BASE_HI(shifted_va >> 32));
+
+        radeon_set_privileged_config_reg(R_008D00_SQ_THREAD_TRACE_BUF0_BASE,
+                                         shifted_va);
+
+        radeon_set_privileged_config_reg(
+            R_008D14_SQ_THREAD_TRACE_MASK,
+            S_008D14_WTYPE_INCLUDE(shader_mask) | S_008D14_SA_SEL(0) |
+                S_008D14_WGP_SEL(wgp) | S_008D14_SIMD_SEL(0));
+
+        radeon_set_privileged_config_reg(
+            R_008D18_SQ_THREAD_TRACE_TOKEN_MASK,
+            S_008D18_REG_INCLUDE(token_mask) |
+                S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF));
       }
-   }
-
-   /* Restore global broadcasting. */
-   radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
-                          S_030800_SE_BROADCAST_WRITES(1) |
-                             S_030800_SH_BROADCAST_WRITES(1) |
-                             S_030800_INSTANCE_BROADCAST_WRITES(1));
-
-   /* Start the thread trace with a different event based on the queue. */
-   if (queue_family_index == AMD_IP_COMPUTE) {
-      radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
-                        S_00B878_THREAD_TRACE_ENABLE(1));
-   } else {
-      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
-   }
-   radeon_end();
-}
 
-static const uint32_t gfx9_thread_trace_info_regs[] =
-{
-   R_030CE4_SQ_THREAD_TRACE_WPTR,
-   R_030CE8_SQ_THREAD_TRACE_STATUS,
-   R_030CF0_SQ_THREAD_TRACE_CNTR,
-};
+      /* Should be emitted last (it enables thread traces). */
+      uint32_t ctrl = S_008D1C_MODE(1) | S_008D1C_HIWATER(5) |
+                      S_008D1C_UTIL_TIMER(1) | S_008D1C_RT_FREQ(2) |
+                      /* 4096 clk */ S_008D1C_DRAW_EVENT_EN(1);
+
+      if (sctx->gfx_level == GFX10_3)
+        ctrl |= S_008D1C_LOWATER_OFFSET(4);
+
+      ctrl |= S_008D1C_AUTO_FLUSH_MODE(
+          sctx->screen->info.has_sqtt_auto_flush_mode_bug);
+
+      switch (sctx->gfx_level) {
+      case GFX10:
+      case GFX10_3:
+        ctrl |= S_008D1C_REG_STALL_EN(1) | S_008D1C_SPI_STALL_EN(1) |
+                S_008D1C_SQ_STALL_EN(1) | S_008D1C_REG_DROP_ON_STALL(0);
+        radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL, ctrl);
+        break;
+      case GFX11:
+        ctrl |= S_0367B0_SPI_STALL_EN(1) | S_0367B0_SQ_STALL_EN(1) |
+                S_0367B0_REG_AT_HWM(2);
+        radeon_set_uconfig_reg(R_0367B0_SQ_THREAD_TRACE_CTRL, ctrl);
+        break;
+      default:
+        assert(false);
+      }
+    } else {
+      /* Order seems important for the following 4 registers. */
+      radeon_set_uconfig_reg(R_030CDC_SQ_THREAD_TRACE_BASE2,
+                             S_030CDC_ADDR_HI(shifted_va >> 32));
 
-static const uint32_t gfx10_thread_trace_info_regs[] =
-{
-   R_008D10_SQ_THREAD_TRACE_WPTR,
-   R_008D20_SQ_THREAD_TRACE_STATUS,
-   R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
-};
+      radeon_set_uconfig_reg(R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
 
-static const uint32_t gfx11_thread_trace_info_regs[] =
-{
-   R_0367BC_SQ_THREAD_TRACE_WPTR,
-   R_0367D0_SQ_THREAD_TRACE_STATUS,
-   R_0367E8_SQ_THREAD_TRACE_DROPPED_CNTR,
-};
+      radeon_set_uconfig_reg(R_030CC4_SQ_THREAD_TRACE_SIZE,
+                             S_030CC4_SIZE(shifted_size));
 
+      radeon_set_uconfig_reg(R_030CD4_SQ_THREAD_TRACE_CTRL,
+                             S_030CD4_RESET_BUFFER(1));
 
-static void
-si_copy_thread_trace_info_regs(struct si_context* sctx,
-             struct radeon_cmdbuf *cs,
-             unsigned se_index)
-{
-   const uint32_t *thread_trace_info_regs = NULL;
-
-   switch (sctx->gfx_level) {
-   case GFX10_3:
-   case GFX10:
-      thread_trace_info_regs = gfx10_thread_trace_info_regs;
-      break;
-   case GFX11:
-      thread_trace_info_regs = gfx11_thread_trace_info_regs;
-      break;
-   case GFX9:
-      thread_trace_info_regs = gfx9_thread_trace_info_regs;
-      break;
-   default:
-      unreachable("Unsupported gfx_level");
-   }
+      uint32_t sqtt_mask = S_030CC8_CU_SEL(first_active_cu) |
+                           S_030CC8_SH_SEL(0) | S_030CC8_SIMD_EN(0xf) |
+                           S_030CC8_VM_ID_MASK(0) | S_030CC8_REG_STALL_EN(1) |
+                           S_030CC8_SPI_STALL_EN(1) | S_030CC8_SQ_STALL_EN(1);
 
-   /* Get the VA where the info struct is stored for this SE. */
-   uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo);
-   uint64_t info_va = ac_thread_trace_get_info_va(va, se_index);
+      radeon_set_uconfig_reg(R_030CC8_SQ_THREAD_TRACE_MASK, sqtt_mask);
 
-   radeon_begin(cs);
+      /* Trace all tokens and registers. */
+      radeon_set_uconfig_reg(R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
+                             S_030CCC_TOKEN_MASK(0xbfff) |
+                                 S_030CCC_REG_MASK(0xff) |
+                                 S_030CCC_REG_DROP_ON_STALL(0));
 
-   /* Copy back the info struct one DWORD at a time. */
-   for (unsigned i = 0; i < 3; i++) {
-      radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
-      radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
-                  COPY_DATA_DST_SEL(COPY_DATA_TC_L2) |
-                  COPY_DATA_WR_CONFIRM);
-      radeon_emit(thread_trace_info_regs[i] >> 2);
-      radeon_emit(0); /* unused */
-      radeon_emit((info_va + i * 4));
-      radeon_emit((info_va + i * 4) >> 32);
-   }
+      /* Enable SQTT perf counters for all CUs. */
+      radeon_set_uconfig_reg(R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
+                             S_030CD0_SH0_MASK(0xffff) |
+                                 S_030CD0_SH1_MASK(0xffff));
 
-   if (sctx->gfx_level == GFX11) {
-      /* On GFX11, WPTR is incremented from the offset of the current buffer base address and it
-       * needs to be subtracted to get the correct offset:
-       *
-       * 1) get the current buffer base address for this SE
-       * 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned
-       * 3) mask off the higher 3 bits because WPTR.OFFSET is 29 bits
-       */
-      uint64_t data_va =
-         ac_thread_trace_get_data_va(&sctx->screen->info, sctx->thread_trace, va, se_index);
-      uint64_t shifted_data_va = (data_va >> 5);
-      uint64_t init_wptr_value = shifted_data_va & 0x1fffffff;
-
-      radeon_emit(PKT3(PKT3_ATOMIC_MEM, 7, 0));
-      radeon_emit(ATOMIC_OP(TC_OP_ATOMIC_SUB_32));
-      radeon_emit(info_va);
-      radeon_emit(info_va >> 32);
-      radeon_emit(init_wptr_value);
-      radeon_emit(init_wptr_value >> 32);
-      radeon_emit(0);
-      radeon_emit(0);
-      radeon_emit(0);
-   }
-
-   radeon_end();
-}
+      radeon_set_uconfig_reg(R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
 
+      radeon_set_uconfig_reg(R_030CEC_SQ_THREAD_TRACE_HIWATER,
+                             S_030CEC_HIWATER(4));
 
+      if (sctx->gfx_level == GFX9) {
+        /* Reset thread trace status errors. */
+        radeon_set_uconfig_reg(R_030CE8_SQ_THREAD_TRACE_STATUS,
+                               S_030CE8_UTC_ERROR(0));
+      }
 
-static void
-si_emit_thread_trace_stop(struct si_context *sctx,
-                          struct radeon_cmdbuf *cs,
-                          uint32_t queue_family_index)
-{
-   unsigned max_se = sctx->screen->info.max_se;
+      /* Enable the thread trace mode. */
+      uint32_t sqtt_mode = S_030CD8_MASK_PS(1) | S_030CD8_MASK_VS(1) |
+                           S_030CD8_MASK_GS(1) | S_030CD8_MASK_ES(1) |
+                           S_030CD8_MASK_HS(1) | S_030CD8_MASK_LS(1) |
+                           S_030CD8_MASK_CS(1) |
+                           S_030CD8_AUTOFLUSH_EN(
+                               1) | /* periodically flush SQTT data to memory */
+                           S_030CD8_MODE(1);
+
+      if (sctx->gfx_level == GFX9) {
+        /* Count SQTT traffic in TCC perf counters. */
+        sqtt_mode |= S_030CD8_TC_PERF_EN(1);
+      }
 
-   radeon_begin(cs);
+      radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE, sqtt_mode);
+    }
+  }
 
-   /* Stop the thread trace with a different event based on the queue. */
-   if (queue_family_index == AMD_IP_COMPUTE) {
-      radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
-                        S_00B878_THREAD_TRACE_ENABLE(0));
-   } else {
-      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
-      radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
-   }
+  /* Restore global broadcasting. */
+  radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
+                         S_030800_SE_BROADCAST_WRITES(1) |
+                             S_030800_SH_BROADCAST_WRITES(1) |
+                             S_030800_INSTANCE_BROADCAST_WRITES(1));
 
-   radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
-   radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
-   radeon_end();
+  /* Start the thread trace with a different event based on the queue. */
+  if (queue_family_index == AMD_IP_COMPUTE) {
+    radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
+                      S_00B878_THREAD_TRACE_ENABLE(1));
+  } else {
+    radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+    radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
+  }
+  radeon_end();
+}
 
-   if (sctx->screen->info.has_sqtt_rb_harvest_bug) {
-      /* Some chips with disabled RBs should wait for idle because FINISH_DONE doesn't work. */
-      sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
-                     SI_CONTEXT_FLUSH_AND_INV_DB |
-                     SI_CONTEXT_CS_PARTIAL_FLUSH;
-      sctx->emit_cache_flush(sctx, cs);
-   }
+static const uint32_t gfx9_sqtt_info_regs[] = {
+    R_030CE4_SQ_THREAD_TRACE_WPTR,
+    R_030CE8_SQ_THREAD_TRACE_STATUS,
+    R_030CF0_SQ_THREAD_TRACE_CNTR,
+};
 
-   for (unsigned se = 0; se < max_se; se++) {
-      if (ac_sqtt_se_is_disabled(&sctx->screen->info, se))
-         continue;
+static const uint32_t gfx10_sqtt_info_regs[] = {
+    R_008D10_SQ_THREAD_TRACE_WPTR,
+    R_008D20_SQ_THREAD_TRACE_STATUS,
+    R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
+};
 
-      radeon_begin(cs);
+static const uint32_t gfx11_sqtt_info_regs[] = {
+    R_0367BC_SQ_THREAD_TRACE_WPTR,
+    R_0367D0_SQ_THREAD_TRACE_STATUS,
+    R_0367E8_SQ_THREAD_TRACE_DROPPED_CNTR,
+};
 
-      /* Target SEi and SH0. */
-      radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
-                             S_030800_SE_INDEX(se) |
-                             S_030800_SH_INDEX(0) |
-                             S_030800_INSTANCE_BROADCAST_WRITES(1));
+static void si_copy_sqtt_info_regs(struct si_context *sctx,
+                                   struct radeon_cmdbuf *cs,
+                                   unsigned se_index) {
+  const uint32_t *sqtt_info_regs = NULL;
+
+  switch (sctx->gfx_level) {
+  case GFX10_3:
+  case GFX10:
+    sqtt_info_regs = gfx10_sqtt_info_regs;
+    break;
+  case GFX11:
+    sqtt_info_regs = gfx11_sqtt_info_regs;
+    break;
+  case GFX9:
+    sqtt_info_regs = gfx9_sqtt_info_regs;
+    break;
+  default:
+    unreachable("Unsupported gfx_level");
+  }
+
+  /* Get the VA where the info struct is stored for this SE. */
+  uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->sqtt->bo);
+  uint64_t info_va = ac_sqtt_get_info_va(va, se_index);
+
+  radeon_begin(cs);
+
+  /* Copy back the info struct one DWORD at a time. */
+  for (unsigned i = 0; i < 3; i++) {
+    radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0));
+    radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
+                COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM);
+    radeon_emit(sqtt_info_regs[i] >> 2);
+    radeon_emit(0); /* unused */
+    radeon_emit((info_va + i * 4));
+    radeon_emit((info_va + i * 4) >> 32);
+  }
+
+  if (sctx->gfx_level == GFX11) {
+    /* On GFX11, WPTR is incremented from the offset of the current buffer base
+     * address and it needs to be subtracted to get the correct offset:
+     *
+     * 1) get the current buffer base address for this SE
+     * 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned
+     * 3) mask off the higher 3 bits because WPTR.OFFSET is 29 bits
+     */
+    uint64_t data_va =
+        ac_sqtt_get_data_va(&sctx->screen->info, sctx->sqtt, va, se_index);
+    uint64_t shifted_data_va = (data_va >> 5);
+    uint64_t init_wptr_value = shifted_data_va & 0x1fffffff;
+
+    radeon_emit(PKT3(PKT3_ATOMIC_MEM, 7, 0));
+    radeon_emit(ATOMIC_OP(TC_OP_ATOMIC_SUB_32));
+    radeon_emit(info_va);
+    radeon_emit(info_va >> 32);
+    radeon_emit(init_wptr_value);
+    radeon_emit(init_wptr_value >> 32);
+    radeon_emit(0);
+    radeon_emit(0);
+    radeon_emit(0);
+  }
+
+  radeon_end();
+}
 
-      if (sctx->gfx_level >= GFX10) {
-         uint32_t tt_status_reg = sctx->gfx_level >= GFX11 ? R_0367D0_SQ_THREAD_TRACE_STATUS :
-                                                             R_008D20_SQ_THREAD_TRACE_STATUS;
-         if (!sctx->screen->info.has_sqtt_rb_harvest_bug) {
-            /* Make sure to wait for the trace buffer. */
-            radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-            radeon_emit(WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
-            radeon_emit(tt_status_reg >> 2);  /* register */
-            radeon_emit(0);
-            radeon_emit(0); /* reference value */
-            radeon_emit(sctx->gfx_level >= GFX11 ? ~C_0367D0_FINISH_DONE : ~C_008D20_FINISH_DONE); /* mask */
-            radeon_emit(4); /* poll interval */
-         }
-
-         /* Disable the thread trace mode. */
-         if (sctx->gfx_level >= GFX11)
-            radeon_set_uconfig_reg(R_0367B0_SQ_THREAD_TRACE_CTRL, S_008D1C_MODE(0));
-         else
-            radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL, S_008D1C_MODE(0));
-
-         /* Wait for thread trace completion. */
-         radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-         radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
-         radeon_emit(tt_status_reg >> 2);  /* register */
-         radeon_emit(0);
-         radeon_emit(0); /* reference value */
-         radeon_emit(sctx->gfx_level >= GFX11 ? ~C_0367D0_BUSY : ~C_008D20_BUSY); /* mask */
-         radeon_emit(4); /* poll interval */
-      } else {
-         /* Disable the thread trace mode. */
-         radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE,
-                                S_030CD8_MODE(0));
-
-         /* Wait for thread trace completion. */
-         radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
-         radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
-         radeon_emit(R_030CE8_SQ_THREAD_TRACE_STATUS >> 2);  /* register */
-         radeon_emit(0);
-         radeon_emit(0); /* reference value */
-         radeon_emit(~C_030CE8_BUSY); /* mask */
-         radeon_emit(4); /* poll interval */
+static void si_emit_sqtt_stop(struct si_context *sctx, struct radeon_cmdbuf *cs,
+                              uint32_t queue_family_index) {
+  unsigned max_se = sctx->screen->info.max_se;
+
+  radeon_begin(cs);
+
+  /* Stop the thread trace with a different event based on the queue. */
+  if (queue_family_index == AMD_IP_COMPUTE) {
+    radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
+                      S_00B878_THREAD_TRACE_ENABLE(0));
+  } else {
+    radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+    radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
+  }
+
+  radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
+  radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
+  radeon_end();
+
+  if (sctx->screen->info.has_sqtt_rb_harvest_bug) {
+    /* Some chips with disabled RBs should wait for idle because FINISH_DONE
+     * doesn't work. */
+    sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB |
+                   SI_CONTEXT_CS_PARTIAL_FLUSH;
+    sctx->emit_cache_flush(sctx, cs);
+  }
+
+  for (unsigned se = 0; se < max_se; se++) {
+    if (ac_sqtt_se_is_disabled(&sctx->screen->info, se))
+      continue;
+
+    radeon_begin(cs);
+
+    /* Target SEi and SH0. */
+    radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
+                           S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) |
+                               S_030800_INSTANCE_BROADCAST_WRITES(1));
+
+    if (sctx->gfx_level >= GFX10) {
+      uint32_t tt_status_reg = sctx->gfx_level >= GFX11
+                                   ? R_0367D0_SQ_THREAD_TRACE_STATUS
+                                   : R_008D20_SQ_THREAD_TRACE_STATUS;
+      if (!sctx->screen->info.has_sqtt_rb_harvest_bug) {
+        /* Make sure to wait for the trace buffer. */
+        radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+        radeon_emit(WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal
+                                                to the reference value */
+        radeon_emit(tt_status_reg >> 2);     /* register */
+        radeon_emit(0);
+        radeon_emit(0); /* reference value */
+        radeon_emit(sctx->gfx_level >= GFX11
+                        ? ~C_0367D0_FINISH_DONE
+                        : ~C_008D20_FINISH_DONE); /* mask */
+        radeon_emit(4);                           /* poll interval */
       }
-      radeon_end();
 
-      si_copy_thread_trace_info_regs(sctx, cs, se);
-   }
-
-   /* Restore global broadcasting. */
-   radeon_begin_again(cs);
-   radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
-                          S_030800_SE_BROADCAST_WRITES(1) |
+      /* Disable the thread trace mode. */
+      if (sctx->gfx_level >= GFX11)
+        radeon_set_uconfig_reg(R_0367B0_SQ_THREAD_TRACE_CTRL, S_008D1C_MODE(0));
+      else
+        radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL,
+                                         S_008D1C_MODE(0));
+
+      /* Wait for thread trace completion. */
+      radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+      radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to
+                                          the reference value */
+      radeon_emit(tt_status_reg >> 2); /* register */
+      radeon_emit(0);
+      radeon_emit(0); /* reference value */
+      radeon_emit(sctx->gfx_level >= GFX11 ? ~C_0367D0_BUSY
+                                           : ~C_008D20_BUSY); /* mask */
+      radeon_emit(4); /* poll interval */
+    } else {
+      /* Disable the thread trace mode. */
+      radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE, S_030CD8_MODE(0));
+
+      /* Wait for thread trace completion. */
+      radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+      radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to
+                                          the reference value */
+      radeon_emit(R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */
+      radeon_emit(0);
+      radeon_emit(0);              /* reference value */
+      radeon_emit(~C_030CE8_BUSY); /* mask */
+      radeon_emit(4);              /* poll interval */
+    }
+    radeon_end();
+
+    si_copy_sqtt_info_regs(sctx, cs, se);
+  }
+
+  /* Restore global broadcasting. */
+  radeon_begin_again(cs);
+  radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX,
+                         S_030800_SE_BROADCAST_WRITES(1) |
                              S_030800_SH_BROADCAST_WRITES(1) |
                              S_030800_INSTANCE_BROADCAST_WRITES(1));
-   radeon_end();
+  radeon_end();
 }
 
-static void
-si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf *cs)
-{
-   struct radeon_winsys *ws = sctx->ws;
-
-   radeon_begin(cs);
-
-   switch (family) {
-      case AMD_IP_GFX:
-         radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
-         radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
-         radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
-         break;
-      case AMD_IP_COMPUTE:
-         radeon_emit(PKT3(PKT3_NOP, 0, 0));
-         radeon_emit(0);
-         break;
-   }
-   radeon_end();
-
-   ws->cs_add_buffer(cs,
-                     sctx->thread_trace->bo,
-                     RADEON_USAGE_READWRITE,
-                     RADEON_DOMAIN_VRAM);
-   if (sctx->spm.bo)
-      ws->cs_add_buffer(cs,
-                        sctx->spm.bo,
-                        RADEON_USAGE_READWRITE,
-                        RADEON_DOMAIN_VRAM);
-
-   si_cp_dma_wait_for_idle(sctx, cs);
-
-   /* Make sure to wait-for-idle before starting SQTT. */
-   sctx->flags |=
-      SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
-      SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
-      SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
-   sctx->emit_cache_flush(sctx, cs);
-
-   si_inhibit_clockgating(sctx, cs, true);
-
-   /* Enable SQG events that collects thread trace data. */
-   si_emit_spi_config_cntl(sctx, cs, true);
-
-   if (sctx->spm.bo) {
-      si_pc_emit_spm_reset(cs);
-      si_pc_emit_shaders(cs, 0x7f);
-      si_emit_spm_setup(sctx, cs);
-   }
-
-   si_emit_thread_trace_start(sctx, cs, family);
-
-   if (sctx->spm.bo)
-      si_pc_emit_spm_start(cs);
+static void si_sqtt_start(struct si_context *sctx, int family,
+                          struct radeon_cmdbuf *cs) {
+  struct radeon_winsys *ws = sctx->ws;
+
+  radeon_begin(cs);
+
+  switch (family) {
+  case AMD_IP_GFX:
+    radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
+    radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
+    radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
+    break;
+  case AMD_IP_COMPUTE:
+    radeon_emit(PKT3(PKT3_NOP, 0, 0));
+    radeon_emit(0);
+    break;
+  }
+  radeon_end();
+
+  ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE,
+                    RADEON_DOMAIN_VRAM);
+  if (sctx->spm.bo)
+    ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE,
+                      RADEON_DOMAIN_VRAM);
+
+  si_cp_dma_wait_for_idle(sctx, cs);
+
+  /* Make sure to wait-for-idle before starting SQTT. */
+  sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+                 SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE |
+                 SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_L2 |
+                 SI_CONTEXT_PFP_SYNC_ME;
+  sctx->emit_cache_flush(sctx, cs);
+
+  si_inhibit_clockgating(sctx, cs, true);
+
+  /* Enable SQG events that collects thread trace data. */
+  si_emit_spi_config_cntl(sctx, cs, true);
+
+  if (sctx->spm.bo) {
+    si_pc_emit_spm_reset(cs);
+    si_pc_emit_shaders(cs, 0x7f);
+    si_emit_spm_setup(sctx, cs);
+  }
+
+  si_emit_sqtt_start(sctx, cs, family);
+
+  if (sctx->spm.bo)
+    si_pc_emit_spm_start(cs);
 }
 
-static void
-si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *cs)
-{
-   struct radeon_winsys *ws = sctx->ws;
+static void si_sqtt_stop(struct si_context *sctx, int family,
+                         struct radeon_cmdbuf *cs) {
+  struct radeon_winsys *ws = sctx->ws;
 
-   radeon_begin(cs);
+  radeon_begin(cs);
 
-   switch (family) {
-      case AMD_IP_GFX:
-         radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
-         radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
-         radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
-         break;
-      case AMD_IP_COMPUTE:
-         radeon_emit(PKT3(PKT3_NOP, 0, 0));
-         radeon_emit(0);
-         break;
-   }
-   radeon_end();
+  switch (family) {
+  case AMD_IP_GFX:
+    radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
+    radeon_emit(CC0_UPDATE_LOAD_ENABLES(1));
+    radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1));
+    break;
+  case AMD_IP_COMPUTE:
+    radeon_emit(PKT3(PKT3_NOP, 0, 0));
+    radeon_emit(0);
+    break;
+  }
+  radeon_end();
 
-   ws->cs_add_buffer(cs,
-                     sctx->thread_trace->bo,
-                     RADEON_USAGE_READWRITE,
-                     RADEON_DOMAIN_VRAM);
+  ws->cs_add_buffer(cs, sctx->sqtt->bo, RADEON_USAGE_READWRITE,
+                    RADEON_DOMAIN_VRAM);
 
-   if (sctx->spm.bo)
-      ws->cs_add_buffer(cs,
-                        sctx->spm.bo,
-                        RADEON_USAGE_READWRITE,
-                        RADEON_DOMAIN_VRAM);
+  if (sctx->spm.bo)
+    ws->cs_add_buffer(cs, sctx->spm.bo, RADEON_USAGE_READWRITE,
+                      RADEON_DOMAIN_VRAM);
 
-   si_cp_dma_wait_for_idle(sctx, cs);
+  si_cp_dma_wait_for_idle(sctx, cs);
 
-   if (sctx->spm.bo)
-      si_pc_emit_spm_stop(cs, sctx->screen->info.never_stop_sq_perf_counters,
-                          sctx->screen->info.never_send_perfcounter_stop);
+  if (sctx->spm.bo)
+    si_pc_emit_spm_stop(cs, sctx->screen->info.never_stop_sq_perf_counters,
+                        sctx->screen->info.never_send_perfcounter_stop);
 
-   /* Make sure to wait-for-idle before stopping SQTT. */
-   sctx->flags |=
-      SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
-      SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
-      SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME;
-   sctx->emit_cache_flush(sctx, cs);
+  /* Make sure to wait-for-idle before stopping SQTT. */
+  sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
+                 SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE |
+                 SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_L2 |
+                 SI_CONTEXT_PFP_SYNC_ME;
+  sctx->emit_cache_flush(sctx, cs);
 
-   si_emit_thread_trace_stop(sctx, cs, family);
+  si_emit_sqtt_stop(sctx, cs, family);
 
-   if (sctx->spm.bo)
-      si_pc_emit_spm_reset(cs);
+  if (sctx->spm.bo)
+    si_pc_emit_spm_reset(cs);
 
-   /* Restore previous state by disabling SQG events. */
-   si_emit_spi_config_cntl(sctx, cs, false);
+  /* Restore previous state by disabling SQG events. */
+  si_emit_spi_config_cntl(sctx, cs, false);
 
-   si_inhibit_clockgating(sctx, cs, false);
+  si_inhibit_clockgating(sctx, cs, false);
 }
 
-
-static void
-si_thread_trace_init_cs(struct si_context *sctx)
-{
-   struct radeon_winsys *ws = sctx->ws;
-
-   /* Thread trace start CS (only handles AMD_IP_GFX). */
-   sctx->thread_trace->start_cs[AMD_IP_GFX] = CALLOC_STRUCT(radeon_cmdbuf);
-   if (!ws->cs_create(sctx->thread_trace->start_cs[AMD_IP_GFX],
-                      sctx->ctx, AMD_IP_GFX, NULL, NULL, 0)) {
-      free(sctx->thread_trace->start_cs[AMD_IP_GFX]);
-      sctx->thread_trace->start_cs[AMD_IP_GFX] = NULL;
-      return;
-   }
-
-   si_thread_trace_start(sctx, AMD_IP_GFX, sctx->thread_trace->start_cs[AMD_IP_GFX]);
-
-   /* Thread trace stop CS. */
-   sctx->thread_trace->stop_cs[AMD_IP_GFX] = CALLOC_STRUCT(radeon_cmdbuf);
-   if (!ws->cs_create(sctx->thread_trace->stop_cs[AMD_IP_GFX],
-                      sctx->ctx, AMD_IP_GFX, NULL, NULL, 0)) {
-      free(sctx->thread_trace->start_cs[AMD_IP_GFX]);
-      sctx->thread_trace->start_cs[AMD_IP_GFX] = NULL;
-      free(sctx->thread_trace->stop_cs[AMD_IP_GFX]);
-      sctx->thread_trace->stop_cs[AMD_IP_GFX] = NULL;
-      return;
-   }
-
-   si_thread_trace_stop(sctx, AMD_IP_GFX, sctx->thread_trace->stop_cs[AMD_IP_GFX]);
+static void si_sqtt_init_cs(struct si_context *sctx) {
+  struct radeon_winsys *ws = sctx->ws;
+
+  /* Thread trace start CS (only handles AMD_IP_GFX). */
+  sctx->sqtt->start_cs[AMD_IP_GFX] = CALLOC_STRUCT(radeon_cmdbuf);
+  if (!ws->cs_create(sctx->sqtt->start_cs[AMD_IP_GFX], sctx->ctx, AMD_IP_GFX,
+                     NULL, NULL, 0)) {
+    free(sctx->sqtt->start_cs[AMD_IP_GFX]);
+    sctx->sqtt->start_cs[AMD_IP_GFX] = NULL;
+    return;
+  }
+
+  si_sqtt_start(sctx, AMD_IP_GFX, sctx->sqtt->start_cs[AMD_IP_GFX]);
+
+  /* Thread trace stop CS. */
+  sctx->sqtt->stop_cs[AMD_IP_GFX] = CALLOC_STRUCT(radeon_cmdbuf);
+  if (!ws->cs_create(sctx->sqtt->stop_cs[AMD_IP_GFX], sctx->ctx, AMD_IP_GFX,
+                     NULL, NULL, 0)) {
+    free(sctx->sqtt->start_cs[AMD_IP_GFX]);
+    sctx->sqtt->start_cs[AMD_IP_GFX] = NULL;
+    free(sctx->sqtt->stop_cs[AMD_IP_GFX]);
+    sctx->sqtt->stop_cs[AMD_IP_GFX] = NULL;
+    return;
+  }
+
+  si_sqtt_stop(sctx, AMD_IP_GFX, sctx->sqtt->stop_cs[AMD_IP_GFX]);
 }
 
-static void
-si_begin_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)
-{
-   struct radeon_cmdbuf *cs = sctx->thread_trace->start_cs[AMD_IP_GFX];
-   sctx->ws->cs_flush(cs, 0, NULL);
+static void si_begin_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs) {
+  struct radeon_cmdbuf *cs = sctx->sqtt->start_cs[AMD_IP_GFX];
+  sctx->ws->cs_flush(cs, 0, NULL);
 }
 
-static void
-si_end_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)
-{
-   struct radeon_cmdbuf *cs = sctx->thread_trace->stop_cs[AMD_IP_GFX];
-   sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence);
+static void si_end_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs) {
+  struct radeon_cmdbuf *cs = sctx->sqtt->stop_cs[AMD_IP_GFX];
+  sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence);
 }
 
-static bool
-si_get_thread_trace(struct si_context *sctx,
-                    struct ac_thread_trace *thread_trace)
-{
-   unsigned max_se = sctx->screen->info.max_se;
+static bool si_get_sqtt_trace(struct si_context *sctx,
+                              struct ac_sqtt_trace *sqtt) {
+  unsigned max_se = sctx->screen->info.max_se;
 
-   memset(thread_trace, 0, sizeof(*thread_trace));
+  memset(sqtt, 0, sizeof(*sqtt));
 
-   sctx->thread_trace->ptr = sctx->ws->buffer_map(sctx->ws, sctx->thread_trace->bo,
-                                                          NULL,
-                                                          PIPE_MAP_READ);
+  sctx->sqtt->ptr =
+      sctx->ws->buffer_map(sctx->ws, sctx->sqtt->bo, NULL, PIPE_MAP_READ);
 
-   if (!sctx->thread_trace->ptr)
-      return false;
+  if (!sctx->sqtt->ptr)
+    return false;
 
-   if (!ac_sqtt_get_trace(sctx->thread_trace, &sctx->screen->info,
-                          thread_trace)) {
-      void *thread_trace_ptr = sctx->thread_trace->ptr;
+  if (!ac_sqtt_get_trace(sctx->sqtt, &sctx->screen->info, sqtt)) {
+    void *sqtt_ptr = sctx->sqtt->ptr;
 
-      for (unsigned se = 0; se < max_se; se++) {
-         uint64_t info_offset = ac_thread_trace_get_info_offset(se);
-         void *info_ptr = thread_trace_ptr + info_offset;
-         struct ac_thread_trace_info *info =
-            (struct ac_thread_trace_info *)info_ptr;
+    for (unsigned se = 0; se < max_se; se++) {
+      uint64_t info_offset = ac_sqtt_get_info_offset(se);
+      void *info_ptr = sqtt_ptr + info_offset;
+      struct ac_sqtt_data_info *info = (struct ac_sqtt_data_info *)info_ptr;
 
-         if (ac_sqtt_se_is_disabled(&sctx->screen->info, se))
-            continue;
-
-         if (!ac_is_thread_trace_complete(&sctx->screen->info, sctx->thread_trace, info)) {
-            uint32_t expected_size =
-               ac_get_expected_buffer_size(&sctx->screen->info, info);
-            uint32_t available_size = (info->cur_offset * 32) / 1024;
-
-            fprintf(stderr, "Failed to get the thread trace "
-                    "because the buffer is too small. The "
-                    "hardware needs %d KB but the "
-                    "buffer size is %d KB.\n",
-                    expected_size, available_size);
-            fprintf(stderr, "Please update the buffer size with "
-                    "AMD_THREAD_TRACE_BUFFER_SIZE=<size_in_kbytes>\n");
-            return false;
-         }
+      if (ac_sqtt_se_is_disabled(&sctx->screen->info, se))
+        continue;
+
+      if (!ac_is_sqtt_complete(&sctx->screen->info, sctx->sqtt, info)) {
+        uint32_t expected_size =
+            ac_get_expected_buffer_size(&sctx->screen->info, info);
+        uint32_t available_size = (info->cur_offset * 32) / 1024;
+
+        fprintf(stderr,
+                "Failed to get the thread trace "
+                "because the buffer is too small. The "
+                "hardware needs %d KB but the "
+                "buffer size is %d KB.\n",
+                expected_size, available_size);
+        fprintf(stderr, "Please update the buffer size with "
+                        "AMD_THREAD_TRACE_BUFFER_SIZE=<size_in_kbytes>\n");
+        return false;
       }
-   }
+    }
+  }
 
-   return true;
+  return true;
 }
 
-
-bool
-si_init_thread_trace(struct si_context *sctx)
-{
-   static bool warn_once = true;
-   if (warn_once) {
-      fprintf(stderr, "*************************************************\n");
-      fprintf(stderr, "* WARNING: Thread trace support is experimental *\n");
-      fprintf(stderr, "*************************************************\n");
-      warn_once = false;
-   }
-
-   sctx->thread_trace = CALLOC_STRUCT(ac_thread_trace_data);
-
-   if (sctx->gfx_level < GFX8) {
-      fprintf(stderr, "GPU hardware not supported: refer to "
-              "the RGP documentation for the list of "
-              "supported GPUs!\n");
-      return false;
-   }
-
-   if (sctx->gfx_level > GFX11) {
-      fprintf(stderr, "radeonsi: Thread trace is not supported "
-              "for that GPU!\n");
-      return false;
-   }
-
-   /* Default buffer size set to 32MB per SE. */
-   sctx->thread_trace->buffer_size = debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 32 * 1024) * 1024;
-   sctx->thread_trace->start_frame = 10;
-
-   const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER");
-   if (trigger) {
-      sctx->thread_trace->start_frame = atoi(trigger);
-      if (sctx->thread_trace->start_frame <= 0) {
-         /* This isn't a frame number, must be a file */
-         sctx->thread_trace->trigger_file = strdup(trigger);
-         sctx->thread_trace->start_frame = -1;
-      }
-   }
-
-   if (!si_thread_trace_init_bo(sctx))
-      return false;
-
-   ac_thread_trace_init(sctx->thread_trace);
-
-   if (sctx->gfx_level >= GFX10 &&
-       debug_get_bool_option("AMD_THREAD_TRACE_SPM", sctx->gfx_level < GFX11)) {
-      /* Limit SPM counters to GFX10 and GFX10_3 for now */
-      ASSERTED bool r = si_spm_init(sctx);
-      assert(r);
-   }
-
-   si_thread_trace_init_cs(sctx);
-
-   sctx->sqtt_next_event = EventInvalid;
-
-   return true;
+bool si_init_sqtt(struct si_context *sctx) {
+  static bool warn_once = true;
+  if (warn_once) {
+    fprintf(stderr, "*************************************************\n");
+    fprintf(stderr, "* WARNING: Thread trace support is experimental *\n");
+    fprintf(stderr, "*************************************************\n");
+    warn_once = false;
+  }
+
+  sctx->sqtt = CALLOC_STRUCT(ac_sqtt);
+
+  if (sctx->gfx_level < GFX8) {
+    fprintf(stderr, "GPU hardware not supported: refer to "
+                    "the RGP documentation for the list of "
+                    "supported GPUs!\n");
+    return false;
+  }
+
+  if (sctx->gfx_level > GFX11) {
+    fprintf(stderr, "radeonsi: Thread trace is not supported "
+                    "for that GPU!\n");
+    return false;
+  }
+
+  /* Default buffer size set to 32MB per SE. */
+  sctx->sqtt->buffer_size =
+      debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 32 * 1024) * 1024;
+  sctx->sqtt->start_frame = 10;
+
+  const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER");
+  if (trigger) {
+    sctx->sqtt->start_frame = atoi(trigger);
+    if (sctx->sqtt->start_frame <= 0) {
+      /* This isn't a frame number, must be a file */
+      sctx->sqtt->trigger_file = strdup(trigger);
+      sctx->sqtt->start_frame = -1;
+    }
+  }
+
+  if (!si_sqtt_init_bo(sctx))
+    return false;
+
+  ac_sqtt_init(sctx->sqtt);
+
+  if (sctx->gfx_level >= GFX10 &&
+      debug_get_bool_option("AMD_THREAD_TRACE_SPM", sctx->gfx_level < GFX11)) {
+    /* Limit SPM counters to GFX10 and GFX10_3 for now */
+    ASSERTED bool r = si_spm_init(sctx);
+    assert(r);
+  }
+
+  si_sqtt_init_cs(sctx);
+
+  sctx->sqtt_next_event = EventInvalid;
+
+  return true;
 }
 
-void
-si_destroy_thread_trace(struct si_context *sctx)
-{
-   struct si_screen *sscreen = sctx->screen;
-   struct pb_buffer *bo = sctx->thread_trace->bo;
-   radeon_bo_reference(sctx->screen->ws, &bo, NULL);
-
-   if (sctx->thread_trace->trigger_file)
-      free(sctx->thread_trace->trigger_file);
-
-   sscreen->ws->cs_destroy(sctx->thread_trace->start_cs[AMD_IP_GFX]);
-   sscreen->ws->cs_destroy(sctx->thread_trace->stop_cs[AMD_IP_GFX]);
-
-   struct rgp_pso_correlation *pso_correlation = &sctx->thread_trace->rgp_pso_correlation;
-   struct rgp_loader_events *loader_events = &sctx->thread_trace->rgp_loader_events;
-   struct rgp_code_object *code_object = &sctx->thread_trace->rgp_code_object;
-   list_for_each_entry_safe(struct rgp_pso_correlation_record, record,
-                            &pso_correlation->record, list) {
-      list_del(&record->list);
-      free(record);
-   }
-
-   list_for_each_entry_safe(struct rgp_loader_events_record, record,
-                            &loader_events->record, list) {
-      list_del(&record->list);
-      free(record);
-   }
-
-   list_for_each_entry_safe(struct rgp_code_object_record, record,
-             &code_object->record, list) {
-      uint32_t mask = record->shader_stages_mask;
-      int i;
-
-      /* Free the disassembly. */
-      while (mask) {
-         i = u_bit_scan(&mask);
-         free(record->shader_data[i].code);
-      }
-      list_del(&record->list);
-      free(record);
-   }
-
-   ac_thread_trace_finish(sctx->thread_trace);
-
-   hash_table_foreach(sctx->thread_trace->pipeline_bos->table, entry) {
-      struct si_sqtt_fake_pipeline *pipeline = (struct si_sqtt_fake_pipeline *)entry->data;
-      si_resource_reference(&pipeline->bo, NULL);
-      FREE(pipeline);
-   }
-
-   free(sctx->thread_trace);
-   sctx->thread_trace = NULL;
-
-   if (sctx->spm.bo)
-      si_spm_finish(sctx);
+void si_destroy_sqtt(struct si_context *sctx) {
+  struct si_screen *sscreen = sctx->screen;
+  struct pb_buffer *bo = sctx->sqtt->bo;
+  radeon_bo_reference(sctx->screen->ws, &bo, NULL);
+
+  if (sctx->sqtt->trigger_file)
+    free(sctx->sqtt->trigger_file);
+
+  sscreen->ws->cs_destroy(sctx->sqtt->start_cs[AMD_IP_GFX]);
+  sscreen->ws->cs_destroy(sctx->sqtt->stop_cs[AMD_IP_GFX]);
+
+  struct rgp_pso_correlation *pso_correlation =
+      &sctx->sqtt->rgp_pso_correlation;
+  struct rgp_loader_events *loader_events = &sctx->sqtt->rgp_loader_events;
+  struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object;
+  list_for_each_entry_safe(struct rgp_pso_correlation_record, record,
+                           &pso_correlation->record, list) {
+    list_del(&record->list);
+    free(record);
+  }
+
+  list_for_each_entry_safe(struct rgp_loader_events_record, record,
+                           &loader_events->record, list) {
+    list_del(&record->list);
+    free(record);
+  }
+
+  list_for_each_entry_safe(struct rgp_code_object_record, record,
+                           &code_object->record, list) {
+    uint32_t mask = record->shader_stages_mask;
+    int i;
+
+    /* Free the disassembly. */
+    while (mask) {
+      i = u_bit_scan(&mask);
+      free(record->shader_data[i].code);
+    }
+    list_del(&record->list);
+    free(record);
+  }
+
+  ac_sqtt_finish(sctx->sqtt);
+
+  hash_table_foreach(sctx->sqtt->pipeline_bos->table, entry) {
+    struct si_sqtt_fake_pipeline *pipeline =
+        (struct si_sqtt_fake_pipeline *)entry->data;
+    si_resource_reference(&pipeline->bo, NULL);
+    FREE(pipeline);
+  }
+
+  free(sctx->sqtt);
+  sctx->sqtt = NULL;
+
+  if (sctx->spm.bo)
+    si_spm_finish(sctx);
 }
 
 static uint64_t num_frames = 0;
 
-void
-si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs)
-{
-   /* Should we enable SQTT yet? */
-   if (!sctx->thread_trace_enabled) {
-      bool frame_trigger = num_frames == sctx->thread_trace->start_frame;
-      bool file_trigger = false;
-      if (sctx->thread_trace->trigger_file &&
-          access(sctx->thread_trace->trigger_file, W_OK) == 0) {
-         if (unlink(sctx->thread_trace->trigger_file) == 0) {
-            file_trigger = true;
-         } else {
-            /* Do not enable tracing if we cannot remove the file,
-             * because by then we'll trace every frame.
-             */
-            fprintf(stderr, "radeonsi: could not remove thread trace trigger file, ignoring\n");
-         }
+void si_handle_sqtt(struct si_context *sctx, struct radeon_cmdbuf *rcs) {
+  /* Should we enable SQTT yet? */
+  if (!sctx->sqtt_enabled) {
+    bool frame_trigger = num_frames == sctx->sqtt->start_frame;
+    bool file_trigger = false;
+    if (sctx->sqtt->trigger_file &&
+        access(sctx->sqtt->trigger_file, W_OK) == 0) {
+      if (unlink(sctx->sqtt->trigger_file) == 0) {
+        file_trigger = true;
+      } else {
+        /* Do not enable tracing if we cannot remove the file,
+         * because by then we'll trace every frame.
+         */
+        fprintf(
+            stderr,
+            "radeonsi: could not remove thread trace trigger file, ignoring\n");
       }
+    }
 
-      if (frame_trigger || file_trigger) {
-         /* Wait for last submission */
-         sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence, PIPE_TIMEOUT_INFINITE);
+    if (frame_trigger || file_trigger) {
+      /* Wait for last submission */
+      sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence,
+                           PIPE_TIMEOUT_INFINITE);
 
-         /* Start SQTT */
-         si_begin_thread_trace(sctx, rcs);
+      /* Start SQTT */
+      si_begin_sqtt(sctx, rcs);
 
-         sctx->thread_trace_enabled = true;
-         sctx->thread_trace->start_frame = -1;
+      sctx->sqtt_enabled = true;
+      sctx->sqtt->start_frame = -1;
 
-         /* Force shader update to make sure si_sqtt_describe_pipeline_bind is called
-          * for the current "pipeline".
-          */
-         sctx->do_update_shaders = true;
-      }
-   } else {
-      struct ac_thread_trace thread_trace = {0};
-
-      /* Stop SQTT */
-      si_end_thread_trace(sctx, rcs);
-      sctx->thread_trace_enabled = false;
-      sctx->thread_trace->start_frame = -1;
-      assert (sctx->last_sqtt_fence);
-
-      /* Wait for SQTT to finish and read back the bo */
-      if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence, PIPE_TIMEOUT_INFINITE) &&
-          si_get_thread_trace(sctx, &thread_trace)) {
-         struct ac_spm_trace spm_trace;
-
-         /* Map the SPM counter buffer */
-         if (sctx->spm.bo) {
-            sctx->spm.ptr = sctx->ws->buffer_map(sctx->ws, sctx->spm.bo,
-                                                       NULL, PIPE_MAP_READ | RADEON_MAP_TEMPORARY);
-            ac_spm_get_trace(&sctx->spm, &spm_trace);
-         }
-
-         ac_dump_rgp_capture(&sctx->screen->info, &thread_trace, sctx->spm.bo ? &spm_trace : NULL);
-
-         if (sctx->spm.ptr)
-            sctx->ws->buffer_unmap(sctx->ws, sctx->spm.bo);
-      } else {
-         fprintf(stderr, "Failed to read the trace\n");
+      /* Force shader update to make sure si_sqtt_describe_pipeline_bind is
+       * called for the current "pipeline".
+       */
+      sctx->do_update_shaders = true;
+    }
+  } else {
+    struct ac_sqtt_trace sqtt_trace = {0};
+
+    /* Stop SQTT */
+    si_end_sqtt(sctx, rcs);
+    sctx->sqtt_enabled = false;
+    sctx->sqtt->start_frame = -1;
+    assert(sctx->last_sqtt_fence);
+
+    /* Wait for SQTT to finish and read back the bo */
+    if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence,
+                             PIPE_TIMEOUT_INFINITE) &&
+        si_get_sqtt_trace(sctx, &sqtt_trace)) {
+      struct ac_spm_trace spm_trace;
+
+      /* Map the SPM counter buffer */
+      if (sctx->spm.bo) {
+        sctx->spm.ptr = sctx->ws->buffer_map(
+            sctx->ws, sctx->spm.bo, NULL, PIPE_MAP_READ | RADEON_MAP_TEMPORARY);
+        ac_spm_get_trace(&sctx->spm, &spm_trace);
       }
-   }
 
-   num_frames++;
-}
+      ac_dump_rgp_capture(&sctx->screen->info, &sqtt_trace,
+                          sctx->spm.bo ? &spm_trace : NULL);
 
+      if (sctx->spm.ptr)
+        sctx->ws->buffer_unmap(sctx->ws, sctx->spm.bo);
+    } else {
+      fprintf(stderr, "Failed to read the trace\n");
+    }
+  }
 
-static void
-si_emit_thread_trace_userdata(struct si_context* sctx,
-                              struct radeon_cmdbuf *cs,
-                              const void *data, uint32_t num_dwords)
-{
-   const uint32_t *dwords = (uint32_t *)data;
+  num_frames++;
+}
 
-   radeon_begin(cs);
+static void si_emit_sqtt_userdata(struct si_context *sctx,
+                                  struct radeon_cmdbuf *cs, const void *data,
+                                  uint32_t num_dwords) {
+  const uint32_t *dwords = (uint32_t *)data;
 
-   while (num_dwords > 0) {
-      uint32_t count = MIN2(num_dwords, 2);
+  radeon_begin(cs);
 
-      /* Without the perfctr bit the CP might not always pass the
-       * write on correctly. */
-      radeon_set_uconfig_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count, sctx->gfx_level >= GFX10);
+  while (num_dwords > 0) {
+    uint32_t count = MIN2(num_dwords, 2);
 
-      radeon_emit_array(dwords, count);
+    /* Without the perfctr bit the CP might not always pass the
+     * write on correctly. */
+    radeon_set_uconfig_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count,
+                               sctx->gfx_level >= GFX10);
 
-      dwords += count;
-      num_dwords -= count;
-   }
-   radeon_end();
+    radeon_emit_array(dwords, count);
+
+    dwords += count;
+    num_dwords -= count;
+  }
+  radeon_end();
 }
 
 static void
@@ -913,7 +872,7 @@ si_sqtt_write_event_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs,
    marker.instance_offset_reg_idx = instance_offset_user_data;
    marker.draw_index_reg_idx = draw_index_user_data;
 
-   si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
+   si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
 
    sctx->sqtt_next_event = EventInvalid;
 }
@@ -935,7 +894,7 @@ si_write_event_with_dims_marker(struct si_context* sctx, struct radeon_cmdbuf *r
    marker.thread_y = y;
    marker.thread_z = z;
 
-   si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
+   si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
    sctx->sqtt_next_event = EventInvalid;
 }
 
@@ -948,7 +907,7 @@ si_sqtt_describe_barrier_start(struct si_context* sctx, struct radeon_cmdbuf *rc
    marker.cb_id = 0;
    marker.dword02 = 0xC0000000 + 10; /* RGP_BARRIER_INTERNAL_BASE */
 
-   si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
+   si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
 }
 
 void
@@ -988,7 +947,7 @@ si_sqtt_describe_barrier_end(struct si_context* sctx, struct radeon_cmdbuf *rcs,
       marker.flush_db = true;
    }
 
-   si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
+   si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
 }
 
 void
@@ -1002,7 +961,7 @@ si_write_user_event(struct si_context* sctx, struct radeon_cmdbuf *rcs,
       marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT;
       marker.data_type = type;
 
-      si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
+      si_emit_sqtt_userdata(sctx, rcs, &marker, sizeof(marker) / 4);
    } else {
       assert (str != NULL);
       struct rgp_sqtt_marker_user_event_with_length marker = { 0 };
@@ -1016,31 +975,26 @@ si_write_user_event(struct si_context* sctx, struct radeon_cmdbuf *rcs,
       memcpy(buffer + sizeof(marker), str, len);
       buffer[sizeof(marker) + len - 1] = '\0';
 
-      si_emit_thread_trace_userdata(sctx, rcs, buffer, sizeof(marker) / 4 + marker.length / 4);
+      si_emit_sqtt_userdata(sctx, rcs, buffer,
+                            sizeof(marker) / 4 + marker.length / 4);
    }
 }
 
-
-bool
-si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data,
-                               uint64_t pipeline_hash)
-{
-   simple_mtx_lock(&thread_trace_data->rgp_pso_correlation.lock);
+bool si_sqtt_pipeline_is_registered(struct ac_sqtt *sqtt,
+                                    uint64_t pipeline_hash) {
+   simple_mtx_lock(&sqtt->rgp_pso_correlation.lock);
    list_for_each_entry_safe(struct rgp_pso_correlation_record, record,
-             &thread_trace_data->rgp_pso_correlation.record, list) {
+                            &sqtt->rgp_pso_correlation.record, list) {
       if (record->pipeline_hash[0] == pipeline_hash) {
-         simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock);
+         simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock);
          return true;
       }
-
    }
-   simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock);
+   simple_mtx_unlock(&sqtt->rgp_pso_correlation.lock);
 
    return false;
 }
 
-
-
 static enum rgp_hardware_stages
 si_sqtt_pipe_to_rgp_shader_stage(union si_shader_key* key, enum pipe_shader_type stage)
 {
@@ -1079,8 +1033,7 @@ si_sqtt_add_code_object(struct si_context* sctx,
                         struct si_sqtt_fake_pipeline *pipeline,
                         bool is_compute)
 {
-   struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
-   struct rgp_code_object *code_object = &thread_trace_data->rgp_code_object;
+   struct rgp_code_object *code_object = &sctx->sqtt->rgp_code_object;
    struct rgp_code_object_record *record;
 
    record = malloc(sizeof(struct rgp_code_object_record));
@@ -1147,15 +1100,14 @@ si_sqtt_add_code_object(struct si_context* sctx,
 bool
 si_sqtt_register_pipeline(struct si_context* sctx, struct si_sqtt_fake_pipeline *pipeline, bool is_compute)
 {
-   struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
-
-   assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline->code_hash));
+   assert(!si_sqtt_pipeline_is_registered(sctx->sqtt, pipeline->code_hash));
 
-   bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline->code_hash);
+   bool result = ac_sqtt_add_pso_correlation(sctx->sqtt, pipeline->code_hash);
    if (!result)
       return false;
 
-   result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline->code_hash, pipeline->bo->gpu_address);
+   result = ac_sqtt_add_code_object_loader_event(
+       sctx->sqtt, pipeline->code_hash, pipeline->bo->gpu_address);
    if (!result)
       return false;
 
@@ -1170,7 +1122,7 @@ si_sqtt_describe_pipeline_bind(struct si_context* sctx,
    struct rgp_sqtt_marker_pipeline_bind marker = {0};
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
-   if (likely(!sctx->thread_trace_enabled)) {
+   if (likely(!sctx->sqtt_enabled)) {
       return;
    }
 
@@ -1180,5 +1132,5 @@ si_sqtt_describe_pipeline_bind(struct si_context* sctx,
    marker.api_pso_hash[0] = pipeline_hash;
    marker.api_pso_hash[1] = pipeline_hash >> 32;
 
-   si_emit_thread_trace_userdata(sctx, cs, &marker, sizeof(marker) / 4);
+   si_emit_sqtt_userdata(sctx, cs, &marker, sizeof(marker) / 4);
 }
index 8bd203a..8284542 100644 (file)
@@ -303,7 +303,7 @@ static bool si_update_shaders(struct si_context *sctx)
          si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
    }
 
-   if (GFX_VERSION >= GFX9 && unlikely(sctx->thread_trace)) {
+   if (GFX_VERSION >= GFX9 && unlikely(sctx->sqtt)) {
       /* Pretend the bound shaders form a vk pipeline. Include the scratch size in
        * the hash calculation to force re-emitting the pipeline if the scratch bo
        * changes.
@@ -326,8 +326,7 @@ static bool si_update_shaders(struct si_context *sctx)
       }
 
       struct si_sqtt_fake_pipeline *pipeline = NULL;
-      struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace;
-      if (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_code_hash)) {
+      if (!si_sqtt_pipeline_is_registered(sctx->sqtt, pipeline_code_hash)) {
          /* This is a new pipeline. Allocate a new bo to hold all the shaders. Without
           * this, shader code export process creates huge rgp files because RGP assumes
           * the shaders live sequentially in memory (shader N address = shader 0 + offset N)
@@ -387,7 +386,7 @@ static bool si_update_shaders(struct si_context *sctx)
             }
             sctx->screen->ws->buffer_unmap(sctx->screen->ws, bo->buf);
 
-            _mesa_hash_table_u64_insert(sctx->thread_trace->pipeline_bos,
+            _mesa_hash_table_u64_insert(sctx->sqtt->pipeline_bos,
                                         pipeline_code_hash, pipeline);
 
             si_sqtt_register_pipeline(sctx, pipeline, false);
@@ -396,8 +395,8 @@ static bool si_update_shaders(struct si_context *sctx)
                si_resource_reference(&bo, NULL);
          }
       } else {
-         pipeline = (struct si_sqtt_fake_pipeline *)
-            _mesa_hash_table_u64_search(sctx->thread_trace->pipeline_bos, pipeline_code_hash);
+         pipeline = (struct si_sqtt_fake_pipeline *)_mesa_hash_table_u64_search(
+             sctx->sqtt->pipeline_bos, pipeline_code_hash);
       }
       assert(pipeline);
 
@@ -1389,15 +1388,15 @@ static void si_emit_draw_registers(struct si_context *sctx,
    radeon_end();
 }
 
-#define EMIT_SQTT_END_DRAW do {                                          \
-      if (GFX_VERSION >= GFX9 && unlikely(sctx->thread_trace_enabled)) { \
-         radeon_begin(&sctx->gfx_cs);                                    \
-         radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));       \
-         radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) |          \
-                     EVENT_INDEX(0));                                    \
-         radeon_end();                                      \
-      }                                                                  \
-   } while (0)
+#define EMIT_SQTT_END_DRAW                                                     \
+  do {                                                                         \
+    if (GFX_VERSION >= GFX9 && unlikely(sctx->sqtt_enabled)) {                 \
+      radeon_begin(&sctx->gfx_cs);                                             \
+      radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));                               \
+      radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));  \
+      radeon_end();                                                            \
+    }                                                                          \
+  } while (0)
 
 template <amd_gfx_level GFX_VERSION, si_has_ngg NGG, si_is_draw_vertex_state IS_DRAW_VERTEX_STATE>
 ALWAYS_INLINE
@@ -1411,7 +1410,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
 {
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
 
-   if (unlikely(sctx->thread_trace_enabled)) {
+   if (unlikely(sctx->sqtt_enabled)) {
       si_sqtt_write_event_marker(sctx, &sctx->gfx_cs, sctx->sqtt_next_event,
                                  UINT_MAX, UINT_MAX, UINT_MAX);
    }