radeonsi: Add tracepoints in radeonsi driver
authorSaroj Kumar <saroj.kumar@amd.com>
Mon, 17 Jul 2023 15:33:02 +0000 (21:03 +0530)
committerMarge Bot <emma+marge@anholt.net>
Thu, 19 Oct 2023 16:16:16 +0000 (16:16 +0000)
Add initialization code for u_trace and tracepoints in the
driver code.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23664>

src/gallium/drivers/radeonsi/si_compute.c
src/gallium/drivers/radeonsi/si_fence.c
src/gallium/drivers/radeonsi/si_gfx_cs.c
src/gallium/drivers/radeonsi/si_perfetto.cpp
src/gallium/drivers/radeonsi/si_pipe.c
src/gallium/drivers/radeonsi/si_state_draw.cpp
src/gallium/drivers/radeonsi/si_utrace.c
src/tool/pps/cfg/amd.cfg [new file with mode: 0644]
src/tool/pps/cfg/system.cfg

index 642acb7..c84d6e5 100644 (file)
@@ -12,6 +12,7 @@
 #include "util/u_async_debug.h"
 #include "util/u_memory.h"
 #include "util/u_upload_mgr.h"
+#include "si_tracepoints.h"
 
 #define COMPUTE_DBG(sscreen, fmt, args...)                                                         \
    do {                                                                                            \
@@ -996,7 +997,10 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
                          NULL);
       }
    }
-
+   
+   if (u_trace_perfetto_active(&sctx->ds.trace_context))
+      trace_si_begin_compute(&sctx->trace);
+   
    if (sctx->bo_list_add_all_compute_resources)
       si_compute_resources_add_all_to_bo_list(sctx);
 
@@ -1064,6 +1068,9 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
    sctx->compute_is_busy = true;
    sctx->num_compute_calls++;
 
+   if (u_trace_perfetto_active(&sctx->ds.trace_context))
+      trace_si_end_compute(&sctx->trace, info->grid[0], info->grid[1], info->grid[2]);
+   
    if (cs_regalloc_hang) {
       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
       si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
index 7dadb77..5f55f94 100644 (file)
@@ -473,6 +473,10 @@ static void si_flush_all_queues(struct pipe_context *ctx,
       if (unlikely(sctx->sqtt && (flags & PIPE_FLUSH_END_OF_FRAME))) {
          si_handle_sqtt(sctx, &sctx->gfx_cs);
       }
+      
+      if (u_trace_perfetto_active(&sctx->ds.trace_context)) {
+         u_trace_context_process(&sctx->ds.trace_context, flags & PIPE_FLUSH_END_OF_FRAME);
+      }
    } else {
       /* Instead of flushing, create a deferred fence. Constraints:
        * - the gallium frontend must allow a deferred flush.
index 811d871..567b8d1 100644 (file)
@@ -12,6 +12,7 @@
 #include "util/u_log.h"
 #include "util/u_upload_mgr.h"
 #include "ac_debug.h"
+#include "si_utrace.h"
 
 void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
 {
@@ -129,9 +130,19 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
    if (ctx->is_noop)
       flags |= RADEON_FLUSH_NOOP;
 
+   uint64_t start_ts = 0, submission_id = 0;
+   if (u_trace_perfetto_active(&ctx->ds.trace_context)) {
+      start_ts = si_ds_begin_submit(&ctx->ds_queue);
+      submission_id = ctx->ds_queue.submission_id;
+   }
+
    /* Flush the CS. */
    ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
 
+   if (u_trace_perfetto_active(&ctx->ds.trace_context) && start_ts > 0) {
+      si_ds_end_submit(&ctx->ds_queue, start_ts);
+   }
+
    tc_driver_internal_flush_notify(ctx->tc);
    if (fence)
       ws->fence_reference(fence, ctx->last_gfx_fence);
@@ -155,6 +166,9 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
    if (ctx->current_saved_cs)
       si_saved_cs_reference(&ctx->current_saved_cs, NULL);
 
+   if (u_trace_perfetto_active(&ctx->ds.trace_context))
+      si_utrace_flush(ctx, submission_id);
+
    si_begin_new_gfx_cs(ctx, false);
    ctx->gfx_flush_in_progress = false;
 }
@@ -352,6 +366,9 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
 {
    bool is_secure = false;
 
+   if (!first_cs)
+      u_trace_fini(&ctx->trace);
+
    if (unlikely(radeon_uses_secure_bos(ctx->ws))) {
       is_secure = ctx->ws->cs_is_secure(&ctx->gfx_cs);
 
@@ -566,6 +583,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
    assert(!ctx->gfx_cs.prev_dw);
    ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw;
 
+   u_trace_init(&ctx->trace, &ctx->ds.trace_context);
    /* All buffer references are removed on a flush, so si_check_needs_implicit_sync
     * cannot determine if si_make_CB_shader_coherent() needs to be called.
     * ctx->force_cb_shader_coherent will be cleared by the first call to
@@ -596,7 +614,7 @@ void si_emit_ts(struct si_context *sctx, struct si_resource* buffer, unsigned in
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
    uint64_t va = buffer->gpu_address + offset;
    si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
-                        EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, PIPE_QUERY_TIMESTAMP);
+                        EOP_DATA_SEL_TIMESTAMP, buffer, va, 0, PIPE_QUERY_TIMESTAMP);
 }
 
 void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
index e5c1e8c..56a9b31 100644 (file)
@@ -58,7 +58,8 @@ struct SIRenderpassTraits : public perfetto::DefaultDataSourceTraits {
    using IncrementalStateType = SIRenderpassIncrementalState;
 };
 
-class SIRenderpassDataSource : public MesaRenderpassDataSource<SIRenderpassDataSource, SIRenderpassTraits> {
+class SIRenderpassDataSource : public MesaRenderpassDataSource<SIRenderpassDataSource, 
+                                                               SIRenderpassTraits> {
 };
 
 PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
@@ -84,10 +85,12 @@ static void sync_timestamp(SIRenderpassDataSource::TraceContext &ctx, struct si_
 
    device->sync_gpu_ts = gpu_ts;
    device->next_clock_sync_ns = cpu_ts + 1000000000ull;
-   MesaRenderpassDataSource<SIRenderpassDataSource, SIRenderpassTraits>::EmitClockSync(ctx, cpu_ts, gpu_ts, device->gpu_clock_id);
+   MesaRenderpassDataSource<SIRenderpassDataSource, SIRenderpassTraits>::
+      EmitClockSync(ctx, cpu_ts, gpu_ts, device->gpu_clock_id);
 }
 
-static void send_descriptors(SIRenderpassDataSource::TraceContext &ctx, struct si_ds_device *device)
+static void send_descriptors(SIRenderpassDataSource::TraceContext &ctx, 
+                             struct si_ds_device *device)
 {
    PERFETTO_LOG("Sending renderstage descriptors");
 
@@ -131,7 +134,8 @@ static void send_descriptors(SIRenderpassDataSource::TraceContext &ctx, struct s
                 * by si_ds_queue_stage.
                 */
                char name[100];
-               snprintf(name, sizeof(name), "%.10s-%s-%u-%s", util_get_process_name(), queue->name, s, si_queue_stage_desc[s].name);
+               snprintf(name, sizeof(name), "%.10s-%s-%u-%s", util_get_process_name(), 
+                        queue->name, s, si_queue_stage_desc[s].name);
 
                auto desc = interned_data->add_gpu_specifications();
                desc->set_iid(queue->stages[s].queue_iid);
@@ -150,7 +154,8 @@ static void send_descriptors(SIRenderpassDataSource::TraceContext &ctx, struct s
    sync_timestamp(ctx, device);
 }
 
-typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *, const void*);
+typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *, 
+                                            const void*);
 
 static void begin_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id)
 {
@@ -172,7 +177,9 @@ static void begin_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_qu
    queue->stages[stage_id].level++;
 }
 
-static void end_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id, uint32_t submission_id, const char *app_event, const void* payload = nullptr, trace_payload_as_extra_func payload_as_extra = nullptr)
+static void end_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id,
+                      uint32_t submission_id, const char *app_event, const void* payload = nullptr,
+                      trace_payload_as_extra_func payload_as_extra = nullptr)
 {
    PERFETTO_LOG("end event called - ts_ns=%lu", ts_ns);
    struct si_ds_device *device = queue->device;
@@ -208,7 +215,9 @@ static void end_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queu
        * stage_iid if not already seen. Otherwise, it's a driver event and we
        * have use the internal stage_iid.
        */
-      uint64_t stage_iid = app_event ? tctx.GetDataSourceLocked()->debug_marker_stage(tctx, app_event) : stage->stage_iid;
+      uint64_t stage_iid = app_event ? 
+                           tctx.GetDataSourceLocked()->debug_marker_stage(tctx, app_event) : 
+                           stage->stage_iid;
 
       auto packet = tctx.NewTracePacket();
 
@@ -340,7 +349,8 @@ void si_driver_ds_init(void)
    si_gpu_tracepoint_config_variable();
 }
 
-void si_ds_device_init(struct si_ds_device *device, const struct radeon_info *devinfo, uint32_t gpu_id, enum amd_ds_api api)
+void si_ds_device_init(struct si_ds_device *device, const struct radeon_info *devinfo,
+                       uint32_t gpu_id, enum amd_ds_api api)
 {
    device->gpu_id = gpu_id;
    device->gpu_clock_id = si_pps_clock_id(gpu_id);
@@ -355,7 +365,9 @@ void si_ds_device_fini(struct si_ds_device *device)
    u_trace_context_fini(&device->trace_context);
 }
 
-struct si_ds_queue * si_ds_device_init_queue(struct si_ds_device *device, struct si_ds_queue *queue, const char *fmt_name, ...)
+struct si_ds_queue * si_ds_device_init_queue(struct si_ds_device *device, 
+                                             struct si_ds_queue *queue, 
+                                             const char *fmt_name, ...)
 {
    va_list ap;
    queue->device = device;
@@ -374,7 +386,8 @@ struct si_ds_queue * si_ds_device_init_queue(struct si_ds_device *device, struct
    return queue;
 }
 
-void si_ds_flush_data_init(struct si_ds_flush_data *data, struct si_ds_queue *queue, uint64_t submission_id)
+void si_ds_flush_data_init(struct si_ds_flush_data *data, struct si_ds_queue *queue, 
+                           uint64_t submission_id)
 {
    memset(data, 0, sizeof(*data));
 
index 6417e6a..2c6aac9 100644 (file)
@@ -24,6 +24,7 @@
 #include "util/u_upload_mgr.h"
 #include "util/xmlconfig.h"
 #include "vl/vl_decoder.h"
+#include "si_utrace.h"
 
 #include <xf86drm.h>
 
@@ -204,6 +205,8 @@ static void si_destroy_context(struct pipe_context *context)
       si_destroy_sqtt(sctx);
    }
 
+   si_utrace_fini(sctx);
+
    pipe_resource_reference(&sctx->esgs_ring, NULL);
    pipe_resource_reference(&sctx->gsvs_ring, NULL);
    pipe_resource_reference(&sctx->tess_rings, NULL);
@@ -779,6 +782,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
       sctx->shader.gs.key.ge.opt.prefer_mono = 1;
    }
 
+   si_utrace_init(sctx);
+
    si_begin_new_gfx_cs(sctx, true);
    assert(sctx->gfx_cs.current.cdw == sctx->initial_gfx_cs_size);
 
@@ -850,6 +855,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
    }
 
    sctx->initial_gfx_cs_size = sctx->gfx_cs.current.cdw;
+   sctx->last_timestamp_cmd = NULL;
 
    sctx->cs_blit_shaders = _mesa_hash_table_create_u32_keys(NULL);
    if (!sctx->cs_blit_shaders)
@@ -1522,6 +1528,8 @@ struct pipe_screen *radeonsi_screen_create(int fd, const struct pipe_screen_conf
       break;
    }
 
+   si_driver_ds_init();
+
    drmFreeVersion(version);
    return rw ? rw->screen : NULL;
 }
index c862ff9..5776999 100644 (file)
@@ -14,6 +14,8 @@
 #include "util/u_prim.h"
 #include "util/u_upload_mgr.h"
 #include "ac_rtld.h"
+#include "si_build_pm4.h"
+#include "si_tracepoints.h"
 
 #if (GFX_VER == 6)
 #define GFX(name) name##GFX6
@@ -1985,6 +1987,9 @@ static void si_draw(struct pipe_context *ctx,
 
    si_need_gfx_cs_space(sctx, num_draws);
 
+   if (u_trace_perfetto_active(&sctx->ds.trace_context))
+      trace_si_begin_draw(&sctx->trace);
+   
    unsigned instance_count = info->instance_count;
 
    /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
@@ -2296,6 +2301,10 @@ static void si_draw(struct pipe_context *ctx,
       zstex->depth_cleared_level_mask &= ~BITFIELD_BIT(sctx->framebuffer.state.zsbuf->u.tex.level);
    }
 
+   if (u_trace_perfetto_active(&sctx->ds.trace_context)) {
+      trace_si_end_draw(&sctx->trace, total_direct_count);
+   }
+
    DRAW_CLEANUP;
 }
 
index 9e1a1de..95d7cfa 100644 (file)
 #include "util/hash_table.h"
 
 
-static void si_utrace_record_ts(struct u_trace *trace, void *cs, void *timestamps, unsigned idx, bool end_of_pipe)
+static void si_utrace_record_ts(struct u_trace *trace, void *cs, void *timestamps, 
+                                unsigned idx, bool end_of_pipe)
 {
    struct si_context *ctx = container_of(trace, struct si_context, trace);
    struct pipe_resource *buffer = timestamps;
    struct si_resource *ts_bo = si_resource(buffer);
 
-   if (ctx->gfx_cs.current.buf == ctx->last_timestamp_cmd && ctx->gfx_cs.current.cdw == ctx->last_timestamp_cmd_cdw) {
+   if (ctx->gfx_cs.current.buf == ctx->last_timestamp_cmd && 
+       ctx->gfx_cs.current.cdw == ctx->last_timestamp_cmd_cdw) {
       uint64_t *ts = si_buffer_map(ctx, ts_bo, PIPE_MAP_READ);
       ts[idx] = U_TRACE_NO_TIMESTAMP;
       return;
@@ -31,7 +33,8 @@ static void si_utrace_record_ts(struct u_trace *trace, void *cs, void *timestamp
    ctx->last_timestamp_cmd_cdw = ctx->gfx_cs.current.cdw;
 }
 
-static uint64_t si_utrace_read_ts(struct u_trace_context *utctx, void *timestamps, unsigned idx, void *flush_data)
+static uint64_t si_utrace_read_ts(struct u_trace_context *utctx, void *timestamps, 
+                                  unsigned idx, void *flush_data)
 {
    struct si_context *ctx = container_of(utctx, struct si_context, ds.trace_context);
    struct pipe_resource *buffer = timestamps;
diff --git a/src/tool/pps/cfg/amd.cfg b/src/tool/pps/cfg/amd.cfg
new file mode 100644 (file)
index 0000000..9ba4fd7
--- /dev/null
@@ -0,0 +1,25 @@
+buffers {
+  size_kb: 16384
+  fill_policy: RING_BUFFER
+}
+
+data_sources {
+  config {
+    name: "gpu.renderstages.amd"
+  }
+}
+
+data_sources {
+  config {
+    name: "track_event"
+    track_event_config {
+      enabled_categories: "mesa.default"
+      enabled_categories: "mesa.slow"
+    }
+  }
+}
+
+duration_ms: 2000
+write_into_file: true
+file_write_period_ms: 500
+flush_period_ms: 500
index f875c7f..f48f5f9 100644 (file)
@@ -35,6 +35,12 @@ data_sources {
 
 data_sources {
   config {
+    name: "gpu.renderstages.amd"
+  }
+}
+
+data_sources {
+  config {
     name: "track_event"
     track_event_config {
       #enabled_tags: "slow"