--- /dev/null
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+
+#include "util/hash_table.h"
+#include "util/u_process.h"
+#include "util/hash_table.h"
+
+#include "si_pipe.h"
+#include "si_perfetto.h"
+#include "si_tracepoints.h"
+
+#ifdef HAVE_PERFETTO
+
+#include "util/perf/u_perfetto.h"
+#include "util/perf/u_perfetto_renderpass.h"
+
+#include "si_tracepoints_perfetto.h"
+
+/* Just naming stages */
+static const struct {
+ const char *name;
+
+ /* The perfetto UI requires that there is a parent-child relationship
+ * within a row of elements. Which means that all children elements must
+ * end within the lifespan of their parent.
+ *
+ * Some elements like stalls and command buffers follow that relationship,
+ * but not all. This tells us in which UI row the elements should live.
+ */
+ enum si_ds_queue_stage draw_stage;
+} si_queue_stage_desc[SI_DS_QUEUE_STAGE_N_STAGES] = {
+ /* Order must match the enum! */
+ {
+ "queue",
+ SI_DS_QUEUE_STAGE_QUEUE,
+ },
+ {
+ "compute",
+ SI_DS_QUEUE_STAGE_COMPUTE,
+ },
+ {
+ "draw",
+ SI_DS_QUEUE_STAGE_DRAW,
+ }
+};
+
+struct SIRenderpassIncrementalState {
+ bool was_cleared = true;
+};
+
+struct SIRenderpassTraits : public perfetto::DefaultDataSourceTraits {
+ using IncrementalStateType = SIRenderpassIncrementalState;
+};
+
+class SIRenderpassDataSource : public MesaRenderpassDataSource<SIRenderpassDataSource, SIRenderpassTraits> {
+};
+
+PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
+PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
+
+using perfetto::protos::pbzero::InternedGpuRenderStageSpecification_RenderStageCategory;
+
+static void sync_timestamp(SIRenderpassDataSource::TraceContext &ctx, struct si_ds_device *device)
+{
+ uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
+ uint64_t gpu_ts;
+
+ struct si_context *sctx = container_of(device, struct si_context, ds);
+ gpu_ts = sctx->screen->b.get_timestamp(&sctx->screen->b);
+
+
+ cpu_ts = perfetto::base::GetBootTimeNs().count();
+
+ if (cpu_ts < device->next_clock_sync_ns)
+ return;
+
+ PERFETTO_LOG("sending clocks gpu=0x%08x", device->gpu_clock_id);
+
+ device->sync_gpu_ts = gpu_ts;
+ device->next_clock_sync_ns = cpu_ts + 1000000000ull;
+ MesaRenderpassDataSource<SIRenderpassDataSource, SIRenderpassTraits>::EmitClockSync(ctx, cpu_ts, gpu_ts, device->gpu_clock_id);
+}
+
+static void send_descriptors(SIRenderpassDataSource::TraceContext &ctx, struct si_ds_device *device)
+{
+ PERFETTO_LOG("Sending renderstage descriptors");
+
+ device->event_id = 0;
+ list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
+ for (uint32_t s = 0; s < ARRAY_SIZE(queue->stages); s++) {
+ queue->stages[s].start_ns[0] = 0;
+ }
+ }
+
+ {
+ auto packet = ctx.NewTracePacket();
+
+ packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
+ packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
+ packet->set_sequence_flags(perfetto::protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);
+
+ auto interned_data = packet->set_interned_data();
+
+ {
+ auto desc = interned_data->add_graphics_contexts();
+ desc->set_iid(device->iid);
+ desc->set_pid(getpid());
+ switch (device->api) {
+ case AMD_DS_API_OPENGL:
+ desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::OPEN_GL);
+ break;
+ case AMD_DS_API_VULKAN:
+ desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::VULKAN);
+ break;
+ default:
+ break;
+ }
+ }
+
+ /* Emit all the IID picked at device/queue creation. */
+ list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
+ for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
+ {
+ /* We put the stage number in there so that all rows are order
+ * by si_ds_queue_stage.
+ */
+ char name[100];
+ snprintf(name, sizeof(name), "%.10s-%s-%u-%s", util_get_process_name(), queue->name, s, si_queue_stage_desc[s].name);
+
+ auto desc = interned_data->add_gpu_specifications();
+ desc->set_iid(queue->stages[s].queue_iid);
+ desc->set_name(name);
+ }
+ {
+ auto desc = interned_data->add_gpu_specifications();
+ desc->set_iid(queue->stages[s].stage_iid);
+ desc->set_name(si_queue_stage_desc[s].name);
+ }
+ }
+ }
+ }
+
+ device->next_clock_sync_ns = 0;
+ sync_timestamp(ctx, device);
+}
+
+typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *, const void*);
+
+static void begin_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id)
+{
+ PERFETTO_LOG("begin event called - ts_ns=%lu", ts_ns);
+ uint32_t level = queue->stages[stage_id].level;
+ /* If we haven't managed to calibrate the alignment between GPU and CPU
+ * timestamps yet, then skip this trace, otherwise perfetto won't know
+ * what to do with it.
+ */
+ if (!queue->device->sync_gpu_ts) {
+ queue->stages[stage_id].start_ns[level] = 0;
+ return;
+ }
+
+ if (level >= (ARRAY_SIZE(queue->stages[stage_id].start_ns) - 1))
+ return;
+
+ queue->stages[stage_id].start_ns[level] = ts_ns;
+ queue->stages[stage_id].level++;
+}
+
+static void end_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id, uint32_t submission_id, const char *app_event, const void* payload = nullptr, trace_payload_as_extra_func payload_as_extra = nullptr)
+{
+ PERFETTO_LOG("end event called - ts_ns=%lu", ts_ns);
+ struct si_ds_device *device = queue->device;
+
+ /* If we haven't managed to calibrate the alignment between GPU and CPU
+ * timestamps yet, then skip this trace, otherwise perfetto won't know
+ * what to do with it.
+ */
+ if (!device->sync_gpu_ts)
+ return;
+
+ if (queue->stages[stage_id].level == 0)
+ return;
+
+ uint32_t level = --queue->stages[stage_id].level;
+ struct si_ds_stage *stage = &queue->stages[stage_id];
+ uint64_t start_ns = stage->start_ns[level];
+ PERFETTO_LOG("end event called - start_ns=%lu ts_ns=%lu", start_ns, ts_ns);
+ if (!start_ns || start_ns > ts_ns)
+ return;
+
+ SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
+ if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
+ send_descriptors(tctx, queue->device);
+ state->was_cleared = false;
+ }
+
+ sync_timestamp(tctx, queue->device);
+
+ uint64_t evt_id = device->event_id++;
+
+ /* If this is an application event, we might need to generate a new
+ * stage_iid if not already seen. Otherwise, it's a driver event and we
+ * have use the internal stage_iid.
+ */
+ uint64_t stage_iid = app_event ? tctx.GetDataSourceLocked()->debug_marker_stage(tctx, app_event) : stage->stage_iid;
+
+ auto packet = tctx.NewTracePacket();
+
+ packet->set_timestamp(start_ns);
+ packet->set_timestamp_clock_id(queue->device->gpu_clock_id);
+
+ assert(ts_ns >= start_ns);
+
+ auto event = packet->set_gpu_render_stage_event();
+ event->set_gpu_id(queue->device->gpu_id);
+
+ event->set_hw_queue_iid(stage->queue_iid);
+ event->set_stage_iid(stage_iid);
+ event->set_context(queue->device->iid);
+ event->set_event_id(evt_id);
+ event->set_duration(ts_ns - start_ns);
+ event->set_submission_id(submission_id);
+
+ if (payload && payload_as_extra) {
+ payload_as_extra(event, payload);
+ }
+ });
+
+ stage->start_ns[level] = 0;
+}
+
+#endif /* HAVE_PERFETTO */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef HAVE_PERFETTO
+
+/*
+ * Trace callbacks, called from u_trace once the timestamps from GPU have been
+ * collected.
+ */
+
+#define CREATE_DUAL_EVENT_CALLBACK(event_name, stage) \
+void si_ds_begin_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx, \
+ const void *flush_data, \
+ const struct trace_si_begin_##event_name *payload) \
+{ \
+ const struct si_ds_flush_data *flush = (const struct si_ds_flush_data *) flush_data; \
+ begin_event(flush->queue, ts_ns, stage); \
+} \
+ \
+void si_ds_end_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx, \
+ const void *flush_data, \
+ const struct trace_si_end_##event_name *payload) \
+{ \
+ const struct si_ds_flush_data *flush = (const struct si_ds_flush_data *) flush_data; \
+ end_event(flush->queue, ts_ns, stage, flush->submission_id, NULL, payload, \
+ (trace_payload_as_extra_func)&trace_payload_as_extra_si_end_##event_name); \
+} \
+
+CREATE_DUAL_EVENT_CALLBACK(draw, SI_DS_QUEUE_STAGE_DRAW)
+CREATE_DUAL_EVENT_CALLBACK(compute, SI_DS_QUEUE_STAGE_COMPUTE)
+
+uint64_t si_ds_begin_submit(struct si_ds_queue *queue)
+{
+ return perfetto::base::GetBootTimeNs().count();
+}
+
+void si_ds_end_submit(struct si_ds_queue *queue, uint64_t start_ts)
+{
+ if (!u_trace_should_process(&queue->device->trace_context)) {
+ queue->device->sync_gpu_ts = 0;
+ queue->device->next_clock_sync_ns = 0;
+ return;
+ }
+
+ uint64_t end_ts = perfetto::base::GetBootTimeNs().count();
+ uint32_t submission_id = queue->submission_id++;
+
+ SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
+ if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
+ send_descriptors(tctx, queue->device);
+ state->was_cleared = false;
+ }
+
+ sync_timestamp(tctx, queue->device);
+
+ auto packet = tctx.NewTracePacket();
+
+ packet->set_timestamp(start_ts);
+
+ auto event = packet->set_vulkan_api_event();
+ auto submit = event->set_vk_queue_submit();
+
+ submit->set_duration_ns(end_ts - start_ts);
+ submit->set_vk_queue((uintptr_t) queue);
+ submit->set_submission_id(submission_id);
+ });
+}
+
+#endif /* HAVE_PERFETTO */
+
+static void si_driver_ds_init_once(void)
+{
+#ifdef HAVE_PERFETTO
+ util_perfetto_init();
+ perfetto::DataSourceDescriptor dsd;
+ dsd.set_name("gpu.renderstages.amd");
+ SIRenderpassDataSource::Register(dsd);
+#endif
+}
+
+static once_flag si_driver_ds_once_flag = ONCE_FLAG_INIT;
+static uint64_t iid = 1;
+
+static uint64_t get_iid()
+{
+ return iid++;
+}
+
+static uint32_t si_pps_clock_id(uint32_t gpu_id)
+{
+ char buf[40];
+ snprintf(buf, sizeof(buf), "org.freedesktop.mesa.amd.gpu%u", gpu_id);
+
+ return _mesa_hash_string(buf) | 0x80000000;
+}
+
+void si_driver_ds_init(void)
+{
+ call_once(&si_driver_ds_once_flag, si_driver_ds_init_once);
+ si_gpu_tracepoint_config_variable();
+}
+
+void si_ds_device_init(struct si_ds_device *device, const struct radeon_info *devinfo, uint32_t gpu_id, enum amd_ds_api api)
+{
+ device->gpu_id = gpu_id;
+ device->gpu_clock_id = si_pps_clock_id(gpu_id);
+ device->info = devinfo;
+ device->iid = get_iid();
+ device->api = api;
+ list_inithead(&device->queues);
+}
+
+void si_ds_device_fini(struct si_ds_device *device)
+{
+ u_trace_context_fini(&device->trace_context);
+}
+
+struct si_ds_queue * si_ds_device_init_queue(struct si_ds_device *device, struct si_ds_queue *queue, const char *fmt_name, ...)
+{
+ va_list ap;
+ queue->device = device;
+
+ va_start(ap, fmt_name);
+ vsnprintf(queue->name, sizeof(queue->name), fmt_name, ap);
+ va_end(ap);
+
+ for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
+ queue->stages[s].queue_iid = get_iid();
+ queue->stages[s].stage_iid = get_iid();
+ }
+
+ list_add(&queue->link, &device->queues);
+
+ return queue;
+}
+
+void si_ds_flush_data_init(struct si_ds_flush_data *data, struct si_ds_queue *queue, uint64_t submission_id)
+{
+ memset(data, 0, sizeof(*data));
+
+ data->queue = queue;
+ data->submission_id = submission_id;
+
+ u_trace_init(&data->trace, &queue->device->trace_context);
+}
+
+void si_ds_flush_data_fini(struct si_ds_flush_data *data)
+{
+ u_trace_fini(&data->trace);
+}
+
+#ifdef __cplusplus
+}
+#endif
--- /dev/null
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef SI_PERFETTO_H
+#define SI_PERFETTO_H
+
+#include <stdint.h>
+
+#include "util/macros.h"
+#include "util/perf/u_trace.h"
+#include "util/u_vector.h"
+
+#include "amd/common/ac_gpu_info.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Perfetto collects TracePackets from the application and/or drivers. It is the root object of a
+ * Perfetto trace. A Perfetto trace is a linear sequence of TracePackets.
+ * TracePackets contains timestamp and timestamp_clock_id along with lots of other data
+ * like gpu_counter_event and gpu_render_stage_event.
+ * gpu_render_stage_event contains data such as event_id, duration, gpu_id, stage_iid, context etc.
+ * So a render stage can be named as "draw" which will collect start timestamp and end timestamp
+ * along with other payload data of each draw call from OpenGL
+ */
+
+enum amd_ds_api {
+ AMD_DS_API_OPENGL,
+ AMD_DS_API_VULKAN,
+};
+
+enum si_ds_queue_stage {
+ SI_DS_QUEUE_STAGE_QUEUE,
+ SI_DS_QUEUE_STAGE_COMPUTE,
+ SI_DS_QUEUE_STAGE_DRAW,
+ SI_DS_QUEUE_STAGE_N_STAGES,
+};
+
+struct si_ds_device {
+ const struct radeon_info *info;
+
+ /* API of this device */
+ enum amd_ds_api api;
+
+ /* GPU identifier domain:bus:device:func:pci_id */
+ uint32_t gpu_id;
+
+ /* Clock identifier for this device. */
+ uint32_t gpu_clock_id;
+
+ /* The timestamp at the point where we first emitted the clock_sync..
+ * this will be a *later* timestamp that the first GPU traces (since
+ * we capture the first clock_sync from the CPU *after* the first GPU
+ * tracepoints happen). To avoid confusing perfetto we need to drop
+ * the GPU traces with timestamps before this.
+ */
+ uint64_t sync_gpu_ts;
+
+ /* Next timestamp after which we should resend a clock correlation. */
+ uint64_t next_clock_sync_ns;
+
+ /* Unique perfetto identifier for the context */
+ uint64_t iid;
+
+ /* Event ID generator (manipulate only inside
+ * SIRenderpassDataSource::Trace)
+ */
+ uint64_t event_id;
+
+ struct u_trace_context trace_context;
+
+ /* List of si_ds_queue */
+ struct list_head queues;
+};
+
+struct si_ds_stage {
+ /* Unique hw_queue IID */
+ uint64_t queue_iid;
+
+ /* Unique stage IID */
+ uint64_t stage_iid;
+
+ /* Start timestamp of the last work element. We have a array indexed by
+ * level so that we can track multi levels of events (like
+ * primary/secondary command buffers).
+ */
+ uint64_t start_ns[5];
+
+ /* Current number of valid elements in start_ns */
+ uint32_t level;
+};
+
+struct si_ds_queue {
+ struct list_head link;
+
+ /* Device this queue belongs to */
+ struct si_ds_device *device;
+
+ /* Unique name of the queue */
+ char name[80];
+
+ /* Counter incremented on each si_ds_end_submit() call */
+ uint64_t submission_id;
+
+ struct si_ds_stage stages[SI_DS_QUEUE_STAGE_N_STAGES];
+};
+
+struct si_ds_flush_data {
+ struct si_ds_queue *queue;
+
+ /* u_trace element in which we copy other traces in case we deal with
+ * reusable command buffers.
+ */
+ struct u_trace trace;
+
+ /* Unique submission ID associated with the trace */
+ uint64_t submission_id;
+};
+
+void si_driver_ds_init(void);
+
+void si_ds_device_init(struct si_ds_device *device, const struct radeon_info *devinfo,
+ uint32_t gpu_id, enum amd_ds_api api);
+void si_ds_device_fini(struct si_ds_device *device);
+
+struct si_ds_queue *si_ds_device_init_queue(struct si_ds_device *device, struct si_ds_queue *queue,
+ const char *fmt_name, ...);
+
+void si_ds_flush_data_init(struct si_ds_flush_data *data, struct si_ds_queue *queue,
+ uint64_t submission_id);
+
+void si_ds_flush_data_fini(struct si_ds_flush_data *data);
+
+#ifdef HAVE_PERFETTO
+uint64_t si_ds_begin_submit(struct si_ds_queue *queue);
+void si_ds_end_submit(struct si_ds_queue *queue,
+ uint64_t start_ts);
+
+#else
+static inline uint64_t si_ds_begin_submit(struct si_ds_queue *queue)
+{
+ return 0;
+}
+
+static inline void si_ds_end_submit(struct si_ds_queue *queue, uint64_t start_ts)
+{
+}
+
+#endif /* HAVE_PERFETTO */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SI_PERFETTO_H */
--- /dev/null
+#
+# Copyright 2023 Advanced Micro Devices, Inc.
+#
+# SPDX-License-Identifier: MIT
+#
+
+import argparse
+import sys
+
+# List of the default tracepoints enabled. By default most tracepoints are
+# enabled, set tp_default=False to disable them by default.
+#
+si_default_tps = []
+
+#
+# Tracepoint definitions:
+#
+def define_tracepoints(args):
+ from u_trace import Header, HeaderScope
+ from u_trace import ForwardDecl
+ from u_trace import Tracepoint
+ from u_trace import TracepointArg as Arg
+ from u_trace import TracepointArgStruct as ArgStruct
+
+ Header('si_perfetto.h', scope=HeaderScope.HEADER)
+
+
+ def begin_end_tp(name, tp_args=[], tp_struct=None, tp_print=None,
+ tp_default_enabled=True, end_pipelined=True,
+ need_cs_param=False):
+ global si_default_tps
+ if tp_default_enabled:
+ si_default_tps.append(name)
+ Tracepoint('si_begin_{0}'.format(name),
+ toggle_name=name,
+ tp_perfetto='si_ds_begin_{0}'.format(name),
+ need_cs_param=need_cs_param)
+ Tracepoint('si_end_{0}'.format(name),
+ toggle_name=name,
+ args=tp_args,
+ tp_struct=tp_struct,
+ tp_perfetto='si_ds_end_{0}'.format(name),
+ tp_print=tp_print,
+ end_of_pipe=end_pipelined,
+ need_cs_param=need_cs_param)
+
+ # Various draws/dispatch, radeonsi
+ begin_end_tp('draw',
+ tp_args=[Arg(type='uint32_t', var='count', c_format='%u')])
+
+ begin_end_tp('compute',
+ tp_args=[Arg(type='uint32_t', var='group_x', c_format='%u'),
+ Arg(type='uint32_t', var='group_y', c_format='%u'),
+ Arg(type='uint32_t', var='group_z', c_format='%u'),],
+ tp_print=['group=%ux%ux%u', '__entry->group_x', '__entry->group_y', '__entry->group_z'])
+
+def generate_code(args):
+ from u_trace import utrace_generate
+ from u_trace import utrace_generate_perfetto_utils
+
+ utrace_generate(cpath=args.src, hpath=args.hdr,
+ ctx_param='struct si_ds_device *dev',
+ trace_toggle_name='si_gpu_tracepoint',
+ trace_toggle_defaults=si_default_tps)
+ utrace_generate_perfetto_utils(hpath=args.perfetto_hdr)
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-p', '--import-path', required=True)
+ parser.add_argument('-C','--src', required=True)
+ parser.add_argument('-H','--hdr', required=True)
+ parser.add_argument('--perfetto-hdr', required=True)
+ args = parser.parse_args()
+ sys.path.insert(0, args.import_path)
+ define_tracepoints(args)
+ generate_code(args)
+
+if __name__ == '__main__':
+ main()