radeonsi: Add perfetto support in radeonsi
authorSaroj Kumar <saroj.kumar@amd.com>
Mon, 17 Jul 2023 14:51:29 +0000 (20:21 +0530)
committerMarge Bot <emma+marge@anholt.net>
Thu, 19 Oct 2023 16:16:15 +0000 (16:16 +0000)
Add perfetto code in new files si_perfetto.h/cc which add tracepoint
begin and end event and calls to the generated code from python
si_tracepoints.py

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23664>

src/gallium/drivers/radeonsi/meson.build
src/gallium/drivers/radeonsi/si_perfetto.cpp [new file with mode: 0644]
src/gallium/drivers/radeonsi/si_perfetto.h [new file with mode: 0644]
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_tracepoints.py [new file with mode: 0644]

index ec0b8bf..a45e529 100644 (file)
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+si_tracepoints = custom_target(
+  'si_tracepoints.[ch]',
+  input: 'si_tracepoints.py',
+  output: ['si_tracepoints.c', 'si_tracepoints_perfetto.h', 'si_tracepoints.h'],
+  command: [
+    prog_python, '@INPUT@',
+    '-p', join_paths(dir_source_root, 'src/util/perf/'),
+    '-C', '@OUTPUT0@',
+    '--perfetto-hdr', '@OUTPUT1@',
+    '-H', '@OUTPUT2@'
+  ],
+  depend_files: u_trace_py,
+)
+
 files_libradeonsi = files(
   'driinfo_radeonsi.h',
   'gfx10_shader_ngg.c',
@@ -101,9 +115,17 @@ files_libradeonsi = files(
   'radeon_video.h',
 )
 
+files_libradeonsi += si_tracepoints
+
 radeonsi_include_dirs = [inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_amd_common,
-                         inc_amd_common_llvm, inc_gallium_drivers]
-radeonsi_deps = [dep_llvm, dep_clock, dep_libdrm_radeon, idep_nir_headers, idep_amdgfxregs_h, idep_mesautil, idep_aco]
+                         inc_amd_common_llvm, inc_gallium_drivers, inc_compiler]
+radeonsi_deps = [dep_llvm, dep_clock, dep_libdrm_radeon, idep_nir_headers, idep_amdgfxregs_h, idep_mesautil, idep_aco, idep_u_tracepoints]
+
+if with_perfetto
+  radeonsi_deps += dep_perfetto
+endif
+
+files_libradeonsi += ['si_perfetto.cpp', 'si_perfetto.h']
 
 radeonsi_gfx_libs = []
 foreach ver : ['6', '7', '8', '9', '10', '103', '11']
diff --git a/src/gallium/drivers/radeonsi/si_perfetto.cpp b/src/gallium/drivers/radeonsi/si_perfetto.cpp
new file mode 100644 (file)
index 0000000..e5c1e8c
--- /dev/null
@@ -0,0 +1,394 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+
+#include "util/hash_table.h"
+#include "util/u_process.h"
+#include "util/hash_table.h"
+
+#include "si_pipe.h"
+#include "si_perfetto.h"
+#include "si_tracepoints.h"
+
+#ifdef HAVE_PERFETTO
+
+#include "util/perf/u_perfetto.h"
+#include "util/perf/u_perfetto_renderpass.h"
+
+#include "si_tracepoints_perfetto.h"
+
+/* Just naming stages */
+static const struct {
+   const char *name;
+
+   /* The perfetto UI requires that there is a parent-child relationship
+    * within a row of elements. Which means that all children elements must
+    * end within the lifespan of their parent.
+    *
+    * Some elements like stalls and command buffers follow that relationship,
+    * but not all. This tells us in which UI row the elements should live.
+    */
+   enum si_ds_queue_stage draw_stage;
+} si_queue_stage_desc[SI_DS_QUEUE_STAGE_N_STAGES] = {
+   /* Order must match the enum! */
+   {
+      "queue",
+      SI_DS_QUEUE_STAGE_QUEUE,
+   },
+   {
+      "compute",
+      SI_DS_QUEUE_STAGE_COMPUTE,
+   },
+   {
+      "draw",
+      SI_DS_QUEUE_STAGE_DRAW,
+   }
+};
+
+struct SIRenderpassIncrementalState {
+   bool was_cleared = true;
+};
+
+struct SIRenderpassTraits : public perfetto::DefaultDataSourceTraits {
+   using IncrementalStateType = SIRenderpassIncrementalState;
+};
+
+class SIRenderpassDataSource : public MesaRenderpassDataSource<SIRenderpassDataSource, SIRenderpassTraits> {
+};
+
+PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
+PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(SIRenderpassDataSource);
+
+using perfetto::protos::pbzero::InternedGpuRenderStageSpecification_RenderStageCategory;
+
+static void sync_timestamp(SIRenderpassDataSource::TraceContext &ctx, struct si_ds_device *device)
+{
+   uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
+   uint64_t gpu_ts;
+
+   struct si_context *sctx = container_of(device, struct si_context, ds);   
+   gpu_ts = sctx->screen->b.get_timestamp(&sctx->screen->b);
+
+
+   cpu_ts = perfetto::base::GetBootTimeNs().count();
+
+   if (cpu_ts < device->next_clock_sync_ns)
+      return;
+
+   PERFETTO_LOG("sending clocks gpu=0x%08x", device->gpu_clock_id);
+
+   device->sync_gpu_ts = gpu_ts;
+   device->next_clock_sync_ns = cpu_ts + 1000000000ull;
+   MesaRenderpassDataSource<SIRenderpassDataSource, SIRenderpassTraits>::EmitClockSync(ctx, cpu_ts, gpu_ts, device->gpu_clock_id);
+}
+
+static void send_descriptors(SIRenderpassDataSource::TraceContext &ctx, struct si_ds_device *device)
+{
+   PERFETTO_LOG("Sending renderstage descriptors");
+
+   device->event_id = 0;
+   list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
+      for (uint32_t s = 0; s < ARRAY_SIZE(queue->stages); s++) {
+         queue->stages[s].start_ns[0] = 0;
+      }
+   }
+
+   {
+      auto packet = ctx.NewTracePacket();
+
+      packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
+      packet->set_timestamp_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
+      packet->set_sequence_flags(perfetto::protos::pbzero::TracePacket::SEQ_INCREMENTAL_STATE_CLEARED);
+
+      auto interned_data = packet->set_interned_data();
+
+      {
+         auto desc = interned_data->add_graphics_contexts();
+         desc->set_iid(device->iid);
+         desc->set_pid(getpid());
+         switch (device->api) {
+         case AMD_DS_API_OPENGL:
+            desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::OPEN_GL);
+            break;
+         case AMD_DS_API_VULKAN:
+            desc->set_api(perfetto::protos::pbzero::InternedGraphicsContext_Api::VULKAN);
+            break;
+         default:
+            break;
+         }
+      }
+
+      /* Emit all the IID picked at device/queue creation. */
+      list_for_each_entry_safe(struct si_ds_queue, queue, &device->queues, link) {
+         for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
+            {
+               /* We put the stage number in there so that all rows are order
+                * by si_ds_queue_stage.
+                */
+               char name[100];
+               snprintf(name, sizeof(name), "%.10s-%s-%u-%s", util_get_process_name(), queue->name, s, si_queue_stage_desc[s].name);
+
+               auto desc = interned_data->add_gpu_specifications();
+               desc->set_iid(queue->stages[s].queue_iid);
+               desc->set_name(name);
+            }
+            {
+               auto desc = interned_data->add_gpu_specifications();
+               desc->set_iid(queue->stages[s].stage_iid);
+               desc->set_name(si_queue_stage_desc[s].name);
+            }
+         }
+      }
+   }
+
+   device->next_clock_sync_ns = 0;
+   sync_timestamp(ctx, device);
+}
+
+typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *, const void*);
+
+static void begin_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id)
+{
+   PERFETTO_LOG("begin event called - ts_ns=%lu", ts_ns);
+   uint32_t level = queue->stages[stage_id].level;
+   /* If we haven't managed to calibrate the alignment between GPU and CPU
+    * timestamps yet, then skip this trace, otherwise perfetto won't know
+    * what to do with it.
+    */
+   if (!queue->device->sync_gpu_ts) {
+      queue->stages[stage_id].start_ns[level] = 0;
+      return;
+   }
+
+   if (level >= (ARRAY_SIZE(queue->stages[stage_id].start_ns) - 1))
+      return;
+
+   queue->stages[stage_id].start_ns[level] = ts_ns;
+   queue->stages[stage_id].level++;
+}
+
+static void end_event(struct si_ds_queue *queue, uint64_t ts_ns, enum si_ds_queue_stage stage_id, uint32_t submission_id, const char *app_event, const void* payload = nullptr, trace_payload_as_extra_func payload_as_extra = nullptr)
+{
+   PERFETTO_LOG("end event called - ts_ns=%lu", ts_ns);
+   struct si_ds_device *device = queue->device;
+
+   /* If we haven't managed to calibrate the alignment between GPU and CPU
+    * timestamps yet, then skip this trace, otherwise perfetto won't know
+    * what to do with it.
+    */
+   if (!device->sync_gpu_ts)
+      return;
+
+   if (queue->stages[stage_id].level == 0)
+      return;
+
+   uint32_t level = --queue->stages[stage_id].level;
+   struct si_ds_stage *stage = &queue->stages[stage_id];
+   uint64_t start_ns = stage->start_ns[level];
+   PERFETTO_LOG("end event called - start_ns=%lu ts_ns=%lu", start_ns, ts_ns);
+   if (!start_ns || start_ns > ts_ns)
+      return;
+
+   SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
+      if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
+         send_descriptors(tctx, queue->device);
+         state->was_cleared = false;
+      }
+
+      sync_timestamp(tctx, queue->device);
+
+      uint64_t evt_id = device->event_id++;
+
+      /* If this is an application event, we might need to generate a new
+       * stage_iid if not already seen. Otherwise, it's a driver event and we
+       * have use the internal stage_iid.
+       */
+      uint64_t stage_iid = app_event ? tctx.GetDataSourceLocked()->debug_marker_stage(tctx, app_event) : stage->stage_iid;
+
+      auto packet = tctx.NewTracePacket();
+
+      packet->set_timestamp(start_ns);
+      packet->set_timestamp_clock_id(queue->device->gpu_clock_id);
+
+      assert(ts_ns >= start_ns);
+
+      auto event = packet->set_gpu_render_stage_event();
+      event->set_gpu_id(queue->device->gpu_id);
+
+      event->set_hw_queue_iid(stage->queue_iid);
+      event->set_stage_iid(stage_iid);
+      event->set_context(queue->device->iid);
+      event->set_event_id(evt_id);
+      event->set_duration(ts_ns - start_ns);
+      event->set_submission_id(submission_id);
+
+      if (payload && payload_as_extra) {
+         payload_as_extra(event, payload);
+      }
+   });
+
+   stage->start_ns[level] = 0;
+}
+
+#endif /* HAVE_PERFETTO */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef HAVE_PERFETTO
+
+/*
+ * Trace callbacks, called from u_trace once the timestamps from GPU have been
+ * collected.
+ */
+
+#define CREATE_DUAL_EVENT_CALLBACK(event_name, stage)                                             \
+void si_ds_begin_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx,       \
+                              const void *flush_data,                                             \
+                              const struct trace_si_begin_##event_name *payload)                  \
+{                                                                                                 \
+   const struct si_ds_flush_data *flush = (const struct si_ds_flush_data *) flush_data;           \
+   begin_event(flush->queue, ts_ns, stage);                                                       \
+}                                                                                                 \
+                                                                                                  \
+void si_ds_end_##event_name(struct si_ds_device *device, uint64_t ts_ns, uint16_t tp_idx,         \
+                            const void *flush_data,                                               \
+                            const struct trace_si_end_##event_name *payload)                      \
+{                                                                                                 \
+   const struct si_ds_flush_data *flush =  (const struct si_ds_flush_data *) flush_data;          \
+   end_event(flush->queue, ts_ns, stage, flush->submission_id, NULL, payload,                     \
+             (trace_payload_as_extra_func)&trace_payload_as_extra_si_end_##event_name);           \
+}                                                                                                 \
+
+CREATE_DUAL_EVENT_CALLBACK(draw, SI_DS_QUEUE_STAGE_DRAW)
+CREATE_DUAL_EVENT_CALLBACK(compute, SI_DS_QUEUE_STAGE_COMPUTE)
+
+uint64_t si_ds_begin_submit(struct si_ds_queue *queue)
+{
+   return perfetto::base::GetBootTimeNs().count();
+}
+
+void si_ds_end_submit(struct si_ds_queue *queue, uint64_t start_ts)
+{
+   if (!u_trace_should_process(&queue->device->trace_context)) {
+      queue->device->sync_gpu_ts = 0;
+      queue->device->next_clock_sync_ns = 0;
+      return;
+   }
+
+   uint64_t end_ts = perfetto::base::GetBootTimeNs().count();
+   uint32_t submission_id = queue->submission_id++;
+
+   SIRenderpassDataSource::Trace([=](SIRenderpassDataSource::TraceContext tctx) {
+      if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
+         send_descriptors(tctx, queue->device);
+         state->was_cleared = false;
+      }
+
+      sync_timestamp(tctx, queue->device);
+
+      auto packet = tctx.NewTracePacket();
+
+      packet->set_timestamp(start_ts);
+
+      auto event = packet->set_vulkan_api_event();
+      auto submit = event->set_vk_queue_submit();
+
+      submit->set_duration_ns(end_ts - start_ts);
+      submit->set_vk_queue((uintptr_t) queue);
+      submit->set_submission_id(submission_id);
+   });
+}
+
+#endif /* HAVE_PERFETTO */
+
+static void si_driver_ds_init_once(void)
+{
+#ifdef HAVE_PERFETTO
+   util_perfetto_init();
+   perfetto::DataSourceDescriptor dsd;
+   dsd.set_name("gpu.renderstages.amd");
+   SIRenderpassDataSource::Register(dsd);
+#endif
+}
+
+static once_flag si_driver_ds_once_flag = ONCE_FLAG_INIT;
+static uint64_t iid = 1;
+
+static uint64_t get_iid()
+{
+   return iid++;
+}
+
+static uint32_t si_pps_clock_id(uint32_t gpu_id)
+{
+   char buf[40];
+   snprintf(buf, sizeof(buf), "org.freedesktop.mesa.amd.gpu%u", gpu_id);
+
+   return _mesa_hash_string(buf) | 0x80000000;
+}
+
+void si_driver_ds_init(void)
+{
+   call_once(&si_driver_ds_once_flag, si_driver_ds_init_once);
+   si_gpu_tracepoint_config_variable();
+}
+
+void si_ds_device_init(struct si_ds_device *device, const struct radeon_info *devinfo, uint32_t gpu_id, enum amd_ds_api api)
+{
+   device->gpu_id = gpu_id;
+   device->gpu_clock_id = si_pps_clock_id(gpu_id);
+   device->info = devinfo;
+   device->iid = get_iid();
+   device->api = api;
+   list_inithead(&device->queues);
+}
+
+void si_ds_device_fini(struct si_ds_device *device)
+{
+   u_trace_context_fini(&device->trace_context);
+}
+
+struct si_ds_queue * si_ds_device_init_queue(struct si_ds_device *device, struct si_ds_queue *queue, const char *fmt_name, ...)
+{
+   va_list ap;
+   queue->device = device;
+
+   va_start(ap, fmt_name);
+   vsnprintf(queue->name, sizeof(queue->name), fmt_name, ap);
+   va_end(ap);
+
+   for (unsigned s = 0; s < SI_DS_QUEUE_STAGE_N_STAGES; s++) {
+      queue->stages[s].queue_iid = get_iid();
+      queue->stages[s].stage_iid = get_iid();
+   }
+
+   list_add(&queue->link, &device->queues);
+
+   return queue;
+}
+
+void si_ds_flush_data_init(struct si_ds_flush_data *data, struct si_ds_queue *queue, uint64_t submission_id)
+{
+   memset(data, 0, sizeof(*data));
+
+   data->queue = queue;
+   data->submission_id = submission_id;
+
+   u_trace_init(&data->trace, &queue->device->trace_context);
+}
+
+void si_ds_flush_data_fini(struct si_ds_flush_data *data)
+{
+   u_trace_fini(&data->trace);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/gallium/drivers/radeonsi/si_perfetto.h b/src/gallium/drivers/radeonsi/si_perfetto.h
new file mode 100644 (file)
index 0000000..1897064
--- /dev/null
@@ -0,0 +1,159 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef SI_PERFETTO_H
+#define SI_PERFETTO_H
+
+#include <stdint.h>
+
+#include "util/macros.h"
+#include "util/perf/u_trace.h"
+#include "util/u_vector.h"
+
+#include "amd/common/ac_gpu_info.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Perfetto collects TracePackets from the application and/or drivers. It is the root object of a 
+ * Perfetto trace. A Perfetto trace is a linear sequence of TracePackets.
+ * TracePackets contains timestamp and timestamp_clock_id along with lots of other data 
+ * like gpu_counter_event and gpu_render_stage_event.
+ * gpu_render_stage_event contains data such as event_id, duration, gpu_id, stage_iid, context etc.
+ * So a render stage can be named as "draw" which will collect start timestamp and end timestamp 
+ * along with other payload data of each draw call from OpenGL
+ */
+
+enum amd_ds_api {
+   AMD_DS_API_OPENGL,
+   AMD_DS_API_VULKAN,
+};
+
+enum si_ds_queue_stage {
+   SI_DS_QUEUE_STAGE_QUEUE,
+   SI_DS_QUEUE_STAGE_COMPUTE,
+   SI_DS_QUEUE_STAGE_DRAW,
+   SI_DS_QUEUE_STAGE_N_STAGES,
+};
+
+struct si_ds_device {
+   const struct radeon_info *info;
+
+   /* API of this device */
+   enum amd_ds_api api;
+
+   /* GPU identifier domain:bus:device:func:pci_id */
+   uint32_t gpu_id;
+
+   /* Clock identifier for this device. */
+   uint32_t gpu_clock_id;
+
+   /* The timestamp at the point where we first emitted the clock_sync..
+    * this  will be a *later* timestamp that the first GPU traces (since
+    * we capture the first clock_sync from the CPU *after* the first GPU
+    * tracepoints happen).  To avoid confusing perfetto we need to drop
+    * the GPU traces with timestamps before this.
+    */
+   uint64_t sync_gpu_ts;
+
+   /* Next timestamp after which we should resend a clock correlation. */
+   uint64_t next_clock_sync_ns;
+
+   /* Unique perfetto identifier for the context */
+   uint64_t iid;
+
+   /* Event ID generator (manipulate only inside
+    * SIRenderpassDataSource::Trace)
+    */
+   uint64_t event_id;
+
+   struct u_trace_context trace_context;
+
+   /* List of si_ds_queue */
+   struct list_head queues;
+};
+
+struct si_ds_stage {
+   /* Unique hw_queue IID */
+   uint64_t queue_iid;
+
+   /* Unique stage IID */
+   uint64_t stage_iid;
+
+   /* Start timestamp of the last work element. We have a array indexed by
+    * level so that we can track multi levels of events (like
+    * primary/secondary command buffers).
+    */
+   uint64_t start_ns[5];
+
+   /* Current number of valid elements in start_ns */
+   uint32_t level;
+};
+
+struct si_ds_queue {
+   struct list_head link;
+
+   /* Device this queue belongs to */
+   struct si_ds_device *device;
+
+   /* Unique name of the queue */
+   char name[80];
+
+   /* Counter incremented on each si_ds_end_submit() call */
+   uint64_t submission_id;
+
+   struct si_ds_stage stages[SI_DS_QUEUE_STAGE_N_STAGES];
+};
+
+struct si_ds_flush_data {
+   struct si_ds_queue *queue;
+
+   /* u_trace element in which we copy other traces in case we deal with
+    * reusable command buffers.
+    */
+   struct u_trace trace;
+
+   /* Unique submission ID associated with the trace */
+   uint64_t submission_id;
+};
+
+void si_driver_ds_init(void);
+
+void si_ds_device_init(struct si_ds_device *device, const struct radeon_info *devinfo,
+                       uint32_t gpu_id, enum amd_ds_api api);
+void si_ds_device_fini(struct si_ds_device *device);
+
+struct si_ds_queue *si_ds_device_init_queue(struct si_ds_device *device, struct si_ds_queue *queue,
+                                            const char *fmt_name, ...);
+
+void si_ds_flush_data_init(struct si_ds_flush_data *data, struct si_ds_queue *queue,
+                           uint64_t submission_id);
+
+void si_ds_flush_data_fini(struct si_ds_flush_data *data);
+
+#ifdef HAVE_PERFETTO
+uint64_t si_ds_begin_submit(struct si_ds_queue *queue);
+void si_ds_end_submit(struct si_ds_queue *queue,
+                         uint64_t start_ts);
+
+#else
+static inline uint64_t si_ds_begin_submit(struct si_ds_queue *queue)
+{
+   return 0;
+}
+
+static inline void si_ds_end_submit(struct si_ds_queue *queue, uint64_t start_ts)
+{
+}
+
+#endif /* HAVE_PERFETTO */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SI_PERFETTO_H */
index af042af..c395084 100644 (file)
@@ -17,6 +17,7 @@
 #include "util/u_vertex_state_cache.h"
 #include "ac_sqtt.h"
 #include "ac_spm.h"
+#include "si_perfetto.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -1361,6 +1362,8 @@ struct si_context {
    /* TODO: move other shaders here too */
    /* Only used for DCC MSAA clears with 4-8 fragments and 4-16 samples. */
    void *cs_clear_dcc_msaa[32][5][2][3][2]; /* [swizzle_mode][log2(bpe)][fragments == 8][log2(samples)-2][is_array] */
+   
+   struct si_ds_device ds;
 };
 
 /* si_blit.c */
diff --git a/src/gallium/drivers/radeonsi/si_tracepoints.py b/src/gallium/drivers/radeonsi/si_tracepoints.py
new file mode 100644 (file)
index 0000000..00320b9
--- /dev/null
@@ -0,0 +1,79 @@
+#
+# Copyright 2023 Advanced Micro Devices, Inc.
+#
+# SPDX-License-Identifier: MIT
+#
+
+import argparse
+import sys
+
+# List of the default tracepoints enabled. By default most tracepoints are
+# enabled, set tp_default=False to disable them by default.
+#
+si_default_tps = []
+
+#
+# Tracepoint definitions:
+#
+def define_tracepoints(args):
+    from u_trace import Header, HeaderScope
+    from u_trace import ForwardDecl
+    from u_trace import Tracepoint
+    from u_trace import TracepointArg as Arg
+    from u_trace import TracepointArgStruct as ArgStruct
+
+    Header('si_perfetto.h', scope=HeaderScope.HEADER)
+    
+
+    def begin_end_tp(name, tp_args=[], tp_struct=None, tp_print=None,
+                     tp_default_enabled=True, end_pipelined=True,
+                     need_cs_param=False):
+        global si_default_tps
+        if tp_default_enabled:
+            si_default_tps.append(name)
+        Tracepoint('si_begin_{0}'.format(name),
+                   toggle_name=name,
+                   tp_perfetto='si_ds_begin_{0}'.format(name),
+                   need_cs_param=need_cs_param)
+        Tracepoint('si_end_{0}'.format(name),
+                   toggle_name=name,
+                   args=tp_args,
+                   tp_struct=tp_struct,
+                   tp_perfetto='si_ds_end_{0}'.format(name),
+                   tp_print=tp_print,
+                   end_of_pipe=end_pipelined,
+                   need_cs_param=need_cs_param)
+
+    # Various draws/dispatch, radeonsi
+    begin_end_tp('draw',
+                 tp_args=[Arg(type='uint32_t', var='count', c_format='%u')])
+
+    begin_end_tp('compute',
+                 tp_args=[Arg(type='uint32_t', var='group_x', c_format='%u'),
+                          Arg(type='uint32_t', var='group_y', c_format='%u'),
+                          Arg(type='uint32_t', var='group_z', c_format='%u'),],
+                 tp_print=['group=%ux%ux%u', '__entry->group_x', '__entry->group_y', '__entry->group_z'])
+
+def generate_code(args):
+    from u_trace import utrace_generate
+    from u_trace import utrace_generate_perfetto_utils
+
+    utrace_generate(cpath=args.src, hpath=args.hdr,
+                    ctx_param='struct si_ds_device *dev',
+                    trace_toggle_name='si_gpu_tracepoint',
+                    trace_toggle_defaults=si_default_tps)
+    utrace_generate_perfetto_utils(hpath=args.perfetto_hdr)
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--import-path', required=True)
+    parser.add_argument('-C','--src', required=True)
+    parser.add_argument('-H','--hdr', required=True)
+    parser.add_argument('--perfetto-hdr', required=True)
+    args = parser.parse_args()
+    sys.path.insert(0, args.import_path)
+    define_tracepoints(args)
+    generate_code(args)
+
+if __name__ == '__main__':
+    main()