tu/perfetto: Allow gpu time to be passed into tu_perfetto_submit
authorDanylo Piliaiev <dpiliaiev@igalia.com>
Wed, 9 Aug 2023 14:14:07 +0000 (16:14 +0200)
committerMarge Bot <emma+marge@anholt.net>
Tue, 3 Oct 2023 14:19:24 +0000 (14:19 +0000)
In preparation to support perfetto on KGSL, on KGSL GPU time is
retrieved on submission and requires minimal post-processing.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12805>

src/freedreno/vulkan/tu_device.h
src/freedreno/vulkan/tu_knl_drm_msm.cc
src/freedreno/vulkan/tu_perfetto.cc
src/freedreno/vulkan/tu_perfetto.h

index 4878823..67db9b4 100644 (file)
@@ -521,6 +521,11 @@ struct tu_u_trace_submission_data
    uint32_t cmd_buffer_count;
    uint32_t last_buffer_with_tracepoints;
    struct tu_u_trace_cmd_data *cmd_trace_data;
+
+   /* GPU time is reset on GPU power cycle and the GPU time
+    * offset may change between submissions due to power cycle.
+    */
+   uint64_t gpu_ts_offset;
 };
 
 VkResult
index 720de82..3a5e356 100644 (file)
@@ -919,14 +919,18 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
 
    p_atomic_set(&queue->fence, req.fence);
 
+   uint64_t gpu_offset = 0;
 #if HAVE_PERFETTO
-   tu_perfetto_submit(queue->device, queue->device->submit_count);
+   struct tu_perfetto_clocks clocks =
+      tu_perfetto_submit(queue->device, queue->device->submit_count, NULL);
+   gpu_offset = clocks.gpu_ts_offset;
 #endif
 
    if (submit->u_trace_submission_data) {
       struct tu_u_trace_submission_data *submission_data =
          submit->u_trace_submission_data;
       submission_data->submission_id = queue->device->submit_count;
+      submission_data->gpu_ts_offset = gpu_offset;
       /* We have to allocate it here since it is different between drm/kgsl */
       submission_data->syncobj = (struct tu_u_trace_syncobj *)
          vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj),
index 614b6fe..b417eec 100644 (file)
@@ -247,6 +247,7 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
    auto trace_flush_data =
       (const struct tu_u_trace_submission_data *) flush_data;
    uint32_t submission_id = trace_flush_data->submission_id;
+   uint64_t gpu_ts_offset = trace_flush_data->gpu_ts_offset;
 
    if (!stage)
       return;
@@ -272,9 +273,9 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id,
 
       auto packet = tctx.NewTracePacket();
 
-      gpu_max_timestamp = MAX2(gpu_max_timestamp, ts_ns + gpu_timestamp_offset);
+      gpu_max_timestamp = MAX2(gpu_max_timestamp, ts_ns + gpu_ts_offset);
 
-      packet->set_timestamp(stage->start_ts + gpu_timestamp_offset);
+      packet->set_timestamp(stage->start_ts + gpu_ts_offset);
       packet->set_timestamp_clock_id(gpu_clock_id);
 
       auto event = packet->set_gpu_render_stage_event();
@@ -315,64 +316,13 @@ tu_perfetto_init(void)
 }
 
 static void
-sync_timestamp(struct tu_device *dev)
+emit_sync_timestamp(uint64_t cpu_ts, uint64_t gpu_ts)
 {
-   uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
-   uint64_t gpu_ts = 0;
-
-   if (cpu_ts < next_clock_sync_ns)
-      return;
-
-   if (tu_device_get_gpu_timestamp(dev, &gpu_ts)) {
-      PERFETTO_ELOG("Could not sync CPU and GPU clocks");
-      return;
-   }
-
-   /* get cpu timestamp again because tu_device_get_gpu_timestamp can take
-    * >100us
-    */
-   cpu_ts = perfetto::base::GetBootTimeNs().count();
-
-   uint64_t current_suspend_count = 0;
-   /* If we fail to get it we will use a fallback */
-   tu_device_get_suspend_count(dev, &current_suspend_count);
-
-   /* convert GPU ts into ns: */
-   gpu_ts = tu_device_ticks_to_ns(dev, gpu_ts);
-
-   /* GPU timestamp is being reset after suspend-resume cycle.
-    * Perfetto requires clock snapshots to be monotonic,
-    * so we have to fix-up the time.
-    */
-   if (current_suspend_count != last_suspend_count) {
-      gpu_timestamp_offset = gpu_max_timestamp;
-      last_suspend_count = current_suspend_count;
-   }
-
-   gpu_ts += gpu_timestamp_offset;
-
-   /* Fallback check, detect non-monotonic cases which would happen
-    * if we cannot retrieve suspend count.
-    */
-   if (sync_gpu_ts > gpu_ts) {
-      gpu_ts += (gpu_max_timestamp - gpu_timestamp_offset);
-      gpu_timestamp_offset = gpu_max_timestamp;
-   }
-
-   if (sync_gpu_ts > gpu_ts) {
-      PERFETTO_ELOG("Non-monotonic gpu timestamp detected, bailing out");
-      return;
-   }
-
    TuRenderpassDataSource::Trace([=](auto tctx) {
       MesaRenderpassDataSource<TuRenderpassDataSource,
                                TuRenderpassTraits>::EmitClockSync(tctx, cpu_ts,
                                                                   gpu_ts, gpu_clock_id);
    });
-
-   gpu_max_timestamp = gpu_ts;
-   sync_gpu_ts = gpu_ts;
-   next_clock_sync_ns = cpu_ts + 30000000;
 }
 
 static void
@@ -390,15 +340,87 @@ emit_submit_id(uint32_t submission_id)
    });
 }
 
-void
-tu_perfetto_submit(struct tu_device *dev, uint32_t submission_id)
+struct tu_perfetto_clocks
+tu_perfetto_submit(struct tu_device *dev,
+                   uint32_t submission_id,
+                   struct tu_perfetto_clocks *gpu_clocks)
 {
-   /* sync_timestamp isn't free */
+   struct tu_perfetto_clocks clocks {};
+   if (gpu_clocks) {
+      clocks = *gpu_clocks;
+   }
+
    if (!u_trace_perfetto_active(tu_device_get_u_trace(dev)))
-      return;
+      return {};
+
+   clocks.cpu = perfetto::base::GetBootTimeNs().count();
+
+   if (gpu_clocks) {
+      /* TODO: It would be better to use CPU time that comes
+       * together with GPU time from the KGSL, but it's not
+       * equal to GetBootTimeNs.
+       */
+
+      clocks.gpu_ts_offset = MAX2(gpu_timestamp_offset, clocks.gpu_ts_offset);
+      gpu_timestamp_offset = clocks.gpu_ts_offset;
+      sync_gpu_ts = clocks.gpu_ts + clocks.gpu_ts_offset;
+   } else {
+      clocks.gpu_ts = 0;
+      clocks.gpu_ts_offset = gpu_timestamp_offset;
+
+      if (clocks.cpu < next_clock_sync_ns)
+         return clocks;
+
+      if (tu_device_get_gpu_timestamp(dev, &clocks.gpu_ts)) {
+         PERFETTO_ELOG("Could not sync CPU and GPU clocks");
+         return {};
+      }
+
+      clocks.gpu_ts = tu_device_ticks_to_ns(dev, clocks.gpu_ts);
+
+      /* get cpu timestamp again because tu_device_get_gpu_timestamp can take
+       * >100us
+       */
+      clocks.cpu = perfetto::base::GetBootTimeNs().count();
+
+      uint64_t current_suspend_count = 0;
+      /* If we fail to get it we will use a fallback */
+      tu_device_get_suspend_count(dev, &current_suspend_count);
+
+      /* GPU timestamp is being reset after suspend-resume cycle.
+       * Perfetto requires clock snapshots to be monotonic,
+       * so we have to fix-up the time.
+       */
+      if (current_suspend_count != last_suspend_count) {
+         gpu_timestamp_offset = gpu_max_timestamp;
+         last_suspend_count = current_suspend_count;
+      }
+      clocks.gpu_ts_offset = gpu_timestamp_offset;
+
+      uint64_t gpu_absolute_ts = clocks.gpu_ts + clocks.gpu_ts_offset;
+
+      /* Fallback check, detect non-monotonic cases which would happen
+       * if we cannot retrieve suspend count.
+       */
+      if (sync_gpu_ts > gpu_absolute_ts) {
+         gpu_absolute_ts += (gpu_max_timestamp - gpu_timestamp_offset);
+         gpu_timestamp_offset = gpu_max_timestamp;
+         clocks.gpu_ts = gpu_absolute_ts - gpu_timestamp_offset;
+      }
+
+      if (sync_gpu_ts > gpu_absolute_ts) {
+         PERFETTO_ELOG("Non-monotonic gpu timestamp detected, bailing out");
+         return {};
+      }
+
+      gpu_max_timestamp = clocks.gpu_ts;
+      sync_gpu_ts = clocks.gpu_ts;
+      next_clock_sync_ns = clocks.cpu + 30000000;
+   }
 
-   sync_timestamp(dev);
+   emit_sync_timestamp(clocks.cpu, clocks.gpu_ts + clocks.gpu_ts_offset);
    emit_submit_id(submission_id);
+   return clocks;
 }
 
 /*
index 922cdc6..40b6a68 100644 (file)
@@ -39,7 +39,17 @@ struct tu_perfetto_state {
 
 void tu_perfetto_init(void);
 
-void tu_perfetto_submit(struct tu_device *dev, uint32_t submission_id);
+struct tu_perfetto_clocks
+{
+   uint64_t cpu;
+   uint64_t gpu_ts;
+   uint64_t gpu_ts_offset;
+};
+
+struct tu_perfetto_clocks
+tu_perfetto_submit(struct tu_device *dev,
+                   uint32_t submission_id,
+                   struct tu_perfetto_clocks *clocks);
 
 #ifdef __cplusplus
 }