From 7f59e3723380e7ed72588040e4f496733ac5ec83 Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Wed, 9 Aug 2023 16:14:07 +0200 Subject: [PATCH] tu/perfetto: Allow gpu time to be passed into tu_perfetto_submit In preparation to support perfetto on KGSL, on KGSL GPU time is retrieved on submission and requires minimal post-processing. Signed-off-by: Danylo Piliaiev Part-of: --- src/freedreno/vulkan/tu_device.h | 5 ++ src/freedreno/vulkan/tu_knl_drm_msm.cc | 6 +- src/freedreno/vulkan/tu_perfetto.cc | 140 +++++++++++++++++++-------------- src/freedreno/vulkan/tu_perfetto.h | 12 ++- 4 files changed, 102 insertions(+), 61 deletions(-) diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index 4878823..67db9b4 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -521,6 +521,11 @@ struct tu_u_trace_submission_data uint32_t cmd_buffer_count; uint32_t last_buffer_with_tracepoints; struct tu_u_trace_cmd_data *cmd_trace_data; + + /* GPU time is reset on GPU power cycle and the GPU time + * offset may change between submissions due to power cycle. + */ + uint64_t gpu_ts_offset; }; VkResult diff --git a/src/freedreno/vulkan/tu_knl_drm_msm.cc b/src/freedreno/vulkan/tu_knl_drm_msm.cc index 720de82..3a5e356 100644 --- a/src/freedreno/vulkan/tu_knl_drm_msm.cc +++ b/src/freedreno/vulkan/tu_knl_drm_msm.cc @@ -919,14 +919,18 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit) p_atomic_set(&queue->fence, req.fence); + uint64_t gpu_offset = 0; #if HAVE_PERFETTO - tu_perfetto_submit(queue->device, queue->device->submit_count); + struct tu_perfetto_clocks clocks = + tu_perfetto_submit(queue->device, queue->device->submit_count, NULL); + gpu_offset = clocks.gpu_ts_offset; #endif if (submit->u_trace_submission_data) { struct tu_u_trace_submission_data *submission_data = submit->u_trace_submission_data; submission_data->submission_id = queue->device->submit_count; + submission_data->gpu_ts_offset = gpu_offset; /* We have to allocate it here since it is different between drm/kgsl */ submission_data->syncobj = (struct tu_u_trace_syncobj *) vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj), diff --git a/src/freedreno/vulkan/tu_perfetto.cc b/src/freedreno/vulkan/tu_perfetto.cc index 614b6fe..b417eec 100644 --- a/src/freedreno/vulkan/tu_perfetto.cc +++ b/src/freedreno/vulkan/tu_perfetto.cc @@ -247,6 +247,7 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id, auto trace_flush_data = (const struct tu_u_trace_submission_data *) flush_data; uint32_t submission_id = trace_flush_data->submission_id; + uint64_t gpu_ts_offset = trace_flush_data->gpu_ts_offset; if (!stage) return; @@ -272,9 +273,9 @@ stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage_id, auto packet = tctx.NewTracePacket(); - gpu_max_timestamp = MAX2(gpu_max_timestamp, ts_ns + gpu_timestamp_offset); + gpu_max_timestamp = MAX2(gpu_max_timestamp, ts_ns + gpu_ts_offset); - packet->set_timestamp(stage->start_ts + gpu_timestamp_offset); + packet->set_timestamp(stage->start_ts + gpu_ts_offset); packet->set_timestamp_clock_id(gpu_clock_id); auto event = packet->set_gpu_render_stage_event(); @@ -315,64 +316,13 @@ tu_perfetto_init(void) } static void -sync_timestamp(struct tu_device *dev) +emit_sync_timestamp(uint64_t cpu_ts, uint64_t gpu_ts) { - uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count(); - uint64_t gpu_ts = 0; - - if (cpu_ts < next_clock_sync_ns) - return; - - if (tu_device_get_gpu_timestamp(dev, &gpu_ts)) { - PERFETTO_ELOG("Could not sync CPU and GPU clocks"); - return; - } - - /* get cpu timestamp again because tu_device_get_gpu_timestamp can take - * >100us - */ - cpu_ts = perfetto::base::GetBootTimeNs().count(); - - uint64_t current_suspend_count = 0; - /* If we fail to get it we will use a fallback */ - tu_device_get_suspend_count(dev, ¤t_suspend_count); - - /* convert GPU ts into ns: */ - gpu_ts = tu_device_ticks_to_ns(dev, gpu_ts); - - /* GPU timestamp is being reset after suspend-resume cycle. - * Perfetto requires clock snapshots to be monotonic, - * so we have to fix-up the time. - */ - if (current_suspend_count != last_suspend_count) { - gpu_timestamp_offset = gpu_max_timestamp; - last_suspend_count = current_suspend_count; - } - - gpu_ts += gpu_timestamp_offset; - - /* Fallback check, detect non-monotonic cases which would happen - * if we cannot retrieve suspend count. - */ - if (sync_gpu_ts > gpu_ts) { - gpu_ts += (gpu_max_timestamp - gpu_timestamp_offset); - gpu_timestamp_offset = gpu_max_timestamp; - } - - if (sync_gpu_ts > gpu_ts) { - PERFETTO_ELOG("Non-monotonic gpu timestamp detected, bailing out"); - return; - } - TuRenderpassDataSource::Trace([=](auto tctx) { MesaRenderpassDataSource::EmitClockSync(tctx, cpu_ts, gpu_ts, gpu_clock_id); }); - - gpu_max_timestamp = gpu_ts; - sync_gpu_ts = gpu_ts; - next_clock_sync_ns = cpu_ts + 30000000; } static void @@ -390,15 +340,87 @@ emit_submit_id(uint32_t submission_id) }); } -void -tu_perfetto_submit(struct tu_device *dev, uint32_t submission_id) +struct tu_perfetto_clocks +tu_perfetto_submit(struct tu_device *dev, + uint32_t submission_id, + struct tu_perfetto_clocks *gpu_clocks) { - /* sync_timestamp isn't free */ + struct tu_perfetto_clocks clocks {}; + if (gpu_clocks) { + clocks = *gpu_clocks; + } + if (!u_trace_perfetto_active(tu_device_get_u_trace(dev))) - return; + return {}; + + clocks.cpu = perfetto::base::GetBootTimeNs().count(); + + if (gpu_clocks) { + /* TODO: It would be better to use CPU time that comes + * together with GPU time from the KGSL, but it's not + * equal to GetBootTimeNs. + */ + + clocks.gpu_ts_offset = MAX2(gpu_timestamp_offset, clocks.gpu_ts_offset); + gpu_timestamp_offset = clocks.gpu_ts_offset; + sync_gpu_ts = clocks.gpu_ts + clocks.gpu_ts_offset; + } else { + clocks.gpu_ts = 0; + clocks.gpu_ts_offset = gpu_timestamp_offset; + + if (clocks.cpu < next_clock_sync_ns) + return clocks; + + if (tu_device_get_gpu_timestamp(dev, &clocks.gpu_ts)) { + PERFETTO_ELOG("Could not sync CPU and GPU clocks"); + return {}; + } + + clocks.gpu_ts = tu_device_ticks_to_ns(dev, clocks.gpu_ts); + + /* get cpu timestamp again because tu_device_get_gpu_timestamp can take + * >100us + */ + clocks.cpu = perfetto::base::GetBootTimeNs().count(); + + uint64_t current_suspend_count = 0; + /* If we fail to get it we will use a fallback */ + tu_device_get_suspend_count(dev, ¤t_suspend_count); + + /* GPU timestamp is being reset after suspend-resume cycle. + * Perfetto requires clock snapshots to be monotonic, + * so we have to fix-up the time. + */ + if (current_suspend_count != last_suspend_count) { + gpu_timestamp_offset = gpu_max_timestamp; + last_suspend_count = current_suspend_count; + } + clocks.gpu_ts_offset = gpu_timestamp_offset; + + uint64_t gpu_absolute_ts = clocks.gpu_ts + clocks.gpu_ts_offset; + + /* Fallback check, detect non-monotonic cases which would happen + * if we cannot retrieve suspend count. + */ + if (sync_gpu_ts > gpu_absolute_ts) { + gpu_absolute_ts += (gpu_max_timestamp - gpu_timestamp_offset); + gpu_timestamp_offset = gpu_max_timestamp; + clocks.gpu_ts = gpu_absolute_ts - gpu_timestamp_offset; + } + + if (sync_gpu_ts > gpu_absolute_ts) { + PERFETTO_ELOG("Non-monotonic gpu timestamp detected, bailing out"); + return {}; + } + + gpu_max_timestamp = clocks.gpu_ts; + sync_gpu_ts = clocks.gpu_ts; + next_clock_sync_ns = clocks.cpu + 30000000; + } - sync_timestamp(dev); + emit_sync_timestamp(clocks.cpu, clocks.gpu_ts + clocks.gpu_ts_offset); emit_submit_id(submission_id); + return clocks; } /* diff --git a/src/freedreno/vulkan/tu_perfetto.h b/src/freedreno/vulkan/tu_perfetto.h index 922cdc6..40b6a68 100644 --- a/src/freedreno/vulkan/tu_perfetto.h +++ b/src/freedreno/vulkan/tu_perfetto.h @@ -39,7 +39,17 @@ struct tu_perfetto_state { void tu_perfetto_init(void); -void tu_perfetto_submit(struct tu_device *dev, uint32_t submission_id); +struct tu_perfetto_clocks +{ + uint64_t cpu; + uint64_t gpu_ts; + uint64_t gpu_ts_offset; +}; + +struct tu_perfetto_clocks +tu_perfetto_submit(struct tu_device *dev, + uint32_t submission_id, + struct tu_perfetto_clocks *clocks); #ifdef __cplusplus } -- 2.7.4