From 49eabb9ea6704a8a3c4ff1149ecc1bff9a81b530 Mon Sep 17 00:00:00 2001 From: Sagar Ghuge Date: Wed, 19 Jul 2023 19:04:21 -0700 Subject: [PATCH] anv: Add GPU breakpoint before/after specific draw call This change allow us to insert the MI_SEMAPHORE_WAIT before/after specific draw call. With GTX tool, we can always update the memory address to unblock spinning wait. v2: - Make sure draw_call_count is thread-safe (Lionel) - Add static inline helper (Lionel) Signed-off-by: Sagar Ghuge Reviewed-by: Lionel Landwerlin Part-of: --- src/intel/vulkan/anv_device.c | 5 +++ src/intel/vulkan/anv_genX.h | 13 ++++++++ src/intel/vulkan/anv_private.h | 3 ++ src/intel/vulkan/genX_cmd_buffer.c | 60 +++++++++++++++++++++++++++++++++++ src/intel/vulkan/genX_gpu_memcpy.c | 2 ++ src/intel/vulkan/genX_simple_shader.h | 2 ++ 6 files changed, 85 insertions(+) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index a5152a1..ed6d3ab 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -3417,6 +3417,10 @@ VkResult anv_CreateDevice( device->vk.enabled_features.robustBufferAccess || device->vk.enabled_features.nullDescriptor; + device->breakpoint = anv_state_pool_alloc(&device->dynamic_state_pool, 4, + 4); + p_atomic_set(&device->draw_call_count, 0); + anv_device_init_blorp(device); anv_device_init_border_colors(device); @@ -3532,6 +3536,7 @@ void anv_DestroyDevice( anv_state_pool_free(&device->dynamic_state_pool, device->border_colors); anv_state_pool_free(&device->dynamic_state_pool, device->slice_hash); anv_state_pool_free(&device->dynamic_state_pool, device->cps_states); + anv_state_pool_free(&device->dynamic_state_pool, device->breakpoint); #endif for (unsigned i = 0; i < ARRAY_SIZE(device->rt_scratch_bos); i++) { diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index c75981b..345920e 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -235,3 +235,16 @@ genX(batch_emit_pipe_control_write)(struct anv_batch *batch, struct anv_address address, uint32_t imm_data, enum anv_pipe_bits bits); + +void genX(batch_emit_breakpoint)(struct anv_batch *batch, + struct anv_device *device, + bool emit_before_draw); + +static inline void +genX(emit_breakpoint)(struct anv_batch *batch, + struct anv_device *device, + bool emit_before_draw) +{ + if (INTEL_DEBUG(DEBUG_DRAW_BKP)) + genX(batch_emit_breakpoint)(batch, device, emit_before_draw); +} diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index f16230d..30a21ca 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1295,6 +1295,9 @@ struct anv_device { struct intel_ds_device ds; nir_shader *fp64_nir; + + uint32_t draw_call_count; + struct anv_state breakpoint; }; static inline struct anv_state diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 3395bbe..8646c37 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -4306,6 +4306,33 @@ void genX(CmdPipelineBarrier2)( cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier"); } +void +genX(batch_emit_breakpoint)(struct anv_batch *batch, + struct anv_device *device, + bool emit_before_draw) +{ + /* Update draw call count once */ + uint32_t draw_count = emit_before_draw ? + p_atomic_inc_return(&device->draw_call_count) : + p_atomic_read(&device->draw_call_count); + + if (((draw_count == intel_debug_bkp_before_draw_count && + emit_before_draw) || + (draw_count == intel_debug_bkp_after_draw_count && + !emit_before_draw))) { + struct anv_address wait_addr = + anv_state_pool_state_address(&device->dynamic_state_pool, + device->breakpoint); + + anv_batch_emit(batch, GENX(MI_SEMAPHORE_WAIT), sem) { + sem.WaitMode = PollingMode; + sem.CompareOperation = COMPARE_SAD_EQUAL_SDD; + sem.SemaphoreDataDword = 0x1; + sem.SemaphoreAddress = wait_addr; + }; + } +} + #if GFX_VER >= 11 #define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE_EXTENDED) #else @@ -4351,6 +4378,8 @@ void genX(CmdDraw)( genX(cmd_buffer_flush_gfx_state)(cmd_buffer); genX(emit_ds)(cmd_buffer); + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true); + anv_batch_emit(&cmd_buffer->batch, #if GFX_VER < 11 GENX(3DPRIMITIVE), @@ -4374,6 +4403,8 @@ void genX(CmdDraw)( #endif } + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false); + #if GFX_VERx10 == 125 genX(emit_dummy_post_sync_op)(cmd_buffer, vertexCount); #endif @@ -4417,6 +4448,8 @@ void genX(CmdDrawMultiEXT)( "draw multi", count); trace_intel_begin_draw_multi(&cmd_buffer->trace); + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true); + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; prim.VertexAccessType = SEQUENTIAL; @@ -4427,6 +4460,8 @@ void genX(CmdDrawMultiEXT)( prim.StartInstanceLocation = firstInstance; prim.BaseVertexLocation = 0; } + + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false); trace_intel_end_draw_multi(&cmd_buffer->trace, count); } #else @@ -4445,6 +4480,8 @@ void genX(CmdDrawMultiEXT)( "draw multi", count); trace_intel_begin_draw_multi(&cmd_buffer->trace); + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true); + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE_EXTENDED), prim) { prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; prim.VertexAccessType = SEQUENTIAL; @@ -4458,6 +4495,8 @@ void genX(CmdDrawMultiEXT)( prim.ExtendedParameter1 = firstInstance; prim.ExtendedParameter2 = i; } + + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false); trace_intel_end_draw_multi(&cmd_buffer->trace, count); } #endif @@ -4509,6 +4548,7 @@ void genX(CmdDrawIndexed)( #endif genX(cmd_buffer_flush_gfx_state)(cmd_buffer); + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true); anv_batch_emit(&cmd_buffer->batch, #if GFX_VER < 11 @@ -4533,6 +4573,8 @@ void genX(CmdDrawIndexed)( #endif } + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false); + #if GFX_VERx10 == 125 genX(emit_dummy_post_sync_op)(cmd_buffer, indexCount); #endif @@ -4591,6 +4633,8 @@ void genX(CmdDrawMultiIndexedEXT)( "draw indexed multi", count); trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace); + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, + true); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; @@ -4602,6 +4646,8 @@ void genX(CmdDrawMultiIndexedEXT)( prim.StartInstanceLocation = firstInstance; prim.BaseVertexLocation = *pVertexOffset; } + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, + false); trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count); emitted = false; } @@ -4622,6 +4668,8 @@ void genX(CmdDrawMultiIndexedEXT)( "draw indexed multi", count); trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace); + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, + true); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; @@ -4633,6 +4681,8 @@ void genX(CmdDrawMultiIndexedEXT)( prim.StartInstanceLocation = firstInstance; prim.BaseVertexLocation = *pVertexOffset; } + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, + false); trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count); } } @@ -4649,6 +4699,7 @@ void genX(CmdDrawMultiIndexedEXT)( "draw indexed multi", count); trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace); + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; @@ -4660,6 +4711,7 @@ void genX(CmdDrawMultiIndexedEXT)( prim.StartInstanceLocation = firstInstance; prim.BaseVertexLocation = draw->vertexOffset; } + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false); trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count); } } @@ -4680,6 +4732,7 @@ void genX(CmdDrawMultiIndexedEXT)( "draw indexed multi", count); trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace); + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true); anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE_EXTENDED), prim) { prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled; @@ -4695,6 +4748,7 @@ void genX(CmdDrawMultiIndexedEXT)( prim.ExtendedParameter1 = firstInstance; prim.ExtendedParameter2 = i; } + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false); trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count); } #endif @@ -4804,6 +4858,7 @@ void genX(CmdDrawIndirectByteCountEXT)( mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID), mi_imm(0)); #endif + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true); anv_batch_emit(&cmd_buffer->batch, #if GFX_VER < 11 GENX(3DPRIMITIVE), @@ -4819,6 +4874,7 @@ void genX(CmdDrawIndirectByteCountEXT)( #endif } + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false); #if GFX_VERx10 == 125 genX(emit_dummy_post_sync_op)(cmd_buffer, 1); #endif @@ -4937,6 +4993,7 @@ emit_indirect_draws(struct anv_cmd_buffer *cmd_buffer, genX(emit_hs)(cmd_buffer); genX(emit_ds)(cmd_buffer); + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true); anv_batch_emit(&cmd_buffer->batch, #if GFX_VER < 11 GENX(3DPRIMITIVE), @@ -4952,6 +5009,7 @@ emit_indirect_draws(struct anv_cmd_buffer *cmd_buffer, #endif } + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false); #if GFX_VERx10 == 125 genX(emit_dummy_post_sync_op)(cmd_buffer, 1); #endif @@ -5165,6 +5223,7 @@ emit_indirect_count_draws(struct anv_cmd_buffer *cmd_buffer, genX(emit_hs)(cmd_buffer); genX(emit_ds)(cmd_buffer); + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true); anv_batch_emit(&cmd_buffer->batch, #if GFX_VER < 11 GENX(3DPRIMITIVE), @@ -5180,6 +5239,7 @@ emit_indirect_count_draws(struct anv_cmd_buffer *cmd_buffer, #endif } + genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false); #if GFX_VERx10 == 125 genX(emit_dummy_post_sync_op)(cmd_buffer, 1); #endif diff --git a/src/intel/vulkan/genX_gpu_memcpy.c b/src/intel/vulkan/genX_gpu_memcpy.c index 266af55..46f97df 100644 --- a/src/intel/vulkan/genX_gpu_memcpy.c +++ b/src/intel/vulkan/genX_gpu_memcpy.c @@ -227,6 +227,7 @@ emit_so_memcpy(struct anv_batch *batch, struct anv_device *device, so.Buffer0SurfacePitch = bs; } + genX(emit_breakpoint)(batch, device, true); anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) { prim.VertexAccessType = SEQUENTIAL; prim.VertexCountPerInstance = size / bs; @@ -235,6 +236,7 @@ emit_so_memcpy(struct anv_batch *batch, struct anv_device *device, prim.StartInstanceLocation = 0; prim.BaseVertexLocation = 0; } + genX(emit_breakpoint)(batch, device, false); #if GFX_VERx10 == 125 genX(batch_emit_dummy_post_sync_op)(batch, device, _3DPRIM_POINTLIST, diff --git a/src/intel/vulkan/genX_simple_shader.h b/src/intel/vulkan/genX_simple_shader.h index ac16fbe..78cbb88 100644 --- a/src/intel/vulkan/genX_simple_shader.h +++ b/src/intel/vulkan/genX_simple_shader.h @@ -450,12 +450,14 @@ genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state, } #endif + genX(emit_breakpoint)(batch, device, true); anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) { prim.VertexAccessType = SEQUENTIAL; prim.PrimitiveTopologyType = _3DPRIM_RECTLIST; prim.VertexCountPerInstance = 3; prim.InstanceCount = 1; } + genX(emit_breakpoint)(batch, device, false); } else { const struct intel_device_info *devinfo = device->info; const struct brw_cs_prog_data *prog_data = -- 2.7.4