From: Bas Nieuwenhuizen Date: Tue, 1 Sep 2020 19:28:16 +0000 (+0200) Subject: radv: Record cache flushes for RGP. X-Git-Tag: upstream/21.0.0~4968 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=78165ea3e2085491abd91df340dc822071d389b5;p=platform%2Fupstream%2Fmesa.git radv: Record cache flushes for RGP. Not doing the EOP TS cacheflush event because that break wave counting in RGP for some reason. But the rest looks to be all there. Reviewed-by: Samuel Pitoiset Part-of: --- diff --git a/src/amd/vulkan/layers/radv_sqtt_layer.c b/src/amd/vulkan/layers/radv_sqtt_layer.c index 5518026..5f3de07 100644 --- a/src/amd/vulkan/layers/radv_sqtt_layer.c +++ b/src/amd/vulkan/layers/radv_sqtt_layer.c @@ -300,7 +300,7 @@ struct rgp_sqtt_marker_barrier_end { union { struct { uint32_t sync_cp_dma : 1; - uint32_t inval_ccp : 1; + uint32_t inval_tcp : 1; uint32_t inval_sqI : 1; uint32_t inval_sqK : 1; uint32_t flush_tcc : 1; @@ -526,6 +526,38 @@ radv_describe_barrier_end_delayed(struct radv_cmd_buffer *cmd_buffer) marker.num_layout_transitions = cmd_buffer->state.num_layout_transitions; /* TODO: fill pipeline stalls, cache flushes, etc */ + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_WAIT_ON_EOP_TS) + marker.wait_on_eop_ts = true; + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_VS_PARTIAL_FLUSH) + marker.vs_partial_flush = true; + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_PS_PARTIAL_FLUSH) + marker.ps_partial_flush = true; + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_CS_PARTIAL_FLUSH) + marker.cs_partial_flush = true; + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_PFP_SYNC_ME) + marker.pfp_sync_me = true; + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_SYNC_CP_DMA) + marker.sync_cp_dma = true; + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_VMEM_L0) + marker.inval_tcp = true; + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_ICACHE) + marker.inval_sqI = true; + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_SMEM_L0) + marker.inval_sqK = true; + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_FLUSH_L2) + marker.flush_tcc = true; + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_L2) + marker.inval_tcc = true; + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_FLUSH_CB) + marker.flush_cb = true; + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_CB) + marker.inval_cb = true; + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_FLUSH_DB) + marker.flush_db = true; + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_DB) + marker.inval_db = true; + if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_L1) + marker.inval_gl1 = true; radv_emit_thread_trace_userdata(cmd_buffer->device, cs, &marker, sizeof(marker) / 4); @@ -543,6 +575,7 @@ radv_describe_barrier_start(struct radv_cmd_buffer *cmd_buffer, return; radv_describe_barrier_end_delayed(cmd_buffer); + cmd_buffer->state.sqtt_flush_bits = 0; marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START; marker.cb_id = 0; diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 243810f..70f9bea 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -606,6 +606,7 @@ radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, } if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) { + enum rgp_flush_bits sqtt_flush_bits = 0; assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)); @@ -617,7 +618,7 @@ radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, &cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va, radv_cmd_buffer_uses_mec(cmd_buffer), - flags, cmd_buffer->gfx9_eop_bug_va); + flags, &sqtt_flush_bits, cmd_buffer->gfx9_eop_bug_va); } if (unlikely(cmd_buffer->device->trace_bo)) diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 421cac4..133a375 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -3767,6 +3767,7 @@ radv_get_preamble_cs(struct radv_queue *queue, } for(int i = 0; i < 3; ++i) { + enum rgp_flush_bits sqtt_flush_bits = 0; struct radeon_cmdbuf *cs = NULL; cs = queue->device->ws->cs_create(queue->device->ws, queue->queue_family_index ? RING_COMPUTE : RING_GFX); @@ -3832,7 +3833,7 @@ radv_get_preamble_cs(struct radv_queue *queue, RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2 | - RADV_CMD_FLAG_START_PIPELINE_STATS, 0); + RADV_CMD_FLAG_START_PIPELINE_STATS, &sqtt_flush_bits, 0); } else if (i == 1) { si_cs_emit_cache_flush(cs, queue->device->physical_device->rad_info.chip_class, @@ -3843,7 +3844,7 @@ radv_get_preamble_cs(struct radv_queue *queue, RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2 | - RADV_CMD_FLAG_START_PIPELINE_STATS, 0); + RADV_CMD_FLAG_START_PIPELINE_STATS, &sqtt_flush_bits, 0); } if (queue->device->ws->cs_finalize(cs) != VK_SUCCESS) diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 4de20b0..1643650 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -1301,6 +1301,25 @@ struct radv_subpass_sample_locs_state { struct radv_sample_locations_state sample_location; }; +enum rgp_flush_bits { + RGP_FLUSH_WAIT_ON_EOP_TS = 0x1, + RGP_FLUSH_VS_PARTIAL_FLUSH = 0x2, + RGP_FLUSH_PS_PARTIAL_FLUSH = 0x4, + RGP_FLUSH_CS_PARTIAL_FLUSH = 0x8, + RGP_FLUSH_PFP_SYNC_ME = 0x10, + RGP_FLUSH_SYNC_CP_DMA = 0x20, + RGP_FLUSH_INVAL_VMEM_L0 = 0x40, + RGP_FLUSH_INVAL_ICACHE = 0x80, + RGP_FLUSH_INVAL_SMEM_L0 = 0x100, + RGP_FLUSH_FLUSH_L2 = 0x200, + RGP_FLUSH_INVAL_L2 = 0x400, + RGP_FLUSH_FLUSH_CB = 0x800, + RGP_FLUSH_INVAL_CB = 0x1000, + RGP_FLUSH_FLUSH_DB = 0x2000, + RGP_FLUSH_INVAL_DB = 0x4000, + RGP_FLUSH_INVAL_L1 = 0x8000, +}; + struct radv_cmd_state { /* Vertex descriptors */ uint64_t vb_va; @@ -1370,6 +1389,7 @@ struct radv_cmd_state { uint32_t num_events; uint32_t num_layout_transitions; bool pending_sqtt_barrier_end; + enum rgp_flush_bits sqtt_flush_bits; }; struct radv_cmd_pool { @@ -1487,6 +1507,7 @@ void si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, uint32_t *fence_ptr, uint64_t va, bool is_mec, enum radv_cmd_flush_bits flush_bits, + enum rgp_flush_bits *sqtt_flush_bits, uint64_t gfx9_eop_bug_va); void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer); void si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer, diff --git a/src/amd/vulkan/radv_sqtt.c b/src/amd/vulkan/radv_sqtt.c index a1de66e..345637c 100644 --- a/src/amd/vulkan/radv_sqtt.c +++ b/src/amd/vulkan/radv_sqtt.c @@ -390,6 +390,7 @@ static void radv_emit_wait_for_idle(struct radv_device *device, struct radeon_cmdbuf *cs, int family) { + enum rgp_flush_bits sqtt_flush_bits = 0; si_cs_emit_cache_flush(cs, device->physical_device->rad_info.chip_class, NULL, 0, family == RING_COMPUTE && @@ -400,7 +401,7 @@ radv_emit_wait_for_idle(struct radv_device *device, RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE | - RADV_CMD_FLAG_INV_L2, 0); + RADV_CMD_FLAG_INV_L2, &sqtt_flush_bits, 0); } static void diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index d840457..5f77211 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -1040,6 +1040,7 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs, uint64_t flush_va, bool is_mec, enum radv_cmd_flush_bits flush_bits, + enum rgp_flush_bits *sqtt_flush_bits, uint64_t gfx9_eop_bug_va) { uint32_t gcr_cntl = 0; @@ -1048,26 +1049,38 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs, /* We don't need these. */ assert(!(flush_bits & (RADV_CMD_FLAG_VGT_STREAMOUT_SYNC))); - if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) + if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) { gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL); + + *sqtt_flush_bits |= RGP_FLUSH_INVAL_ICACHE; + } if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) { /* TODO: When writing to the SMEM L1 cache, we need to set SEQ * to FORWARD when both L1 and L2 are written out (WB or INV). */ gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1); + + *sqtt_flush_bits |= RGP_FLUSH_INVAL_SMEM_L0; } - if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) + if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) { gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1); + + *sqtt_flush_bits |= RGP_FLUSH_INVAL_VMEM_L0 | RGP_FLUSH_INVAL_L1; + } if (flush_bits & RADV_CMD_FLAG_INV_L2) { /* Writeback and invalidate everything in L2. */ gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1); + + *sqtt_flush_bits |= RGP_FLUSH_INVAL_L2; } else if (flush_bits & RADV_CMD_FLAG_WB_L2) { /* Writeback but do not invalidate. * GLM doesn't support WB alone. If WB is set, INV must be set too. */ gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1); + + *sqtt_flush_bits |= RGP_FLUSH_FLUSH_L2; } /* TODO: Implement this new flag for GFX9+. @@ -1082,6 +1095,8 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs, radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); + + *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB; } /* TODO: trigger on RADV_CMD_FLAG_FLUSH_AND_INV_DB_META ? */ @@ -1090,6 +1105,8 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs, radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); + + *sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB; } /* First flush CB/DB, then L1/L2. */ @@ -1110,15 +1127,21 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs, if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + + *sqtt_flush_bits |= RGP_FLUSH_PS_PARTIAL_FLUSH; } else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + + *sqtt_flush_bits |= RGP_FLUSH_VS_PARTIAL_FLUSH; } } if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4))); + + *sqtt_flush_bits |= RGP_FLUSH_CS_PARTIAL_FLUSH; } if (cb_db_event) { @@ -1197,6 +1220,8 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs, /* We need to ensure that PFP waits as well. */ radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cs, 0); + + *sqtt_flush_bits |= RGP_FLUSH_PFP_SYNC_ME; } if (flush_bits & RADV_CMD_FLAG_START_PIPELINE_STATS) { @@ -1217,6 +1242,7 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, uint64_t flush_va, bool is_mec, enum radv_cmd_flush_bits flush_bits, + enum rgp_flush_bits *sqtt_flush_bits, uint64_t gfx9_eop_bug_va) { unsigned cp_coher_cntl = 0; @@ -1226,14 +1252,19 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, if (chip_class >= GFX10) { /* GFX10 cache flush handling is quite different. */ gfx10_cs_emit_cache_flush(cs, chip_class, flush_cnt, flush_va, - is_mec, flush_bits, gfx9_eop_bug_va); + is_mec, flush_bits, sqtt_flush_bits, + gfx9_eop_bug_va); return; } - if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) + if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) { cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); - if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) + *sqtt_flush_bits |= RGP_FLUSH_INVAL_ICACHE; + } + if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) { cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); + *sqtt_flush_bits |= RGP_FLUSH_INVAL_SMEM_L0; + } if (chip_class <= GFX8) { if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { @@ -1259,34 +1290,48 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, 0, 0, gfx9_eop_bug_va); } + + *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB; } if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1); + + *sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB; } } if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); + + *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB; } if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); + + *sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB; } if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + + *sqtt_flush_bits |= RGP_FLUSH_PS_PARTIAL_FLUSH; } else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + + *sqtt_flush_bits |= RGP_FLUSH_VS_PARTIAL_FLUSH; } if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + + *sqtt_flush_bits |= RGP_FLUSH_CS_PARTIAL_FLUSH; } if (chip_class == GFX9 && flush_cb_db) { @@ -1310,6 +1355,9 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA; + *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB | + RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB; + /* Ideally flush TC together with CB/DB. */ if (flush_bits & RADV_CMD_FLAG_INV_L2) { /* Writeback and invalidate everything in L2 & L1. */ @@ -1321,6 +1369,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, flush_bits &= ~(RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_WB_L2 | RADV_CMD_FLAG_INV_VCACHE); + + *sqtt_flush_bits |= RGP_FLUSH_INVAL_L2; } assert(flush_cnt); (*flush_cnt)++; @@ -1357,6 +1407,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, !is_mec) { radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cs, 0); + + *sqtt_flush_bits |= RGP_FLUSH_PFP_SYNC_ME; } if ((flush_bits & RADV_CMD_FLAG_INV_L2) || @@ -1367,6 +1419,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, S_0085F0_TCL1_ACTION_ENA(1) | S_0301F0_TC_WB_ACTION_ENA(chip_class >= GFX8)); cp_coher_cntl = 0; + + *sqtt_flush_bits |= RGP_FLUSH_INVAL_L2 | RGP_FLUSH_INVAL_VMEM_L0; } else { if(flush_bits & RADV_CMD_FLAG_WB_L2) { /* WB = write-back @@ -1381,6 +1435,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1)); cp_coher_cntl = 0; + + *sqtt_flush_bits |= RGP_FLUSH_FLUSH_L2 | RGP_FLUSH_INVAL_VMEM_L0; } if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) { si_emit_acquire_mem(cs, is_mec, @@ -1388,6 +1444,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1)); cp_coher_cntl = 0; + + *sqtt_flush_bits |= RGP_FLUSH_INVAL_VMEM_L0; } } @@ -1437,6 +1495,7 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->gfx9_fence_va, radv_cmd_buffer_uses_mec(cmd_buffer), cmd_buffer->state.flush_bits, + &cmd_buffer->state.sqtt_flush_bits, cmd_buffer->gfx9_eop_bug_va);