radv: Record cache flushes for RGP.
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Tue, 1 Sep 2020 19:28:16 +0000 (21:28 +0200)
committerMarge Bot <eric+marge@anholt.net>
Mon, 28 Sep 2020 15:46:08 +0000 (15:46 +0000)
Not doing the EOP TS cacheflush event because that break wave counting
in RGP for some reason. But the rest looks to be all there.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6550>

src/amd/vulkan/layers/radv_sqtt_layer.c
src/amd/vulkan/radv_cmd_buffer.c
src/amd/vulkan/radv_device.c
src/amd/vulkan/radv_private.h
src/amd/vulkan/radv_sqtt.c
src/amd/vulkan/si_cmd_buffer.c

index 5518026..5f3de07 100644 (file)
@@ -300,7 +300,7 @@ struct rgp_sqtt_marker_barrier_end {
        union {
                struct {
                        uint32_t sync_cp_dma : 1;
-                       uint32_t inval_ccp : 1;
+                       uint32_t inval_tcp : 1;
                        uint32_t inval_sqI : 1;
                        uint32_t inval_sqK : 1;
                        uint32_t flush_tcc : 1;
@@ -526,6 +526,38 @@ radv_describe_barrier_end_delayed(struct radv_cmd_buffer *cmd_buffer)
        marker.num_layout_transitions = cmd_buffer->state.num_layout_transitions;
 
        /* TODO: fill pipeline stalls, cache flushes, etc */
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_WAIT_ON_EOP_TS)
+               marker.wait_on_eop_ts = true;
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_VS_PARTIAL_FLUSH)
+               marker.vs_partial_flush = true;
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_PS_PARTIAL_FLUSH)
+               marker.ps_partial_flush = true;
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_CS_PARTIAL_FLUSH)
+               marker.cs_partial_flush = true;
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_PFP_SYNC_ME)
+               marker.pfp_sync_me = true;
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_SYNC_CP_DMA)
+               marker.sync_cp_dma = true;
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_VMEM_L0)
+               marker.inval_tcp = true;
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_ICACHE)
+               marker.inval_sqI = true;
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_SMEM_L0)
+               marker.inval_sqK = true;
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_FLUSH_L2)
+               marker.flush_tcc = true;
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_L2)
+               marker.inval_tcc = true;
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_FLUSH_CB)
+               marker.flush_cb = true;
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_CB)
+               marker.inval_cb = true;
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_FLUSH_DB)
+               marker.flush_db = true;
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_DB)
+               marker.inval_db = true;
+       if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_L1)
+               marker.inval_gl1 = true;
 
        radv_emit_thread_trace_userdata(cmd_buffer->device, cs, &marker, sizeof(marker) / 4);
 
@@ -543,6 +575,7 @@ radv_describe_barrier_start(struct radv_cmd_buffer *cmd_buffer,
                return;
 
        radv_describe_barrier_end_delayed(cmd_buffer);
+       cmd_buffer->state.sqtt_flush_bits = 0;
 
        marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
        marker.cb_id = 0;
index 243810f..70f9bea 100644 (file)
@@ -606,6 +606,7 @@ radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer,
        }
 
        if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
+               enum rgp_flush_bits sqtt_flush_bits = 0;
                assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
                                RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
 
@@ -617,7 +618,7 @@ radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer,
                                       &cmd_buffer->gfx9_fence_idx,
                                       cmd_buffer->gfx9_fence_va,
                                       radv_cmd_buffer_uses_mec(cmd_buffer),
-                                      flags, cmd_buffer->gfx9_eop_bug_va);
+                                      flags, &sqtt_flush_bits, cmd_buffer->gfx9_eop_bug_va);
        }
 
        if (unlikely(cmd_buffer->device->trace_bo))
index 421cac4..133a375 100644 (file)
@@ -3767,6 +3767,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
        }
 
        for(int i = 0; i < 3; ++i) {
+               enum rgp_flush_bits sqtt_flush_bits = 0;
                struct radeon_cmdbuf *cs = NULL;
                cs = queue->device->ws->cs_create(queue->device->ws,
                                                  queue->queue_family_index ? RING_COMPUTE : RING_GFX);
@@ -3832,7 +3833,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
                                               RADV_CMD_FLAG_INV_SCACHE |
                                               RADV_CMD_FLAG_INV_VCACHE |
                                               RADV_CMD_FLAG_INV_L2 |
-                                              RADV_CMD_FLAG_START_PIPELINE_STATS, 0);
+                                              RADV_CMD_FLAG_START_PIPELINE_STATS, &sqtt_flush_bits, 0);
                } else if (i == 1) {
                        si_cs_emit_cache_flush(cs,
                                               queue->device->physical_device->rad_info.chip_class,
@@ -3843,7 +3844,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
                                               RADV_CMD_FLAG_INV_SCACHE |
                                               RADV_CMD_FLAG_INV_VCACHE |
                                               RADV_CMD_FLAG_INV_L2 |
-                                              RADV_CMD_FLAG_START_PIPELINE_STATS, 0);
+                                              RADV_CMD_FLAG_START_PIPELINE_STATS, &sqtt_flush_bits, 0);
                }
 
                if (queue->device->ws->cs_finalize(cs) != VK_SUCCESS)
index 4de20b0..1643650 100644 (file)
@@ -1301,6 +1301,25 @@ struct radv_subpass_sample_locs_state {
        struct radv_sample_locations_state sample_location;
 };
 
+enum rgp_flush_bits {
+       RGP_FLUSH_WAIT_ON_EOP_TS   = 0x1,
+       RGP_FLUSH_VS_PARTIAL_FLUSH = 0x2,
+       RGP_FLUSH_PS_PARTIAL_FLUSH = 0x4,
+       RGP_FLUSH_CS_PARTIAL_FLUSH = 0x8,
+       RGP_FLUSH_PFP_SYNC_ME      = 0x10,
+       RGP_FLUSH_SYNC_CP_DMA      = 0x20,
+       RGP_FLUSH_INVAL_VMEM_L0    = 0x40,
+       RGP_FLUSH_INVAL_ICACHE     = 0x80,
+       RGP_FLUSH_INVAL_SMEM_L0    = 0x100,
+       RGP_FLUSH_FLUSH_L2         = 0x200,
+       RGP_FLUSH_INVAL_L2         = 0x400,
+       RGP_FLUSH_FLUSH_CB         = 0x800,
+       RGP_FLUSH_INVAL_CB         = 0x1000,
+       RGP_FLUSH_FLUSH_DB         = 0x2000,
+       RGP_FLUSH_INVAL_DB         = 0x4000,
+       RGP_FLUSH_INVAL_L1         = 0x8000,
+};
+
 struct radv_cmd_state {
        /* Vertex descriptors */
        uint64_t                                      vb_va;
@@ -1370,6 +1389,7 @@ struct radv_cmd_state {
        uint32_t num_events;
        uint32_t num_layout_transitions;
        bool pending_sqtt_barrier_end;
+       enum rgp_flush_bits sqtt_flush_bits;
 };
 
 struct radv_cmd_pool {
@@ -1487,6 +1507,7 @@ void si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                            uint32_t *fence_ptr, uint64_t va,
                            bool is_mec,
                            enum radv_cmd_flush_bits flush_bits,
+                           enum rgp_flush_bits *sqtt_flush_bits,
                            uint64_t gfx9_eop_bug_va);
 void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer);
 void si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer,
index a1de66e..345637c 100644 (file)
@@ -390,6 +390,7 @@ static void
 radv_emit_wait_for_idle(struct radv_device *device,
                        struct radeon_cmdbuf *cs, int family)
 {
+       enum rgp_flush_bits sqtt_flush_bits = 0;
        si_cs_emit_cache_flush(cs, device->physical_device->rad_info.chip_class,
                               NULL, 0,
                               family == RING_COMPUTE &&
@@ -400,7 +401,7 @@ radv_emit_wait_for_idle(struct radv_device *device,
                               RADV_CMD_FLAG_INV_ICACHE |
                               RADV_CMD_FLAG_INV_SCACHE |
                               RADV_CMD_FLAG_INV_VCACHE |
-                              RADV_CMD_FLAG_INV_L2, 0);
+                              RADV_CMD_FLAG_INV_L2, &sqtt_flush_bits, 0);
 }
 
 static void
index d840457..5f77211 100644 (file)
@@ -1040,6 +1040,7 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                          uint64_t flush_va,
                          bool is_mec,
                          enum radv_cmd_flush_bits flush_bits,
+                         enum rgp_flush_bits *sqtt_flush_bits,
                          uint64_t gfx9_eop_bug_va)
 {
        uint32_t gcr_cntl = 0;
@@ -1048,26 +1049,38 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
        /* We don't need these. */
        assert(!(flush_bits & (RADV_CMD_FLAG_VGT_STREAMOUT_SYNC)));
 
-       if (flush_bits & RADV_CMD_FLAG_INV_ICACHE)
+       if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) {
                gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
+
+               *sqtt_flush_bits |= RGP_FLUSH_INVAL_ICACHE;
+       }
        if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) {
                /* TODO: When writing to the SMEM L1 cache, we need to set SEQ
                 * to FORWARD when both L1 and L2 are written out (WB or INV).
                 */
                gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
+
+               *sqtt_flush_bits |= RGP_FLUSH_INVAL_SMEM_L0;
        }
-       if (flush_bits & RADV_CMD_FLAG_INV_VCACHE)
+       if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) {
                gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
+
+               *sqtt_flush_bits |= RGP_FLUSH_INVAL_VMEM_L0 | RGP_FLUSH_INVAL_L1;
+       }
        if (flush_bits & RADV_CMD_FLAG_INV_L2) {
                /* Writeback and invalidate everything in L2. */
                gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) |
                            S_586_GLM_INV(1) | S_586_GLM_WB(1);
+
+               *sqtt_flush_bits |= RGP_FLUSH_INVAL_L2;
        } else if (flush_bits & RADV_CMD_FLAG_WB_L2) {
                /* Writeback but do not invalidate.
                 * GLM doesn't support WB alone. If WB is set, INV must be set too.
                 */
                gcr_cntl |= S_586_GL2_WB(1) |
                            S_586_GLM_WB(1) | S_586_GLM_INV(1);
+
+               *sqtt_flush_bits |= RGP_FLUSH_FLUSH_L2;
        }
 
        /* TODO: Implement this new flag for GFX9+.
@@ -1082,6 +1095,8 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                        radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                        radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) |
                                        EVENT_INDEX(0));
+
+                       *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB;
                }
 
                /* TODO: trigger on RADV_CMD_FLAG_FLUSH_AND_INV_DB_META ? */
@@ -1090,6 +1105,8 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                        radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                        radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) |
                                        EVENT_INDEX(0));
+
+                       *sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
                }
 
                /* First flush CB/DB, then L1/L2. */
@@ -1110,15 +1127,21 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) {
                        radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                        radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+                       *sqtt_flush_bits |= RGP_FLUSH_PS_PARTIAL_FLUSH;
                } else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) {
                        radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                        radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+                       *sqtt_flush_bits |= RGP_FLUSH_VS_PARTIAL_FLUSH;
                }
        }
 
        if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) {
                radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
+
+               *sqtt_flush_bits |= RGP_FLUSH_CS_PARTIAL_FLUSH;
        }
 
        if (cb_db_event) {
@@ -1197,6 +1220,8 @@ gfx10_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                /* We need to ensure that PFP waits as well. */
                radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
                radeon_emit(cs, 0);
+
+               *sqtt_flush_bits |= RGP_FLUSH_PFP_SYNC_ME;
        }
 
        if (flush_bits & RADV_CMD_FLAG_START_PIPELINE_STATS) {
@@ -1217,6 +1242,7 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                       uint64_t flush_va,
                        bool is_mec,
                        enum radv_cmd_flush_bits flush_bits,
+                      enum rgp_flush_bits *sqtt_flush_bits,
                       uint64_t gfx9_eop_bug_va)
 {
        unsigned cp_coher_cntl = 0;
@@ -1226,14 +1252,19 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
        if (chip_class >= GFX10) {
                /* GFX10 cache flush handling is quite different. */
                gfx10_cs_emit_cache_flush(cs, chip_class, flush_cnt, flush_va,
-                                         is_mec, flush_bits, gfx9_eop_bug_va);
+                                         is_mec, flush_bits, sqtt_flush_bits,
+                                         gfx9_eop_bug_va);
                return;
        }
 
-       if (flush_bits & RADV_CMD_FLAG_INV_ICACHE)
+       if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) {
                cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
-       if (flush_bits & RADV_CMD_FLAG_INV_SCACHE)
+               *sqtt_flush_bits |= RGP_FLUSH_INVAL_ICACHE;
+       }
+       if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) {
                cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
+               *sqtt_flush_bits |= RGP_FLUSH_INVAL_SMEM_L0;
+       }
 
        if (chip_class <= GFX8) {
                if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) {
@@ -1259,34 +1290,48 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                                                           0, 0,
                                                           gfx9_eop_bug_va);
                        }
+
+                       *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB;
                }
                if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) {
                        cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
                                S_0085F0_DB_DEST_BASE_ENA(1);
+
+                       *sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
                }
        }
 
        if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) {
                radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
+
+               *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB;
        }
 
        if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) {
                radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
+
+               *sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
        }
 
        if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) {
                radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+               *sqtt_flush_bits |= RGP_FLUSH_PS_PARTIAL_FLUSH;
        } else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) {
                radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+               *sqtt_flush_bits |= RGP_FLUSH_VS_PARTIAL_FLUSH;
        }
 
        if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) {
                radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
                radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+               *sqtt_flush_bits |= RGP_FLUSH_CS_PARTIAL_FLUSH;
        }
 
        if (chip_class == GFX9 && flush_cb_db) {
@@ -1310,6 +1355,9 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                tc_flags = EVENT_TC_ACTION_ENA |
                           EVENT_TC_MD_ACTION_ENA;
 
+               *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB |
+                                   RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
+
                /* Ideally flush TC together with CB/DB. */
                if (flush_bits & RADV_CMD_FLAG_INV_L2) {
                        /* Writeback and invalidate everything in L2 & L1. */
@@ -1321,6 +1369,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                        flush_bits &= ~(RADV_CMD_FLAG_INV_L2 |
                                         RADV_CMD_FLAG_WB_L2 |
                                         RADV_CMD_FLAG_INV_VCACHE);
+
+                       *sqtt_flush_bits |= RGP_FLUSH_INVAL_L2;
                }
                assert(flush_cnt);
                (*flush_cnt)++;
@@ -1357,6 +1407,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
            !is_mec) {
                radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
                radeon_emit(cs, 0);
+
+               *sqtt_flush_bits |= RGP_FLUSH_PFP_SYNC_ME;
        }
 
        if ((flush_bits & RADV_CMD_FLAG_INV_L2) ||
@@ -1367,6 +1419,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                                    S_0085F0_TCL1_ACTION_ENA(1) |
                                    S_0301F0_TC_WB_ACTION_ENA(chip_class >= GFX8));
                cp_coher_cntl = 0;
+
+               *sqtt_flush_bits |= RGP_FLUSH_INVAL_L2 | RGP_FLUSH_INVAL_VMEM_L0;
        } else {
                if(flush_bits & RADV_CMD_FLAG_WB_L2) {
                        /* WB = write-back
@@ -1381,6 +1435,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                                            S_0301F0_TC_WB_ACTION_ENA(1) |
                                            S_0301F0_TC_NC_ACTION_ENA(1));
                        cp_coher_cntl = 0;
+
+                       *sqtt_flush_bits |= RGP_FLUSH_FLUSH_L2 | RGP_FLUSH_INVAL_VMEM_L0;
                }
                if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) {
                        si_emit_acquire_mem(cs, is_mec,
@@ -1388,6 +1444,8 @@ si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                                            cp_coher_cntl |
                                            S_0085F0_TCL1_ACTION_ENA(1));
                        cp_coher_cntl = 0;
+
+                       *sqtt_flush_bits |= RGP_FLUSH_INVAL_VMEM_L0;
                }
        }
 
@@ -1437,6 +1495,7 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
                               cmd_buffer->gfx9_fence_va,
                               radv_cmd_buffer_uses_mec(cmd_buffer),
                               cmd_buffer->state.flush_bits,
+                              &cmd_buffer->state.sqtt_flush_bits,
                               cmd_buffer->gfx9_eop_bug_va);