union {
struct {
uint32_t sync_cp_dma : 1;
- uint32_t inval_ccp : 1;
+ uint32_t inval_tcp : 1;
uint32_t inval_sqI : 1;
uint32_t inval_sqK : 1;
uint32_t flush_tcc : 1;
marker.num_layout_transitions = cmd_buffer->state.num_layout_transitions;
/* TODO: fill pipeline stalls, cache flushes, etc */
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_WAIT_ON_EOP_TS)
+ marker.wait_on_eop_ts = true;
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_VS_PARTIAL_FLUSH)
+ marker.vs_partial_flush = true;
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_PS_PARTIAL_FLUSH)
+ marker.ps_partial_flush = true;
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_CS_PARTIAL_FLUSH)
+ marker.cs_partial_flush = true;
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_PFP_SYNC_ME)
+ marker.pfp_sync_me = true;
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_SYNC_CP_DMA)
+ marker.sync_cp_dma = true;
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_VMEM_L0)
+ marker.inval_tcp = true;
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_ICACHE)
+ marker.inval_sqI = true;
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_SMEM_L0)
+ marker.inval_sqK = true;
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_FLUSH_L2)
+ marker.flush_tcc = true;
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_L2)
+ marker.inval_tcc = true;
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_FLUSH_CB)
+ marker.flush_cb = true;
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_CB)
+ marker.inval_cb = true;
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_FLUSH_DB)
+ marker.flush_db = true;
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_DB)
+ marker.inval_db = true;
+ if (cmd_buffer->state.sqtt_flush_bits & RGP_FLUSH_INVAL_L1)
+ marker.inval_gl1 = true;
radv_emit_thread_trace_userdata(cmd_buffer->device, cs, &marker, sizeof(marker) / 4);
return;
radv_describe_barrier_end_delayed(cmd_buffer);
+ cmd_buffer->state.sqtt_flush_bits = 0;
marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START;
marker.cb_id = 0;
struct radv_sample_locations_state sample_location;
};
+enum rgp_flush_bits {
+ RGP_FLUSH_WAIT_ON_EOP_TS = 0x1,
+ RGP_FLUSH_VS_PARTIAL_FLUSH = 0x2,
+ RGP_FLUSH_PS_PARTIAL_FLUSH = 0x4,
+ RGP_FLUSH_CS_PARTIAL_FLUSH = 0x8,
+ RGP_FLUSH_PFP_SYNC_ME = 0x10,
+ RGP_FLUSH_SYNC_CP_DMA = 0x20,
+ RGP_FLUSH_INVAL_VMEM_L0 = 0x40,
+ RGP_FLUSH_INVAL_ICACHE = 0x80,
+ RGP_FLUSH_INVAL_SMEM_L0 = 0x100,
+ RGP_FLUSH_FLUSH_L2 = 0x200,
+ RGP_FLUSH_INVAL_L2 = 0x400,
+ RGP_FLUSH_FLUSH_CB = 0x800,
+ RGP_FLUSH_INVAL_CB = 0x1000,
+ RGP_FLUSH_FLUSH_DB = 0x2000,
+ RGP_FLUSH_INVAL_DB = 0x4000,
+ RGP_FLUSH_INVAL_L1 = 0x8000,
+};
+
struct radv_cmd_state {
/* Vertex descriptors */
uint64_t vb_va;
uint32_t num_events;
uint32_t num_layout_transitions;
bool pending_sqtt_barrier_end;
+ enum rgp_flush_bits sqtt_flush_bits;
};
struct radv_cmd_pool {
uint32_t *fence_ptr, uint64_t va,
bool is_mec,
enum radv_cmd_flush_bits flush_bits,
+ enum rgp_flush_bits *sqtt_flush_bits,
uint64_t gfx9_eop_bug_va);
void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer);
void si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer,
uint64_t flush_va,
bool is_mec,
enum radv_cmd_flush_bits flush_bits,
+ enum rgp_flush_bits *sqtt_flush_bits,
uint64_t gfx9_eop_bug_va)
{
uint32_t gcr_cntl = 0;
/* We don't need these. */
assert(!(flush_bits & (RADV_CMD_FLAG_VGT_STREAMOUT_SYNC)));
- if (flush_bits & RADV_CMD_FLAG_INV_ICACHE)
+ if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) {
gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
+
+ *sqtt_flush_bits |= RGP_FLUSH_INVAL_ICACHE;
+ }
if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) {
/* TODO: When writing to the SMEM L1 cache, we need to set SEQ
* to FORWARD when both L1 and L2 are written out (WB or INV).
*/
gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
+
+ *sqtt_flush_bits |= RGP_FLUSH_INVAL_SMEM_L0;
}
- if (flush_bits & RADV_CMD_FLAG_INV_VCACHE)
+ if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) {
gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
+
+ *sqtt_flush_bits |= RGP_FLUSH_INVAL_VMEM_L0 | RGP_FLUSH_INVAL_L1;
+ }
if (flush_bits & RADV_CMD_FLAG_INV_L2) {
/* Writeback and invalidate everything in L2. */
gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) |
S_586_GLM_INV(1) | S_586_GLM_WB(1);
+
+ *sqtt_flush_bits |= RGP_FLUSH_INVAL_L2;
} else if (flush_bits & RADV_CMD_FLAG_WB_L2) {
/* Writeback but do not invalidate.
* GLM doesn't support WB alone. If WB is set, INV must be set too.
*/
gcr_cntl |= S_586_GL2_WB(1) |
S_586_GLM_WB(1) | S_586_GLM_INV(1);
+
+ *sqtt_flush_bits |= RGP_FLUSH_FLUSH_L2;
}
/* TODO: Implement this new flag for GFX9+.
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) |
EVENT_INDEX(0));
+
+ *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB;
}
/* TODO: trigger on RADV_CMD_FLAG_FLUSH_AND_INV_DB_META ? */
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) |
EVENT_INDEX(0));
+
+ *sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
}
/* First flush CB/DB, then L1/L2. */
if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+ *sqtt_flush_bits |= RGP_FLUSH_PS_PARTIAL_FLUSH;
} else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+ *sqtt_flush_bits |= RGP_FLUSH_VS_PARTIAL_FLUSH;
}
}
if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
+
+ *sqtt_flush_bits |= RGP_FLUSH_CS_PARTIAL_FLUSH;
}
if (cb_db_event) {
/* We need to ensure that PFP waits as well. */
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
+
+ *sqtt_flush_bits |= RGP_FLUSH_PFP_SYNC_ME;
}
if (flush_bits & RADV_CMD_FLAG_START_PIPELINE_STATS) {
uint64_t flush_va,
bool is_mec,
enum radv_cmd_flush_bits flush_bits,
+ enum rgp_flush_bits *sqtt_flush_bits,
uint64_t gfx9_eop_bug_va)
{
unsigned cp_coher_cntl = 0;
if (chip_class >= GFX10) {
/* GFX10 cache flush handling is quite different. */
gfx10_cs_emit_cache_flush(cs, chip_class, flush_cnt, flush_va,
- is_mec, flush_bits, gfx9_eop_bug_va);
+ is_mec, flush_bits, sqtt_flush_bits,
+ gfx9_eop_bug_va);
return;
}
- if (flush_bits & RADV_CMD_FLAG_INV_ICACHE)
+ if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) {
cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
- if (flush_bits & RADV_CMD_FLAG_INV_SCACHE)
+ *sqtt_flush_bits |= RGP_FLUSH_INVAL_ICACHE;
+ }
+ if (flush_bits & RADV_CMD_FLAG_INV_SCACHE) {
cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
+ *sqtt_flush_bits |= RGP_FLUSH_INVAL_SMEM_L0;
+ }
if (chip_class <= GFX8) {
if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) {
0, 0,
gfx9_eop_bug_va);
}
+
+ *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB;
}
if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) {
cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
S_0085F0_DB_DEST_BASE_ENA(1);
+
+ *sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
}
}
if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
+
+ *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB;
}
if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
+
+ *sqtt_flush_bits |= RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
}
if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+ *sqtt_flush_bits |= RGP_FLUSH_PS_PARTIAL_FLUSH;
} else if (flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+ *sqtt_flush_bits |= RGP_FLUSH_VS_PARTIAL_FLUSH;
}
if (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+
+ *sqtt_flush_bits |= RGP_FLUSH_CS_PARTIAL_FLUSH;
}
if (chip_class == GFX9 && flush_cb_db) {
tc_flags = EVENT_TC_ACTION_ENA |
EVENT_TC_MD_ACTION_ENA;
+ *sqtt_flush_bits |= RGP_FLUSH_FLUSH_CB | RGP_FLUSH_INVAL_CB |
+ RGP_FLUSH_FLUSH_DB | RGP_FLUSH_INVAL_DB;
+
/* Ideally flush TC together with CB/DB. */
if (flush_bits & RADV_CMD_FLAG_INV_L2) {
/* Writeback and invalidate everything in L2 & L1. */
flush_bits &= ~(RADV_CMD_FLAG_INV_L2 |
RADV_CMD_FLAG_WB_L2 |
RADV_CMD_FLAG_INV_VCACHE);
+
+ *sqtt_flush_bits |= RGP_FLUSH_INVAL_L2;
}
assert(flush_cnt);
(*flush_cnt)++;
!is_mec) {
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
+
+ *sqtt_flush_bits |= RGP_FLUSH_PFP_SYNC_ME;
}
if ((flush_bits & RADV_CMD_FLAG_INV_L2) ||
S_0085F0_TCL1_ACTION_ENA(1) |
S_0301F0_TC_WB_ACTION_ENA(chip_class >= GFX8));
cp_coher_cntl = 0;
+
+ *sqtt_flush_bits |= RGP_FLUSH_INVAL_L2 | RGP_FLUSH_INVAL_VMEM_L0;
} else {
if(flush_bits & RADV_CMD_FLAG_WB_L2) {
/* WB = write-back
S_0301F0_TC_WB_ACTION_ENA(1) |
S_0301F0_TC_NC_ACTION_ENA(1));
cp_coher_cntl = 0;
+
+ *sqtt_flush_bits |= RGP_FLUSH_FLUSH_L2 | RGP_FLUSH_INVAL_VMEM_L0;
}
if (flush_bits & RADV_CMD_FLAG_INV_VCACHE) {
si_emit_acquire_mem(cs, is_mec,
cp_coher_cntl |
S_0085F0_TCL1_ACTION_ENA(1));
cp_coher_cntl = 0;
+
+ *sqtt_flush_bits |= RGP_FLUSH_INVAL_VMEM_L0;
}
}
cmd_buffer->gfx9_fence_va,
radv_cmd_buffer_uses_mec(cmd_buffer),
cmd_buffer->state.flush_bits,
+ &cmd_buffer->state.sqtt_flush_bits,
cmd_buffer->gfx9_eop_bug_va);