From c2fbeb7ca057b3bee8c8cd0f7076af2b90d28111 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 6 Jun 2017 09:01:48 +1000 Subject: [PATCH] radv: add GFX9 cache flushing support. GFX9 needs to write event EOP to a fence buffer, allocate some space for this, and just write an ever increasing number to it, this isn't exactly what radeonsi does, but it seems to work. Reviewed-by: Bas Nieuwenhuizen Signed-off-by: Dave Airlie --- src/amd/vulkan/radv_cmd_buffer.c | 8 ++ src/amd/vulkan/radv_device.c | 3 + src/amd/vulkan/radv_private.h | 10 ++- src/amd/vulkan/si_cmd_buffer.c | 175 ++++++++++++++++++++++++++++----------- 4 files changed, 145 insertions(+), 51 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index d66f897..d078421 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -234,6 +234,14 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) cmd_buffer->record_fail = false; cmd_buffer->ring_offsets_idx = -1; + + if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) { + void *fence_ptr; + radv_cmd_buffer_upload_alloc(cmd_buffer, 8, 0, + &cmd_buffer->gfx9_fence_offset, + &fence_ptr); + cmd_buffer->gfx9_fence_bo = cmd_buffer->upload.upload_bo; + } } static bool diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index ca42ab8..9d510ea 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -1103,6 +1103,7 @@ VkResult radv_CreateDevice( case RADV_QUEUE_COMPUTE: si_cs_emit_cache_flush(device->flush_cs[family], device->physical_device->rad_info.chip_class, + NULL, 0, family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK, RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SMEM_L1 | @@ -1118,6 +1119,7 @@ VkResult radv_CreateDevice( case RADV_QUEUE_COMPUTE: si_cs_emit_cache_flush(device->flush_shader_cs[family], device->physical_device->rad_info.chip_class, + NULL, 0, family == RADV_QUEUE_COMPUTE && device->physical_device->rad_info.chip_class >= CIK, family == RADV_QUEUE_COMPUTE ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH) | RADV_CMD_FLAG_INV_ICACHE | @@ -1763,6 +1765,7 @@ radv_get_preamble_cs(struct radv_queue *queue, if (!i) { si_cs_emit_cache_flush(cs, queue->device->physical_device->rad_info.chip_class, + NULL, 0, queue->queue_family_index == RING_COMPUTE && queue->device->physical_device->rad_info.chip_class >= CIK, RADV_CMD_FLAG_INV_ICACHE | diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index e1b9a29..6a6c1e2 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -822,6 +822,9 @@ struct radv_cmd_buffer { bool record_fail; int ring_offsets_idx; /* just used for verification */ + uint32_t gfx9_fence_offset; + struct radeon_winsys_bo *gfx9_fence_bo; + uint32_t gfx9_fence_idx; }; struct radv_image; @@ -854,9 +857,10 @@ void si_emit_wait_fence(struct radeon_winsys_cs *cs, uint64_t va, uint32_t ref, uint32_t mask); void si_cs_emit_cache_flush(struct radeon_winsys_cs *cs, - enum chip_class chip_class, - bool is_mec, - enum radv_cmd_flush_bits flush_bits); + enum chip_class chip_class, + uint32_t *fence_ptr, uint64_t va, + bool is_mec, + enum radv_cmd_flush_bits flush_bits); void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer); void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer, uint64_t src_va, uint64_t dest_va, diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index eda24be..3e0b8ee 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -823,15 +823,18 @@ void si_cs_emit_write_event_eop(struct radeon_winsys_cs *cs, unsigned op = EVENT_TYPE(event) | EVENT_INDEX(5) | event_flags; + unsigned is_gfx8_mec = is_mec && chip_class < GFX9; - if (is_mec) { - radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 5, 0)); + if (chip_class >= GFX9 || is_gfx8_mec) { + radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, is_gfx8_mec ? 5 : 6, 0)); radeon_emit(cs, op); radeon_emit(cs, EOP_DATA_SEL(data_sel)); radeon_emit(cs, va); /* address lo */ radeon_emit(cs, va >> 32); /* address hi */ radeon_emit(cs, new_fence); /* immediate data lo */ radeon_emit(cs, 0); /* immediate data hi */ + if (!is_gfx8_mec) + radeon_emit(cs, 0); /* unused */ } else { if (chip_class == CIK || chip_class == VI) { @@ -872,15 +875,16 @@ si_emit_wait_fence(struct radeon_winsys_cs *cs, static void si_emit_acquire_mem(struct radeon_winsys_cs *cs, - bool is_mec, + bool is_mec, bool is_gfx9, unsigned cp_coher_cntl) { - if (is_mec) { + if (is_mec || is_gfx9) { + uint32_t hi_val = is_gfx9 ? 0xffffff : 0xff; radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0) | - PKT3_SHADER_TYPE_S(1)); + PKT3_SHADER_TYPE_S(is_mec)); radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(cs, 0xff); /* CP_COHER_SIZE_HI */ + radeon_emit(cs, hi_val); /* CP_COHER_SIZE_HI */ radeon_emit(cs, 0); /* CP_COHER_BASE */ radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ @@ -897,40 +901,45 @@ si_emit_acquire_mem(struct radeon_winsys_cs *cs, void si_cs_emit_cache_flush(struct radeon_winsys_cs *cs, enum chip_class chip_class, + uint32_t *flush_cnt, + uint64_t flush_va, bool is_mec, enum radv_cmd_flush_bits flush_bits) { unsigned cp_coher_cntl = 0; - + uint32_t flush_cb_db = flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | + RADV_CMD_FLAG_FLUSH_AND_INV_DB); + if (flush_bits & RADV_CMD_FLAG_INV_ICACHE) cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); if (flush_bits & RADV_CMD_FLAG_INV_SMEM_L1) cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); - if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { - cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | - S_0085F0_CB0_DEST_BASE_ENA(1) | - S_0085F0_CB1_DEST_BASE_ENA(1) | - S_0085F0_CB2_DEST_BASE_ENA(1) | - S_0085F0_CB3_DEST_BASE_ENA(1) | - S_0085F0_CB4_DEST_BASE_ENA(1) | - S_0085F0_CB5_DEST_BASE_ENA(1) | - S_0085F0_CB6_DEST_BASE_ENA(1) | - S_0085F0_CB7_DEST_BASE_ENA(1); - - /* Necessary for DCC */ - if (chip_class >= VI) { - si_cs_emit_write_event_eop(cs, - chip_class, - is_mec, - V_028A90_FLUSH_AND_INV_CB_DATA_TS, - 0, 0, 0, 0, 0); + if (chip_class <= VI) { + if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) { + cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | + S_0085F0_CB0_DEST_BASE_ENA(1) | + S_0085F0_CB1_DEST_BASE_ENA(1) | + S_0085F0_CB2_DEST_BASE_ENA(1) | + S_0085F0_CB3_DEST_BASE_ENA(1) | + S_0085F0_CB4_DEST_BASE_ENA(1) | + S_0085F0_CB5_DEST_BASE_ENA(1) | + S_0085F0_CB6_DEST_BASE_ENA(1) | + S_0085F0_CB7_DEST_BASE_ENA(1); + + /* Necessary for DCC */ + if (chip_class >= VI) { + si_cs_emit_write_event_eop(cs, + chip_class, + is_mec, + V_028A90_FLUSH_AND_INV_CB_DATA_TS, + 0, 0, 0, 0, 0); + } + } + if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { + cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | + S_0085F0_DB_DEST_BASE_ENA(1); } - } - - if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) { - cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | - S_0085F0_DB_DEST_BASE_ENA(1); } if (flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) { @@ -943,8 +952,7 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs, radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); } - if (!(flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | - RADV_CMD_FLAG_FLUSH_AND_INV_DB))) { + if (!flush_cb_db) { if (flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); @@ -959,6 +967,54 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs, radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); } + if (chip_class >= GFX9 && flush_cb_db) { + unsigned cb_db_event, tc_flags; + + /* Set the CB/DB flush event. */ + switch (flush_cb_db) { + case RADV_CMD_FLAG_FLUSH_AND_INV_CB: + cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS; + break; + case RADV_CMD_FLAG_FLUSH_AND_INV_DB: + cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS; + break; + default: + /* both CB & DB */ + cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; + } + + /* TC | TC_WB = invalidate L2 data + * TC_MD | TC_WB = invalidate L2 metadata + * TC | TC_WB | TC_MD = invalidate L2 data & metadata + * + * The metadata cache must always be invalidated for coherency + * between CB/DB and shaders. (metadata = HTILE, CMASK, DCC) + * + * TC must be invalidated on GFX9 only if the CB/DB surface is + * not pipe-aligned. If the surface is RB-aligned, it might not + * strictly be pipe-aligned since RB alignment takes precendence. + */ + tc_flags = EVENT_TC_WB_ACTION_ENA | + EVENT_TC_MD_ACTION_ENA; + + /* Ideally flush TC together with CB/DB. */ + if (flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) { + tc_flags |= EVENT_TC_ACTION_ENA | + EVENT_TCL1_ACTION_ENA; + + /* Clear the flags. */ + flush_bits &= ~(RADV_CMD_FLAG_INV_GLOBAL_L2 | + RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2 | + RADV_CMD_FLAG_INV_VMEM_L1); + } + assert(flush_cnt); + uint32_t old_fence = (*flush_cnt)++; + + si_cs_emit_write_event_eop(cs, chip_class, false, cb_db_event, tc_flags, 1, + flush_va, old_fence, *flush_cnt); + si_emit_wait_fence(cs, flush_va, *flush_cnt, 0xffffffff); + } + /* VGT state sync */ if (flush_bits & RADV_CMD_FLAG_VGT_FLUSH) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); @@ -968,7 +1024,11 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs, /* Make sure ME is idle (it executes most packets) before continuing. * This prevents read-after-write hazards between PFP and ME. */ - if ((cp_coher_cntl || (flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) && + if ((cp_coher_cntl || + (flush_bits & (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | + RADV_CMD_FLAG_INV_VMEM_L1 | + RADV_CMD_FLAG_INV_GLOBAL_L2 | + RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) && !is_mec) { radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cs, 0); @@ -976,34 +1036,46 @@ si_cs_emit_cache_flush(struct radeon_winsys_cs *cs, if ((flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) || (chip_class <= CIK && (flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) { - cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1); - if (chip_class >= VI) - cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1); - } else if(flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2) { - cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1) | - S_0301F0_TC_NC_ACTION_ENA(1); - - /* L2 writeback doesn't combine with L1 invalidate */ - si_emit_acquire_mem(cs, is_mec, cp_coher_cntl); - + si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, + cp_coher_cntl | + S_0085F0_TC_ACTION_ENA(1) | + S_0085F0_TCL1_ACTION_ENA(1) | + S_0301F0_TC_WB_ACTION_ENA(chip_class >= VI)); cp_coher_cntl = 0; + } else { + if(flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2) { + /* WB = write-back + * NC = apply to non-coherent MTYPEs + * (i.e. MTYPE <= 1, which is what we use everywhere) + * + * WB doesn't work without NC. + */ + si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, + cp_coher_cntl | + S_0301F0_TC_WB_ACTION_ENA(1) | + S_0301F0_TC_NC_ACTION_ENA(1)); + cp_coher_cntl = 0; + } + if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) { + si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, + cp_coher_cntl | + S_0085F0_TCL1_ACTION_ENA(1)); + cp_coher_cntl = 0; + } } - if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) - cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1); - /* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle. * Therefore, it should be last. Done in PFP. */ if (cp_coher_cntl) - si_emit_acquire_mem(cs, is_mec, cp_coher_cntl); + si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, cp_coher_cntl); } void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer) { bool is_compute = cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE; - + enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class; if (is_compute) cmd_buffer->state.flush_bits &= ~(RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META | @@ -1015,8 +1087,15 @@ si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer) radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128); + uint32_t *ptr = NULL; + uint64_t va = 0; + if (chip_class == GFX9) { + va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->gfx9_fence_bo) + cmd_buffer->gfx9_fence_offset; + ptr = &cmd_buffer->gfx9_fence_idx; + } si_cs_emit_cache_flush(cmd_buffer->cs, cmd_buffer->device->physical_device->rad_info.chip_class, + ptr, va, radv_cmd_buffer_uses_mec(cmd_buffer), cmd_buffer->state.flush_bits); -- 2.7.4