From 5ea5ed60500a8612166853975b42abd40a459216 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Thu, 26 May 2016 22:00:03 +0200 Subject: [PATCH] r600g: fix CP DMA hazard with index buffer fetches (v3) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit v3: use PFP_SYNC_ME on EG-CM only when supported by the kernel, otherwise use MEM_WRITE + WAIT_REG_MEM to emulate that Reviewed-by: Alex Deucher Tested-by: Grazvydas Ignotas Tested-by: Dieter Nützel --- src/gallium/drivers/r600/evergreen_hw_context.c | 16 ++++-- src/gallium/drivers/r600/evergreend.h | 1 + src/gallium/drivers/r600/r600_blit.c | 2 +- src/gallium/drivers/r600/r600_hw_context.c | 69 ++++++++++++++++++++++++- src/gallium/drivers/r600/r600_pipe.h | 5 +- src/gallium/drivers/r600/r600d.h | 5 ++ src/gallium/drivers/radeonsi/sid.h | 2 +- 7 files changed, 93 insertions(+), 7 deletions(-) diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c index f456696..2feb801 100644 --- a/src/gallium/drivers/r600/evergreen_hw_context.c +++ b/src/gallium/drivers/r600/evergreen_hw_context.c @@ -85,7 +85,8 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx, void evergreen_cp_dma_clear_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t offset, - unsigned size, uint32_t clear_value) + unsigned size, uint32_t clear_value, + enum r600_coherency coher) { struct radeon_winsys_cs *cs = rctx->b.gfx.cs; @@ -117,7 +118,9 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx, unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); unsigned reloc; - r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0), FALSE); + r600_need_cs_space(rctx, + 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) + + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE); /* Flush the caches for the first copy only. */ if (rctx->b.flags) { @@ -148,9 +151,16 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx, offset += byte_count; } + /* CP DMA is executed in ME, but index buffers are read by PFP. + * This ensures that ME (CP DMA) is idle before PFP starts fetching + * indices. If we wanted to execute CP DMA in PFP, this packet + * should precede it. + */ + if (coher == R600_COHERENCY_SHADER) + r600_emit_pfp_sync_me(rctx); + /* Invalidate the read caches. */ rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | R600_CONTEXT_INV_TEX_CACHE; } - diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h index c1c6169..a81b6c5 100644 --- a/src/gallium/drivers/r600/evergreend.h +++ b/src/gallium/drivers/r600/evergreend.h @@ -88,6 +88,7 @@ #define WAIT_REG_MEM_EQUAL 3 #define PKT3_MEM_WRITE 0x3D #define PKT3_INDIRECT_BUFFER 0x32 +#define PKT3_PFP_SYNC_ME 0x42 #define PKT3_SURFACE_SYNC 0x43 #define PKT3_ME_INITIALIZE 0x44 #define PKT3_COND_WRITE 0x45 diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c index 282645f..76c3364 100644 --- a/src/gallium/drivers/r600/r600_blit.c +++ b/src/gallium/drivers/r600/r600_blit.c @@ -589,7 +589,7 @@ static void r600_clear_buffer(struct pipe_context *ctx, struct pipe_resource *ds if (rctx->screen->b.has_cp_dma && rctx->b.chip_class >= EVERGREEN && offset % 4 == 0 && size % 4 == 0) { - evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value); + evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value, coher); } else if (rctx->screen->b.has_streamout && offset % 4 == 0 && size % 4 == 0) { union pipe_color_union clear_value; clear_value.ui[0] = value; diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c index 808bd27..3ba723d 100644 --- a/src/gallium/drivers/r600/r600_hw_context.c +++ b/src/gallium/drivers/r600/r600_hw_context.c @@ -364,6 +364,66 @@ void r600_begin_new_cs(struct r600_context *ctx) ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->current.cdw; } +void r600_emit_pfp_sync_me(struct r600_context *rctx) +{ + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; + + if (rctx->b.chip_class >= EVERGREEN && + rctx->b.screen->info.drm_minor >= 46) { + radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cs, 0); + } else { + /* Emulate PFP_SYNC_ME by writing a value to memory in ME and + * waiting for it in PFP. + */ + struct r600_resource *buf = NULL; + unsigned offset, reloc; + uint64_t va; + + /* 16-byte address alignment is required by WAIT_REG_MEM. */ + u_suballocator_alloc(rctx->b.allocator_zeroed_memory, 4, 16, + &offset, (struct pipe_resource**)&buf); + if (!buf) { + /* This is too heavyweight, but will work. */ + rctx->b.gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL); + return; + } + + reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, buf, + RADEON_USAGE_READWRITE, + RADEON_PRIO_FENCE); + + va = buf->gpu_address + offset; + assert(va % 16 == 0); + + /* Write 1 to memory in ME. */ + radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0)); + radeon_emit(cs, va); + radeon_emit(cs, ((va >> 32) & 0xff) | MEM_WRITE_32_BITS); + radeon_emit(cs, 1); + radeon_emit(cs, 0); + + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(cs, reloc); + + /* Wait in PFP (PFP can only do GEQUAL against memory). */ + radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); + radeon_emit(cs, WAIT_REG_MEM_GEQUAL | + WAIT_REG_MEM_MEMORY | + WAIT_REG_MEM_PFP); + radeon_emit(cs, va); + radeon_emit(cs, va >> 32); + radeon_emit(cs, 1); /* reference value */ + radeon_emit(cs, 0xffffffff); /* mask */ + radeon_emit(cs, 4); /* poll interval */ + + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(cs, reloc); + + r600_resource_reference(&buf, NULL); + } +} + /* The max number of bytes to copy per packet. */ #define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8) @@ -407,7 +467,7 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx, r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) + - 3, FALSE); + 3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE); /* Flush the caches for the first copy only. */ if (rctx->b.flags) { @@ -447,6 +507,13 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx, radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_CP_DMA_IDLE(1)); + /* CP DMA is executed in ME, but index buffers are read by PFP. + * This ensures that ME (CP DMA) is idle before PFP starts fetching + * indices. If we wanted to execute CP DMA in PFP, this packet + * should precede it. + */ + r600_emit_pfp_sync_me(rctx); + /* Invalidate the read caches. */ rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE | R600_CONTEXT_INV_VERTEX_CACHE | diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index 76178c2..313bf69 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -57,6 +57,7 @@ /* the number of CS dwords for flushing and drawing */ #define R600_MAX_FLUSH_CS_DWORDS 18 #define R600_MAX_DRAW_CS_DWORDS 58 +#define R600_MAX_PFP_SYNC_ME_DWORDS 16 #define R600_MAX_USER_CONST_BUFFERS 13 #define R600_MAX_DRIVER_CONST_BUFFERS 3 @@ -663,13 +664,15 @@ void r600_context_gfx_flush(void *context, unsigned flags, void r600_begin_new_cs(struct r600_context *ctx); void r600_flush_emit(struct r600_context *ctx); void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in); +void r600_emit_pfp_sync_me(struct r600_context *rctx); void r600_cp_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t dst_offset, struct pipe_resource *src, uint64_t src_offset, unsigned size); void evergreen_cp_dma_clear_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t offset, - unsigned size, uint32_t clear_value); + unsigned size, uint32_t clear_value, + enum r600_coherency coher); void r600_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *dst, struct pipe_resource *src, diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h index 24f599e..75d64c1 100644 --- a/src/gallium/drivers/r600/r600d.h +++ b/src/gallium/drivers/r600/r600d.h @@ -96,8 +96,13 @@ #define COPY_DW_DST_IS_MEM (1 << 1) #define PKT3_WAIT_REG_MEM 0x3C #define WAIT_REG_MEM_EQUAL 3 +#define WAIT_REG_MEM_GEQUAL 5 +#define WAIT_REG_MEM_MEMORY (1 << 4) +#define WAIT_REG_MEM_PFP (1 << 8) #define PKT3_MEM_WRITE 0x3D +#define MEM_WRITE_32_BITS (1 << 18) #define PKT3_INDIRECT_BUFFER 0x32 +#define PKT3_PFP_SYNC_ME 0x42 /* EG+ */ #define PKT3_SURFACE_SYNC 0x43 #define PKT3_ME_INITIALIZE 0x44 #define PKT3_COND_WRITE 0x45 diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h index 1b466aa..a6d5c05 100644 --- a/src/gallium/drivers/radeonsi/sid.h +++ b/src/gallium/drivers/radeonsi/sid.h @@ -143,7 +143,7 @@ #define COPY_DATA_DST_SEL(x) (((unsigned)(x) & 0xf) << 8) #define COPY_DATA_COUNT_SEL (1 << 16) #define COPY_DATA_WR_CONFIRM (1 << 20) -#define PKT3_PFP_SYNC_ME 0x42 /* r7xx+ */ +#define PKT3_PFP_SYNC_ME 0x42 #define PKT3_SURFACE_SYNC 0x43 /* deprecated on CIK, use ACQUIRE_MEM */ #define PKT3_ME_INITIALIZE 0x44 /* not on CIK */ #define PKT3_COND_WRITE 0x45 -- 2.7.4