From f68c6951b86ac38ebdb89bc6b5a6285433e684a6 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sat, 5 Mar 2016 16:53:11 -0500 Subject: [PATCH] freedreno/a4xx: hw binning Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/a4xx/fd4_draw.c | 39 +++++++ src/gallium/drivers/freedreno/a4xx/fd4_emit.c | 6 +- src/gallium/drivers/freedreno/a4xx/fd4_gmem.c | 127 +++++++++++++++++++++-- src/gallium/drivers/freedreno/a4xx/fd4_program.c | 71 ++++++++----- 4 files changed, 210 insertions(+), 33 deletions(-) diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c index bc9cfae..c34f944 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c @@ -176,6 +176,43 @@ reset_viewport(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb) OUT_RING(ring, A4XX_GRAS_CL_VPORT_YSCALE_0(-half_height)); } +/* TODO maybe we should just migrate u_blitter for clear and do it in + * core (so we get normal draw pass state mgmt and binning).. That should + * work well enough for a3xx/a4xx (but maybe not a2xx?) + */ + +static void +fd4_clear_binning(struct fd_context *ctx, unsigned dirty) +{ + struct fd4_context *fd4_ctx = fd4_context(ctx); + struct fd_ringbuffer *ring = ctx->binning_ring; + struct fd4_emit emit = { + .vtx = &fd4_ctx->solid_vbuf_state, + .prog = &ctx->solid_prog, + .key = { + .binning_pass = true, + .half_precision = true, + }, + .dirty = dirty, + }; + + fd4_emit_state(ctx, ring, &emit); + fd4_emit_vertex_bufs(ring, &emit); + reset_viewport(ring, &ctx->framebuffer); + + OUT_PKT0(ring, REG_A4XX_PC_PRIM_VTX_CNTL, 2); + OUT_RING(ring, A4XX_PC_PRIM_VTX_CNTL_VAROUT(0) | + A4XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); + OUT_RING(ring, A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) | + A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES)); + + OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1); + OUT_RING(ring, 0x00000002); + + fd4_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX_SIZE_IGN, 0, 0, NULL); +} + static void fd4_clear(struct fd_context *ctx, unsigned buffers, const union pipe_color_union *color, double depth, unsigned stencil) @@ -198,6 +235,8 @@ fd4_clear(struct fd_context *ctx, unsigned buffers, dirty |= FD_DIRTY_PROG; emit.dirty = dirty; + fd4_clear_binning(ctx, dirty); + OUT_PKT0(ring, REG_A4XX_PC_PRIM_VTX_CNTL, 1); OUT_RING(ring, A4XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c index 58ddf2a..81ed16c 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c @@ -606,7 +606,11 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, if (dirty & (FD_DIRTY_PROG | FD_DIRTY_FRAMEBUFFER)) { struct pipe_framebuffer_state *pfb = &ctx->framebuffer; - fd4_program_emit(ring, emit, pfb->nr_cbufs, pfb->cbufs); + unsigned n = pfb->nr_cbufs; + /* if we have depth/stencil, we need at least on MRT: */ + if (pfb->zsbuf) + n = MAX2(1, n); + fd4_program_emit(ring, emit, n, pfb->cbufs); } if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */ diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c index bb74088..ed2307d 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c @@ -130,6 +130,19 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, } } +static bool +use_hw_binning(struct fd_context *ctx) +{ + struct fd_gmem_stateobj *gmem = &ctx->gmem; + struct pipe_framebuffer_state *pfb = &ctx->framebuffer; + + /* this seems to be a hw bug.. but this hack fixes piglit fbo-maxsize: */ + if ((pfb->width > 4096) && (pfb->height > 4096)) + return false; + + return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2); +} + /* transfer from gmem to system memory (ie. normal RAM) */ static void @@ -575,6 +588,70 @@ update_vsc_pipe(struct fd_context *ctx) } } +static void +emit_binning_pass(struct fd_context *ctx) +{ + struct fd_gmem_stateobj *gmem = &ctx->gmem; + struct pipe_framebuffer_state *pfb = &ctx->framebuffer; + struct fd_ringbuffer *ring = ctx->ring; + int i; + + uint32_t x1 = gmem->minx; + uint32_t y1 = gmem->miny; + uint32_t x2 = gmem->minx + gmem->width - 1; + uint32_t y2 = gmem->miny + gmem->height - 1; + + OUT_PKT0(ring, REG_A4XX_PC_BINNING_COMMAND, 1); + OUT_RING(ring, A4XX_PC_BINNING_COMMAND_BINNING_ENABLE); + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_TILING_PASS) | + A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | + A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A4XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + OUT_PKT0(ring, REG_A4XX_RB_FRAME_BUFFER_DIMENSION, 1); + OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | + A4XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); + + /* setup scissor/offset for whole screen: */ + OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1); + OUT_RING(ring, A4XX_RB_BIN_OFFSET_X(x1) | + A4XX_RB_BIN_OFFSET_Y(y1)); + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_TL_X(x1) | + A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(y1)); + OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(x2) | + A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(y2)); + + for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { + OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1); + OUT_RING(ring, A4XX_RB_MRT_CONTROL_ROP_CODE(ROP_CLEAR) | + A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf)); + } + + /* emit IB to binning drawcmds: */ + ctx->emit_ib(ring, ctx->binning_start, ctx->binning_end); + + fd_reset_wfi(ctx); + fd_wfi(ctx, ring); + + /* and then put stuff back the way it was: */ + + OUT_PKT0(ring, REG_A4XX_PC_BINNING_COMMAND, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | + A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A4XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + fd_event_write(ctx, ring, CACHE_FLUSH); + fd_wfi(ctx, ring); +} + /* before first tile */ static void fd4_emit_tile_init(struct fd_context *ctx) @@ -588,16 +665,30 @@ fd4_emit_tile_init(struct fd_context *ctx) OUT_RING(ring, A4XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) | A4XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); + update_vsc_pipe(ctx); + + if (use_hw_binning(ctx)) { + OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) | + A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h)); + + OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); + OUT_RING(ring, A4XX_RB_RENDER_CONTROL_BINNING_PASS | + A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | + 0x8); + + /* emit hw binning pass: */ + emit_binning_pass(ctx); + + patch_draws(ctx, USE_VISIBILITY); + } else { + patch_draws(ctx, IGNORE_VISIBILITY); + } + OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1); OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) | A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h) | - 0x00010000); /* XXX */ - - update_vsc_pipe(ctx); - patch_draws(ctx, IGNORE_VISIBILITY); - - OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); - OUT_RING(ring, 0x8); + A4XX_RB_MODE_CONTROL_ENABLE_GMEM); } /* before mem2gmem */ @@ -659,6 +750,7 @@ fd4_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile) static void fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile) { + struct fd4_context *fd4_ctx = fd4_context(ctx); struct fd_ringbuffer *ring = ctx->ring; struct fd_gmem_stateobj *gmem = &ctx->gmem; struct pipe_framebuffer_state *pfb = &ctx->framebuffer; @@ -668,6 +760,27 @@ fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile) uint32_t x2 = tile->xoff + tile->bin_w - 1; uint32_t y2 = tile->yoff + tile->bin_h - 1; + if (use_hw_binning(ctx)) { + struct fd_vsc_pipe *pipe = &ctx->pipe[tile->p]; + + assert(pipe->w * pipe->h); + + fd_event_write(ctx, ring, HLSQ_FLUSH); + fd_wfi(ctx, ring); + + OUT_PKT0(ring, REG_A4XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, A4XX_PC_VSTREAM_CONTROL_SIZE(pipe->w * pipe->h) | + A4XX_PC_VSTREAM_CONTROL_N(tile->n)); + + OUT_PKT3(ring, CP_SET_BIN_DATA, 2); + OUT_RELOC(ring, pipe->bo, 0, 0, 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */ + OUT_RELOC(ring, fd4_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */ + (tile->p * 4), 0, 0); + } else { + OUT_PKT0(ring, REG_A4XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, 0x00000000); + } + OUT_PKT3(ring, CP_SET_BIN, 3); OUT_RING(ring, 0x00000000); OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1)); diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c index 082313f..d782b94 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c @@ -217,6 +217,9 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, debug_assert(nr <= ARRAY_SIZE(color_regid)); + if (emit->key.binning_pass) + nr = 0; + setup_stages(emit, s); fssz = (s[FS].i->max_reg >= 24) ? TWO_QUADS : FOUR_QUADS; @@ -373,31 +376,49 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, A4XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[VS].instroff)); OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0); /* SP_VS_OBJ_START_REG */ - OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1); - OUT_RING(ring, s[FS].v->instrlen); /* SP_FS_LENGTH_REG */ - - OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2); - OUT_RING(ring, A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | - COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) | - A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) | - A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) | - A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | - A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | - A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | - COND(s[FS].v->has_samp, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE)); - OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) | - 0x80000000 | /* XXX */ - COND(s[FS].v->frag_face, A4XX_SP_FS_CTRL_REG1_FACENESS) | - COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG1_VARYING) | - COND(s[FS].v->frag_coord, A4XX_SP_FS_CTRL_REG1_FRAGCOORD)); - - OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2); - OUT_RING(ring, A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) | - A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff)); - if (emit->key.binning_pass) + if (emit->key.binning_pass) { + OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1); + OUT_RING(ring, 0x00000000); /* SP_FS_LENGTH_REG */ + + OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2); + OUT_RING(ring, A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | + COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) | + A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(0) | + A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(0) | + A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | + A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | + A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE); + OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) | + 0x80000000); + + OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2); + OUT_RING(ring, A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) | + A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff)); OUT_RING(ring, 0x00000000); - else + } else { + OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1); + OUT_RING(ring, s[FS].v->instrlen); /* SP_FS_LENGTH_REG */ + + OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2); + OUT_RING(ring, A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | + COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) | + A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) | + A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) | + A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | + A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | + A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | + COND(s[FS].v->has_samp, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE)); + OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) | + 0x80000000 | /* XXX */ + COND(s[FS].v->frag_face, A4XX_SP_FS_CTRL_REG1_FACENESS) | + COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG1_VARYING) | + COND(s[FS].v->frag_coord, A4XX_SP_FS_CTRL_REG1_FRAGCOORD)); + + OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2); + OUT_RING(ring, A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) | + A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff)); OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */ + } OUT_PKT0(ring, REG_A4XX_SP_HS_OBJ_OFFSET_REG, 1); OUT_RING(ring, A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[HS].constoff) | @@ -421,11 +442,11 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, A4XX_RB_RENDER_CONTROL2_WCOORD)); OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1); - OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(MAX2(1, nr)) | + OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(nr) | COND(s[FS].v->writes_pos, A4XX_RB_FS_OUTPUT_REG_FRAG_WRITES_Z)); OUT_PKT0(ring, REG_A4XX_SP_FS_OUTPUT_REG, 1); - OUT_RING(ring, A4XX_SP_FS_OUTPUT_REG_MRT(MAX2(1, nr)) | + OUT_RING(ring, A4XX_SP_FS_OUTPUT_REG_MRT(nr) | COND(s[FS].v->writes_pos, A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) | A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid)); -- 2.7.4