freedreno: a2xx: clear fixes and fast clear path
authorJonathan Marek <jonathan@marek.ca>
Mon, 28 Jan 2019 17:49:54 +0000 (12:49 -0500)
committerRob Clark <robdclark@gmail.com>
Mon, 28 Jan 2019 23:21:16 +0000 (18:21 -0500)
This fixes the depth/stencil clear on a20x, and adds a fast clear path.

The fast clear path is only used for a20x, needs performance tests on a22x.

Signed-off-by: Jonathan Marek <jonathan@marek.ca>
src/gallium/drivers/freedreno/a2xx/fd2_context.c
src/gallium/drivers/freedreno/a2xx/fd2_draw.c
src/gallium/drivers/freedreno/a2xx/fd2_draw.h
src/gallium/drivers/freedreno/a2xx/fd2_emit.c
src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
src/gallium/drivers/freedreno/freedreno_batch.c
src/gallium/drivers/freedreno/freedreno_batch.h
src/gallium/drivers/freedreno/freedreno_gmem.c

index 760ad17..28073b0 100644 (file)
@@ -54,6 +54,8 @@ create_solid_vertexbuf(struct pipe_context *pctx)
                        +0.000000, +0.000000,
                        +1.000000, +0.000000,
                        +0.000000, +1.000000,
+                       /* SCREEN_SCISSOR_BR value (must be at 60 byte offset in page) */
+                       0.0,
        };
        struct pipe_resource *prsc = pipe_buffer_create(pctx->screen,
                        PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, sizeof(init_shader_const));
index c857c11..05c4cd5 100644 (file)
@@ -208,23 +208,13 @@ fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *pinfo,
        return true;
 }
 
-
-static bool
-fd2_clear(struct fd_context *ctx, unsigned buffers,
-               const union pipe_color_union *color, double depth, unsigned stencil)
+static void
+clear_state(struct fd_batch *batch, struct fd_ringbuffer *ring,
+       unsigned buffers, bool fast_clear)
 {
+       struct fd_context *ctx = batch->ctx;
        struct fd2_context *fd2_ctx = fd2_context(ctx);
-       struct fd_ringbuffer *ring = ctx->batch->draw;
-       struct pipe_framebuffer_state *fb = &ctx->batch->framebuffer;
-       uint32_t reg, colr = 0;
-
-       if ((buffers & PIPE_CLEAR_COLOR) && fb->nr_cbufs)
-               colr = pack_rgba(PIPE_FORMAT_R8G8B8A8_UNORM, color->f);
-
-       /* emit generic state now: */
-       fd2_emit_state(ctx, ctx->dirty &
-                       (FD_DIRTY_BLEND | FD_DIRTY_VIEWPORT |
-                                       FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR));
+       uint32_t reg;
 
        fd2_emit_vertex_bufs(ring, 0x9c, (struct fd2_vertex_buf[]) {
                        { .prsc = fd2_ctx->solid_vertexbuf, .size = 36 },
@@ -234,96 +224,28 @@ fd2_clear(struct fd_context *ctx, unsigned buffers,
        OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET));
        OUT_RING(ring, 0);
 
-       if (!is_a20x(ctx->screen)) {
-               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-               OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL));
-               OUT_RING(ring, 0x0000028f);
-       }
-
        fd2_program_emit(ctx, ring, &ctx->solid_prog);
 
        OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1);
        OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE);
 
-       if (is_a20x(ctx->screen)) {
-               OUT_PKT3(ring, CP_SET_CONSTANT, 5);
-               OUT_RING(ring, 0x00000480);
-               OUT_RING(ring, color->ui[0]);
-               OUT_RING(ring, color->ui[1]);
-               OUT_RING(ring, color->ui[2]);
-               OUT_RING(ring, color->ui[3]);
-       } else {
-               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-               OUT_RING(ring, CP_REG(REG_A2XX_CLEAR_COLOR));
-               OUT_RING(ring, colr);
-       }
-
-       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-       OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL));
-       OUT_RING(ring, 0x00000084);
-
-       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-       OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL));
-       reg = 0;
        if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
-               reg |= A2XX_RB_COPY_CONTROL_DEPTH_CLEAR_ENABLE;
-               switch (fd_pipe2depth(fb->zsbuf->format)) {
-               case DEPTHX_24_8:
-                       if (buffers & PIPE_CLEAR_DEPTH)
-                               reg |= A2XX_RB_COPY_CONTROL_CLEAR_MASK(0xe);
-                       if (buffers & PIPE_CLEAR_STENCIL)
-                               reg |= A2XX_RB_COPY_CONTROL_CLEAR_MASK(0x1);
-                       break;
-               case DEPTHX_16:
-                       if (buffers & PIPE_CLEAR_DEPTH)
-                               reg |= A2XX_RB_COPY_CONTROL_CLEAR_MASK(0xf);
-                       break;
-               default:
-                       debug_assert(0);
-                       break;
-               }
-       }
-       OUT_RING(ring, reg);
-
-       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-       OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTH_CLEAR));
-       reg = 0;
-       if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
-               switch (fd_pipe2depth(fb->zsbuf->format)) {
-               case DEPTHX_24_8:
-                       reg = (((uint32_t)(0xffffff * depth)) << 8) |
-                               (stencil & 0xff);
-                       break;
-               case DEPTHX_16:
-                       reg = (uint32_t)(0xffffffff * depth);
-                       break;
-               default:
-                       debug_assert(0);
-                       break;
-               }
-       }
-       OUT_RING(ring, reg);
-
-       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-       OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTHCONTROL));
-       reg = 0;
-       if (buffers & PIPE_CLEAR_DEPTH) {
-               reg |= A2XX_RB_DEPTHCONTROL_ZFUNC(FUNC_ALWAYS) |
+               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+               OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTHCONTROL));
+               reg = 0;
+               if (buffers & PIPE_CLEAR_DEPTH) {
+                       reg |= A2XX_RB_DEPTHCONTROL_ZFUNC(FUNC_ALWAYS) |
                                A2XX_RB_DEPTHCONTROL_Z_ENABLE |
                                A2XX_RB_DEPTHCONTROL_Z_WRITE_ENABLE |
                                A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE;
+               }
+               if (buffers & PIPE_CLEAR_STENCIL) {
+                       reg |= A2XX_RB_DEPTHCONTROL_STENCILFUNC(FUNC_ALWAYS) |
+                                       A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE |
+                                       A2XX_RB_DEPTHCONTROL_STENCILZPASS(STENCIL_REPLACE);
+               }
+               OUT_RING(ring, reg);
        }
-       if (buffers & PIPE_CLEAR_STENCIL) {
-               reg |= A2XX_RB_DEPTHCONTROL_STENCILFUNC(FUNC_ALWAYS) |
-                               A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE |
-                               A2XX_RB_DEPTHCONTROL_STENCILZPASS(STENCIL_REPLACE);
-       }
-       OUT_RING(ring, reg);
-
-       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
-       OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF));
-       OUT_RING(ring, 0xff000000 | A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff));
-       OUT_RING(ring, 0xff000000 | A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff));
 
        OUT_PKT3(ring, CP_SET_CONSTANT, 2);
        OUT_RING(ring, CP_REG(REG_A2XX_RB_COLORCONTROL));
@@ -338,18 +260,19 @@ fd2_clear(struct fd_context *ctx, unsigned buffers,
        OUT_RING(ring, 0x00000000);        /* PA_CL_CLIP_CNTL */
        OUT_RING(ring, A2XX_PA_SU_SC_MODE_CNTL_PROVOKING_VTX_LAST |  /* PA_SU_SC_MODE_CNTL */
                        A2XX_PA_SU_SC_MODE_CNTL_FRONT_PTYPE(PC_DRAW_TRIANGLES) |
-                       A2XX_PA_SU_SC_MODE_CNTL_BACK_PTYPE(PC_DRAW_TRIANGLES));
+                       A2XX_PA_SU_SC_MODE_CNTL_BACK_PTYPE(PC_DRAW_TRIANGLES) |
+                       (fast_clear ? A2XX_PA_SU_SC_MODE_CNTL_MSAA_ENABLE : 0));
+
+       if (fast_clear) {
+               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+               OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_CONFIG));
+               OUT_RING(ring, A2XX_PA_SC_AA_CONFIG_MSAA_NUM_SAMPLES(3));
+       }
 
        OUT_PKT3(ring, CP_SET_CONSTANT, 2);
        OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_MASK));
        OUT_RING(ring, 0x0000ffff);
 
-       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
-       OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL));
-       OUT_RING(ring, xy2d(0,0));              /* PA_SC_WINDOW_SCISSOR_TL */
-       OUT_RING(ring, xy2d(fb->width,      /* PA_SC_WINDOW_SCISSOR_BR */
-                       fb->height));
-
        OUT_PKT3(ring, CP_SET_CONSTANT, 2);
        OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK));
        if (buffers & PIPE_CLEAR_COLOR) {
@@ -361,30 +284,326 @@ fd2_clear(struct fd_context *ctx, unsigned buffers,
                OUT_RING(ring, 0x0);
        }
 
-       if (!is_a20x(ctx->screen)) {
-               OUT_PKT3(ring, CP_SET_CONSTANT, 3);
-               OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX));
-               OUT_RING(ring, 3);                 /* VGT_MAX_VTX_INDX */
-               OUT_RING(ring, 0);                 /* VGT_MIN_VTX_INDX */
-       }
+       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+       OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL));
+       OUT_RING(ring, 0);
 
-       fd_draw(ctx->batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
-                       DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL);
+       if (is_a20x(batch->ctx->screen))
+               return;
+
+       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+       OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX));
+       OUT_RING(ring, 3);                 /* VGT_MAX_VTX_INDX */
+       OUT_RING(ring, 0);                 /* VGT_MIN_VTX_INDX */
+
+       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+       OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF));
+       OUT_RING(ring, 0xff000000 | A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff));
+       OUT_RING(ring, 0xff000000 | A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff));
 
        OUT_PKT3(ring, CP_SET_CONSTANT, 2);
        OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL));
-       OUT_RING(ring, 0x00000000);
+       OUT_RING(ring, 0x00000084);
+
+       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+       OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL));
+       OUT_RING(ring, 0x0000028f);
+}
+
+static void
+clear_state_restore(struct fd_context *ctx, struct fd_ringbuffer *ring)
+{
+       if (is_a20x(ctx->screen))
+               return;
 
        OUT_PKT3(ring, CP_SET_CONSTANT, 2);
        OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL));
        OUT_RING(ring, 0x00000000);
 
-       if (!is_a20x(ctx->screen)) {
+       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+       OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL));
+       OUT_RING(ring, 0x00000000);
+
+       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+       OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL));
+       OUT_RING(ring, 0x0000003b);
+}
+
+static void
+clear_fast(struct fd_batch *batch, struct fd_ringbuffer *ring,
+       uint32_t color_clear, uint32_t depth_clear, unsigned patch_type)
+{
+       BEGIN_RING(ring, 8); /* preallocate next 2 packets (for patching) */
+
+       /* zero values are patched in */
+       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+       OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR));
+       OUT_RINGP(ring, patch_type, &batch->gmem_patches);
+       OUT_RING(ring, 0);
+
+       OUT_PKT3(ring, CP_SET_CONSTANT, 4);
+       OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO));
+       OUT_RING(ring, 0x8000 | 32);
+       OUT_RING(ring, 0);
+       OUT_RING(ring, 0);
+
+       /* set fill values */
+       if (!is_a20x(batch->ctx->screen)) {
                OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-               OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL));
-               OUT_RING(ring, 0x0000003b);
+               OUT_RING(ring, CP_REG(REG_A2XX_CLEAR_COLOR));
+               OUT_RING(ring, color_clear);
+
+               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+               OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL));
+               OUT_RING(ring, A2XX_RB_COPY_CONTROL_DEPTH_CLEAR_ENABLE |
+                       A2XX_RB_COPY_CONTROL_CLEAR_MASK(0xf));
+
+               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+               OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTH_CLEAR));
+               OUT_RING(ring, depth_clear);
+       } else {
+               const float sc = 1.0f / 255.0f;
+
+               OUT_PKT3(ring, CP_SET_CONSTANT, 5);
+               OUT_RING(ring, 0x00000480);
+               OUT_RING(ring, fui((float) (color_clear >>  0 & 0xff) * sc));
+               OUT_RING(ring, fui((float) (color_clear >>  8 & 0xff) * sc));
+               OUT_RING(ring, fui((float) (color_clear >> 16 & 0xff) * sc));
+               OUT_RING(ring, fui((float) (color_clear >> 24 & 0xff) * sc));
+
+               // XXX if using float the rounding error breaks it..
+               float depth = ((double) (depth_clear >> 8)) * (1.0/(double) 0xffffff);
+               assert((unsigned) (((double) depth * (double) 0xffffff)) ==
+                       (depth_clear >> 8));
+
+               OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+               OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_ZSCALE));
+               OUT_RING(ring, fui(0.0f));
+               OUT_RING(ring, fui(depth));
+
+               OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+               OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF));
+               OUT_RING(ring, 0xff000000 |
+                       A2XX_RB_STENCILREFMASK_BF_STENCILREF(depth_clear & 0xff) |
+                       A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff));
+               OUT_RING(ring, 0xff000000 |
+                       A2XX_RB_STENCILREFMASK_STENCILREF(depth_clear & 0xff) |
+                       A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff));
+       }
+
+       fd_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+                       DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL);
+}
+
+static bool
+fd2_clear_fast(struct fd_context *ctx, unsigned buffers,
+               const union pipe_color_union *color, double depth, unsigned stencil)
+{
+       /* using 4x MSAA allows clearing ~2x faster
+        * then we can use higher bpp clearing to clear lower bpp
+        * 1 "pixel" can clear 64 bits (rgba8+depth24+stencil8)
+        * note: its possible to clear with 32_32_32_32 format but its not faster
+        * note: fast clear doesn't work with sysmem rendering
+        * (sysmem rendering is disabled when clear is used)
+        *
+        * we only have 16-bit / 32-bit color formats
+        * and 16-bit / 32-bit depth formats
+        * so there are only a few possible combinations
+        *
+        * if the bpp of the color/depth doesn't match
+        * we clear with depth/color individually
+        */
+       struct fd2_context *fd2_ctx = fd2_context(ctx);
+       struct fd_batch *batch = ctx->batch;
+       struct fd_ringbuffer *ring = batch->draw;
+       struct pipe_framebuffer_state *pfb = &batch->framebuffer;
+       uint32_t color_clear = 0, depth_clear = 0;
+       enum pipe_format format = pipe_surface_format(pfb->cbufs[0]);
+       int depth_size = -1; /* -1: no clear, 0: clear 16-bit, 1: clear 32-bit */
+       int color_size = -1;
+
+       /* TODO: need to test performance on a22x */
+       if (!is_a20x(ctx->screen))
+               return false;
+
+       if (buffers & PIPE_CLEAR_COLOR)
+               color_size = util_format_get_blocksizebits(format) == 32;
+
+       if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))
+               depth_size = fd_pipe2depth(pfb->zsbuf->format) == DEPTHX_24_8;
+
+       assert(color_size >= 0 || depth_size >= 0);
+
+       /* when clearing 24_8, depth/stencil must be both cleared
+        * TODO: if buffer isn't attached we can clear it anyway
+        */
+       if (depth_size == 1 && !(buffers & PIPE_CLEAR_STENCIL) != !(buffers & PIPE_CLEAR_DEPTH))
+               return false;
+
+       if (color_size == 0) {
+               color_clear = pack_rgba(format, color->f);
+               color_clear = (color_clear << 16) | (color_clear & 0xffff);
+       } else if (color_size == 1) {
+               color_clear = pack_rgba(format, color->f);
+       }
+
+       if (depth_size == 0) {
+               depth_clear = (uint32_t)(0xffff * depth);
+               depth_clear |= depth_clear << 16;
+       } else if (depth_size == 1) {
+               depth_clear = (((uint32_t)(0xffffff * depth)) << 8);
+               depth_clear |= (stencil & 0xff);
+       }
+
+       /* disable "window" scissor.. */
+       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+       OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL));
+       OUT_RING(ring, xy2d(0, 0));
+       OUT_RING(ring, xy2d(0x7fff, 0x7fff));
+
+       /* make sure we fill all "pixels" (in SCREEN_SCISSOR) */
+       OUT_PKT3(ring, CP_SET_CONSTANT, 5);
+       OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE));
+       OUT_RING(ring, fui(4096.0));
+       OUT_RING(ring, fui(4096.0));
+       OUT_RING(ring, fui(4096.0));
+       OUT_RING(ring, fui(4096.0));
+
+       clear_state(batch, ring, ~0u, true);
+
+       if (color_size >= 0 && depth_size != color_size)
+               clear_fast(batch, ring, color_clear, color_clear, GMEM_PATCH_FASTCLEAR_COLOR);
+
+       if (depth_size >= 0 && depth_size != color_size)
+               clear_fast(batch, ring, depth_clear, depth_clear, GMEM_PATCH_FASTCLEAR_DEPTH);
+
+       if (depth_size == color_size)
+               clear_fast(batch, ring, color_clear, depth_clear, GMEM_PATCH_FASTCLEAR_COLOR_DEPTH);
+
+       clear_state_restore(ctx, ring);
+
+       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+       OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_CONFIG));
+       OUT_RING(ring, 0);
+
+       /* can't patch in SCREEN_SCISSOR_BR as it can be different for each tile.
+        * MEM_WRITE the value in tile_renderprep, and use CP_LOAD_CONSTANT_CONTEXT
+        * the value is read from byte offset 60 in the given bo
+        */
+       OUT_PKT3(ring, CP_LOAD_CONSTANT_CONTEXT, 3);
+       OUT_RELOC(ring, fd_resource(fd2_ctx->solid_vertexbuf)->bo, 0, 0, 0);
+       OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR));
+       OUT_RING(ring, 1);
+
+       OUT_PKT3(ring, CP_SET_CONSTANT, 4);
+       OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO));
+       OUT_RINGP(ring, GMEM_PATCH_RESTORE_INFO, &batch->gmem_patches);
+       OUT_RING(ring, 0);
+       OUT_RING(ring, 0);
+       return true;
+}
+
+static bool
+fd2_clear(struct fd_context *ctx, unsigned buffers,
+               const union pipe_color_union *color, double depth, unsigned stencil)
+{
+       struct fd_ringbuffer *ring = ctx->batch->draw;
+       struct pipe_framebuffer_state *fb = &ctx->batch->framebuffer;
+
+       if (fd2_clear_fast(ctx, buffers, color, depth, stencil))
+               goto dirty;
+
+       /* set clear value */
+       if (is_a20x(ctx->screen)) {
+               if (buffers & PIPE_CLEAR_COLOR) {
+                       /* C0 used by fragment shader */
+                       OUT_PKT3(ring, CP_SET_CONSTANT, 5);
+                       OUT_RING(ring, 0x00000480);
+                       OUT_RING(ring, color->ui[0]);
+                       OUT_RING(ring, color->ui[1]);
+                       OUT_RING(ring, color->ui[2]);
+                       OUT_RING(ring, color->ui[3]);
+               }
+
+               if (buffers & PIPE_CLEAR_DEPTH) {
+                       /* use viewport to set depth value */
+                       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+                       OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_ZSCALE));
+                       OUT_RING(ring, fui(0.0f));
+                       OUT_RING(ring, fui(depth));
+               }
+
+               if (buffers & PIPE_CLEAR_STENCIL) {
+                       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+                       OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF));
+                       OUT_RING(ring, 0xff000000 |
+                               A2XX_RB_STENCILREFMASK_BF_STENCILREF(stencil) |
+                               A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff));
+                       OUT_RING(ring, 0xff000000 |
+                               A2XX_RB_STENCILREFMASK_STENCILREF(stencil) |
+                               A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff));
+               }
+       } else {
+               if (buffers & PIPE_CLEAR_COLOR) {
+                       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+                       OUT_RING(ring, CP_REG(REG_A2XX_CLEAR_COLOR));
+                       OUT_RING(ring, pack_rgba(PIPE_FORMAT_R8G8B8A8_UNORM, color->f));
+               }
+
+               if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
+                       uint32_t clear_mask, depth_clear;
+                       if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
+                               switch (fd_pipe2depth(fb->zsbuf->format)) {
+                               case DEPTHX_24_8:
+                                       clear_mask = ((buffers & PIPE_CLEAR_DEPTH) ? 0xe : 0) |
+                                               ((buffers & PIPE_CLEAR_STENCIL) ? 0x1 : 0);
+                                       depth_clear = (((uint32_t)(0xffffff * depth)) << 8) |
+                                               (stencil & 0xff);
+                                       break;
+                               case DEPTHX_16:
+                                       clear_mask = 0xf;
+                                       depth_clear = (uint32_t)(0xffffffff * depth);
+                                       break;
+                               default:
+                                       debug_assert(0);
+                                       break;
+                               }
+                       }
+
+                       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+                       OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL));
+                       OUT_RING(ring, A2XX_RB_COPY_CONTROL_DEPTH_CLEAR_ENABLE |
+                               A2XX_RB_COPY_CONTROL_CLEAR_MASK(clear_mask));
+
+                       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+                       OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTH_CLEAR));
+                       OUT_RING(ring, depth_clear);
+               }
        }
 
+       /* scissor state */
+       OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+       OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL));
+       OUT_RING(ring, xy2d(0, 0));
+       OUT_RING(ring, xy2d(fb->width, fb->height));
+
+       /* viewport state */
+       OUT_PKT3(ring, CP_SET_CONSTANT, 5);
+       OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE));
+       OUT_RING(ring, fui((float) fb->width / 2.0));
+       OUT_RING(ring, fui((float) fb->width / 2.0));
+       OUT_RING(ring, fui((float) fb->height / 2.0));
+       OUT_RING(ring, fui((float) fb->height / 2.0));
+
+       /* common state */
+       clear_state(ctx->batch, ring, buffers, false);
+
+       fd_draw(ctx->batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+                       DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL);
+
+       clear_state_restore(ctx, ring);
+
+dirty:
        ctx->dirty |= FD_DIRTY_ZSA |
                        FD_DIRTY_VIEWPORT |
                        FD_DIRTY_RASTERIZER |
@@ -392,7 +611,8 @@ fd2_clear(struct fd_context *ctx, unsigned buffers,
                        FD_DIRTY_PROG |
                        FD_DIRTY_CONST |
                        FD_DIRTY_BLEND |
-                       FD_DIRTY_FRAMEBUFFER;
+                       FD_DIRTY_FRAMEBUFFER |
+                       FD_DIRTY_SCISSOR;
 
        ctx->dirty_shader[PIPE_SHADER_VERTEX]   |= FD_DIRTY_SHADER_PROG;
        ctx->dirty_shader[PIPE_SHADER_FRAGMENT] |= FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST;
index 1dd67e0..c796475 100644 (file)
 
 void fd2_draw_init(struct pipe_context *pctx);
 
+enum {
+       GMEM_PATCH_FASTCLEAR_COLOR,
+       GMEM_PATCH_FASTCLEAR_DEPTH,
+       GMEM_PATCH_FASTCLEAR_COLOR_DEPTH,
+       GMEM_PATCH_RESTORE_INFO,
+};
+
 #endif /* FD2_DRAW_H_ */
index 18d6944..805a4cf 100644 (file)
@@ -360,7 +360,7 @@ fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty)
        if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_ZSA)) {
                OUT_PKT3(ring, CP_SET_CONSTANT, 2);
                OUT_RING(ring, CP_REG(REG_A2XX_RB_COLORCONTROL));
-               OUT_RING(ring, blend ? zsa->rb_colorcontrol | blend->rb_colorcontrol : 0);
+               OUT_RING(ring, zsa->rb_colorcontrol | blend->rb_colorcontrol);
        }
 
        if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) {
@@ -370,13 +370,13 @@ fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty)
 
                OUT_PKT3(ring, CP_SET_CONSTANT, 2);
                OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL));
-               OUT_RING(ring, blend ? blend->rb_blendcontrol_alpha |
+               OUT_RING(ring, blend->rb_blendcontrol_alpha |
                        COND(has_alpha, blend->rb_blendcontrol_rgb) |
-                       COND(!has_alpha, blend->rb_blendcontrol_no_alpha_rgb) : 0);
+                       COND(!has_alpha, blend->rb_blendcontrol_no_alpha_rgb));
 
                OUT_PKT3(ring, CP_SET_CONSTANT, 2);
                OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK));
-               OUT_RING(ring, blend ? blend->rb_colormask : 0xf);
+               OUT_RING(ring, blend->rb_colormask);
        }
 
        if (dirty & FD_DIRTY_BLEND_COLOR) {
index 6a066a6..17d6d6e 100644 (file)
@@ -39,6 +39,7 @@
 #include "fd2_program.h"
 #include "fd2_util.h"
 #include "fd2_zsa.h"
+#include "fd2_draw.h"
 #include "instr-a2xx.h"
 
 static uint32_t fmt2swap(enum pipe_format format)
@@ -473,6 +474,58 @@ fd2_emit_tile_init(struct fd_batch *batch)
                reg |= A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd_pipe2depth(pfb->zsbuf->format));
        OUT_RING(ring, reg);                         /* RB_DEPTH_INFO */
 
+       /* fast clear patches */
+       int depth_size = -1;
+       int color_size = -1;
+
+       if (pfb->cbufs[0])
+               color_size = util_format_get_blocksizebits(format) == 32 ? 4 : 2;
+
+       if (pfb->zsbuf)
+               depth_size = fd_pipe2depth(pfb->zsbuf->format) == 1 ? 4 : 2;
+
+       for (int i = 0; i < fd_patch_num_elements(&batch->gmem_patches); i++) {
+               struct fd_cs_patch *patch = fd_patch_element(&batch->gmem_patches, i);
+               uint32_t color_base = 0, depth_base = gmem->zsbuf_base[0];
+               uint32_t size, lines;
+
+               /* note: 1 "line" is 512 bytes in both color/depth areas (1K total) */
+               switch (patch->val) {
+               case GMEM_PATCH_FASTCLEAR_COLOR:
+                       size = align(gmem->bin_w * gmem->bin_h * color_size, 0x4000);
+                       lines = size / 1024;
+                       depth_base = size / 2;
+                       break;
+               case GMEM_PATCH_FASTCLEAR_DEPTH:
+                       size = align(gmem->bin_w * gmem->bin_h * depth_size, 0x4000);
+                       lines = size / 1024;
+                       color_base = depth_base;
+                       depth_base = depth_base + size / 2;
+                       break;
+               case GMEM_PATCH_FASTCLEAR_COLOR_DEPTH:
+                       lines = align(gmem->bin_w * gmem->bin_h * color_size * 2, 0x4000) / 1024;
+                       break;
+               case GMEM_PATCH_RESTORE_INFO:
+                       patch->cs[0] = gmem->bin_w;
+                       patch->cs[1] = A2XX_RB_COLOR_INFO_SWAP(fmt2swap(format)) |
+                                       A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format));
+                       patch->cs[2] = A2XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]);
+                       if (pfb->zsbuf)
+                               patch->cs[2] |= A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd_pipe2depth(pfb->zsbuf->format));
+                       continue;
+               default:
+                       continue;
+               }
+
+               patch->cs[0] = A2XX_PA_SC_SCREEN_SCISSOR_BR_X(32) |
+                       A2XX_PA_SC_SCREEN_SCISSOR_BR_Y(lines);
+               patch->cs[4] = A2XX_RB_COLOR_INFO_BASE(color_base) |
+                       A2XX_RB_COLOR_INFO_FORMAT(COLORX_8_8_8_8);
+               patch->cs[5] = A2XX_RB_DEPTH_INFO_DEPTH_BASE(depth_base) |
+                       A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(1);
+       }
+       util_dynarray_resize(&batch->gmem_patches, 0);
+
        /* set to zero, for some reason hardware doesn't like certain values */
        OUT_PKT3(ring, CP_SET_CONSTANT, 2);
        OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN));
@@ -607,6 +660,7 @@ static void
 fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile)
 {
        struct fd_context *ctx = batch->ctx;
+       struct fd2_context *fd2_ctx = fd2_context(ctx);
        struct fd_ringbuffer *ring = batch->gmem;
        struct pipe_framebuffer_state *pfb = &batch->framebuffer;
        enum pipe_format format = pipe_surface_format(pfb->cbufs[0]);
@@ -624,6 +678,12 @@ fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile)
        OUT_RING(ring, A2XX_PA_SC_WINDOW_OFFSET_X(-tile->xoff) |
                        A2XX_PA_SC_WINDOW_OFFSET_Y(-tile->yoff));
 
+       /* write SCISSOR_BR to memory so fast clear path can restore from it */
+       OUT_PKT3(ring, CP_MEM_WRITE, 2);
+       OUT_RELOC(ring, fd_resource(fd2_ctx->solid_vertexbuf)->bo, 60, 0, 0);
+       OUT_RING(ring, A2XX_PA_SC_SCREEN_SCISSOR_BR_X(tile->bin_w) |
+                       A2XX_PA_SC_SCREEN_SCISSOR_BR_Y(tile->bin_h));
+
        /* tile offset for gl_FragCoord on a20x (C64 in fragment shader) */
        if (is_a20x(batch->ctx->screen)) {
                OUT_PKT3(ring, CP_SET_CONSTANT, 5);
index a852494..a157850 100644 (file)
@@ -90,8 +90,10 @@ batch_init(struct fd_batch *batch)
 
        util_dynarray_init(&batch->draw_patches, NULL);
 
-       if (is_a2xx(ctx->screen))
+       if (is_a2xx(ctx->screen)) {
                util_dynarray_init(&batch->shader_patches, NULL);
+               util_dynarray_init(&batch->gmem_patches, NULL);
+       }
 
        if (is_a3xx(ctx->screen))
                util_dynarray_init(&batch->rbrc_patches, NULL);
@@ -167,8 +169,10 @@ batch_fini(struct fd_batch *batch)
 
        util_dynarray_fini(&batch->draw_patches);
 
-       if (is_a2xx(batch->ctx->screen))
+       if (is_a2xx(batch->ctx->screen)) {
                util_dynarray_fini(&batch->shader_patches);
+               util_dynarray_fini(&batch->gmem_patches);
+       }
 
        if (is_a3xx(batch->ctx->screen))
                util_dynarray_fini(&batch->rbrc_patches);
index 428a027..7b723db 100644 (file)
@@ -145,6 +145,11 @@ struct fd_batch {
         */
        struct util_dynarray rbrc_patches;
 
+       /* Keep track of GMEM related values that need to be patched up once we
+        * know the gmem layout:
+        */
+       struct util_dynarray gmem_patches;
+
        /* Keep track of pointer to start of MEM exports for a20x binning shaders
         *
         * this is so the end of the shader can be cut off at the right point
index d0420b2..dd35dfa 100644 (file)
@@ -77,24 +77,25 @@ static uint32_t bin_width(struct fd_screen *screen)
 
 static uint32_t
 total_size(uint8_t cbuf_cpp[], uint8_t zsbuf_cpp[2],
-                  uint32_t bin_w, uint32_t bin_h, struct fd_gmem_stateobj *gmem)
+                  uint32_t bin_w, uint32_t bin_h, uint32_t gmem_align,
+                  struct fd_gmem_stateobj *gmem)
 {
        uint32_t total = 0, i;
 
        for (i = 0; i < MAX_RENDER_TARGETS; i++) {
                if (cbuf_cpp[i]) {
-                       gmem->cbuf_base[i] = align(total, 0x4000);
+                       gmem->cbuf_base[i] = align(total, gmem_align);
                        total = gmem->cbuf_base[i] + cbuf_cpp[i] * bin_w * bin_h;
                }
        }
 
        if (zsbuf_cpp[0]) {
-               gmem->zsbuf_base[0] = align(total, 0x4000);
+               gmem->zsbuf_base[0] = align(total, gmem_align);
                total = gmem->zsbuf_base[0] + zsbuf_cpp[0] * bin_w * bin_h;
        }
 
        if (zsbuf_cpp[1]) {
-               gmem->zsbuf_base[1] = align(total, 0x4000);
+               gmem->zsbuf_base[1] = align(total, gmem_align);
                total = gmem->zsbuf_base[1] + zsbuf_cpp[1] * bin_w * bin_h;
        }
 
@@ -116,6 +117,7 @@ calculate_tiles(struct fd_batch *batch)
        uint32_t minx, miny, width, height;
        uint32_t nbins_x = 1, nbins_y = 1;
        uint32_t bin_w, bin_h;
+       uint32_t gmem_align = 0x4000;
        uint32_t max_width = bin_width(screen);
        uint8_t cbuf_cpp[MAX_RENDER_TARGETS] = {0}, zsbuf_cpp[2] = {0};
        uint32_t i, j, t, xoff, yoff;
@@ -178,10 +180,18 @@ calculate_tiles(struct fd_batch *batch)
                                zsbuf_cpp[0], width, height);
        }
 
+       if (is_a20x(screen) && batch->cleared) {
+               /* under normal circumstances the requirement would be 4K
+                * but the fast clear path requires an alignment of 32K
+                */
+               gmem_align = 0x8000;
+       }
+
        /* then find a bin width/height that satisfies the memory
         * constraints:
         */
-       while (total_size(cbuf_cpp, zsbuf_cpp, bin_w, bin_h, gmem) > gmem_size) {
+       while (total_size(cbuf_cpp, zsbuf_cpp, bin_w, bin_h, gmem_align, gmem) >
+                  gmem_size) {
                if (bin_w > bin_h) {
                        nbins_x++;
                        bin_w = align(width / nbins_x, gmem_alignw);