freedreno/a6xx: move const emit to state group
authorRob Clark <robdclark@gmail.com>
Sun, 7 Oct 2018 17:59:27 +0000 (13:59 -0400)
committerRob Clark <robdclark@gmail.com>
Wed, 17 Oct 2018 16:44:48 +0000 (12:44 -0400)
Eventually we want to move nearly everything, but no other state depends
on const state, so this is the easiest one to move first.

For webgl aquarium, this reduces GPU load by about 10%, since for each
fish it does a uniform upload plus draw.. fish frequently are visible in
only a single tile, so this skips the uniform uploads for other tiles.

The additional step of avoiding WFI's when using CP_SET_DRAW_STATE seems
to be work an additional 10% gain for aquarium.

Signed-off-by: Rob Clark <robdclark@gmail.com>
src/gallium/drivers/freedreno/a6xx/fd6_emit.c
src/gallium/drivers/freedreno/a6xx/fd6_emit.h
src/gallium/drivers/freedreno/a6xx/fd6_gmem.c
src/gallium/drivers/freedreno/ir3/ir3_shader.c

index fc4a53f..93f6a26 100644 (file)
@@ -359,7 +359,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
        if (tex->num_samplers > 0) {
                struct fd_ringbuffer *state =
-                       fd_ringbuffer_new_object(ctx->pipe, tex->num_samplers * 4);
+                       fd_ringbuffer_new_flags(ctx->pipe, tex->num_samplers * 4 * 4,
+                                       FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
                for (unsigned i = 0; i < tex->num_samplers; i++) {
                        static const struct fd6_sampler_stateobj dummy_sampler = {};
                        const struct fd6_sampler_stateobj *sampler = tex->samplers[i] ?
@@ -389,7 +390,8 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
        if (tex->num_textures > 0) {
                struct fd_ringbuffer *state =
-                       fd_ringbuffer_new_object(ctx->pipe, tex->num_textures * 16);
+                       fd_ringbuffer_new_flags(ctx->pipe, tex->num_textures * 16 * 4,
+                                       FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
                for (unsigned i = 0; i < tex->num_textures; i++) {
                        static const struct fd6_pipe_sampler_view dummy_view = {};
                        const struct fd6_pipe_sampler_view *view = tex->textures[i] ?
@@ -791,9 +793,29 @@ fd6_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
                OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr));
        }
 
-       ir3_emit_vs_consts(vp, ring, ctx, emit->info);
-       if (!emit->key.binning_pass)
-               ir3_emit_fs_consts(fp, ring, ctx);
+#define DIRTY_CONST (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST | \
+                                        FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)
+
+       if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & DIRTY_CONST) {
+               struct fd_ringbuffer *vsconstobj =
+                       fd_ringbuffer_new_flags(ctx->pipe, 0x1000,
+                                       FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
+
+               ir3_emit_vs_consts(vp, vsconstobj, ctx, emit->info);
+               fd6_emit_add_group(emit, vsconstobj, FD6_GROUP_VS_CONST, 0x7);
+               fd_ringbuffer_del(vsconstobj);
+       }
+
+       if ((ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & DIRTY_CONST) &&
+                       !emit->key.binning_pass) {
+               struct fd_ringbuffer *fsconstobj =
+                       fd_ringbuffer_new_flags(ctx->pipe, 0x1000,
+                                       FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
+
+               ir3_emit_fs_consts(fp, fsconstobj, ctx);
+               fd6_emit_add_group(emit, fsconstobj, FD6_GROUP_FS_CONST, 0x7);
+               fd_ringbuffer_del(fsconstobj);
+       }
 
        struct pipe_stream_output_info *info = &vp->shader->stream_output;
        if (info->num_outputs) {
index a2117a1..4e27597 100644 (file)
@@ -43,7 +43,8 @@ struct fd_ringbuffer;
  * need to be emit'd.
  */
 enum fd6_state_id {
-       FD6_GROUP_CONST,
+       FD6_GROUP_VS_CONST,
+       FD6_GROUP_FS_CONST,
 };
 
 struct fd6_state_group {
@@ -116,7 +117,7 @@ fd6_emit_add_group(struct fd6_emit *emit, struct fd_ringbuffer *stateobj,
        if (fd_ringbuffer_size(stateobj) == 0)
                return;
        struct fd6_state_group *g = &emit->groups[emit->num_groups++];
-       g->stateobj = stateobj;
+       g->stateobj = fd_ringbuffer_ref(stateobj);
        g->group_id = group_id;
        g->enable_mask = enable_mask;
 }
index 0c96250..1167399 100644 (file)
@@ -751,6 +751,13 @@ fd6_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile)
                OUT_RING(ring, A2XX_CP_SET_MARKER_0_MODE(0x5) | 0x10);
        }
 
+       OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
+       OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
+                       CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
+                       CP_SET_DRAW_STATE__0_GROUP_ID(0));
+       OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
+       OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
+
        OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
        OUT_RING(ring, 0x0);
 
index 5532a7f..ee063f8 100644 (file)
@@ -552,6 +552,18 @@ ir3_shader_outputs(const struct ir3_shader *so)
 
 #include "freedreno_resource.h"
 
+static inline void
+ring_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring)
+{
+       /* when we emit const state via ring (IB2) we need a WFI, but when
+        * it is emit'd via stateobj, we don't
+        */
+       if (ring->flags & FD_RINGBUFFER_OBJECT)
+               return;
+
+       fd_wfi(batch, ring);
+}
+
 static void
 emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
                struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf)
@@ -579,7 +591,7 @@ emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
                size = MIN2(size, 4 * max_const);
 
                if (size > 0) {
-                       fd_wfi(ctx->batch, ring);
+                       ring_wfi(ctx->batch, ring);
                        ctx->emit_const(ring, v->type, 0,
                                        cb->buffer_offset, size,
                                        cb->user_buffer, cb->buffer);
@@ -611,7 +623,7 @@ emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
                        }
                }
 
-               fd_wfi(ctx->batch, ring);
+               ring_wfi(ctx->batch, ring);
                ctx->emit_const_bo(ring, v->type, false, offset * 4, params, prscs, offsets);
        }
 }
@@ -631,7 +643,7 @@ emit_ssbo_sizes(struct fd_context *ctx, const struct ir3_shader_variant *v,
                        sizes[off] = sb->sb[index].buffer_size;
                }
 
-               fd_wfi(ctx->batch, ring);
+               ring_wfi(ctx->batch, ring);
                ctx->emit_const(ring, v->type, offset * 4,
                        0, ARRAY_SIZE(sizes), sizes, NULL);
        }
@@ -673,7 +685,7 @@ emit_image_dims(struct fd_context *ctx, const struct ir3_shader_variant *v,
                        }
                }
 
-               fd_wfi(ctx->batch, ring);
+               ring_wfi(ctx->batch, ring);
                ctx->emit_const(ring, v->type, offset * 4,
                        0, ARRAY_SIZE(dims), dims, NULL);
        }
@@ -696,7 +708,7 @@ emit_immediates(struct fd_context *ctx, const struct ir3_shader_variant *v,
        size *= 4;
 
        if (size > 0) {
-               fd_wfi(ctx->batch, ring);
+               ring_wfi(ctx->batch, ring);
                ctx->emit_const(ring, v->type, base,
                        0, size, v->immediates[0].val, NULL);
        }
@@ -729,7 +741,7 @@ emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v,
                        }
                }
 
-               fd_wfi(ctx->batch, ring);
+               ring_wfi(ctx->batch, ring);
                ctx->emit_const_bo(ring, v->type, true, offset * 4, params, prscs, offsets);
        }
 }
@@ -787,6 +799,19 @@ emit_common_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
 {
        enum fd_dirty_shader_state dirty = ctx->dirty_shader[t];
 
+       /* When we use CP_SET_DRAW_STATE objects to emit constant state,
+        * if we emit any of it we need to emit all.  This is because
+        * we are using the same state-group-id each time for uniform
+        * state, and if previous update is never evaluated (due to no
+        * visible primitives in the current tile) then the new stateobj
+        * completely replaces the old one.
+        *
+        * Possibly if we split up different parts of the const state to
+        * different state-objects we could avoid this.
+        */
+       if (dirty && (ring->flags & FD_RINGBUFFER_OBJECT))
+               dirty = ~0;
+
        if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) {
                struct fd_constbuf_stateobj *constbuf;
                bool shader_dirty;
@@ -846,7 +871,7 @@ ir3_emit_vs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
                                vertex_params_size = ARRAY_SIZE(vertex_params);
                        }
 
-                       fd_wfi(ctx->batch, ring);
+                       ring_wfi(ctx->batch, ring);
 
                        bool needs_vtxid_base =
                                ir3_find_sysval_regid(v, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) != regid(63, 0);
@@ -918,7 +943,7 @@ ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
        /* emit compute-shader driver-params: */
        uint32_t offset = v->constbase.driver_param;
        if (v->constlen > offset) {
-               fd_wfi(ctx->batch, ring);
+               ring_wfi(ctx->batch, ring);
 
                if (info->indirect) {
                        struct pipe_resource *indirect = NULL;