From 893425a607a63a83e8a4c13fd963367c8d174678 Mon Sep 17 00:00:00 2001 From: "Kristian H. Kristensen" Date: Tue, 26 Mar 2019 10:31:54 -0700 Subject: [PATCH] freedreno/ir3: Push UBOs to constant file We have a rather big constant file and it seems that the best way to use it is to upload all UBOs and lower UBO access the load_uniform. Signed-off-by: Kristian H. Kristensen Reviewed-by: Rob Clark --- src/freedreno/ir3/ir3_context.c | 2 +- src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c | 111 +++++++++++++++++++++--- src/freedreno/ir3/ir3_shader.h | 17 ++++ src/gallium/drivers/freedreno/a6xx/fd6_emit.c | 15 +++- src/gallium/drivers/freedreno/ir3/ir3_gallium.c | 16 ++++ 5 files changed, 145 insertions(+), 16 deletions(-) diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c index 7c35b9b..d626716 100644 --- a/src/freedreno/ir3/ir3_context.c +++ b/src/freedreno/ir3/ir3_context.c @@ -124,7 +124,7 @@ ir3_context_init(struct ir3_compiler *compiler, * Immediates go last mostly because they are inserted in the CP pass * after the nir -> ir3 frontend. */ - unsigned constoff = align(ctx->s->num_uniforms, 4); + unsigned constoff = align(ctx->so->shader->ubo_state.size / 16, 4); unsigned ptrsz = ir3_pointer_size(ctx); memset(&so->constbase, ~0, sizeof(so->constbase)); diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c index 35b9219..aaa2a86 100644 --- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c +++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c @@ -27,9 +27,38 @@ #include "util/u_dynarray.h" #include "mesa/main/macros.h" -struct ir3_ubo_analysis_state { - unsigned lower_count; -}; +static inline struct ir3_ubo_range +get_ubo_load_range(nir_intrinsic_instr *instr) +{ + struct ir3_ubo_range r; + + const int bytes = nir_intrinsic_dest_components(instr) * + (nir_dest_bit_size(instr->dest) / 8); + + r.start = ROUND_DOWN_TO(nir_src_as_uint(instr->src[1]), 16 * 4); + r.end = ALIGN(r.start + bytes, 16 * 4); + + return r; +} + +static void +gather_ubo_ranges(nir_intrinsic_instr *instr, + struct ir3_ubo_analysis_state *state) +{ + if (!nir_src_is_const(instr->src[0])) + return; + + if (!nir_src_is_const(instr->src[1])) + return; + + const struct ir3_ubo_range r = get_ubo_load_range(instr); + const uint32_t block = nir_src_as_uint(instr->src[0]); + + if (r.start < state->range[block].start) + state->range[block].start = r.start; + if (state->range[block].end < r.end) + state->range[block].end = r.end; +} static void lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b, @@ -43,15 +72,37 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b, return; const uint32_t block = nir_src_as_uint(instr->src[0]); - if (block > 0) - return; + + if (block > 0) { + /* We don't lower dynamic array indexing either, but we definitely should. + * We don't have a good way of determining the range of the dynamic + * access, so for now just fall back to pulling. + */ + if (!nir_src_is_const(instr->src[1])) + return; + + /* After gathering the UBO access ranges, we limit the total + * upload. Reject if we're now outside the range. + */ + const struct ir3_ubo_range r = get_ubo_load_range(instr); + if (!(state->range[block].start <= r.start && + r.end <= state->range[block].end)) + return; + } b->cursor = nir_before_instr(&instr->instr); nir_ssa_def *ubo_offset = nir_ssa_for_src(b, instr->src[1], 1); - nir_ssa_def *uniform_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2); - if (uniform_offset == NULL) - uniform_offset = nir_ushr(b, ubo_offset, nir_imm_int(b, 2)); + nir_ssa_def *new_offset = ir3_nir_try_propagate_bit_shift(b, ubo_offset, -2); + if (new_offset) + ubo_offset = new_offset; + else + ubo_offset = nir_ushr(b, ubo_offset, nir_imm_int(b, 2)); + + const int range_offset = + (state->range[block].offset - state->range[block].start) / 4; + nir_ssa_def *uniform_offset = + nir_iadd(b, ubo_offset, nir_imm_int(b, range_offset)); nir_intrinsic_instr *uniform = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform); @@ -72,7 +123,45 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b, bool ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader) { - struct ir3_ubo_analysis_state state = { 0 }; + struct ir3_ubo_analysis_state *state = &shader->ubo_state; + + memset(state, 0, sizeof(*state)); + state->range[0].end = nir->num_uniforms * 16; + + nir_foreach_function(function, nir) { + if (function->impl) { + nir_foreach_block(block, function->impl) { + nir_foreach_instr(instr, block) { + if (instr->type == nir_instr_type_intrinsic && + nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_ubo) + gather_ubo_ranges(nir_instr_as_intrinsic(instr), state); + } + } + } + } + + /* For now, everything we upload is accessed statically and thus will be + * used by the shader. Once we can upload dynamically indexed data, we may + * upload sparsely accessed arrays, at which point we probably want to + * give priority to smaller UBOs, on the assumption that big UBOs will be + * accessed dynamically. Alternatively, we can track statically and + * dynamically accessed ranges separately and upload static rangtes + * first. + */ + const uint32_t max_upload = 16 * 1024; + uint32_t offset = 0; + for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) { + uint32_t range_size = state->range[i].end - state->range[i].start; + + debug_assert(offset <= max_upload); + state->range[i].offset = offset; + if (offset + range_size > max_upload) { + range_size = max_upload - offset; + state->range[i].end = state->range[i].start + range_size; + } + offset += range_size; + } + state->size = offset; nir_foreach_function(function, nir) { if (function->impl) { @@ -82,7 +171,7 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader) nir_foreach_instr_safe(instr, block) { if (instr->type == nir_instr_type_intrinsic && nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_ubo) - lower_ubo_load_to_uniform(nir_instr_as_intrinsic(instr), &builder, &state); + lower_ubo_load_to_uniform(nir_instr_as_intrinsic(instr), &builder, state); } } @@ -91,5 +180,5 @@ ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader *shader) } } - return state.lower_count > 0; + return state->lower_count > 0; } diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 647651c..58d1419 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -67,6 +67,8 @@ enum ir3_driver_param { #define IR3_MAX_SHADER_IMAGES 32 #define IR3_MAX_SO_BUFFERS 4 #define IR3_MAX_SO_OUTPUTS 64 +#define IR3_MAX_CONSTANT_BUFFERS 32 + /** * For consts needed to pass internal values to shader which may or may not @@ -474,6 +476,19 @@ struct ir3_shader_variant { struct ir3_shader *shader; }; +struct ir3_ubo_range { + uint32_t offset; /* start offset of this block in const register file */ + uint32_t start, end; /* range of block that's actually used */ +}; + +struct ir3_ubo_analysis_state +{ + struct ir3_ubo_range range[IR3_MAX_CONSTANT_BUFFERS]; + uint32_t size; + uint32_t lower_count; +}; + + struct ir3_shader { gl_shader_stage type; @@ -486,6 +501,8 @@ struct ir3_shader { struct ir3_compiler *compiler; + struct ir3_ubo_analysis_state ubo_state; + struct nir_shader *nir; struct ir3_stream_output_info stream_output; diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c index b48a0d6..75c8c91 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c @@ -72,11 +72,10 @@ fd6_emit_const(struct fd_ringbuffer *ring, gl_shader_stage type, uint32_t regid, uint32_t offset, uint32_t sizedwords, const uint32_t *dwords, struct pipe_resource *prsc) { - uint32_t i, sz; + uint32_t i, sz, align_sz; enum a6xx_state_src src; debug_assert((regid % 4) == 0); - debug_assert((sizedwords % 4) == 0); if (prsc) { sz = 0; @@ -86,12 +85,14 @@ fd6_emit_const(struct fd_ringbuffer *ring, gl_shader_stage type, src = SS6_DIRECT; } - OUT_PKT7(ring, shader_t_to_opcode(type), 3 + sz); + align_sz = align(sz, 4); + + OUT_PKT7(ring, shader_t_to_opcode(type), 3 + align_sz); OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid/4) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | CP_LOAD_STATE6_0_STATE_SRC(src) | CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT(sizedwords/4)); + CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(sizedwords, 4))); if (prsc) { struct fd_bo *bo = fd_resource(prsc)->bo; OUT_RELOC(ring, bo, offset, 0, 0); @@ -100,9 +101,15 @@ fd6_emit_const(struct fd_ringbuffer *ring, gl_shader_stage type, OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); dwords = (uint32_t *)&((uint8_t *)dwords)[offset]; } + for (i = 0; i < sz; i++) { OUT_RING(ring, dwords[i]); } + + /* Zero-pad to multiple of 4 dwords */ + for (i = sz; i < align_sz; i++) { + OUT_RING(ring, 0); + } } static void diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c index 4481c54..2d9516a 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c @@ -254,6 +254,22 @@ emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v, cb->user_buffer, cb->buffer); } } + + struct ir3_ubo_analysis_state *state; + state = &v->shader->ubo_state; + + for (uint32_t i = 1; i < ARRAY_SIZE(state->range); i++) { + struct pipe_constant_buffer *cb = &constbuf->cb[i]; + + if (state->range[i].start < state->range[i].end && + constbuf->enabled_mask & (1 << i)) { + + ctx->emit_const(ring, v->type, state->range[i].offset / 4, + cb->buffer_offset + state->range[i].start, + (state->range[i].end - state->range[i].start) / 4, + cb->user_buffer, cb->buffer); + } + } } static void -- 2.7.4