From 6d28c6e52cfd76855c1368560dd90f12493e2580 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Sat, 2 Jan 2016 03:21:28 -0800 Subject: [PATCH] i965: Select ranges of UBO data to be uploaded as push constants. This adds a NIR pass that decides which portions of UBOS we should upload as push constants, rather than pull constants. v2: Switch to uint16_t for the UBO block number, because we may have a lot of them in Vulkan (suggested by Jason). Add more comments about bitfield trickery (requested by Matt). v3: Skip vec4 stages for now...I haven't finished wiring up support in the vec4 backend, and so pushing the data but not using it will just be wasteful. Reviewed-by: Matt Turner --- src/intel/Makefile.sources | 1 + src/intel/compiler/brw_compiler.h | 9 + src/intel/compiler/brw_nir.h | 4 + src/intel/compiler/brw_nir_analyze_ubo_ranges.c | 298 ++++++++++++++++++++++++ src/mesa/drivers/dri/i965/brw_gs.c | 2 + src/mesa/drivers/dri/i965/brw_tcs.c | 2 + src/mesa/drivers/dri/i965/brw_tes.c | 2 + src/mesa/drivers/dri/i965/brw_vs.c | 2 + src/mesa/drivers/dri/i965/brw_wm.c | 2 + 9 files changed, 322 insertions(+) create mode 100644 src/intel/compiler/brw_nir_analyze_ubo_ranges.c diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources index 30643e0..17027cf 100644 --- a/src/intel/Makefile.sources +++ b/src/intel/Makefile.sources @@ -74,6 +74,7 @@ COMPILER_FILES = \ compiler/brw_nir.h \ compiler/brw_nir.c \ compiler/brw_nir_analyze_boolean_resolves.c \ + compiler/brw_nir_analyze_ubo_ranges.c \ compiler/brw_nir_attribute_workarounds.c \ compiler/brw_nir_intrinsics.c \ compiler/brw_nir_opt_peephole_ffma.c \ diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index e4c22e3..bebd244 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -468,6 +468,13 @@ struct brw_image_param { */ #define BRW_SHADER_TIME_STRIDE 64 +struct brw_ubo_range +{ + uint16_t block; + uint8_t start; + uint8_t length; +}; + struct brw_stage_prog_data { struct { /** size of our binding table. */ @@ -488,6 +495,8 @@ struct brw_stage_prog_data { /** @} */ } binding_table; + struct brw_ubo_range ubo_ranges[4]; + GLuint nr_params; /**< number of float params/constants */ GLuint nr_pull_params; unsigned nr_image_params; diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index 5d866b8..560027c 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -142,6 +142,10 @@ void brw_nir_setup_glsl_uniforms(nir_shader *shader, void brw_nir_setup_arb_uniforms(nir_shader *shader, struct gl_program *prog, struct brw_stage_prog_data *stage_prog_data); +void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler, + nir_shader *nir, + struct brw_ubo_range out_ranges[4]); + bool brw_nir_opt_peephole_ffma(nir_shader *shader); #define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0 diff --git a/src/intel/compiler/brw_nir_analyze_ubo_ranges.c b/src/intel/compiler/brw_nir_analyze_ubo_ranges.c new file mode 100644 index 0000000..097aa8e --- /dev/null +++ b/src/intel/compiler/brw_nir_analyze_ubo_ranges.c @@ -0,0 +1,298 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "brw_nir.h" +#include "compiler/nir/nir.h" +#include "util/u_dynarray.h" + +/** + * \file brw_nir_analyze_ubo_ranges.c + * + * This pass decides which portions of UBOs to upload as push constants, + * so shaders can access them as part of the thread payload, rather than + * having to issue expensive memory reads to pull the data. + * + * The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different + * buffers, in GRF (256-bit/32-byte) units. + * + * To do this, we examine NIR load_ubo intrinsics, recording the number of + * loads at each offset. We track offsets at a 32-byte granularity, so even + * fields with a bit of padding between them tend to fall into contiguous + * ranges. We build a list of these ranges, tracking their "cost" (number + * of registers required) and "benefit" (number of pull loads eliminated + * by pushing the range). We then sort the list to obtain the four best + * ranges (most benefit for the least cost). + */ + +struct ubo_range_entry +{ + struct brw_ubo_range range; + int benefit; +}; + +static int +score(const struct ubo_range_entry *entry) +{ + return 2 * entry->benefit - entry->range.length; +} + +/** + * Compares score for two UBO range entries. + * + * For a descending qsort(). + */ +static int +cmp_ubo_range_entry(const void *va, const void *vb) +{ + const struct ubo_range_entry *a = va; + const struct ubo_range_entry *b = vb; + + /* Rank based on scores */ + int delta = score(b) - score(a); + + /* Then use the UBO block index as a tie-breaker */ + if (delta == 0) + delta = b->range.block - a->range.block; + + /* Finally use the UBO offset as a second tie-breaker */ + if (delta == 0) + delta = b->range.block - a->range.block; + + return delta; +} + +struct ubo_block_info +{ + /* Each bit in the offsets bitfield represents a 32-byte section of data. + * If it's set to one, there is interesting UBO data at that offset. If + * not, there's a "hole" - padding between data - or just nothing at all. + */ + uint64_t offsets; + uint8_t uses[64]; +}; + +struct ubo_analysis_state +{ + struct hash_table *blocks; + bool uses_regular_uniforms; +}; + +static struct ubo_block_info * +get_block_info(struct ubo_analysis_state *state, int block) +{ + uint32_t hash = block + 1; + void *key = (void *) (uintptr_t) hash; + + struct hash_entry *entry = + _mesa_hash_table_search_pre_hashed(state->blocks, hash, key); + + if (entry) + return (struct ubo_block_info *) entry->data; + + struct ubo_block_info *info = + rzalloc(state->blocks, struct ubo_block_info); + _mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info); + + return info; +} + +static void +analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block) +{ + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic == nir_intrinsic_load_uniform) + state->uses_regular_uniforms = true; + + if (intrin->intrinsic != nir_intrinsic_load_ubo) + continue; + + nir_const_value *block_const = nir_src_as_const_value(intrin->src[0]); + nir_const_value *offset_const = nir_src_as_const_value(intrin->src[1]); + + if (block_const && offset_const) { + const int block = block_const->u32[0]; + const int offset = offset_const->u32[0] / 32; + + /* Won't fit in our bitfield */ + if (offset >= 64) + continue; + + /* TODO: should we count uses in loops as higher benefit? */ + + struct ubo_block_info *info = get_block_info(state, block); + info->offsets |= 1ull << offset; + info->uses[offset]++; + } + } +} + +static void +print_ubo_entry(FILE *file, + const struct ubo_range_entry *entry, + struct ubo_analysis_state *state) +{ + struct ubo_block_info *info = get_block_info(state, entry->range.block); + + fprintf(file, + "block %2d, start %2d, length %2d, bits = %zx, " + "benefit %2d, cost %2d, score = %2d\n", + entry->range.block, entry->range.start, entry->range.length, + info->offsets, entry->benefit, entry->range.length, score(entry)); +} + +void +brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler, + nir_shader *nir, + struct brw_ubo_range out_ranges[4]) +{ + const struct gen_device_info *devinfo = compiler->devinfo; + + if ((devinfo->gen <= 7 && !devinfo->is_haswell) || + !compiler->scalar_stage[nir->stage]) { + memset(out_ranges, 0, 4 * sizeof(struct brw_ubo_range)); + return; + } + + void *mem_ctx = ralloc_context(NULL); + + struct ubo_analysis_state state = { + .uses_regular_uniforms = false, + .blocks = + _mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal), + }; + + /* Walk the IR, recording how many times each UBO block/offset is used. */ + nir_foreach_function(function, nir) { + if (function->impl) { + nir_foreach_block(block, function->impl) { + analyze_ubos_block(&state, block); + } + } + } + + /* Find ranges: a block, starting 32-byte offset, and length. */ + struct util_dynarray ranges; + util_dynarray_init(&ranges, mem_ctx); + + struct hash_entry *entry; + hash_table_foreach(state.blocks, entry) { + const int b = entry->hash - 1; + const struct ubo_block_info *info = entry->data; + uint64_t offsets = info->offsets; + + /* Walk through the offsets bitfield, finding contiguous regions of + * set bits: + * + * 0000000001111111111111000000000000111111111111110000000011111100 + * ^^^^^^^^^^^^^ ^^^^^^^^^^^^^^ ^^^^^^ + * + * Each of these will become a UBO range. + */ + while (offsets != 0) { + /* Find the first 1 in the offsets bitfield. This represents the + * start of a range of interesting UBO data. Make it zero-indexed. + */ + int first_bit = ffsll(offsets) - 1; + + /* Find the first 0 bit in offsets beyond first_bit. To find the + * first zero bit, we find the first 1 bit in the complement. In + * order to ignore bits before first_bit, we mask off those bits. + */ + int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1; + + if (first_hole == -1) { + /* If we didn't find a hole, then set it to the end of the + * bitfield. There are no more ranges to process. + */ + first_hole = 64; + offsets = 0; + } else { + /* We've processed all bits before first_hole. Mask them off. */ + offsets &= ~((1ull << first_hole) - 1); + } + + struct ubo_range_entry *entry = + util_dynarray_grow(&ranges, sizeof(struct ubo_range_entry)); + + entry->range.block = b; + entry->range.start = first_bit; + /* first_hole is one beyond the end, so we don't need to add 1 */ + entry->range.length = first_hole - first_bit; + entry->benefit = 0; + + for (int i = 0; i < entry->range.length; i++) + entry->benefit += info->uses[first_bit + i]; + } + } + + int nr_entries = ranges.size / sizeof(struct ubo_range_entry); + + if (0) { + util_dynarray_foreach(&ranges, struct ubo_range_entry, entry) { + print_ubo_entry(stderr, entry, &state); + } + } + + /* TODO: Consider combining ranges. + * + * We can only push 3-4 ranges via 3DSTATE_CONSTANT_XS. If there are + * more ranges, and two are close by with only a small hole, it may be + * worth combining them. The holes will waste register space, but the + * benefit of removing pulls may outweigh that cost. + */ + + /* Sort the list so the most beneficial ranges are at the front. */ + qsort(ranges.data, nr_entries, sizeof(struct ubo_range_entry), + cmp_ubo_range_entry); + + struct ubo_range_entry *entries = ranges.data; + + /* Return the top 4 or so. We drop by one if regular uniforms are in + * use, assuming one push buffer will be dedicated to those. We may + * also only get 3 on Haswell if we can't write INSTPM. + * + * The backend may need to shrink these ranges to ensure that they + * don't exceed the maximum push constant limits. It can simply drop + * the tail of the list, as that's the least valuable portion. We + * unfortunately can't truncate it here, because we don't know what + * the backend is planning to do with regular uniforms. + */ + const int max_ubos = (compiler->constant_buffer_0_is_relative ? 3 : 4) - + state.uses_regular_uniforms; + nr_entries = MIN2(nr_entries, max_ubos); + + for (int i = 0; i < nr_entries; i++) { + out_ranges[i] = entries[i].range; + } + for (int i = nr_entries; i < 4; i++) { + out_ranges[i].block = 0; + out_ranges[i].start = 0; + out_ranges[i].length = 0; + } + + ralloc_free(ranges.mem_ctx); +} diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c index 0c04ef0..bd8f993 100644 --- a/src/mesa/drivers/dri/i965/brw_gs.c +++ b/src/mesa/drivers/dri/i965/brw_gs.c @@ -112,6 +112,8 @@ brw_codegen_gs_prog(struct brw_context *brw, brw_nir_setup_glsl_uniforms(gp->program.nir, &gp->program, &prog_data.base.base, compiler->scalar_stage[MESA_SHADER_GEOMETRY]); + brw_nir_analyze_ubo_ranges(compiler, gp->program.nir, + prog_data.base.base.ubo_ranges); uint64_t outputs_written = gp->program.info.outputs_written; diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c index 72c5872..1ed622e 100644 --- a/src/mesa/drivers/dri/i965/brw_tcs.c +++ b/src/mesa/drivers/dri/i965/brw_tcs.c @@ -205,6 +205,8 @@ brw_codegen_tcs_prog(struct brw_context *brw, struct brw_program *tcp, brw_nir_setup_glsl_uniforms(nir, &tcp->program, &prog_data.base.base, compiler->scalar_stage[MESA_SHADER_TESS_CTRL]); + brw_nir_analyze_ubo_ranges(compiler, tcp->program.nir, + prog_data.base.base.ubo_ranges); } else { /* Upload the Patch URB Header as the first two uniforms. * Do the annoying scrambling so the shader doesn't have to. diff --git a/src/mesa/drivers/dri/i965/brw_tes.c b/src/mesa/drivers/dri/i965/brw_tes.c index 372ef51..20ce1f4 100644 --- a/src/mesa/drivers/dri/i965/brw_tes.c +++ b/src/mesa/drivers/dri/i965/brw_tes.c @@ -102,6 +102,8 @@ brw_codegen_tes_prog(struct brw_context *brw, brw_nir_setup_glsl_uniforms(nir, &tep->program, &prog_data.base.base, compiler->scalar_stage[MESA_SHADER_TESS_EVAL]); + brw_nir_analyze_ubo_ranges(compiler, tep->program.nir, + prog_data.base.base.ubo_ranges); int st_index = -1; if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME)) diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 33f2ac1..c0a0a13 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -203,6 +203,8 @@ brw_codegen_vs_prog(struct brw_context *brw, brw_nir_setup_glsl_uniforms(vp->program.nir, &vp->program, &prog_data.base.base, compiler->scalar_stage[MESA_SHADER_VERTEX]); + brw_nir_analyze_ubo_ranges(compiler, vp->program.nir, + prog_data.base.base.ubo_ranges); } else { brw_nir_setup_arb_uniforms(vp->program.nir, &vp->program, &prog_data.base.base); diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index 71118c1..3a5fcf5 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -165,6 +165,8 @@ brw_codegen_wm_prog(struct brw_context *brw, if (!fp->program.is_arb_asm) { brw_nir_setup_glsl_uniforms(fp->program.nir, &fp->program, &prog_data.base, true); + brw_nir_analyze_ubo_ranges(brw->screen->compiler, fp->program.nir, + prog_data.base.ubo_ranges); } else { brw_nir_setup_arb_uniforms(fp->program.nir, &fp->program, &prog_data.base); -- 2.7.4