From b2da1238012c751a8ad36c9a51d3fec46a292b0d Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Tue, 29 Nov 2016 05:20:20 -0800 Subject: [PATCH] i965: Use pushed UBO data in the scalar backend. This actually takes advantage of the newly pushed UBO data, avoiding pull loads. Improves performance in GLBenchmark Manhattan 3.1 by: HSW: ~1%, BDW/SKL/KBL GT2: 3-4%, SKL GT4: 7-8%, APL: 4-5%. (thanks to Eero Tamminen for these numbers) shader-db results on Skylake, ignoring programs with spill/fill changes: total instructions in shared programs: 13963994 -> 13651893 (-2.24%) instructions in affected programs: 4250328 -> 3938227 (-7.34%) helped: 28527 HURT: 0 total cycles in shared programs: 179808608 -> 172535170 (-4.05%) cycles in affected programs: 79720410 -> 72446972 (-9.12%) helped: 26951 HURT: 1248 LOST: 46 GAINED: 21 Many "Deus Ex: Mankind Divided" shaders which already spilled end up spill a lot more (about 240 programs hurt, 9 helped). The cycle estimator suggests this is still overall a win (-0.23% in cycle counts) presumably because we trade pull loads for fills. v2: Drop "PULL" environment variable left in for initial debugging (caught by Matt). Reviewed-by: Matt Turner --- src/intel/compiler/brw_fs.cpp | 35 ++++++++++++++++++++++++++++++++++- src/intel/compiler/brw_fs.h | 2 ++ src/intel/compiler/brw_fs_nir.cpp | 28 ++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index d631bd0..ab9e955 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -1386,7 +1386,9 @@ fs_visitor::assign_curb_setup() unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8); unsigned ubo_push_length = 0; + unsigned ubo_push_start[4]; for (int i = 0; i < 4; i++) { + ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length); ubo_push_length += stage_prog_data->ubo_ranges[i].length; } @@ -1398,7 +1400,11 @@ fs_visitor::assign_curb_setup() if (inst->src[i].file == UNIFORM) { int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4; int constant_nr; - if (uniform_nr >= 0 && uniform_nr < (int) uniforms) { + if (inst->src[i].nr >= UBO_START) { + /* constant_nr is in 32-bit units, the rest are in bytes */ + constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] + + inst->src[i].offset / 4; + } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) { constant_nr = push_constant_loc[uniform_nr]; } else { /* Section 5.11 of the OpenGL 4.1 spec says: @@ -2069,6 +2075,20 @@ fs_visitor::assign_constant_locations() stage_prog_data->nr_params = num_push_constants; stage_prog_data->nr_pull_params = num_pull_constants; + /* Now that we know how many regular uniforms we'll push, reduce the + * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits. + */ + unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8); + for (int i = 0; i < 4; i++) { + struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; + + if (push_length + range->length > 64) + range->length = 64 - push_length; + + push_length += range->length; + } + assert(push_length <= 64); + /* Up until now, the param[] array has been indexed by reg + offset * of UNIFORM registers. Move pull constants into pull_param[] and * condense param[] to only contain the uniforms we chose to push. @@ -2103,6 +2123,19 @@ fs_visitor::get_pull_locs(const fs_reg &src, { assert(src.file == UNIFORM); + if (src.nr >= UBO_START) { + const struct brw_ubo_range *range = + &prog_data->ubo_ranges[src.nr - UBO_START]; + + /* If this access is in our (reduced) range, use the push data. */ + if (src.offset / 32 < range->length) + return false; + + *out_surf_index = prog_data->binding_table.ubo_start + range->block; + *out_pull_index = (32 * range->start + src.offset) / 4; + return true; + } + const unsigned location = src.nr + src.offset / 4; if (location < uniforms && pull_constant_loc[location] != -1) { diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index beb0d97..f1ba193 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -50,6 +50,8 @@ offset(const fs_reg ®, const brw::fs_builder &bld, unsigned delta) return offset(reg, bld.dispatch_width(), delta); } +#define UBO_START ((1 << 16) - 4) + /** * The fragment shader front-end. * diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index aea2c62..b6fa0f0 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -3822,6 +3822,34 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr * and we have to split it if necessary. */ const unsigned type_size = type_sz(dest.type); + + /* See if we've selected this as a push constant candidate */ + if (const_index) { + const unsigned ubo_block = const_index->u32[0]; + const unsigned offset_256b = const_offset->u32[0] / 32; + + fs_reg push_reg; + for (int i = 0; i < 4; i++) { + const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; + if (range->block == ubo_block && + offset_256b >= range->start && + offset_256b < range->start + range->length) { + + push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type); + push_reg.offset = const_offset->u32[0] - 32 * range->start; + break; + } + } + + if (push_reg.file != BAD_FILE) { + for (unsigned i = 0; i < instr->num_components; i++) { + bld.MOV(offset(dest, bld, i), + byte_offset(push_reg, i * type_size)); + } + break; + } + } + const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0); const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD); -- 2.7.4