From a25f96c00cfe4f4ad7f5f079ba3e11d4da2e1994 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Mon, 7 Aug 2023 14:03:57 +0300 Subject: [PATCH] intel/fs: switch from SIMD 1 to 8 instructions surface/sampler rematerialization SIMD1 instructions are problematic because they are considered partial writes. This increases the liveness of the destination register written by those instructions. To workaround this we use UNDEF instructions to bound the liveness of the register. But this causing other issues like in this case : undef(1) vgrf2 mov(1) vgrf2, u4.0 add(1) vgrf3, vgrf2.0, 64UD In this case the copy propagation pass in unable to see that vgrf2 in the add() instruction can be replaced with the uniform u4.0. To fix this problem, we switch NoMask SIMD8 instructions that cover the entire register. We can drop the UNDEF instructions and now copy propagation can do its job. Good results on 2 apps : Cyberpunk 2077 : Totals from 7258 (68.80% of 10549) affected shaders: Instrs: 6332210 -> 6073833 (-4.08%); split: -4.11%, +0.03% Cycles: 130667501 -> 127351268 (-2.54%); split: -3.12%, +0.58% Subgroup size: 90320 -> 90400 (+0.09%) Spill count: 90 -> 68 (-24.44%) Fill count: 82 -> 64 (-21.95%) Scratch Memory Size: 8192 -> 6144 (-25.00%) Max live registers: 385464 -> 375152 (-2.68%) Max dispatch width: 64336 -> 64424 (+0.14%); split: +0.96%, -0.82% Gaining 60 SIMD16/SIMD32 shaders, loosing 33 Strange Brigade : Totals from 2137 (53.12% of 4023) affected shaders: Instrs: 1544031 -> 1457544 (-5.60%); split: -5.60%, +0.00% Cycles: 22292564 -> 21868978 (-1.90%); split: -2.43%, +0.53% Subgroup size: 25328 -> 25344 (+0.06%) Max live registers: 113716 -> 111214 (-2.20%) Max dispatch width: 17232 -> 18608 (+7.99%); split: +8.36%, -0.37% Gaining 138 SIMD16/SIMD32 shaders, loosing 4 On app slightly negatively affected : Dota2 : Totals from 232 (14.73% of 1575) affected shaders: Instrs: 30029 -> 28194 (-6.11%) Cycles: 385155 -> 371422 (-3.57%); split: -3.59%, +0.02% Max live registers: 6792 -> 6780 (-0.18%) Max dispatch width: 2256 -> 2160 (-4.26%) Loosing 6 SIMD32 shaders Signed-off-by: Lionel Landwerlin Reviewed-by: Kenneth Graunke Reviewed-by: Ian Romanick Part-of: --- src/intel/compiler/brw_fs_nir.cpp | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 166c67c..34df9a8 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4003,7 +4003,7 @@ fs_reg fs_visitor::try_rebuild_resource(const brw::fs_builder &bld, nir_def *resource_def) { /* Create a build at the location of the resource_intel intrinsic */ - fs_builder ubld1 = bld.exec_all().group(1, 0); + fs_builder ubld8 = bld.exec_all().group(8, 0); struct rebuild_resource resources = {}; resources.idx = 0; @@ -4041,10 +4041,9 @@ fs_visitor::try_rebuild_resource(const brw::fs_builder &bld, nir_def *resource_d case nir_instr_type_load_const: { nir_load_const_instr *load_const = nir_instr_as_load_const(instr); - fs_reg dst = ubld1.vgrf(BRW_REGISTER_TYPE_UD); - ubld1.UNDEF(dst); + fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD); nir_resource_insts[def->index] = - ubld1.group(8, 0).MOV(dst, brw_imm_ud(load_const->value[0].i32)); + ubld8.MOV(dst, brw_imm_ud(load_const->value[0].i32)); break; } @@ -4067,52 +4066,47 @@ fs_visitor::try_rebuild_resource(const brw::fs_builder &bld, nir_def *resource_d switch (alu->op) { case nir_op_iadd: { - fs_reg dst = ubld1.vgrf(BRW_REGISTER_TYPE_UD); - ubld1.UNDEF(dst); + fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD); fs_reg src0 = nir_resource_insts[alu->src[0].src.ssa->index]->dst; fs_reg src1 = nir_resource_insts[alu->src[1].src.ssa->index]->dst; assert(src0.file != BAD_FILE && src1.file != BAD_FILE); assert(src0.type == BRW_REGISTER_TYPE_UD); nir_resource_insts[def->index] = - ubld1.ADD(dst, + ubld8.ADD(dst, src0.file != IMM ? src0 : src1, src0.file != IMM ? src1 : src0); break; } case nir_op_iadd3: { - fs_reg dst = ubld1.vgrf(BRW_REGISTER_TYPE_UD); - ubld1.UNDEF(dst); + fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD); fs_reg src0 = nir_resource_insts[alu->src[0].src.ssa->index]->dst; fs_reg src1 = nir_resource_insts[alu->src[1].src.ssa->index]->dst; fs_reg src2 = nir_resource_insts[alu->src[2].src.ssa->index]->dst; assert(src0.file != BAD_FILE && src1.file != BAD_FILE && src2.file != BAD_FILE); assert(src0.type == BRW_REGISTER_TYPE_UD); nir_resource_insts[def->index] = - ubld1.ADD3(dst, + ubld8.ADD3(dst, src1.file == IMM ? src1 : src0, src1.file == IMM ? src0 : src1, src2); break; } case nir_op_ushr: { - assert(ubld1.dispatch_width() == 1); - fs_reg dst = ubld1.vgrf(BRW_REGISTER_TYPE_UD); - ubld1.UNDEF(dst); + fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD); fs_reg src0 = nir_resource_insts[alu->src[0].src.ssa->index]->dst; fs_reg src1 = nir_resource_insts[alu->src[1].src.ssa->index]->dst; assert(src0.file != BAD_FILE && src1.file != BAD_FILE); assert(src0.type == BRW_REGISTER_TYPE_UD); - nir_resource_insts[def->index] = ubld1.SHR(dst, src0, src1); + nir_resource_insts[def->index] = ubld8.SHR(dst, src0, src1); break; } case nir_op_ishl: { - fs_reg dst = ubld1.vgrf(BRW_REGISTER_TYPE_UD); - ubld1.UNDEF(dst); + fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD); fs_reg src0 = nir_resource_insts[alu->src[0].src.ssa->index]->dst; fs_reg src1 = nir_resource_insts[alu->src[1].src.ssa->index]->dst; assert(src0.file != BAD_FILE && src1.file != BAD_FILE); assert(src0.type == BRW_REGISTER_TYPE_UD); - nir_resource_insts[def->index] = ubld1.SHL(dst, src0, src1); + nir_resource_insts[def->index] = ubld8.SHL(dst, src0, src1); break; } case nir_op_mov: { @@ -4138,11 +4132,10 @@ fs_visitor::try_rebuild_resource(const brw::fs_builder &bld, nir_def *resource_d unsigned base_offset = nir_intrinsic_base(intrin); unsigned load_offset = nir_src_as_uint(intrin->src[0]); - fs_reg dst = ubld1.vgrf(BRW_REGISTER_TYPE_UD); - ubld1.UNDEF(dst); + fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD); fs_reg src(UNIFORM, base_offset / 4, BRW_REGISTER_TYPE_UD); src.offset = load_offset + base_offset % 4; - nir_resource_insts[def->index] = ubld1.MOV(dst, src); + nir_resource_insts[def->index] = ubld8.MOV(dst, src); break; } -- 2.7.4