From: Ian Romanick Date: Sat, 23 Jan 2021 22:28:07 +0000 (-0800) Subject: intel/compiler: Lower 8-bit ops to 16-bit in NIR on all platforms X-Git-Tag: upstream/22.3.5~19060 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=5ce3bfcdf3154a65c37386f908b3f053b7fd6a61;p=platform%2Fupstream%2Fmesa.git intel/compiler: Lower 8-bit ops to 16-bit in NIR on all platforms This fixes the Crucible func.shader.shift.int8_t test on Gen8 and Gen9. See https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76. With the previous optimizations in place, this change seems to improve the quality of the generated code. Comparing a couple Vulkan CTS tests on Skylake had the following results. dEQP-VK.spirv_assembly.type.vec3.i8.bitwise_xor_frag: SIMD8 shader: 36 instructions. 1 loops. 3822 cycles. 0:0 spills:fills, 5 sends SIMD8 shader: 27 instructions. 1 loops. 2742 cycles. 0:0 spills:fills, 5 sends dEQP-VK.spirv_assembly.type.vec3.i8.max_frag: SIMD8 shader: 39 instructions. 1 loops. 3922 cycles. 0:0 spills:fills, 5 sends SIMD8 shader: 37 instructions. 1 loops. 3682 cycles. 0:0 spills:fills, 5 sends Reviewed-by: Francisco Jerez Reviewed-by: Lionel Landwerlin Part-of: --- diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index 629f1cb..cb5655e 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -193,9 +193,6 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo) nir_options->lower_int64_options = int64_options; nir_options->lower_doubles_options = fp64_options; - /* Starting with Gfx11, we lower away 8-bit arithmetic */ - nir_options->support_8bit_alu = devinfo->ver < 11; - nir_options->unify_interfaces = i < MESA_SHADER_FRAGMENT; nir_options->force_indirect_unrolling |= diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 53f347b..10d3f0e 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -993,8 +993,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, default: for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { - assert((devinfo->ver == 8 || devinfo->ver == 9) || - type_sz(op[i].type) > 1); + assert(type_sz(op[i].type) > 1); } } #endif @@ -1836,6 +1835,35 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_bitfield_insert: unreachable("not reached: should have been lowered"); + /* For all shift operations: + * + * Gen4 - Gen7: After application of source modifiers, the low 5-bits of + * src1 are used an unsigned value for the shift count. + * + * Gen8: As with earlier platforms, but for Q and UQ types on src0, the low + * 6-bit of src1 are used. + * + * Gen9+: The low bits of src1 matching the size of src0 (e.g., 4-bits for + * W or UW src0). + * + * The implication is that the following instruction will produce a + * different result on Gen9+ than on previous platforms: + * + * shr(8) g4<1>UW g12<8,8,1>UW 0x0010UW + * + * where Gen9+ will shift by zero, and earlier platforms will shift by 16. + * + * This does not seem to be the case. Experimentally, it has been + * determined that shifts of 16-bit values on Gen8 behave properly. Shifts + * of 8-bit values on both Gen8 and Gen9 do not. Gen11+ lowers 8-bit + * values, so those platforms were not tested. No features expose access + * to 8- or 16-bit types on Gen7 or earlier, so those platforms were not + * tested either. See + * https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76. + * + * This is part of the reason 8-bit values are lowered to 16-bit on all + * platforms. + */ case nir_op_ishl: bld.SHL(result, op[0], op[1]); break; diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 6b97326..5f64ef1 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -675,15 +675,14 @@ lower_bit_size_callback(const nir_instr *instr, UNUSED void *data) assert(!"Should have been lowered by nir_opt_algebraic."); return 0; default: - if (devinfo->ver >= 11) { - if (nir_op_infos[alu->op].num_inputs >= 2 && - alu->dest.dest.ssa.bit_size == 8) - return 16; - - if (nir_alu_instr_is_comparison(alu) && - alu->src[0].src.ssa->bit_size == 8) - return 16; - } + if (nir_op_infos[alu->op].num_inputs >= 2 && + alu->dest.dest.ssa.bit_size == 8) + return 16; + + if (nir_alu_instr_is_comparison(alu) && + alu->src[0].src.ssa->bit_size == 8) + return 16; + return 0; } break; @@ -704,7 +703,7 @@ lower_bit_size_callback(const nir_instr *instr, UNUSED void *data) case nir_intrinsic_quad_swap_horizontal: case nir_intrinsic_quad_swap_vertical: case nir_intrinsic_quad_swap_diagonal: - if (intrin->src[0].ssa->bit_size == 8 && devinfo->ver >= 11) + if (intrin->src[0].ssa->bit_size == 8) return 16; return 0; @@ -737,7 +736,7 @@ lower_bit_size_callback(const nir_instr *instr, UNUSED void *data) case nir_instr_type_phi: { nir_phi_instr *phi = nir_instr_as_phi(instr); - if (devinfo->ver >= 11 && phi->dest.ssa.bit_size == 8) + if (phi->dest.ssa.bit_size == 8) return 16; return 0; }