From f0a8a9816afce4f30be64a8cdf7560a4282eb048 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Mon, 25 Jan 2021 16:31:17 -0800 Subject: [PATCH] nir: intel/compiler: Add and use nir_op_pack_32_4x8_split A lot of CTS tests write a u8vec4 or an i8vec4 to an SSBO. This results in a lot of shifts and MOVs. When that pattern can be recognized, the individual 8-bit components can be packed much more efficiently. v2: Rebase on b4369de27fc ("nir/lower_packing: use shader_instructions_pass") Reviewed-by: Lionel Landwerlin Part-of: --- src/compiler/nir/nir.h | 3 +++ src/compiler/nir/nir_lower_packing.c | 16 ++++++++++++++-- src/compiler/nir/nir_opcodes.py | 4 ++++ src/compiler/nir/nir_opt_algebraic.py | 4 ++++ src/intel/compiler/brw_compiler.c | 1 + src/intel/compiler/brw_fs_nir.cpp | 5 +++++ 6 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 3eea054..d1abbc3 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -3666,6 +3666,9 @@ typedef struct nir_shader_compiler_options { * iadd(x, ineg(y)). If true, driver should call nir_opt_algebraic_late(). */ bool has_isub; + /** Backend supports pack_32_4x8 or pack_32_4x8_split. */ + bool has_pack_32_4x8; + /** Backend supports txs, if not nir_lower_tex(..) uses txs-free variants * for rect texture lowering. */ bool has_txs; diff --git a/src/compiler/nir/nir_lower_packing.c b/src/compiler/nir/nir_lower_packing.c index 57bbeac..457726c 100644 --- a/src/compiler/nir/nir_lower_packing.c +++ b/src/compiler/nir/nir_lower_packing.c @@ -86,6 +86,15 @@ lower_unpack_64_to_16(nir_builder *b, nir_ssa_def *src) nir_unpack_32_2x16_split_y(b, zw)); } +static nir_ssa_def * +lower_pack_32_from_8(nir_builder *b, nir_ssa_def *src) +{ + return nir_pack_32_4x8_split(b, nir_channel(b, src, 0), + nir_channel(b, src, 1), + nir_channel(b, src, 2), + nir_channel(b, src, 3)); +} + static bool lower_pack_instr(nir_builder *b, nir_instr *instr, void *data) { @@ -99,8 +108,8 @@ lower_pack_instr(nir_builder *b, nir_instr *instr, void *data) alu_instr->op != nir_op_pack_64_4x16 && alu_instr->op != nir_op_unpack_64_4x16 && alu_instr->op != nir_op_pack_32_2x16 && - alu_instr->op != nir_op_unpack_32_2x16) - + alu_instr->op != nir_op_unpack_32_2x16 && + alu_instr->op != nir_op_pack_32_4x8) return false; b->cursor = nir_before_instr(&alu_instr->instr); @@ -127,6 +136,9 @@ lower_pack_instr(nir_builder *b, nir_instr *instr, void *data) case nir_op_unpack_32_2x16: dest = lower_unpack_32_to_16(b, src); break; + case nir_op_pack_32_4x8: + dest = lower_pack_32_from_8(b, src); + break; default: unreachable("Impossible opcode"); } diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index 4a5241a..6b2fc24 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -897,6 +897,10 @@ binop_convert("pack_64_2x32_split", tuint64, tuint32, "", binop_convert("pack_32_2x16_split", tuint32, tuint16, "", "src0 | ((uint32_t)src1 << 16)") +opcode("pack_32_4x8_split", 0, tuint32, [0, 0, 0, 0], [tuint8, tuint8, tuint8, tuint8], + False, "", + "src0 | ((uint32_t)src1 << 8) | ((uint32_t)src2 << 16) | ((uint32_t)src3 << 24)") + # bfm implements the behavior of the first operation of the SM5 "bfi" assembly # and that of the "bfi1" i965 instruction. That is, the bits and offset values # are from the low five bits of src0 and src1, respectively. diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 264d49a..449f114 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -1313,6 +1313,10 @@ optimizations.extend([ (('ibfe', a, 0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'), (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'), + # Packing a u8vec4 to write to an SSBO. + (('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))), + ('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'), + (('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)), (('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)), diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index ec6b591..629f1cb 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -68,6 +68,7 @@ .lower_usub_sat64 = true, \ .lower_hadd64 = true, \ .avoid_ternary_with_two_constants = true, \ + .has_pack_32_4x8 = true, \ .max_unroll_iterations = 32, \ .force_indirect_unrolling = nir_var_function_temp diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 2bc8ea4..53f347b 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -988,6 +988,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, case nir_op_u2u32: case nir_op_iabs: case nir_op_ineg: + case nir_op_pack_32_4x8_split: break; default: @@ -1721,6 +1722,10 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, bld.emit(FS_OPCODE_PACK, result, op[0], op[1]); break; + case nir_op_pack_32_4x8_split: + bld.emit(FS_OPCODE_PACK, result, op, 4); + break; + case nir_op_unpack_64_2x32_split_x: case nir_op_unpack_64_2x32_split_y: { if (instr->op == nir_op_unpack_64_2x32_split_x) -- 2.7.4