From f0a8a9816afce4f30be64a8cdf7560a4282eb048 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Mon, 25 Jan 2021 16:31:17 -0800
Subject: [PATCH] nir: intel/compiler: Add and use nir_op_pack_32_4x8_split

A lot of CTS tests write a u8vec4 or an i8vec4 to an SSBO.  This results
in a lot of shifts and MOVs.  When that pattern can be recognized, the
individual 8-bit components can be packed much more efficiently.

v2: Rebase on b4369de27fc ("nir/lower_packing: use
shader_instructions_pass")

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9025>
---
 src/compiler/nir/nir.h                |  3 +++
 src/compiler/nir/nir_lower_packing.c  | 16 ++++++++++++++--
 src/compiler/nir/nir_opcodes.py       |  4 ++++
 src/compiler/nir/nir_opt_algebraic.py |  4 ++++
 src/intel/compiler/brw_compiler.c     |  1 +
 src/intel/compiler/brw_fs_nir.cpp     |  5 +++++
 6 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 3eea054..d1abbc3 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -3666,6 +3666,9 @@ typedef struct nir_shader_compiler_options {
     * iadd(x, ineg(y)). If true, driver should call nir_opt_algebraic_late(). */
    bool has_isub;
 
+   /** Backend supports pack_32_4x8 or pack_32_4x8_split. */
+   bool has_pack_32_4x8;
+
    /** Backend supports txs, if not nir_lower_tex(..) uses txs-free variants
     * for rect texture lowering. */
    bool has_txs;
diff --git a/src/compiler/nir/nir_lower_packing.c b/src/compiler/nir/nir_lower_packing.c
index 57bbeac..457726c 100644
--- a/src/compiler/nir/nir_lower_packing.c
+++ b/src/compiler/nir/nir_lower_packing.c
@@ -86,6 +86,15 @@ lower_unpack_64_to_16(nir_builder *b, nir_ssa_def *src)
                       nir_unpack_32_2x16_split_y(b, zw));
 }
 
+static nir_ssa_def *
+lower_pack_32_from_8(nir_builder *b, nir_ssa_def *src)
+{
+   return nir_pack_32_4x8_split(b, nir_channel(b, src, 0),
+                                   nir_channel(b, src, 1),
+                                   nir_channel(b, src, 2),
+                                   nir_channel(b, src, 3));
+}
+
 static bool
 lower_pack_instr(nir_builder *b, nir_instr *instr, void *data)
 {
@@ -99,8 +108,8 @@ lower_pack_instr(nir_builder *b, nir_instr *instr, void *data)
        alu_instr->op != nir_op_pack_64_4x16 &&
        alu_instr->op != nir_op_unpack_64_4x16 &&
        alu_instr->op != nir_op_pack_32_2x16 &&
-       alu_instr->op != nir_op_unpack_32_2x16)
-
+       alu_instr->op != nir_op_unpack_32_2x16 &&
+       alu_instr->op != nir_op_pack_32_4x8)
       return false;
 
    b->cursor = nir_before_instr(&alu_instr->instr);
@@ -127,6 +136,9 @@ lower_pack_instr(nir_builder *b, nir_instr *instr, void *data)
    case nir_op_unpack_32_2x16:
       dest = lower_unpack_32_to_16(b, src);
       break;
+   case nir_op_pack_32_4x8:
+      dest = lower_pack_32_from_8(b, src);
+      break;
    default:
       unreachable("Impossible opcode");
    }
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index 4a5241a..6b2fc24 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -897,6 +897,10 @@ binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
               "src0 | ((uint32_t)src1 << 16)")
 
+opcode("pack_32_4x8_split", 0, tuint32, [0, 0, 0, 0], [tuint8, tuint8, tuint8, tuint8],
+       False, "",
+       "src0 | ((uint32_t)src1 << 8) | ((uint32_t)src2 << 16) | ((uint32_t)src3 << 24)")
+
 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
 # are from the low five bits of src0 and src1, respectively.
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 264d49a..449f114 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -1313,6 +1313,10 @@ optimizations.extend([
    (('ibfe', a,  0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
    (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
 
+   # Packing a u8vec4 to write to an SSBO.
+   (('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))),
+    ('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'),
+
    (('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)),
    (('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)),
 
diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
index ec6b591..629f1cb 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -68,6 +68,7 @@
    .lower_usub_sat64 = true,                                                  \
    .lower_hadd64 = true,                                                      \
    .avoid_ternary_with_two_constants = true,                                  \
+   .has_pack_32_4x8 = true,                                                   \
    .max_unroll_iterations = 32,                                               \
    .force_indirect_unrolling = nir_var_function_temp
 
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 2bc8ea4..53f347b 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -988,6 +988,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
    case nir_op_u2u32:
    case nir_op_iabs:
    case nir_op_ineg:
+   case nir_op_pack_32_4x8_split:
       break;
 
    default:
@@ -1721,6 +1722,10 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
       bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
       break;
 
+   case nir_op_pack_32_4x8_split:
+      bld.emit(FS_OPCODE_PACK, result, op, 4);
+      break;
+
    case nir_op_unpack_64_2x32_split_x:
    case nir_op_unpack_64_2x32_split_y: {
       if (instr->op == nir_op_unpack_64_2x32_split_x)
-- 
2.7.4