aco: improve do_pack_2x16() with zero constants
authorRhys Perry <pendingchaos02@gmail.com>
Mon, 28 Nov 2022 19:18:32 +0000 (19:18 +0000)
committerMarge Bot <emma+marge@anholt.net>
Thu, 1 Dec 2022 21:43:28 +0000 (21:43 +0000)
We can skip the v_or_b32 or use an instruction smaller than
v_alignbyte_b32.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>

src/amd/compiler/aco_lower_to_hw_instr.cpp
src/amd/compiler/tests/test_to_hw_instr.cpp

index 95c8d77..5d4ae43 100644 (file)
@@ -1451,8 +1451,8 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
 
    /* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */
    if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 &&
-       (!hi.isConstant() || !Operand::c32(hi.constantValue()).isLiteral() ||
-        ctx->program->gfx_level >= GFX10)) {
+       (!hi.isConstant() || (hi.constantValue() && (!Operand::c32(hi.constantValue()).isLiteral() ||
+                                                    ctx->program->gfx_level >= GFX10)))) {
       if (hi.isConstant())
          bld.vop3(aco_opcode::v_alignbyte_b32, def, Operand::c32(hi.constantValue()), lo,
                   Operand::c32(2u));
@@ -1470,8 +1470,9 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
          bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
       else
          bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi);
-      bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),
-               Operand(def.physReg(), v1));
+      if (lo.constantValue())
+         bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),
+                  Operand(def.physReg(), v1));
       return;
    }
    if (hi.isConstant()) {
@@ -1482,8 +1483,9 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
          bld.vop1(aco_opcode::v_cvt_u32_u16, def, lo);
       else
          bld.vop2(aco_opcode::v_and_b32, def_lo, Operand::c32(0xFFFFu), lo);
-      bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),
-               Operand(def.physReg(), v1));
+      if (hi.constantValue())
+         bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),
+                  Operand(def.physReg(), v1));
       return;
    }
 
index 91d049e..9d2a272 100644 (file)
@@ -841,26 +841,54 @@ BEGIN_TEST(to_hw_instr.swap_linear_vgpr)
    finish_to_hw_instr_test();
 END_TEST
 
-BEGIN_TEST(to_hw_instr.pack2x16_alignbyte_constant)
+BEGIN_TEST(to_hw_instr.pack2x16_constant)
    PhysReg v0_lo{256};
    PhysReg v0_hi{256};
+   PhysReg v1_lo{257};
    PhysReg v1_hi{257};
    v0_hi.reg_b += 2;
    v1_hi.reg_b += 2;
 
-   if (!setup_cs(NULL, GFX10))
-      return;
+   for (amd_gfx_level lvl : {GFX10, GFX11}) {
+      if (!setup_cs(NULL, lvl))
+         continue;
 
-   /* prevent usage of v_pack_b32_f16 */
-   program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
+      /* prevent usage of v_pack_b32_f16 */
+      program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
 
-   //>> p_unit_test 0
-   //! v1: %_:v[0] = v_alignbyte_b32 0x3800, %_:v[1][16:32], 2
-   bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
-   bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
-              Operand(v1_hi, v2b), Operand::c16(0x3800));
+      //>> p_unit_test 0
+      //! v1: %_:v[0] = v_alignbyte_b32 0x3800, %_:v[1][16:32], 2
+      bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
+      bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
+                 Operand(v1_hi, v2b), Operand::c16(0x3800));
 
-   //! s_endpgm
+      //! p_unit_test 1
+      //! v2b: %_:v[0][0:16] = v_lshrrev_b32 16, %_:v[1][16:32]
+      bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
+      bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
+                 Operand(v1_hi, v2b), Operand::zero(2));
 
-   finish_to_hw_instr_test();
+      //! p_unit_test 2
+      //~gfx10! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16]
+      //~gfx11! v1: %_:v[0] = v_cvt_u32_u16 %_:v[1][0:16]
+      bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
+      bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
+                 Operand(v1_lo, v2b), Operand::zero(2));
+
+      //! p_unit_test 3
+      //! v2b: %_:v[0][16:32] = v_and_b32 0xffff0000, %_:v[1][16:32]
+      bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
+      bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
+                 Operand::zero(2), Operand(v1_hi, v2b));
+
+      //! p_unit_test 4
+      //! v2b: %_:v[0][16:32] = v_lshlrev_b32 16, %_:v[1][0:16]
+      bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
+      bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
+                 Operand::zero(2), Operand(v1_lo, v2b));
+
+      //! s_endpgm
+
+      finish_to_hw_instr_test();
+   }
 END_TEST