aco/tests: add tests for v_fma_f32 with 2 fp16 literals
authorRhys Perry <pendingchaos02@gmail.com>
Wed, 1 Mar 2023 17:57:07 +0000 (17:57 +0000)
committerRhys Perry <pendingchaos02@gmail.com>
Fri, 3 Mar 2023 14:20:55 +0000 (14:20 +0000)
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21633>

src/amd/compiler/tests/test_optimizer.cpp

index 205c1ef..7c109ed 100644 (file)
@@ -1836,3 +1836,114 @@ BEGIN_TEST(optimize.vop3p_constants)
       }
    }
 END_TEST
+
+BEGIN_TEST(optimize.fmamix_two_literals)
+   /* This test has to recreate literals sometimes because we don't combine them at all if there's
+    * at least one uncombined use.
+    */
+   for (unsigned i = GFX10; i <= GFX10_3; i++) {
+      //>> v1: %a, v1: %b = p_startpgm
+      if (!setup_cs("v1 v1", (amd_gfx_level)i))
+         continue;
+
+      Temp a = inputs[0];
+      Temp b = inputs[1];
+
+      Temp c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
+      Temp c30 = bld.copy(bld.def(v1), Operand::c32(fui(3.0f)));
+      Temp c_denorm = bld.copy(bld.def(v1), Operand::c32(0x387fc000));
+
+      //! v1: %res0 = v_fma_mix_f32 %a, lo(0x42003e00), hi(0x42003e00)
+      //! p_unit_test 0, %res0
+      writeout(0, fma(a, c15, c30));
+
+      /* No need to use v_fma_mix_f32. */
+      //! v1: %res1 = v_fmaak_f32 %a, %b, 0x40400000
+      //! p_unit_test 1, %res1
+      writeout(1, fma(a, b, c30));
+
+      /* Separate mul/add can become v_fma_mix_f32 if it's not precise. */
+      //! v1: %res2 = v_fma_mix_f32 %a, lo(0x42003e00), hi(0x42003e00)
+      //! p_unit_test 2, %res2
+      writeout(2, fadd(fmul(a, c15), c30));
+
+      //~gfx10! v1: %c15 = p_parallelcopy 0x3fc00000
+      c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
+      c30 = bld.copy(bld.def(v1), Operand::c32(fui(3.0f)));
+
+      /* v_fma_mix_f32 is a fused mul/add, so it can't be used for precise separate mul/add. */
+      //~gfx10! v1: (precise)%res3 = v_madak_f32 %a, %c15, 0x40400000
+      //~gfx10_3! v1: (precise)%res3_tmp = v_mul_f32 %a, 0x3fc00000
+      //~gfx10_3! v1: %res3 = v_add_f32 %res3_tmp, 0x40400000
+      //! p_unit_test 3, %res3
+      writeout(3, fadd(bld.precise().vop2(aco_opcode::v_mul_f32, bld.def(v1), a, c15), c30));
+
+      //~gfx10! v1: (precise)%res4 = v_madak_f32 %1, %c16, 0x40400000
+      //~gfx10_3! v1: %res4_tmp = v_mul_f32 %a, 0x3fc00000
+      //~gfx10_3! v1: (precise)%res4 = v_add_f32 %res4_tmp, 0x40400000
+      //! p_unit_test 4, %res4
+      writeout(4, bld.precise().vop2(aco_opcode::v_add_f32, bld.def(v1), fmul(a, c15), c30));
+
+      /* Can't convert to fp16 if it will be flushed as a denormal. */
+      //! v1: %res5 = v_fma_mix_f32 %1, lo(0x3ff3e00), hi(0x3ff3e00)
+      //! p_unit_test 5, %res5
+      c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
+      writeout(5, fma(a, c15, c_denorm));
+
+      //>> BB1
+      //! /* logical preds: / linear preds: / kind: uniform, */
+      program->next_fp_mode.denorm16_64 = fp_denorm_flush;
+      bld.reset(program->create_and_insert_block());
+
+      //~gfx10; del c15
+      //! v1: %c15 = p_parallelcopy 0x3fc00000
+      //! v1: %res6 = v_fmaak_f32 %a, %c15, 0x387fc000
+      //! p_unit_test 6, %res6
+      c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
+      c_denorm = bld.copy(bld.def(v1), Operand::c32(0x387fc000));
+      writeout(6, fma(a, c15, c_denorm));
+
+      /* Can't accept more than 3 unique fp16 literals. */
+      //! v1: %c45 = p_parallelcopy 0x40900000
+      //! v1: %res7 = v_fma_mix_f32 lo(0x42003e00), hi(0x42003e00), %c45
+      //! p_unit_test 7, %res7
+      Temp c45 = bld.copy(bld.def(v1), Operand::c32(fui(4.5f)));
+      writeout(7, fma(c15, c30, c45));
+
+      /* Modifiers must be preserved. */
+      //! v1: %res8 = v_fma_mix_f32 -%a, lo(0x44804200), hi(0x44804200)
+      //! p_unit_test 8, %res8
+      writeout(8, fma(fneg(a), c30, c45));
+
+      //! v1: %res9 = v_fma_mix_f32 lo(0x44804200), |%a|, hi(0x44804200)
+      //! p_unit_test 9, %res9
+      writeout(9, fma(c30, fabs(a), c45));
+
+      //! v1: %res10 = v_fma_mix_f32 %a, lo(0x44804200), hi(0x44804200) clamp
+      //! p_unit_test 10, %res10
+      writeout(10, fsat(fma(a, c30, c45)));
+
+      /* Output modifiers are not supported by v_fma_mix_f32. */
+      c30 = bld.copy(bld.def(v1), Operand::c32(fui(3.0f)));
+      //; del c45
+      //! v1: %c45 = p_parallelcopy 0x40900000
+      //! v1: %res11 = v_fma_f32 %a, 0x40400000, %c45 *0.5
+      //! p_unit_test 11, %res11
+      c45 = bld.copy(bld.def(v1), Operand::c32(fui(4.5f)));
+      writeout(11, fmul(fma(a, c30, c45), bld.copy(bld.def(v1), Operand::c32(0x3f000000))));
+
+      /* Has a literal which can't be represented as fp16. */
+      //! v1: %c03 = p_parallelcopy 0x3e99999a
+      //! v1: %res12 = v_fmaak_f32 %a, %c03, 0x40400000
+      //! p_unit_test 12, %res12
+      Temp c03 = bld.copy(bld.def(v1), Operand::c32(fui(0.3f)));
+      writeout(12, fma(a, c03, c30));
+
+      /* We should still use fmaak/fmamk if the two literals are identical. */
+      //! v1: %res13 = v_fmaak_f32 0x40400000, %a, 0x40400000
+      //! p_unit_test 13, %res13
+      writeout(13, fma(a, c30, c30));
+
+      finish_opt_test();
+   }
+END_TEST