rs6000: Do not allow combining of multiple assemble quads [PR103548]
authorPeter Bergner <bergner@linux.ibm.com>
Tue, 14 Dec 2021 20:50:41 +0000 (14:50 -0600)
committerPeter Bergner <bergner@linux.ibm.com>
Tue, 14 Dec 2021 21:00:00 +0000 (15:00 -0600)
The compiler will gladly CSE the result of two __builtin_mma_build_acc
calls with the same four vector arguments, leading to illegal MMA
code being generated.  The fix here is to make the mma_assemble_acc
pattern use a unspec_volatile to stop the CSE from happening.

2021-12-14  Peter Bergner  <bergner@linux.ibm.com>

gcc/
PR target/103548
* config/rs6000/mma.md (UNSPEC_MMA_ASSEMBLE): Rename unspec from this...
(UNSPEC_VSX_ASSEMBLE): ...to this.
(UNSPECV_MMA_ASSEMBLE): New unspecv.
(vsx_assemble_pair): Use UNSPEC_VSX_ASSEMBLE.
(*vsx_assemble_pair): Likewise.
(mma_assemble_acc): Use UNSPECV_MMA_ASSEMBLE.
(*mma_assemble_acc): Likewise.
* config/rs6000/rs6000.c (rs6000_split_multireg_move): Handle
UNSPEC_VOLATILE.  Use UNSPEC_VSX_ASSEMBLE and UNSPECV_MMA_ASSEMBLE.

gcc/testsuite/
PR target/103548
* gcc.target/powerpc/mma-builtin-10-pair.c: New test.
* gcc.target/powerpc/mma-builtin-10-quad.c: New test.

gcc/config/rs6000/mma.md
gcc/config/rs6000/rs6000.c
gcc/testsuite/gcc.target/powerpc/mma-builtin-10-pair.c [new file with mode: 0644]
gcc/testsuite/gcc.target/powerpc/mma-builtin-10-quad.c [new file with mode: 0644]

index fa08160..8a26205 100644 (file)
@@ -29,7 +29,7 @@
 ;; Constants for creating unspecs
 
 (define_c_enum "unspec"
-  [UNSPEC_MMA_ASSEMBLE
+  [UNSPEC_VSX_ASSEMBLE
    UNSPEC_MMA_EXTRACT
    UNSPEC_MMA_PMXVBF16GER2
    UNSPEC_MMA_PMXVBF16GER2NN
@@ -94,7 +94,8 @@
   ])
 
 (define_c_enum "unspecv"
-  [UNSPECV_MMA_XXSETACCZ
+  [UNSPECV_MMA_ASSEMBLE
+   UNSPECV_MMA_XXSETACCZ
   ])
 
 ;; MMA instructions with 1 accumulator argument
 {
   rtx src = gen_rtx_UNSPEC (OOmode,
                            gen_rtvec (2, operands[1], operands[2]),
-                           UNSPEC_MMA_ASSEMBLE);
+                           UNSPEC_VSX_ASSEMBLE);
   emit_move_insn (operands[0], src);
   DONE;
 })
   [(set (match_operand:OO 0 "vsx_register_operand" "=&wa")
        (unspec:OO [(match_operand:V16QI 1 "mma_assemble_input_operand" "mwa")
                    (match_operand:V16QI 2 "mma_assemble_input_operand" "mwa")]
-                   UNSPEC_MMA_ASSEMBLE))]
+                  UNSPEC_VSX_ASSEMBLE))]
   "TARGET_MMA"
   "#"
   "&& reload_completed"
 {
   rtx src = gen_rtx_UNSPEC (OOmode,
                            gen_rtvec (2, operands[1], operands[2]),
-                           UNSPEC_MMA_ASSEMBLE);
+                           UNSPEC_VSX_ASSEMBLE);
   rs6000_split_multireg_move (operands[0], src);
   DONE;
 })
    (match_operand:V16QI 4 "mma_assemble_input_operand")]
   "TARGET_MMA"
 {
-  rtx src = gen_rtx_UNSPEC (XOmode,
-                           gen_rtvec (4, operands[1], operands[2],
-                                      operands[3], operands[4]),
-                           UNSPEC_MMA_ASSEMBLE);
+  rtx src = gen_rtx_UNSPEC_VOLATILE (XOmode,
+                                    gen_rtvec (4, operands[1], operands[2],
+                                               operands[3], operands[4]),
+                                    UNSPECV_MMA_ASSEMBLE);
   emit_move_insn (operands[0], src);
   DONE;
 })
 
 (define_insn_and_split "*mma_assemble_acc"
   [(set (match_operand:XO 0 "fpr_reg_operand" "=&d")
-       (unspec:XO [(match_operand:V16QI 1 "mma_assemble_input_operand" "mwa")
-                   (match_operand:V16QI 2 "mma_assemble_input_operand" "mwa")
-                   (match_operand:V16QI 3 "mma_assemble_input_operand" "mwa")
-                   (match_operand:V16QI 4 "mma_assemble_input_operand" "mwa")]
-                   UNSPEC_MMA_ASSEMBLE))]
+       (unspec_volatile:XO
+         [(match_operand:V16QI 1 "mma_assemble_input_operand" "mwa")
+          (match_operand:V16QI 2 "mma_assemble_input_operand" "mwa")
+          (match_operand:V16QI 3 "mma_assemble_input_operand" "mwa")
+          (match_operand:V16QI 4 "mma_assemble_input_operand" "mwa")]
+         UNSPECV_MMA_ASSEMBLE))]
   "TARGET_MMA
    && fpr_reg_operand (operands[0], XOmode)"
   "#"
   "&& reload_completed"
   [(const_int 0)]
 {
-  rtx src = gen_rtx_UNSPEC (XOmode,
-                           gen_rtvec (4, operands[1], operands[2],
-                                      operands[3], operands[4]),
-                           UNSPEC_MMA_ASSEMBLE);
+  rtx src = gen_rtx_UNSPEC_VOLATILE (XOmode,
+                                    gen_rtvec (4, operands[1], operands[2],
+                                               operands[3], operands[4]),
+                                    UNSPECV_MMA_ASSEMBLE);
   rs6000_split_multireg_move (operands[0], src);
   DONE;
 })
index 70df511..9fc1577 100644 (file)
@@ -27071,9 +27071,11 @@ rs6000_split_multireg_move (rtx dst, rtx src)
          return;
        }
 
-      if (GET_CODE (src) == UNSPEC)
+      if (GET_CODE (src) == UNSPEC
+         || GET_CODE (src) == UNSPEC_VOLATILE)
        {
-         gcc_assert (XINT (src, 1) == UNSPEC_MMA_ASSEMBLE);
+         gcc_assert (XINT (src, 1) == UNSPEC_VSX_ASSEMBLE
+                     || XINT (src, 1) == UNSPECV_MMA_ASSEMBLE);
          gcc_assert (REG_P (dst));
          if (GET_MODE (src) == XOmode)
            gcc_assert (FP_REGNO_P (REGNO (dst)));
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-10-pair.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-10-pair.c
new file mode 100644 (file)
index 0000000..d8748d8
--- /dev/null
@@ -0,0 +1,21 @@
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+typedef unsigned char  vec_t __attribute__((vector_size(16)));
+
+void
+foo (__vector_pair *dst, vec_t *src)
+{
+  __vector_pair pair0, pair1;
+  /* Adjacent loads should be combined into one lxvp instruction
+     and identical build pairs should be combined.  */
+  __builtin_vsx_build_pair (&pair0, src[0], src[1]);
+  __builtin_vsx_build_pair (&pair1, src[0], src[1]);
+  dst[0] = pair0;
+  dst[2] = pair1;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-10-quad.c b/gcc/testsuite/gcc.target/powerpc/mma-builtin-10-quad.c
new file mode 100644 (file)
index 0000000..02342c7
--- /dev/null
@@ -0,0 +1,23 @@
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+typedef unsigned char  vec_t __attribute__((vector_size(16)));
+
+void
+foo (__vector_quad *dst, vec_t *src)
+{
+  __vector_quad quad0, quad1;
+  /* Adjacent loads should be combined into two lxvp instructions.
+     and identical build accs should not be combined.  */
+  __builtin_mma_build_acc (&quad0, src[0], src[1], src[2], src[3]);
+  __builtin_mma_build_acc (&quad1, src[0], src[1], src[2], src[3]);
+  dst[0] = quad0;
+  dst[2] = quad1;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mxxmtacc\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxmfacc\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 4 } } */