aarch64: Use dup and zip1 for interleaving elements in vector initializer.
authorPrathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
Tue, 6 Dec 2022 01:21:14 +0000 (06:51 +0530)
committerPrathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
Tue, 6 Dec 2022 01:21:14 +0000 (06:51 +0530)
gcc/ChangeLog:
* config/aarch64/aarch64.cc (aarch64_expand_vector_init): Use dup
and zip1 for interleaving elements in vector initializer.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/interleave-init-1.c: New test.

gcc/config/aarch64/aarch64.cc
gcc/testsuite/gcc.target/aarch64/interleave-init-1.c [new file with mode: 0644]

index e97f3b3..dedda68 100644 (file)
@@ -22058,6 +22058,38 @@ aarch64_expand_vector_init (rtx target, rtx vals)
       return;
     }
 
+  /* Check for interleaving case.
+     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
+     Generate following code:
+     dup v0.h, x
+     dup v1.h, y
+     zip1 v0.h, v0.h, v1.h
+     for "large enough" initializer.  */
+
+  if (n_elts >= 8)
+    {
+      int i;
+      for (i = 2; i < n_elts; i++)
+       if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
+         break;
+
+      if (i == n_elts)
+       {
+         machine_mode mode = GET_MODE (target);
+         rtx dest[2];
+
+         for (int i = 0; i < 2; i++)
+           {
+             rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
+             dest[i] = force_reg (mode, x);
+           }
+
+         rtvec v = gen_rtvec (2, dest[0], dest[1]);
+         emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+         return;
+       }
+    }
+
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
new file mode 100644 (file)
index 0000000..ee77504
--- /dev/null
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** foo:
+**     ...
+**     dup     v[0-9]+\.8h, w[0-9]+
+**     dup     v[0-9]+\.8h, w[0-9]+
+**     zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
+**     ...
+**     ret
+*/
+
+int16x8_t foo(int16_t x, int y)
+{
+  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; 
+  return v;
+}
+
+/*
+** foo2:
+**     ...
+**     dup     v[0-9]+\.8h, w[0-9]+
+**     movi    v[0-9]+\.8h, 0x1
+**     zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
+**     ...
+**     ret
+*/
+
+int16x8_t foo2(int16_t x) 
+{
+  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; 
+  return v;
+}