From cd91a084877dabcc53aec57ab70ca4fc32f3d985 Mon Sep 17 00:00:00 2001
From: Przemyslaw Wirkus <przemyslaw.wirkus@arm.com>
Date: Wed, 22 Jul 2020 10:14:39 +0100
Subject: [PATCH] Generation of adjusted ldp/stp for vector types

Introduce simple peephole2 optimization which substitutes a sequence of
four consecutive load or store (LDR, STR) instructions with two load or
store pair (LDP, STP) instructions for 2 element supported vector modes
(V2SI, V2SF, V2DI, and V2DF).
Generated load / store pair instruction offset is adjusted accordingly.

Bootstrapped and tested on aarch64-none-linux-gnu.

Example:
$ cat stp_vec_v2sf.c
typedef float __attribute__((vector_size(8))) vec;

void
store_adjusted(vec *out, vec x, vec y)
{
  out[400] = x;
  out[401] = y;
  out[402] = y;
  out[403] = x;
}

Example compiled with:
$ ./aarch64-none-linux-gnu-gcc -S -O2 stp_vec_v2sf.c -dp

Before the patch:

store_adjusted:
    str     d0, [x0, 3200]    // 9    [c=4 l=4]  *aarch64_simd_movv2si/2
    str     d1, [x0, 3208]    // 11   [c=4 l=4]  *aarch64_simd_movv2si/2
    str     d1, [x0, 3216]    // 13   [c=4 l=4]  *aarch64_simd_movv2si/2
    str     d0, [x0, 3224]    // 15   [c=4 l=4]  *aarch64_simd_movv2si/2
    ret       // 26       [c=0 l=4]  *do_return

After the patch:

store_adjusted:
    add     x1, x0, 3200    // 27   [c=4 l=4]  *adddi3_aarch64/0
    stp     d0, d1, [x1]    // 28   [c=0 l=4]  vec_store_pairv2siv2si
    stp     d1, d0, [x1, 16]        // 29   [c=0 l=4]  vec_store_pairv2siv2si
    ret             // 22   [c=0 l=4]  *do_return

gcc/ChangeLog:

	* config/aarch64/aarch64-ldpstp.md: Add two peepholes for adjusted vector
	V2SI, V2SF, V2DI, V2DF load pair and store pair modes.
	* config/aarch64/aarch64-protos.h (aarch64_gen_adjusted_ldpstp):
	Change mode parameter to machine_mode.
	(aarch64_operands_adjust_ok_for_ldpstp): Change mode parameter to
	machine_mode.
	* config/aarch64/aarch64.c (aarch64_operands_adjust_ok_for_ldpstp):
	Change mode parameter to machine_mode.
	(aarch64_gen_adjusted_ldpstp): Change mode parameter to machine_mode.
	* config/aarch64/iterators.md (VP_2E): New iterator for 2 element vectors.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/ldp_vec_v2sf.c: New test.
	* gcc.target/aarch64/ldp_vec_v2si.c: New test.
	* gcc.target/aarch64/stp_vec_v2df.c: New test.
	* gcc.target/aarch64/stp_vec_v2di.c: New test.
	* gcc.target/aarch64/stp_vec_v2sf.c: New test.
	* gcc.target/aarch64/stp_vec_v2si.c: New test.
---
 gcc/config/aarch64/aarch64-ldpstp.md            | 42 +++++++++++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h             |  4 +--
 gcc/config/aarch64/aarch64.c                    |  8 ++---
 gcc/config/aarch64/iterators.md                 |  3 ++
 gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c | 14 +++++++++
 gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c | 14 +++++++++
 gcc/testsuite/gcc.target/aarch64/stp_vec_v2df.c | 18 +++++++++++
 gcc/testsuite/gcc.target/aarch64/stp_vec_v2di.c | 18 +++++++++++
 gcc/testsuite/gcc.target/aarch64/stp_vec_v2sf.c | 18 +++++++++++
 gcc/testsuite/gcc.target/aarch64/stp_vec_v2si.c | 18 +++++++++++
 10 files changed, 151 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_vec_v2df.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_vec_v2di.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_vec_v2sf.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_vec_v2si.c

diff --git a/gcc/config/aarch64/aarch64-ldpstp.md b/gcc/config/aarch64/aarch64-ldpstp.md
index dd6f396..02d7a5b 100644
--- a/gcc/config/aarch64/aarch64-ldpstp.md
+++ b/gcc/config/aarch64/aarch64-ldpstp.md
@@ -294,3 +294,45 @@
   else
     FAIL;
 })
+
+(define_peephole2
+  [(match_scratch:DI 8 "r")
+   (set (match_operand:VP_2E 0 "memory_operand" "")
+        (match_operand:VP_2E 1 "aarch64_reg_or_zero" ""))
+   (set (match_operand:VP_2E 2 "memory_operand" "")
+        (match_operand:VP_2E 3 "aarch64_reg_or_zero" ""))
+   (set (match_operand:VP_2E 4 "memory_operand" "")
+        (match_operand:VP_2E 5 "aarch64_reg_or_zero" ""))
+   (set (match_operand:VP_2E 6 "memory_operand" "")
+        (match_operand:VP_2E 7 "aarch64_reg_or_zero" ""))
+   (match_dup 8)]
+  "TARGET_SIMD
+   && aarch64_operands_adjust_ok_for_ldpstp (operands, false, <MODE>mode)"
+  [(const_int 0)]
+{
+  if (aarch64_gen_adjusted_ldpstp (operands, false, <MODE>mode, UNKNOWN))
+    DONE;
+  else
+    FAIL;
+})
+
+(define_peephole2
+  [(match_scratch:DI 8 "r")
+   (set (match_operand:VP_2E 0 "register_operand" "")
+        (match_operand:VP_2E 1 "memory_operand" ""))
+   (set (match_operand:VP_2E 2 "register_operand" "")
+        (match_operand:VP_2E 3 "memory_operand" ""))
+   (set (match_operand:VP_2E 4 "register_operand" "")
+        (match_operand:VP_2E 5 "memory_operand" ""))
+   (set (match_operand:VP_2E 6 "register_operand" "")
+        (match_operand:VP_2E 7 "memory_operand" ""))
+   (match_dup 8)]
+  "TARGET_SIMD
+   && aarch64_operands_adjust_ok_for_ldpstp (operands, true, <MODE>mode)"
+  [(const_int 0)]
+{
+  if (aarch64_gen_adjusted_ldpstp (operands, true, <MODE>mode, UNKNOWN))
+    DONE;
+  else
+    FAIL;
+})
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 839f801..c7e828d 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -682,7 +682,7 @@ void aarch64_split_compare_and_swap (rtx op[]);
 
 void aarch64_split_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx, rtx);
 
-bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, RTX_CODE);
+bool aarch64_gen_adjusted_ldpstp (rtx *, bool, machine_mode, RTX_CODE);
 
 void aarch64_expand_sve_vec_cmp_int (rtx, rtx_code, rtx, rtx);
 bool aarch64_expand_sve_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
@@ -733,7 +733,7 @@ int aarch64_ccmp_mode_to_code (machine_mode mode);
 
 bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset);
 bool aarch64_operands_ok_for_ldpstp (rtx *, bool, machine_mode);
-bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, scalar_mode);
+bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, machine_mode);
 void aarch64_swap_ldrstr_operands (rtx *, bool);
 
 extern void aarch64_asm_output_pool_epilogue (FILE *, const char *,
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2a0fd71..3fe1fea 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -22158,7 +22158,7 @@ aarch64_ldrstr_offset_compare (const void *x, const void *y)
 
 bool
 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
-				       scalar_mode mode)
+				       machine_mode mode)
 {
   const int num_insns = 4;
   enum reg_class rclass;
@@ -22235,7 +22235,7 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
   for (int i = 0; i < num_insns; i++)
     offvals[i] = INTVAL (offset[i]);
 
-  msize = GET_MODE_SIZE (mode);
+  msize = GET_MODE_SIZE (mode).to_constant ();
 
   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
@@ -22275,7 +22275,7 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
 
 bool
 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
-			     scalar_mode mode, RTX_CODE code)
+			     machine_mode mode, RTX_CODE code)
 {
   rtx base, offset_1, offset_3, t1, t2;
   rtx mem_1, mem_2, mem_3, mem_4;
@@ -22314,7 +22314,7 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
 	      && offset_3 != NULL_RTX);
 
   /* Adjust offset so it can fit in LDP/STP instruction.  */
-  msize = GET_MODE_SIZE (mode);
+  msize = GET_MODE_SIZE (mode).to_constant();
   stp_off_upper_limit = msize * (0x40 - 1);
   stp_off_lower_limit = - msize * 0x40;
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 9a51916..054fd85 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -98,6 +98,9 @@
 ;; Copy of the above.
 (define_mode_iterator DREG2 [V8QI V4HI V4HF V2SI V2SF DF])
 
+;; All modes suitable to store/load pair (2 elements) using STP/LDP.
+(define_mode_iterator VP_2E [V2SI V2SF V2DI V2DF])
+
 ;; Advanced SIMD, 64-bit container, all integer modes.
 (define_mode_iterator VD_BHSI [V8QI V4HI V2SI])
 
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c
new file mode 100644
index 0000000..fbdae1c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2sf.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef float __attribute__((vector_size(8))) vec;
+
+vec
+load_long(vec *v) {
+  return v[110] + v[111] + v[112] + v[113];
+}
+
+/* { dg-final { scan-assembler {add\tx[0-9]+, x[0-9]+, 880} } } */
+/* { dg-final { scan-assembler {ldp\td[0-9]+, d[0-9]+, \[x[0-9]+\]} } } */
+/* { dg-final { scan-assembler {ldp\td[0-9]+, d[0-9]+, \[x[0-9]+, 16\]} } } */
+/* { dg-final { scan-assembler-not "ldr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c
new file mode 100644
index 0000000..7714cd6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_vec_v2si.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef int __attribute__((vector_size(8))) vec;
+
+vec
+load_long(vec *v) {
+  return v[110] + v[111] + v[112] + v[113];
+}
+
+/* { dg-final { scan-assembler {add\tx[0-9]+, x[0-9]+, 880} } } */
+/* { dg-final { scan-assembler {ldp\td[0-9]+, d[0-9]+, \[x[0-9]+\]} } } */
+/* { dg-final { scan-assembler {ldp\td[0-9]+, d[0-9]+, \[x[0-9]+, 16\]} } } */
+/* { dg-final { scan-assembler-not "ldr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/stp_vec_v2df.c b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2df.c
new file mode 100644
index 0000000..3ee039d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2df.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef double __attribute__((vector_size(16))) vec;
+
+void
+store_adjusted(vec *out, vec x, vec y)
+{
+  out[100] = x;
+  out[101] = y;
+  out[102] = y;
+  out[103] = x;
+}
+
+/* { dg-final { scan-assembler {add\tx[0-9]+, x[0-9]+, 1600} } } */
+/* { dg-final { scan-assembler {stp\tq[0-9]+, q[0-9]+, \[x[0-9]+\]} } } */
+/* { dg-final { scan-assembler {stp\tq[0-9]+, q[0-9]+, \[x[0-9]+, 32\]} } } */
+/* { dg-final { scan-assembler-not "str\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/stp_vec_v2di.c b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2di.c
new file mode 100644
index 0000000..85d7b5b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2di.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef long long __attribute__((vector_size(16))) vec;
+
+void
+store_adjusted(vec *out, vec x, vec y)
+{
+  out[100] = x;
+  out[101] = y;
+  out[102] = y;
+  out[103] = x;
+}
+
+/* { dg-final { scan-assembler {add\tx[0-9]+, x[0-9]+, 1600} } } */
+/* { dg-final { scan-assembler {stp\tq[0-9]+, q[0-9]+, \[x[0-9]+\]} } } */
+/* { dg-final { scan-assembler {stp\tq[0-9]+, q[0-9]+, \[x[0-9]+, 32\]} } } */
+/* { dg-final { scan-assembler-not "str\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/stp_vec_v2sf.c b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2sf.c
new file mode 100644
index 0000000..e7c2e6e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2sf.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef float __attribute__((vector_size(8))) vec;
+
+void
+store_adjusted(vec *out, vec x, vec y)
+{
+  out[400] = x;
+  out[401] = y;
+  out[402] = y;
+  out[403] = x;
+}
+
+/* { dg-final { scan-assembler {add\tx[0-9]+, x[0-9]+, 3200} } } */
+/* { dg-final { scan-assembler {stp\td[0-9]+, d[0-9]+, \[x[0-9]+\]} } } */
+/* { dg-final { scan-assembler {stp\td[0-9]+, d[0-9]+, \[x[0-9]+, 16\]} } } */
+/* { dg-final { scan-assembler-not "str\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/stp_vec_v2si.c b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2si.c
new file mode 100644
index 0000000..17b44a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/stp_vec_v2si.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef int __attribute__((vector_size(8))) vec;
+
+void
+store_adjusted(vec *out, vec x, vec y)
+{
+  out[400] = x;
+  out[401] = y;
+  out[402] = y;
+  out[403] = x;
+}
+
+/* { dg-final { scan-assembler {add\tx[0-9]+, x[0-9]+, 3200} } } */
+/* { dg-final { scan-assembler {stp\td[0-9]+, d[0-9]+, \[x[0-9]+\]} } } */
+/* { dg-final { scan-assembler {stp\td[0-9]+, d[0-9]+, \[x[0-9]+, 16\]} } } */
+/* { dg-final { scan-assembler-not "str\t" } } */
-- 
2.7.4