From 40757a25d45d47ddc50819bfd32dd6aac595abc2 Mon Sep 17 00:00:00 2001
From: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date: Wed, 8 Nov 2017 18:27:57 +0000
Subject: [PATCH] vec_merge + vec_duplicate + vec_concat simplification

Another vec_merge simplification that's missing is transforming:
(vec_merge (vec_duplicate x) (vec_concat (y) (z)) (const_int N))
into
(vec_concat x z) if N == 1 (0b01) or
(vec_concat y x) if N == 2 (0b10)

For the testcase in this patch on aarch64 this allows us to try matching during combine the pattern:
(set (reg:V2DI 78 [ x ])
    (vec_concat:V2DI
        (mem:DI (reg/v/f:DI 76 [ y ]) [1 *y_4(D)+0 S8 A64])
        (mem:DI (plus:DI (reg/v/f:DI 76 [ y ])
                (const_int 8 [0x8])) [1 MEM[(long long int *)y_4(D) + 8B]+0 S8 A64])))

rather than the more complex:
(set (reg:V2DI 78 [ x ])
    (vec_merge:V2DI (vec_duplicate:V2DI (mem:DI (plus:DI (reg/v/f:DI 76 [ y ])
                    (const_int 8 [0x8])) [1 MEM[(long long int *)y_4(D) + 8B]+0 S8 A64]))
        (vec_duplicate:V2DI (mem:DI (reg/v/f:DI 76 [ y ]) [1 *y_4(D)+0 S8 A64]))
        (const_int 2 [0x2])))

We don't actually have an aarch64 pattern for the simplified version above, but it's a simple enough
form to add, so this patch adds such a pattern that performs a concatenated load of two 64-bit vectors
in adjacent memory locations as a single Q-register LDR. The new aarch64 pattern is needed to demonstrate
the effectiveness of the simplify-rtx change, so I've kept them together as one patch.

Now for the testcase in the patch we can generate:
construct_lanedi:
        ldr     q0, [x0]
        ret

construct_lanedf:
        ldr     q0, [x0]
        ret

instead of:
construct_lanedi:
        ld1r    {v0.2d}, [x0]
        ldr     x0, [x0, 8]
        ins     v0.d[1], x0
        ret

construct_lanedf:
        ld1r    {v0.2d}, [x0]
        ldr     d1, [x0, 8]
        ins     v0.d[1], v1.d[0]
        ret

The new memory constraint Utq is needed because we need to allow only the Q-register addressing modes but
the MEM expressions in the RTL pattern have 64-bit vector modes, and if we don't constrain them they will
allow the D-register addressing modes during register allocation/address mode selection, which will produce
invalid assembly.

Bootstrapped and tested on aarch64-none-linux-gnu.

	* simplify-rtx.c (simplify_ternary_operation, VEC_MERGE):
	Simplify vec_merge of vec_duplicate and vec_concat.
	* config/aarch64/constraints.md (Utq): New constraint.
	* config/aarch64/aarch64-simd.md (load_pair_lanes<mode>): New
	define_insn.

	* gcc.target/aarch64/load_v2vec_lanes_1.c: New test.

From-SVN: r254549
---
 gcc/ChangeLog                                      |  8 +++++++
 gcc/config/aarch64/aarch64-simd.md                 | 14 ++++++++++++
 gcc/config/aarch64/constraints.md                  |  7 ++++++
 gcc/simplify-rtx.c                                 | 19 ++++++++++++++++
 gcc/testsuite/ChangeLog                            |  4 ++++
 .../gcc.target/aarch64/load_v2vec_lanes_1.c        | 26 ++++++++++++++++++++++
 6 files changed, 78 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/load_v2vec_lanes_1.c
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index d1225c3..9b50bca 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,6 +1,14 @@
 2017-11-08  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
 
 	* simplify-rtx.c (simplify_ternary_operation, VEC_MERGE):
+	Simplify vec_merge of vec_duplicate and vec_concat.
+	* config/aarch64/constraints.md (Utq): New constraint.
+	* config/aarch64/aarch64-simd.md (load_pair_lanes<mode>): New
+	define_insn.
+
+2017-11-08  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+	* simplify-rtx.c (simplify_ternary_operation, VEC_MERGE):
 	Simplify vec_merge of vec_duplicate and const_vector.
 	* config/aarch64/predicates.md (aarch64_simd_or_scalar_imm_zero):
 	New predicate.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 34233f6..1f5c911 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2935,6 +2935,20 @@
   [(set_attr "type" "neon_to_gp<q>, neon_dup<q>, neon_store1_one_lane<q>")]
 )
 
+(define_insn "load_pair_lanes<mode>"
+  [(set (match_operand:<VDBL> 0 "register_operand" "=w")
+	(vec_concat:<VDBL>
+	   (match_operand:VDC 1 "memory_operand" "Utq")
+	   (match_operand:VDC 2 "memory_operand" "m")))]
+  "TARGET_SIMD && !STRICT_ALIGNMENT
+   && rtx_equal_p (XEXP (operands[2], 0),
+		   plus_constant (Pmode,
+				  XEXP (operands[1], 0),
+				  GET_MODE_SIZE (<MODE>mode)))"
+  "ldr\\t%q0, %1"
+  [(set_attr "type" "neon_load1_1reg_q")]
+)
+
 ;; In this insn, operand 1 should be low, and operand 2 the high part of the
 ;; dest vector.
 
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 77ca85d..4ef7a50 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -178,6 +178,13 @@
   (and (match_code "mem")
        (match_test "aarch64_simd_mem_operand_p (op)")))
 
+(define_memory_constraint "Utq"
+  "@internal
+   An address valid for loading or storing a 128-bit AdvSIMD register"
+  (and (match_code "mem")
+       (match_test "aarch64_legitimate_address_p (V2DImode, XEXP (op, 0),
+						  MEM, 1)")))
+
 (define_constraint "Ufc"
   "A floating point constant which can be used with an\
    FMOV immediate operation."
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index 94302f6..92c783a 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -5765,6 +5765,25 @@ simplify_ternary_operation (enum rtx_code code, machine_mode mode,
 		std::swap (newop0, newop1);
 	      return simplify_gen_binary (VEC_CONCAT, mode, newop0, newop1);
 	    }
+	  /* Replace (vec_merge (vec_duplicate x) (vec_concat (y) (z)) (const_int N))
+	     with (vec_concat x z) if N == 1, or (vec_concat y x) if N == 2.
+	     Only applies for vectors of two elements.  */
+	  if (GET_CODE (op0) == VEC_DUPLICATE
+	      && GET_CODE (op1) == VEC_CONCAT
+	      && GET_MODE_NUNITS (GET_MODE (op0)) == 2
+	      && GET_MODE_NUNITS (GET_MODE (op1)) == 2
+	      && IN_RANGE (sel, 1, 2))
+	    {
+	      rtx newop0 = XEXP (op0, 0);
+	      rtx newop1 = XEXP (op1, 2 - sel);
+	      rtx otherop = XEXP (op1, sel - 1);
+	      if (sel == 2)
+		std::swap (newop0, newop1);
+	      /* Don't want to throw away the other part of the vec_concat if
+		 it has side-effects.  */
+	      if (!side_effects_p (otherop))
+		return simplify_gen_binary (VEC_CONCAT, mode, newop0, newop1);
+	    }
 	}
 
       if (rtx_equal_p (op0, op1)
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 28894ee..3d9c337 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,5 +1,9 @@
 2017-11-08  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
 
+	* gcc.target/aarch64/load_v2vec_lanes_1.c: New test.
+
+2017-11-08  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
 	* gcc.target/aarch64/construct_lane_zero_1.c: New test.
 
 2017-11-08  Ed Schonberg  <schonberg@adacore.com>
diff --git a/gcc/testsuite/gcc.target/aarch64/load_v2vec_lanes_1.c b/gcc/testsuite/gcc.target/aarch64/load_v2vec_lanes_1.c
new file mode 100644
index 0000000..3c31b34
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/load_v2vec_lanes_1.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef long long v2di __attribute__ ((vector_size (16)));
+typedef double v2df __attribute__ ((vector_size (16)));
+
+v2di
+construct_lanedi (long long *y)
+{
+  v2di x = { y[0], y[1] };
+  return x;
+}
+
+v2df
+construct_lanedf (double *y)
+{
+  v2df x = { y[0], y[1] };
+  return x;
+}
+
+/* We can use the load_pair_lanes<mode> pattern to vec_concat two DI/DF
+   values from consecutive memory into a 2-element vector by using
+   a Q-reg LDR.  */
+
+/* { dg-final { scan-assembler-times "ldr\tq\[0-9\]+" 2 } } */
+/* { dg-final { scan-assembler-not "ins\t" } } */
-- 
2.7.4