[AArch64] Add gather loads for partial SVE modes
authorRichard Sandiford <richard.sandiford@arm.com>
Sat, 16 Nov 2019 11:20:30 +0000 (11:20 +0000)
committerRichard Sandiford <rsandifo@gcc.gnu.org>
Sat, 16 Nov 2019 11:20:30 +0000 (11:20 +0000)
This patch adds support for gather loads of partial vectors,
where the vector base or offset elements can be wider than the
elements being loaded.

2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
* config/aarch64/iterators.md (SVE_24, SVE_2, SVE_4): New mode
iterators.
* config/aarch64/aarch64-sve.md
(gather_load<SVE_FULL_SD:mode><v_int_equiv>): Extend to...
(gather_load<SVE_24:mode><v_int_container>): ...this.
(mask_gather_load<SVE_FULL_S:mode><v_int_equiv>): Extend to...
(mask_gather_load<SVE_4:mode><v_int_container>): ...this.
(mask_gather_load<SVE_FULL_D:mode><v_int_equiv>): Extend to...
(mask_gather_load<SVE_2:mode><v_int_container>): ...this.
(*mask_gather_load<SVE_2:mode><v_int_container>_<su>xtw_unpacked):
New pattern.
(*mask_gather_load<SVE_FULL_D:mode><v_int_equiv>_sxtw): Extend to...
(*mask_gather_load<SVE_2:mode><v_int_equiv>_sxtw): ...this.
Allow the nominal extension predicate to be different from the
load predicate.
(*mask_gather_load<SVE_FULL_D:mode><v_int_equiv>_uxtw): Extend to...
(*mask_gather_load<SVE_2:mode><v_int_equiv>_uxtw): ...this.

gcc/testsuite/
* gcc.target/aarch64/sve/gather_load_1.c (TEST_LOOP): Start at 0.
(TEST_ALL): Add tests for 8-bit and 16-bit elements.
* gcc.target/aarch64/sve/gather_load_2.c: Update accordingly.
* gcc.target/aarch64/sve/gather_load_3.c (TEST_LOOP): Start at 0.
(TEST_ALL): Add tests for 8-bit and 16-bit elements.
* gcc.target/aarch64/sve/gather_load_4.c: Update accordingly.
* gcc.target/aarch64/sve/gather_load_5.c (TEST_LOOP): Start at 0.
(TEST_ALL): Add tests for 8-bit, 16-bit and 32-bit elements.
* gcc.target/aarch64/sve/gather_load_6.c: Add
--param aarch64-sve-compare-costs=0.
(TEST_LOOP): Start at 0.
* gcc.target/aarch64/sve/gather_load_7.c: Add
--param aarch64-sve-compare-costs=0.
* gcc.target/aarch64/sve/gather_load_8.c: New test.
* gcc.target/aarch64/sve/gather_load_9.c: Likewise.
* gcc.target/aarch64/sve/mask_gather_load_6.c: Add
--param aarch64-sve-compare-costs=0.

From-SVN: r278345

14 files changed:
gcc/ChangeLog
gcc/config/aarch64/aarch64-sve.md
gcc/config/aarch64/iterators.md
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/aarch64/sve/gather_load_1.c
gcc/testsuite/gcc.target/aarch64/sve/gather_load_2.c
gcc/testsuite/gcc.target/aarch64/sve/gather_load_3.c
gcc/testsuite/gcc.target/aarch64/sve/gather_load_4.c
gcc/testsuite/gcc.target/aarch64/sve/gather_load_5.c
gcc/testsuite/gcc.target/aarch64/sve/gather_load_6.c
gcc/testsuite/gcc.target/aarch64/sve/gather_load_7.c
gcc/testsuite/gcc.target/aarch64/sve/gather_load_8.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/gather_load_9.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/mask_gather_load_6.c

index 7fe9a11..1253306 100644 (file)
@@ -1,5 +1,25 @@
 2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>
 
+       * config/aarch64/iterators.md (SVE_24, SVE_2, SVE_4): New mode
+       iterators.
+       * config/aarch64/aarch64-sve.md
+       (gather_load<SVE_FULL_SD:mode><v_int_equiv>): Extend to...
+       (gather_load<SVE_24:mode><v_int_container>): ...this.
+       (mask_gather_load<SVE_FULL_S:mode><v_int_equiv>): Extend to...
+       (mask_gather_load<SVE_4:mode><v_int_container>): ...this.
+       (mask_gather_load<SVE_FULL_D:mode><v_int_equiv>): Extend to...
+       (mask_gather_load<SVE_2:mode><v_int_container>): ...this.
+       (*mask_gather_load<SVE_2:mode><v_int_container>_<su>xtw_unpacked):
+       New pattern.
+       (*mask_gather_load<SVE_FULL_D:mode><v_int_equiv>_sxtw): Extend to...
+       (*mask_gather_load<SVE_2:mode><v_int_equiv>_sxtw): ...this.
+       Allow the nominal extension predicate to be different from the
+       load predicate.
+       (*mask_gather_load<SVE_FULL_D:mode><v_int_equiv>_uxtw): Extend to...
+       (*mask_gather_load<SVE_2:mode><v_int_equiv>_uxtw): ...this.
+
+2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>
+
        * config/aarch64/aarch64-sve.md
        (trunc<SVE_HSDI:mode><SVE_PARTIAL_I:mode>2): New pattern.
        * config/aarch64/aarch64.c (aarch64_integer_truncation_p): New
index 158a178..e26ac45 100644 (file)
 ;; -------------------------------------------------------------------------
 
 ;; Unpredicated gather loads.
-(define_expand "gather_load<mode><v_int_equiv>"
-  [(set (match_operand:SVE_FULL_SD 0 "register_operand")
-       (unspec:SVE_FULL_SD
+(define_expand "gather_load<mode><v_int_container>"
+  [(set (match_operand:SVE_24 0 "register_operand")
+       (unspec:SVE_24
          [(match_dup 5)
           (match_operand:DI 1 "aarch64_sve_gather_offset_<Vesize>")
-          (match_operand:<V_INT_EQUIV> 2 "register_operand")
+          (match_operand:<V_INT_CONTAINER> 2 "register_operand")
           (match_operand:DI 3 "const_int_operand")
           (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
           (mem:BLK (scratch))]
 
 ;; Predicated gather loads for 32-bit elements.  Operand 3 is true for
 ;; unsigned extension and false for signed extension.
-(define_insn "mask_gather_load<mode><v_int_equiv>"
-  [(set (match_operand:SVE_FULL_S 0 "register_operand" "=w, w, w, w, w, w")
-       (unspec:SVE_FULL_S
+(define_insn "mask_gather_load<mode><v_int_container>"
+  [(set (match_operand:SVE_4 0 "register_operand" "=w, w, w, w, w, w")
+       (unspec:SVE_4
          [(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
-          (match_operand:DI 1 "aarch64_sve_gather_offset_w" "Z, vgw, rk, rk, rk, rk")
+          (match_operand:DI 1 "aarch64_sve_gather_offset_<Vesize>" "Z, vgw, rk, rk, rk, rk")
           (match_operand:VNx4SI 2 "register_operand" "w, w, w, w, w, w")
           (match_operand:DI 3 "const_int_operand" "Ui1, Ui1, Z, Ui1, Z, Ui1")
-          (match_operand:DI 4 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, Ui1, i, i")
+          (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
           (mem:BLK (scratch))]
          UNSPEC_LD1_GATHER))]
   "TARGET_SVE"
   "@
-   ld1w\t%0.s, %5/z, [%2.s]
-   ld1w\t%0.s, %5/z, [%2.s, #%1]
-   ld1w\t%0.s, %5/z, [%1, %2.s, sxtw]
-   ld1w\t%0.s, %5/z, [%1, %2.s, uxtw]
-   ld1w\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
-   ld1w\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
+   ld1<Vesize>\t%0.s, %5/z, [%2.s]
+   ld1<Vesize>\t%0.s, %5/z, [%2.s, #%1]
+   ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw]
+   ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw]
+   ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
+   ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
 )
 
 ;; Predicated gather loads for 64-bit elements.  The value of operand 3
 ;; doesn't matter in this case.
-(define_insn "mask_gather_load<mode><v_int_equiv>"
-  [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w, w, w")
-       (unspec:SVE_FULL_D
+(define_insn "mask_gather_load<mode><v_int_container>"
+  [(set (match_operand:SVE_2 0 "register_operand" "=w, w, w, w")
+       (unspec:SVE_2
          [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
-          (match_operand:DI 1 "aarch64_sve_gather_offset_d" "Z, vgd, rk, rk")
+          (match_operand:DI 1 "aarch64_sve_gather_offset_<Vesize>" "Z, vgd, rk, rk")
           (match_operand:VNx2DI 2 "register_operand" "w, w, w, w")
           (match_operand:DI 3 "const_int_operand")
-          (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, Ui1, Ui1, i")
+          (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>" "Ui1, Ui1, Ui1, i")
           (mem:BLK (scratch))]
          UNSPEC_LD1_GATHER))]
   "TARGET_SVE"
   "@
-   ld1d\t%0.d, %5/z, [%2.d]
-   ld1d\t%0.d, %5/z, [%2.d, #%1]
-   ld1d\t%0.d, %5/z, [%1, %2.d]
-   ld1d\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
+   ld1<Vesize>\t%0.d, %5/z, [%2.d]
+   ld1<Vesize>\t%0.d, %5/z, [%2.d, #%1]
+   ld1<Vesize>\t%0.d, %5/z, [%1, %2.d]
+   ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
 )
 
-;; Likewise, but with the offset being sign-extended from 32 bits.
-(define_insn "*mask_gather_load<mode><v_int_equiv>_sxtw"
-  [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w")
-       (unspec:SVE_FULL_D
+;; Likewise, but with the offset being extended from 32 bits.
+(define_insn_and_rewrite "*mask_gather_load<mode><v_int_container>_<su>xtw_unpacked"
+  [(set (match_operand:SVE_2 0 "register_operand" "=w, w")
+       (unspec:SVE_2
+         [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
+          (match_operand:DI 1 "register_operand" "rk, rk")
+          (unspec:VNx2DI
+            [(match_operand 6)
+             (ANY_EXTEND:VNx2DI
+               (match_operand:VNx2SI 2 "register_operand" "w, w"))]
+            UNSPEC_PRED_X)
+          (match_operand:DI 3 "const_int_operand")
+          (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>" "Ui1, i")
+          (mem:BLK (scratch))]
+         UNSPEC_LD1_GATHER))]
+  "TARGET_SVE"
+  "@
+   ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, <su>xtw]
+   ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, <su>xtw %p4]"
+  "&& !CONSTANT_P (operands[6])"
+  {
+    operands[6] = CONSTM1_RTX (VNx2BImode);
+  }
+)
+
+;; Likewise, but with the offset being truncated to 32 bits and then
+;; sign-extended.
+(define_insn_and_rewrite "*mask_gather_load<mode><v_int_container>_sxtw"
+  [(set (match_operand:SVE_2 0 "register_operand" "=w, w")
+       (unspec:SVE_2
          [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
           (match_operand:DI 1 "register_operand" "rk, rk")
           (unspec:VNx2DI
-            [(match_dup 5)
+            [(match_operand 6)
              (sign_extend:VNx2DI
                (truncate:VNx2SI
                  (match_operand:VNx2DI 2 "register_operand" "w, w")))]
             UNSPEC_PRED_X)
           (match_operand:DI 3 "const_int_operand")
-          (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, i")
+          (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>" "Ui1, i")
           (mem:BLK (scratch))]
          UNSPEC_LD1_GATHER))]
   "TARGET_SVE"
   "@
-   ld1d\t%0.d, %5/z, [%1, %2.d, sxtw]
-   ld1d\t%0.d, %5/z, [%1, %2.d, sxtw %p4]"
+   ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, sxtw]
+   ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, sxtw %p4]"
+  "&& !CONSTANT_P (operands[6])"
+  {
+    operands[6] = CONSTM1_RTX (VNx2BImode);
+  }
 )
 
-;; Likewise, but with the offset being zero-extended from 32 bits.
-(define_insn "*mask_gather_load<mode><v_int_equiv>_uxtw"
-  [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w")
-       (unspec:SVE_FULL_D
+;; Likewise, but with the offset being truncated to 32 bits and then
+;; zero-extended.
+(define_insn "*mask_gather_load<mode><v_int_container>_uxtw"
+  [(set (match_operand:SVE_2 0 "register_operand" "=w, w")
+       (unspec:SVE_2
          [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
           (match_operand:DI 1 "register_operand" "rk, rk")
           (and:VNx2DI
             (match_operand:VNx2DI 2 "register_operand" "w, w")
             (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
           (match_operand:DI 3 "const_int_operand")
-          (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, i")
+          (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>" "Ui1, i")
           (mem:BLK (scratch))]
          UNSPEC_LD1_GATHER))]
   "TARGET_SVE"
   "@
-   ld1d\t%0.d, %5/z, [%1, %2.d, uxtw]
-   ld1d\t%0.d, %5/z, [%1, %2.d, uxtw %p4]"
+   ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, uxtw]
+   ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, uxtw %p4]"
 )
 
 ;; -------------------------------------------------------------------------
index 06e91eb..c5b0fa7 100644 (file)
                                VNx4SI VNx2SI
                                VNx2DI])
 
+;; SVE modes with 2 or 4 elements.
+(define_mode_iterator SVE_24 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF
+                             VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF])
+
+;; SVE modes with 2 elements.
+(define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF])
+
+;; SVE modes with 4 elements.
+(define_mode_iterator SVE_4 [VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF])
+
 ;; Modes involved in extending or truncating SVE data, for 8 elements per
 ;; 128-bit block.
 (define_mode_iterator VNx8_NARROW [VNx8QI])
                              (VNx4SI "vnx4sf") (VNx4SF "vnx4sf")
                              (VNx2DI "vnx2df") (VNx2DF "vnx2df")])
 
+;; Maps full and partial vector modes of any element type to a full-vector
+;; integer mode with the same number of units.
+(define_mode_attr V_INT_CONTAINER [(VNx16QI "VNx16QI") (VNx8QI "VNx8HI")
+                                  (VNx4QI "VNx4SI") (VNx2QI "VNx2DI")
+                                  (VNx8HI "VNx8HI") (VNx4HI "VNx4SI")
+                                  (VNx2HI "VNx2DI")
+                                  (VNx4SI "VNx4SI") (VNx2SI "VNx2DI")
+                                  (VNx2DI "VNx2DI")
+                                  (VNx8HF "VNx8HI") (VNx4HF "VNx4SI")
+                                  (VNx2HF "VNx2DI")
+                                  (VNx4SF "VNx4SI") (VNx2SF "VNx2SI")
+                                  (VNx2DF "VNx2DI")])
+
+;; Lower-case version of V_INT_CONTAINER.
+(define_mode_attr v_int_container [(VNx16QI "vnx16qi") (VNx8QI "vnx8hi")
+                                  (VNx4QI "vnx4si") (VNx2QI "vnx2di")
+                                  (VNx8HI "vnx8hi") (VNx4HI "vnx4si")
+                                  (VNx2HI "vnx2di")
+                                  (VNx4SI "vnx4si") (VNx2SI "vnx2di")
+                                  (VNx2DI "vnx2di")
+                                  (VNx8HF "vnx8hi") (VNx4HF "vnx4si")
+                                  (VNx2HF "vnx2di")
+                                  (VNx4SF "vnx4si") (VNx2SF "vnx2di")
+                                  (VNx2DF "vnx2di")])
+
 ;; Mode for vector conditional operations where the comparison has
 ;; different type from the lhs.
 (define_mode_attr V_cmp_mixed [(V2SI "V2SF") (V4SI "V4SF")
index 28f99e0..e15be63 100644 (file)
@@ -1,5 +1,25 @@
 2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>
 
+       * gcc.target/aarch64/sve/gather_load_1.c (TEST_LOOP): Start at 0.
+       (TEST_ALL): Add tests for 8-bit and 16-bit elements.
+       * gcc.target/aarch64/sve/gather_load_2.c: Update accordingly.
+       * gcc.target/aarch64/sve/gather_load_3.c (TEST_LOOP): Start at 0.
+       (TEST_ALL): Add tests for 8-bit and 16-bit elements.
+       * gcc.target/aarch64/sve/gather_load_4.c: Update accordingly.
+       * gcc.target/aarch64/sve/gather_load_5.c (TEST_LOOP): Start at 0.
+       (TEST_ALL): Add tests for 8-bit, 16-bit and 32-bit elements.
+       * gcc.target/aarch64/sve/gather_load_6.c: Add
+       --param aarch64-sve-compare-costs=0.
+       (TEST_LOOP): Start at 0.
+       * gcc.target/aarch64/sve/gather_load_7.c: Add
+       --param aarch64-sve-compare-costs=0.
+       * gcc.target/aarch64/sve/gather_load_8.c: New test.
+       * gcc.target/aarch64/sve/gather_load_9.c: Likewise.
+       * gcc.target/aarch64/sve/mask_gather_load_6.c: Add
+       --param aarch64-sve-compare-costs=0.
+
+2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>
+
        * gcc.target/aarch64/sve/mask_struct_load_1.c: Add
        --param aarch64-sve-compare-costs=0.
        * gcc.target/aarch64/sve/mask_struct_load_2.c: Likewise.
index 33f1629..941ca65 100644 (file)
@@ -8,17 +8,20 @@
 #define INDEX64 int64_t
 #endif
 
-/* Invoked 18 times for each data size.  */
 #define TEST_LOOP(DATA_TYPE, BITS)                                     \
   void __attribute__ ((noinline, noclone))                             \
   f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src,    \
                 INDEX##BITS *indices, int n)                           \
   {                                                                    \
-    for (int i = 9; i < n; ++i)                                                \
+    for (int i = 0; i < n; ++i)                                                \
       dest[i] += src[indices[i]];                                      \
   }
 
 #define TEST_ALL(T)                            \
+  T (int8_t, 32)                               \
+  T (uint8_t, 32)                              \
+  T (int16_t, 32)                              \
+  T (uint16_t, 32)                             \
   T (int32_t, 32)                              \
   T (uint32_t, 32)                             \
   T (float, 32)                                        \
 
 TEST_ALL (TEST_LOOP)
 
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 1\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */
index e3fb2a9..4a73d1f 100644 (file)
@@ -6,5 +6,12 @@
 
 #include "gather_load_1.c"
 
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 1\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */
index 54af507..bd4b208 100644 (file)
@@ -8,17 +8,20 @@
 #define INDEX64 int64_t
 #endif
 
-/* Invoked 18 times for each data size.  */
 #define TEST_LOOP(DATA_TYPE, BITS)                                     \
   void __attribute__ ((noinline, noclone))                             \
   f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src,    \
                 INDEX##BITS *indices, int n)                           \
   {                                                                    \
-    for (int i = 9; i < n; ++i)                                                \
+    for (int i = 0; i < n; ++i)                                                \
       dest[i] += *(DATA_TYPE *) ((char *) src + indices[i]);           \
   }
 
 #define TEST_ALL(T)                            \
+  T (int8_t, 32)                               \
+  T (uint8_t, 32)                              \
+  T (int16_t, 32)                              \
+  T (uint16_t, 32)                             \
   T (int32_t, 32)                              \
   T (uint32_t, 32)                             \
   T (float, 32)                                        \
 
 TEST_ALL (TEST_LOOP)
 
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */
index 3e2c831..2cfded6 100644 (file)
@@ -6,5 +6,12 @@
 
 #include "gather_load_3.c"
 
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */
index b22a80a..3737e04 100644 (file)
@@ -3,21 +3,34 @@
 
 #include <stdint.h>
 
-/* Invoked 18 times for each data size.  */
 #define TEST_LOOP(DATA_TYPE)                                           \
   void __attribute__ ((noinline, noclone))                             \
   f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict *src,   \
                 int n)                                                 \
   {                                                                    \
-    for (int i = 9; i < n; ++i)                                                \
+    for (int i = 0; i < n; ++i)                                                \
       dest[i] += *src[i];                                              \
   }
 
 #define TEST_ALL(T)                            \
+  T (int8_t)                                   \
+  T (uint8_t)                                  \
+  T (int16_t)                                  \
+  T (uint16_t)                                 \
+  T (int32_t)                                  \
+  T (uint32_t)                                 \
   T (int64_t)                                  \
   T (uint64_t)                                 \
   T (double)
 
 TEST_ALL (TEST_LOOP)
 
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.d, p[0-7]/z, \[z[0-9]+.d\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d, p[0-7]/z, \[z[0-9]+.d\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d, p[0-7]/z, \[z[0-9]+.d\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[z[0-9]+.d\]\n} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */
index 8445be4..6fdd16b 100644 (file)
@@ -1,5 +1,5 @@
 /* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O2 -ftree-vectorize -fwrapv --save-temps" } */
+/* { dg-options "-O2 -ftree-vectorize -fwrapv --save-temps --param aarch64-sve-compare-costs=0" } */
 
 #include <stdint.h>
 
@@ -8,13 +8,12 @@
 #define INDEX32 int32_t
 #endif
 
-/* Invoked 18 times for each data size.  */
 #define TEST_LOOP(DATA_TYPE, BITS)                                     \
   void __attribute__ ((noinline, noclone))                             \
   f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src,    \
                 INDEX##BITS *indices, INDEX##BITS mask, int n)         \
   {                                                                    \
-    for (int i = 9; i < n; ++i)                                                \
+    for (int i = 0; i < n; ++i)                                                \
       dest[i] = src[(INDEX##BITS) (indices[i] | mask)];                        \
   }
 
index f5ae930..5a3f3e7 100644 (file)
@@ -1,5 +1,5 @@
 /* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps --param aarch64-sve-compare-costs=0" } */
 
 #define INDEX16 uint16_t
 #define INDEX32 uint32_t
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_8.c b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_8.c
new file mode 100644 (file)
index 0000000..0ea6f72
--- /dev/null
@@ -0,0 +1,46 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize -fwrapv --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX16 int16_t
+#define INDEX32 int32_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, BITS)                                     \
+  void __attribute__ ((noinline, noclone))                             \
+  f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src,    \
+                INDEX##BITS *indices, INDEX##BITS mask, int n)         \
+  {                                                                    \
+    for (int i = 0; i < n; ++i)                                                \
+      dest[i] = src[(INDEX##BITS) (indices[i] + mask)];                        \
+  }
+
+#define TEST_ALL(T)                            \
+  T (int8_t, 16)                               \
+  T (uint8_t, 16)                              \
+  T (int16_t, 16)                              \
+  T (uint16_t, 16)                             \
+  T (_Float16, 16)                             \
+  T (int32_t, 16)                              \
+  T (uint32_t, 16)                             \
+  T (float, 16)                                        \
+  T (int64_t, 32)                              \
+  T (uint64_t, 32)                             \
+  T (double, 32)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 1\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, sxtw 3\]\n} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tsxt.\tz} 8 } } */
+/* { dg-final { scan-assembler-times {\tsxth\tz[0-9]+\.s,} 8 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_9.c b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_9.c
new file mode 100644 (file)
index 0000000..04b71f1
--- /dev/null
@@ -0,0 +1,20 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize -fwrapv --save-temps" } */
+
+#define INDEX16 uint16_t
+#define INDEX32 uint32_t
+
+#include "gather_load_8.c"
+
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 1\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, uxtw 3\]\n} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tuxt.\tz} 8 } } */
+/* { dg-final { scan-assembler-times {\tuxth\tz[0-9]+\.s,} 8 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */
index ff01431..a13516a 100644 (file)
@@ -1,5 +1,5 @@
 /* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O2 -ftree-vectorize -ffast-math --save-temps" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math --save-temps --param aarch64-sve-compare-costs=0" } */
 
 #include <stdint.h>