[AArch64] Add autovec support for partial SVE vectors
authorRichard Sandiford <richard.sandiford@arm.com>
Sat, 16 Nov 2019 11:02:09 +0000 (11:02 +0000)
committerRichard Sandiford <rsandifo@gcc.gnu.org>
Sat, 16 Nov 2019 11:02:09 +0000 (11:02 +0000)
This patch adds the bare minimum needed to support autovectorisation of
partial SVE vectors, namely moves and integer addition.  Later patches
add more interesting cases.

2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
* config/aarch64/aarch64-modes.def: Define partial SVE vector
float modes.
* config/aarch64/aarch64-protos.h (aarch64_sve_pred_mode): New
function.
* config/aarch64/aarch64.c (aarch64_classify_vector_mode): Handle the
new vector float modes.
(aarch64_sve_container_bits): New function.
(aarch64_sve_pred_mode): Likewise.
(aarch64_get_mask_mode): Use it.
(aarch64_sve_element_int_mode): Handle structure modes and partial
modes.
(aarch64_sve_container_int_mode): New function.
(aarch64_vectorize_related_mode): Return SVE modes when given
SVE modes.  Handle partial modes, taking the preferred number
of units from the size of the given mode.
(aarch64_hard_regno_mode_ok): Allow partial modes to be stored
in registers.
(aarch64_expand_sve_ld1rq): Use the mode form of aarch64_sve_pred_mode.
(aarch64_expand_sve_const_vector): Handle partial SVE vectors.
(aarch64_split_sve_subreg_move): Use the mode form of
aarch64_sve_pred_mode.
(aarch64_secondary_reload): Handle partial modes in the same way
as full big-endian vectors.
(aarch64_vector_mode_supported_p): Allow partial SVE vectors.
(aarch64_autovectorize_vector_modes): Try unpacked SVE vectors,
merging with the Advanced SIMD modes.  If two modes have the
same size, try the Advanced SIMD mode first.
(aarch64_simd_valid_immediate): Use the container rather than
the element mode for INDEX constants.
(aarch64_simd_vector_alignment): Make the alignment of partial
SVE vector modes the same as their minimum size.
(aarch64_evpc_sel): Use the mode form of aarch64_sve_pred_mode.
* config/aarch64/aarch64-sve.md (mov<SVE_FULL:mode>): Extend to...
(mov<SVE_ALL:mode>): ...this.
(movmisalign<SVE_FULL:mode>): Extend to...
(movmisalign<SVE_ALL:mode>): ...this.
(*aarch64_sve_mov<mode>_le): Rename to...
(*aarch64_sve_mov<mode>_ldr_str): ...this.
(*aarch64_sve_mov<SVE_FULL:mode>_be): Rename and extend to...
(*aarch64_sve_mov<SVE_ALL:mode>_no_ldr_str): ...this.  Handle
partial modes regardless of endianness.
(aarch64_sve_reload_be): Rename to...
(aarch64_sve_reload_mem): ...this and enable for little-endian.
Use aarch64_sve_pred_mode to get the appropriate predicate mode.
(@aarch64_pred_mov<SVE_FULL:mode>): Extend to...
(@aarch64_pred_mov<SVE_ALL:mode>): ...this.
(*aarch64_sve_mov<SVE_FULL:mode>_subreg_be): Extend to...
(*aarch64_sve_mov<SVE_ALL:mode>_subreg_be): ...this.
(@aarch64_sve_reinterpret<SVE_FULL:mode>): Extend to...
(@aarch64_sve_reinterpret<SVE_ALL:mode>): ...this.
(*aarch64_sve_reinterpret<SVE_FULL:mode>): Extend to...
(*aarch64_sve_reinterpret<SVE_ALL:mode>): ...this.
(maskload<SVE_FULL:mode><vpred>): Extend to...
(maskload<SVE_ALL:mode><vpred>): ...this.
(maskstore<SVE_FULL:mode><vpred>): Extend to...
(maskstore<SVE_ALL:mode><vpred>): ...this.
(vec_duplicate<SVE_FULL:mode>): Extend to...
(vec_duplicate<SVE_ALL:mode>): ...this.
(*vec_duplicate<SVE_FULL:mode>_reg): Extend to...
(*vec_duplicate<SVE_ALL:mode>_reg): ...this.
(sve_ld1r<SVE_FULL:mode>): Extend to...
(sve_ld1r<SVE_ALL:mode>): ...this.
(vec_series<SVE_FULL_I:mode>): Extend to...
(vec_series<SVE_I:mode>): ...this.
(*vec_series<SVE_FULL_I:mode>_plus): Extend to...
(*vec_series<SVE_I:mode>_plus): ...this.
(@aarch64_pred_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>): Avoid
new VPRED ambiguity.
(@aarch64_cond_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>): Likewise.
(add<SVE_FULL_I:mode>3): Extend to...
(add<SVE_I:mode>3): ...this.
* config/aarch64/iterators.md (SVE_ALL, SVE_I): New mode iterators.
(Vetype, Vesize, VEL, Vel, vwcore): Handle partial SVE vector modes.
(VPRED, vpred): Likewise.
(Vctype): New iterator.
(vw): Remove SVE modes.

gcc/testsuite/
* gcc.target/aarch64/sve/mixed_size_1.c: New test.
* gcc.target/aarch64/sve/mixed_size_2.c: Likewise.
* gcc.target/aarch64/sve/mixed_size_3.c: Likewise.
* gcc.target/aarch64/sve/mixed_size_4.c: Likewise.
* gcc.target/aarch64/sve/mixed_size_5.c: Likewise.

From-SVN: r278341

12 files changed:
gcc/ChangeLog
gcc/config/aarch64/aarch64-modes.def
gcc/config/aarch64/aarch64-protos.h
gcc/config/aarch64/aarch64-sve.md
gcc/config/aarch64/aarch64.c
gcc/config/aarch64/iterators.md
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/aarch64/sve/mixed_size_1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/mixed_size_2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/mixed_size_3.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/mixed_size_4.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/mixed_size_5.c [new file with mode: 0644]

index b7e46cf..afb995f 100644 (file)
@@ -1,5 +1,84 @@
 2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>
 
+       * config/aarch64/aarch64-modes.def: Define partial SVE vector
+       float modes.
+       * config/aarch64/aarch64-protos.h (aarch64_sve_pred_mode): New
+       function.
+       * config/aarch64/aarch64.c (aarch64_classify_vector_mode): Handle the
+       new vector float modes.
+       (aarch64_sve_container_bits): New function.
+       (aarch64_sve_pred_mode): Likewise.
+       (aarch64_get_mask_mode): Use it.
+       (aarch64_sve_element_int_mode): Handle structure modes and partial
+       modes.
+       (aarch64_sve_container_int_mode): New function.
+       (aarch64_vectorize_related_mode): Return SVE modes when given
+       SVE modes.  Handle partial modes, taking the preferred number
+       of units from the size of the given mode.
+       (aarch64_hard_regno_mode_ok): Allow partial modes to be stored
+       in registers.
+       (aarch64_expand_sve_ld1rq): Use the mode form of aarch64_sve_pred_mode.
+       (aarch64_expand_sve_const_vector): Handle partial SVE vectors.
+       (aarch64_split_sve_subreg_move): Use the mode form of
+       aarch64_sve_pred_mode.
+       (aarch64_secondary_reload): Handle partial modes in the same way
+       as full big-endian vectors.
+       (aarch64_vector_mode_supported_p): Allow partial SVE vectors.
+       (aarch64_autovectorize_vector_modes): Try unpacked SVE vectors,
+       merging with the Advanced SIMD modes.  If two modes have the
+       same size, try the Advanced SIMD mode first.
+       (aarch64_simd_valid_immediate): Use the container rather than
+       the element mode for INDEX constants.
+       (aarch64_simd_vector_alignment): Make the alignment of partial
+       SVE vector modes the same as their minimum size.
+       (aarch64_evpc_sel): Use the mode form of aarch64_sve_pred_mode.
+       * config/aarch64/aarch64-sve.md (mov<SVE_FULL:mode>): Extend to...
+       (mov<SVE_ALL:mode>): ...this.
+       (movmisalign<SVE_FULL:mode>): Extend to...
+       (movmisalign<SVE_ALL:mode>): ...this.
+       (*aarch64_sve_mov<mode>_le): Rename to...
+       (*aarch64_sve_mov<mode>_ldr_str): ...this.
+       (*aarch64_sve_mov<SVE_FULL:mode>_be): Rename and extend to...
+       (*aarch64_sve_mov<SVE_ALL:mode>_no_ldr_str): ...this.  Handle
+       partial modes regardless of endianness.
+       (aarch64_sve_reload_be): Rename to...
+       (aarch64_sve_reload_mem): ...this and enable for little-endian.
+       Use aarch64_sve_pred_mode to get the appropriate predicate mode.
+       (@aarch64_pred_mov<SVE_FULL:mode>): Extend to...
+       (@aarch64_pred_mov<SVE_ALL:mode>): ...this.
+       (*aarch64_sve_mov<SVE_FULL:mode>_subreg_be): Extend to...
+       (*aarch64_sve_mov<SVE_ALL:mode>_subreg_be): ...this.
+       (@aarch64_sve_reinterpret<SVE_FULL:mode>): Extend to...
+       (@aarch64_sve_reinterpret<SVE_ALL:mode>): ...this.
+       (*aarch64_sve_reinterpret<SVE_FULL:mode>): Extend to...
+       (*aarch64_sve_reinterpret<SVE_ALL:mode>): ...this.
+       (maskload<SVE_FULL:mode><vpred>): Extend to...
+       (maskload<SVE_ALL:mode><vpred>): ...this.
+       (maskstore<SVE_FULL:mode><vpred>): Extend to...
+       (maskstore<SVE_ALL:mode><vpred>): ...this.
+       (vec_duplicate<SVE_FULL:mode>): Extend to...
+       (vec_duplicate<SVE_ALL:mode>): ...this.
+       (*vec_duplicate<SVE_FULL:mode>_reg): Extend to...
+       (*vec_duplicate<SVE_ALL:mode>_reg): ...this.
+       (sve_ld1r<SVE_FULL:mode>): Extend to...
+       (sve_ld1r<SVE_ALL:mode>): ...this.
+       (vec_series<SVE_FULL_I:mode>): Extend to...
+       (vec_series<SVE_I:mode>): ...this.
+       (*vec_series<SVE_FULL_I:mode>_plus): Extend to...
+       (*vec_series<SVE_I:mode>_plus): ...this.
+       (@aarch64_pred_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>): Avoid
+       new VPRED ambiguity.
+       (@aarch64_cond_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>): Likewise.
+       (add<SVE_FULL_I:mode>3): Extend to...
+       (add<SVE_I:mode>3): ...this.
+       * config/aarch64/iterators.md (SVE_ALL, SVE_I): New mode iterators.
+       (Vetype, Vesize, VEL, Vel, vwcore): Handle partial SVE vector modes.
+       (VPRED, vpred): Likewise.
+       (Vctype): New iterator.
+       (vw): Remove SVE modes.
+
+2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>
+
        * config/aarch64/iterators.md (SVE_PARTIAL): Rename to...
        (SVE_PARTIAL_I): ...this.
        * config/aarch64/aarch64-sve.md: Apply the above renaming throughout.
index a9b1bce..3c698b6 100644 (file)
@@ -123,13 +123,18 @@ SVE_MODES (4, VNx64, VNx32, VNx16, VNx8)
 VECTOR_MODES_WITH_PREFIX (VNx, INT, 2, 1);
 VECTOR_MODES_WITH_PREFIX (VNx, INT, 4, 1);
 VECTOR_MODES_WITH_PREFIX (VNx, INT, 8, 1);
+VECTOR_MODES_WITH_PREFIX (VNx, FLOAT, 4, 1);
+VECTOR_MODES_WITH_PREFIX (VNx, FLOAT, 8, 1);
 
 ADJUST_NUNITS (VNx2QI, aarch64_sve_vg);
 ADJUST_NUNITS (VNx2HI, aarch64_sve_vg);
 ADJUST_NUNITS (VNx2SI, aarch64_sve_vg);
+ADJUST_NUNITS (VNx2HF, aarch64_sve_vg);
+ADJUST_NUNITS (VNx2SF, aarch64_sve_vg);
 
 ADJUST_NUNITS (VNx4QI, aarch64_sve_vg * 2);
 ADJUST_NUNITS (VNx4HI, aarch64_sve_vg * 2);
+ADJUST_NUNITS (VNx4HF, aarch64_sve_vg * 2);
 
 ADJUST_NUNITS (VNx8QI, aarch64_sve_vg * 4);
 
@@ -139,8 +144,11 @@ ADJUST_ALIGNMENT (VNx8QI, 1);
 
 ADJUST_ALIGNMENT (VNx2HI, 2);
 ADJUST_ALIGNMENT (VNx4HI, 2);
+ADJUST_ALIGNMENT (VNx2HF, 2);
+ADJUST_ALIGNMENT (VNx4HF, 2);
 
 ADJUST_ALIGNMENT (VNx2SI, 4);
+ADJUST_ALIGNMENT (VNx2SF, 4);
 
 /* Quad float: 128-bit floating mode for long doubles.  */
 FLOAT_MODE (TF, 16, ieee_quad_format);
index 1d4f4fd..bcb3fd4 100644 (file)
@@ -512,6 +512,7 @@ bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
 bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
 machine_mode aarch64_sve_int_mode (machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
+machine_mode aarch64_sve_pred_mode (machine_mode);
 opt_machine_mode aarch64_sve_data_mode (scalar_mode, poly_uint64);
 bool aarch64_sve_mode_p (machine_mode);
 HOST_WIDE_INT aarch64_fold_sve_cnt_pat (aarch64_svpattern, unsigned int);
index 5b71ab0..b43d4fb 100644 (file)
 ;; -------------------------------------------------------------------------
 
 (define_expand "mov<mode>"
-  [(set (match_operand:SVE_FULL 0 "nonimmediate_operand")
-       (match_operand:SVE_FULL 1 "general_operand"))]
+  [(set (match_operand:SVE_ALL 0 "nonimmediate_operand")
+       (match_operand:SVE_ALL 1 "general_operand"))]
   "TARGET_SVE"
   {
     /* Use the predicated load and store patterns where possible.
 )
 
 (define_expand "movmisalign<mode>"
-  [(set (match_operand:SVE_FULL 0 "nonimmediate_operand")
-       (match_operand:SVE_FULL 1 "general_operand"))]
+  [(set (match_operand:SVE_ALL 0 "nonimmediate_operand")
+       (match_operand:SVE_ALL 1 "general_operand"))]
   "TARGET_SVE"
   {
     /* Equivalent to a normal move for our purpooses.  */
   }
 )
 
-;; Unpredicated moves (bytes or little-endian).  Only allow memory operations
-;; during and after RA; before RA we want the predicated load and store
-;; patterns to be used instead.
-(define_insn "*aarch64_sve_mov<mode>_le"
+;; Unpredicated moves that can use LDR and STR, i.e. full vectors for which
+;; little-endian ordering is acceptable.  Only allow memory operations during
+;; and after RA; before RA we want the predicated load and store patterns to
+;; be used instead.
+(define_insn "*aarch64_sve_mov<mode>_ldr_str"
   [(set (match_operand:SVE_FULL 0 "aarch64_sve_nonimmediate_operand" "=w, Utr, w, w")
        (match_operand:SVE_FULL 1 "aarch64_sve_general_operand" "Utr, w, w, Dn"))]
   "TARGET_SVE
    * return aarch64_output_sve_mov_immediate (operands[1]);"
 )
 
-;; Unpredicated moves (non-byte big-endian).  Memory accesses require secondary
-;; reloads.
-(define_insn "*aarch64_sve_mov<mode>_be"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, w")
-       (match_operand:SVE_FULL 1 "aarch64_nonmemory_operand" "w, Dn"))]
-  "TARGET_SVE && BYTES_BIG_ENDIAN && <MODE>mode != VNx16QImode"
+;; Unpredicated moves that cannot use LDR and STR, i.e. partial vectors
+;; or vectors for which little-endian ordering isn't acceptable.  Memory
+;; accesses require secondary reloads.
+(define_insn "*aarch64_sve_mov<mode>_no_ldr_str"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w")
+       (match_operand:SVE_ALL 1 "aarch64_nonmemory_operand" "w, Dn"))]
+  "TARGET_SVE
+   && <MODE>mode != VNx16QImode
+   && (BYTES_BIG_ENDIAN
+       || maybe_ne (BYTES_PER_SVE_VECTOR, GET_MODE_SIZE (<MODE>mode)))"
   "@
    mov\t%0.d, %1.d
    * return aarch64_output_sve_mov_immediate (operands[1]);"
 )
 
-;; Handle big-endian memory reloads.  We use byte PTRUE for all modes
-;; to try to encourage reuse.
-;; This pattern needs constraints due to TARGET_SECONDARY_RELOAD hook.
-(define_expand "aarch64_sve_reload_be"
+;; Handle memory reloads for modes that can't use LDR and STR.  We use
+;; byte PTRUE for all modes to try to encourage reuse.  This pattern
+;; needs constraints because it is returned by TARGET_SECONDARY_RELOAD.
+(define_expand "aarch64_sve_reload_mem"
   [(parallel
      [(set (match_operand 0)
           (match_operand 1))
       (clobber (match_operand:VNx16BI 2 "register_operand" "=Upl"))])]
-  "TARGET_SVE && BYTES_BIG_ENDIAN"
+  "TARGET_SVE"
   {
     /* Create a PTRUE.  */
     emit_move_insn (operands[2], CONSTM1_RTX (VNx16BImode));
 
     /* Refer to the PTRUE in the appropriate mode for this move.  */
     machine_mode mode = GET_MODE (operands[0]);
-    machine_mode pred_mode
-      = aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (mode)).require ();
-    rtx pred = gen_lowpart (pred_mode, operands[2]);
+    rtx pred = gen_lowpart (aarch64_sve_pred_mode (mode), operands[2]);
 
     /* Emit a predicated load or store.  */
     aarch64_emit_sve_pred_move (operands[0], pred, operands[1]);
 ;; Note that this pattern is generated directly by aarch64_emit_sve_pred_move,
 ;; so changes to this pattern will need changes there as well.
 (define_insn_and_split "@aarch64_pred_mov<mode>"
-  [(set (match_operand:SVE_FULL 0 "nonimmediate_operand" "=w, w, m")
-       (unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "nonimmediate_operand" "=w, w, m")
+       (unspec:SVE_ALL
          [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-          (match_operand:SVE_FULL 2 "nonimmediate_operand" "w, m, w")]
+          (match_operand:SVE_ALL 2 "nonimmediate_operand" "w, m, w")]
          UNSPEC_PRED_X))]
   "TARGET_SVE
    && (register_operand (operands[0], <MODE>mode)
        || register_operand (operands[2], <MODE>mode))"
   "@
    #
-   ld1<Vesize>\t%0.<Vetype>, %1/z, %2
-   st1<Vesize>\t%2.<Vetype>, %1, %0"
+   ld1<Vesize>\t%0.<Vctype>, %1/z, %2
+   st1<Vesize>\t%2.<Vctype>, %1, %0"
   "&& register_operand (operands[0], <MODE>mode)
    && register_operand (operands[2], <MODE>mode)"
   [(set (match_dup 0) (match_dup 2))]
 ;; for details.  We use a special predicate for operand 2 to reduce
 ;; the number of patterns.
 (define_insn_and_split "*aarch64_sve_mov<mode>_subreg_be"
-  [(set (match_operand:SVE_FULL 0 "aarch64_sve_nonimmediate_operand" "=w")
-       (unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "aarch64_sve_nonimmediate_operand" "=w")
+       (unspec:SVE_ALL
          [(match_operand:VNx16BI 1 "register_operand" "Upl")
           (match_operand 2 "aarch64_any_register_operand" "w")]
          UNSPEC_REV_SUBREG))]
 ;; This is equivalent to a subreg on little-endian targets but not for
 ;; big-endian; see the comment at the head of the file for details.
 (define_expand "@aarch64_sve_reinterpret<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand")
-       (unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand")
+       (unspec:SVE_ALL
          [(match_operand 1 "aarch64_any_register_operand")]
          UNSPEC_REINTERPRET))]
   "TARGET_SVE"
 ;; A pattern for handling type punning on big-endian targets.  We use a
 ;; special predicate for operand 1 to reduce the number of patterns.
 (define_insn_and_split "*aarch64_sve_reinterpret<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-       (unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+       (unspec:SVE_ALL
          [(match_operand 1 "aarch64_any_register_operand" "w")]
          UNSPEC_REINTERPRET))]
   "TARGET_SVE"
 
 ;; Predicated LD1.
 (define_insn "maskload<mode><vpred>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-       (unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+       (unspec:SVE_ALL
          [(match_operand:<VPRED> 2 "register_operand" "Upl")
-          (match_operand:SVE_FULL 1 "memory_operand" "m")]
+          (match_operand:SVE_ALL 1 "memory_operand" "m")]
          UNSPEC_LD1_SVE))]
   "TARGET_SVE"
-  "ld1<Vesize>\t%0.<Vetype>, %2/z, %1"
+  "ld1<Vesize>\t%0.<Vctype>, %2/z, %1"
 )
 
 ;; Unpredicated LD[234].
 
 ;; Predicated ST1.
 (define_insn "maskstore<mode><vpred>"
-  [(set (match_operand:SVE_FULL 0 "memory_operand" "+m")
-       (unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "memory_operand" "+m")
+       (unspec:SVE_ALL
          [(match_operand:<VPRED> 2 "register_operand" "Upl")
-          (match_operand:SVE_FULL 1 "register_operand" "w")
+          (match_operand:SVE_ALL 1 "register_operand" "w")
           (match_dup 0)]
          UNSPEC_ST1_SVE))]
   "TARGET_SVE"
-  "st1<Vesize>\t%1.<Vetype>, %2, %0"
+  "st1<Vesize>\t%1.<Vctype>, %2, %0"
 )
 
 ;; Unpredicated ST[234].  This is always a full update, so the dependence
 
 (define_expand "vec_duplicate<mode>"
   [(parallel
-    [(set (match_operand:SVE_FULL 0 "register_operand")
-         (vec_duplicate:SVE_FULL
+    [(set (match_operand:SVE_ALL 0 "register_operand")
+         (vec_duplicate:SVE_ALL
            (match_operand:<VEL> 1 "aarch64_sve_dup_operand")))
      (clobber (scratch:VNx16BI))])]
   "TARGET_SVE"
 ;; the load at the first opportunity in order to allow the PTRUE to be
 ;; optimized with surrounding code.
 (define_insn_and_split "*vec_duplicate<mode>_reg"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, w, w")
-       (vec_duplicate:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w, w")
+       (vec_duplicate:SVE_ALL
          (match_operand:<VEL> 1 "aarch64_sve_dup_operand" "r, w, Uty")))
    (clobber (match_scratch:VNx16BI 2 "=X, X, Upl"))]
   "TARGET_SVE"
 ;; be used by combine to optimize selects of a a vec_duplicate<mode>
 ;; with zero.
 (define_insn "sve_ld1r<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-       (unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+       (unspec:SVE_ALL
          [(match_operand:<VPRED> 1 "register_operand" "Upl")
-          (vec_duplicate:SVE_FULL
+          (vec_duplicate:SVE_ALL
             (match_operand:<VEL> 2 "aarch64_sve_ld1r_operand" "Uty"))
-          (match_operand:SVE_FULL 3 "aarch64_simd_imm_zero")]
+          (match_operand:SVE_ALL 3 "aarch64_simd_imm_zero")]
          UNSPEC_SEL))]
   "TARGET_SVE"
   "ld1r<Vesize>\t%0.<Vetype>, %1/z, %2"
 ;; -------------------------------------------------------------------------
 
 (define_insn "vec_series<mode>"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, w")
-       (vec_series:SVE_FULL_I
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, w")
+       (vec_series:SVE_I
          (match_operand:<VEL> 1 "aarch64_sve_index_operand" "Usi, r, r")
          (match_operand:<VEL> 2 "aarch64_sve_index_operand" "r, Usi, r")))]
   "TARGET_SVE"
   "@
-   index\t%0.<Vetype>, #%1, %<vw>2
-   index\t%0.<Vetype>, %<vw>1, #%2
-   index\t%0.<Vetype>, %<vw>1, %<vw>2"
+   index\t%0.<Vctype>, #%1, %<vwcore>2
+   index\t%0.<Vctype>, %<vwcore>1, #%2
+   index\t%0.<Vctype>, %<vwcore>1, %<vwcore>2"
 )
 
 ;; Optimize {x, x, x, x, ...} + {0, n, 2*n, 3*n, ...} if n is in range
 ;; of an INDEX instruction.
 (define_insn "*vec_series<mode>_plus"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
-       (plus:SVE_FULL_I
-         (vec_duplicate:SVE_FULL_I
+  [(set (match_operand:SVE_I 0 "register_operand" "=w")
+       (plus:SVE_I
+         (vec_duplicate:SVE_I
            (match_operand:<VEL> 1 "register_operand" "r"))
-         (match_operand:SVE_FULL_I 2 "immediate_operand")))]
+         (match_operand:SVE_I 2 "immediate_operand")))]
   "TARGET_SVE && aarch64_check_zero_based_sve_index_immediate (operands[2])"
   {
     operands[2] = aarch64_check_zero_based_sve_index_immediate (operands[2]);
-    return "index\t%0.<Vetype>, %<vw>1, #%2";
+    return "index\t%0.<Vctype>, %<vwcore>1, #%2";
   }
 )
 
 (define_insn "@aarch64_pred_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>"
   [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
        (unspec:SVE_FULL_HSDI
-         [(match_operand:<VPRED> 1 "register_operand" "Upl")
+         [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl")
           (sign_extend:SVE_FULL_HSDI
             (truncate:SVE_PARTIAL_I
               (match_operand:SVE_FULL_HSDI 2 "register_operand" "w")))]
 (define_insn "@aarch64_cond_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>"
   [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w, ?&w, ?&w")
        (unspec:SVE_FULL_HSDI
-         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+         [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl")
           (sign_extend:SVE_FULL_HSDI
             (truncate:SVE_PARTIAL_I
               (match_operand:SVE_FULL_HSDI 2 "register_operand" "w, w, w")))
 ;; -------------------------------------------------------------------------
 
 (define_insn "add<mode>3"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, w, ?w, ?w, w")
-       (plus:SVE_FULL_I
-         (match_operand:SVE_FULL_I 1 "register_operand" "%0, 0, 0, w, w, w")
-         (match_operand:SVE_FULL_I 2 "aarch64_sve_add_operand" "vsa, vsn, vsi, vsa, vsn, w")))]
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, w, ?w, ?w, w")
+       (plus:SVE_I
+         (match_operand:SVE_I 1 "register_operand" "%0, 0, 0, w, w, w")
+         (match_operand:SVE_I 2 "aarch64_sve_add_operand" "vsa, vsn, vsi, vsa, vsn, w")))]
   "TARGET_SVE"
   "@
    add\t%0.<Vetype>, %0.<Vetype>, #%D2
index 9ffe213..d175e1f 100644 (file)
@@ -1625,6 +1625,11 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_VNx4HImode:
     /* Partial SVE SI vector.  */
     case E_VNx2SImode:
+    /* Partial SVE HF vectors.  */
+    case E_VNx2HFmode:
+    case E_VNx4HFmode:
+    /* Partial SVE SF vector.  */
+    case E_VNx2SFmode:
       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
 
     case E_VNx16QImode:
@@ -1753,6 +1758,22 @@ aarch64_array_mode_supported_p (machine_mode mode,
   return false;
 }
 
+/* MODE is some form of SVE vector mode.  For data modes, return the number
+   of vector register bits that each element of MODE occupies, such as 64
+   for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
+   in a 64-bit container).  For predicate modes, return the number of
+   data bits controlled by each significant predicate bit.  */
+
+static unsigned int
+aarch64_sve_container_bits (machine_mode mode)
+{
+  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+  poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
+                            ? BITS_PER_SVE_VECTOR
+                            : GET_MODE_BITSIZE (mode));
+  return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
+}
+
 /* Return the SVE predicate mode to use for elements that have
    ELEM_NBYTES bytes, if such a mode exists.  */
 
@@ -1773,6 +1794,16 @@ aarch64_sve_pred_mode (unsigned int elem_nbytes)
   return opt_machine_mode ();
 }
 
+/* Return the SVE predicate mode that should be used to control
+   SVE mode MODE.  */
+
+machine_mode
+aarch64_sve_pred_mode (machine_mode mode)
+{
+  unsigned int bits = aarch64_sve_container_bits (mode);
+  return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
+}
+
 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
 
 static opt_machine_mode
@@ -1780,7 +1811,7 @@ aarch64_get_mask_mode (machine_mode mode)
 {
   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
   if (vec_flags & VEC_SVE_DATA)
-    return aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (mode));
+    return aarch64_sve_pred_mode (mode);
 
   return default_get_mask_mode (mode);
 }
@@ -1806,11 +1837,25 @@ aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
 static scalar_int_mode
 aarch64_sve_element_int_mode (machine_mode mode)
 {
-  unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
+  poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
+                            ? BITS_PER_SVE_VECTOR
+                            : GET_MODE_BITSIZE (mode));
+  unsigned int elt_bits = vector_element_size (vector_bits,
                                               GET_MODE_NUNITS (mode));
   return int_mode_for_size (elt_bits, 0).require ();
 }
 
+/* Return an integer element mode that contains exactly
+   aarch64_sve_container_bits (MODE) bits.  This is wider than
+   aarch64_sve_element_int_mode if MODE is a partial vector,
+   otherwise it's the same.  */
+
+static scalar_int_mode
+aarch64_sve_container_int_mode (machine_mode mode)
+{
+  return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
+}
+
 /* Return the integer vector mode associated with SVE mode MODE.
    Unlike related_int_vector_mode, this can handle the case in which
    MODE is a predicate (and thus has a different total size).  */
@@ -1831,6 +1876,37 @@ aarch64_vectorize_related_mode (machine_mode vector_mode,
 {
   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
 
+  /* If we're operating on SVE vectors, try to return an SVE mode.  */
+  poly_uint64 sve_nunits;
+  if ((vec_flags & VEC_SVE_DATA)
+      && multiple_p (BYTES_PER_SVE_VECTOR,
+                    GET_MODE_SIZE (element_mode), &sve_nunits))
+    {
+      machine_mode sve_mode;
+      if (maybe_ne (nunits, 0U))
+       {
+         /* Try to find a full or partial SVE mode with exactly
+            NUNITS units.  */
+         if (multiple_p (sve_nunits, nunits)
+             && aarch64_sve_data_mode (element_mode,
+                                       nunits).exists (&sve_mode))
+           return sve_mode;
+       }
+      else
+       {
+         /* Take the preferred number of units from the number of bytes
+            that fit in VECTOR_MODE.  We always start by "autodetecting"
+            a full vector mode with preferred_simd_mode, so vectors
+            chosen here will also be full vector modes.  Then
+            autovectorize_vector_modes tries smaller starting modes
+            and thus smaller preferred numbers of units.  */
+         sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
+         if (aarch64_sve_data_mode (element_mode,
+                                    sve_nunits).exists (&sve_mode))
+           return sve_mode;
+       }
+    }
+
   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
   if ((vec_flags & VEC_ADVSIMD)
       && known_eq (nunits, 0U)
@@ -1907,11 +1983,6 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
     return mode == DImode;
 
   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
-  /* At the moment, partial vector modes are only useful for memory
-     references, but that could change in future.  */
-  if (vec_flags & VEC_PARTIAL)
-    return false;
-
   if (vec_flags & VEC_SVE_PRED)
     return pr_or_ffr_regnum_p (regno);
 
@@ -4015,8 +4086,7 @@ aarch64_expand_sve_ld1rq (rtx dest, rtx src)
     }
 
   machine_mode mode = GET_MODE (dest);
-  unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
-  machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
+  machine_mode pred_mode = aarch64_sve_pred_mode (mode);
   rtx ptrue = aarch64_ptrue_reg (pred_mode);
   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
   return true;
@@ -4037,7 +4107,26 @@ aarch64_expand_sve_const_vector (rtx target, rtx src)
   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
   scalar_mode elt_mode = GET_MODE_INNER (mode);
   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
-  unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
+  unsigned int container_bits = aarch64_sve_container_bits (mode);
+  unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
+
+  if (nelts_per_pattern == 1
+      && encoded_bits <= 128
+      && container_bits != elt_bits)
+    {
+      /* We have a partial vector mode and a constant whose full-vector
+        equivalent would occupy a repeating 128-bit sequence.  Build that
+        full-vector equivalent instead, so that we have the option of
+        using LD1RQ and Advanced SIMD operations.  */
+      unsigned int repeat = container_bits / elt_bits;
+      machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
+      rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
+      for (unsigned int i = 0; i < npatterns; ++i)
+       for (unsigned int j = 0; j < repeat; ++j)
+         builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
+      target = aarch64_target_reg (target, full_mode);
+      return aarch64_expand_sve_const_vector (target, builder.build ());
+    }
 
   if (nelts_per_pattern == 1 && encoded_bits == 128)
     {
@@ -4730,8 +4819,7 @@ aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
 
   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
-  unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
-  machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
+  machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
 
   /* Get the operands in the appropriate modes and emit the instruction.  */
   ptrue = gen_lowpart (pred_mode, ptrue);
@@ -9971,19 +10059,21 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
                          machine_mode mode,
                          secondary_reload_info *sri)
 {
-  /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
-     directly by the *aarch64_sve_mov<mode>_[lb]e move patterns.  See the
-     comment at the head of aarch64-sve.md for more details about the
-     big-endian handling.  */
-  if (BYTES_BIG_ENDIAN
-      && reg_class_subset_p (rclass, FP_REGS)
+  /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
+     LDR and STR.  See the comment at the head of aarch64-sve.md for
+     more details about the big-endian handling.  */
+  if (reg_class_subset_p (rclass, FP_REGS)
       && !((REG_P (x) && HARD_REGISTER_P (x))
           || aarch64_simd_valid_immediate (x, NULL))
-      && mode != VNx16QImode
-      && aarch64_sve_data_mode_p (mode))
+      && mode != VNx16QImode)
     {
-      sri->icode = CODE_FOR_aarch64_sve_reload_be;
-      return NO_REGS;
+      unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+      if ((vec_flags & VEC_SVE_DATA)
+         && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
+       {
+         sri->icode = CODE_FOR_aarch64_sve_reload_mem;
+         return NO_REGS;
+       }
     }
 
   /* If we have to disable direct literal pool loads and stores because the
@@ -15837,7 +15927,7 @@ static bool
 aarch64_vector_mode_supported_p (machine_mode mode)
 {
   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
-  return vec_flags != 0 && (vec_flags & (VEC_STRUCT | VEC_PARTIAL)) == 0;
+  return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
 }
 
 /* Return the full-width SVE vector mode for element mode MODE, if one
@@ -15938,29 +16028,72 @@ aarch64_preferred_simd_mode (scalar_mode mode)
 static unsigned int
 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
 {
-  if (TARGET_SVE)
-    modes->safe_push (VNx16QImode);
+  static const machine_mode sve_modes[] = {
+    /* Try using full vectors for all element types.  */
+    VNx16QImode,
+
+    /* Try using 16-bit containers for 8-bit elements and full vectors
+       for wider elements.  */
+    VNx8QImode,
+
+    /* Try using 32-bit containers for 8-bit and 16-bit elements and
+       full vectors for wider elements.  */
+    VNx4QImode,
 
-  /* Try using 128-bit vectors for all element types.  */
-  modes->safe_push (V16QImode);
+    /* Try using 64-bit containers for all element types.  */
+    VNx2QImode
+  };
+
+  static const machine_mode advsimd_modes[] = {
+    /* Try using 128-bit vectors for all element types.  */
+    V16QImode,
+
+    /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
+       for wider elements.  */
+    V8QImode,
+
+    /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
+       for wider elements.
+
+       TODO: We could support a limited form of V4QImode too, so that
+       we use 32-bit vectors for 8-bit elements.  */
+    V4HImode,
+
+    /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
+       for 64-bit elements.
 
-  /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
-     for wider elements.  */
-  modes->safe_push (V8QImode);
+       TODO: We could similarly support limited forms of V2QImode and V2HImode
+       for this case.  */
+    V2SImode
+  };
 
-  /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
-     for wider elements.
+  /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
+     This is because:
 
-     TODO: We could support a limited form of V4QImode too, so that
-     we use 32-bit vectors for 8-bit elements.  */
-  modes->safe_push (V4HImode);
+     - If we can't use N-byte Advanced SIMD vectors then the placement
+       doesn't matter; we'll just continue as though the Advanced SIMD
+       entry didn't exist.
 
-  /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
-     for 64-bit elements.
+     - If an SVE main loop with N bytes ends up being cheaper than an
+       Advanced SIMD main loop with N bytes then by default we'll replace
+       the Advanced SIMD version with the SVE one.
 
-     TODO: We could similarly support limited forms of V2QImode and V2HImode
-     for this case.  */
-  modes->safe_push (V2SImode);
+     - If an Advanced SIMD main loop with N bytes ends up being cheaper
+       than an SVE main loop with N bytes then by default we'll try to
+       use the SVE loop to vectorize the epilogue instead.  */
+  unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
+  unsigned int advsimd_i = 0;
+  while (advsimd_i < ARRAY_SIZE (advsimd_modes))
+    {
+      if (sve_i < ARRAY_SIZE (sve_modes)
+         && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
+                      GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
+       modes->safe_push (sve_modes[sve_i++]);
+      else
+       modes->safe_push (advsimd_modes[advsimd_i++]);
+    }
+  while (sve_i < ARRAY_SIZE (sve_modes))
+    modes->safe_push (sve_modes[sve_i++]);
 
   unsigned int flags = 0;
   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
@@ -16507,7 +16640,14 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
        return false;
 
       if (info)
-       *info = simd_immediate_info (elt_mode, base, step);
+       {
+         /* Get the corresponding container mode.  E.g. an INDEX on V2SI
+            should yield two integer values per 128-bit block, meaning
+            that we need to treat it in the same way as V2DI and then
+            ignore the upper 32 bits of each element.  */
+         elt_mode = aarch64_sve_container_int_mode (mode);
+         *info = simd_immediate_info (elt_mode, base, step);
+       }
       return true;
     }
   else if (GET_CODE (op) == CONST_VECTOR
@@ -16976,9 +17116,9 @@ aarch64_simd_vector_alignment (const_tree type)
      direct way we have of identifying real SVE predicate types.  */
   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
     return 16;
-  if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
-    return 128;
-  return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
+  widest_int min_size
+    = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
+  return wi::umin (min_size, 128).to_uhwi ();
 }
 
 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
@@ -19154,7 +19294,7 @@ aarch64_evpc_sel (struct expand_vec_perm_d *d)
   if (d->testing_p)
     return true;
 
-  machine_mode pred_mode = aarch64_sve_pred_mode (unit_size).require ();
+  machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
 
   rtx_vector_builder builder (pred_mode, n_patterns, 2);
   for (int i = 0; i < n_patterns * 2; i++)
index fc27179..4c9035f 100644 (file)
                                     VNx4HI VNx2HI
                                     VNx2SI])
 
+;; All SVE vector modes.
+(define_mode_iterator SVE_ALL [VNx16QI VNx8QI VNx4QI VNx2QI
+                              VNx8HI VNx4HI VNx2HI
+                              VNx8HF VNx4HF VNx2HF
+                              VNx4SI VNx2SI
+                              VNx4SF VNx2SF
+                              VNx2DI
+                              VNx2DF])
+
+;; All SVE integer vector modes.
+(define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
+                            VNx8HI VNx4HI VNx2HI
+                            VNx4SI VNx2SI
+                            VNx2DI])
+
 ;; Modes involved in extending or truncating SVE data, for 8 elements per
 ;; 128-bit block.
 (define_mode_iterator VNx8_NARROW [VNx8QI])
                           (HI   "")])
 
 ;; Mode-to-individual element type mapping.
-(define_mode_attr Vetype [(V8QI "b") (V16QI "b") (VNx16QI "b") (VNx16BI "b")
-                         (V4HI "h") (V8HI  "h") (VNx8HI  "h") (VNx8BI  "h")
-                         (V2SI "s") (V4SI  "s") (VNx4SI  "s") (VNx4BI  "s")
-                         (V2DI "d")             (VNx2DI  "d") (VNx2BI  "d")
-                         (V4HF "h") (V8HF  "h") (VNx8HF  "h")
-                         (V2SF "s") (V4SF  "s") (VNx4SF  "s")
-                         (V2DF "d")             (VNx2DF  "d")
-                         (HF   "h")
-                         (SF   "s") (DF  "d")
-                         (QI "b")   (HI "h")
-                         (SI "s")   (DI "d")])
+(define_mode_attr Vetype [(V8QI "b") (V16QI "b")
+                         (V4HI "h") (V8HI  "h")
+                         (V2SI "s") (V4SI  "s")
+                         (V2DI "d")
+                         (V4HF "h") (V8HF  "h")
+                         (V2SF "s") (V4SF  "s")
+                         (V2DF "d")
+                         (VNx16BI "b") (VNx8BI "h") (VNx4BI "s") (VNx2BI "d")
+                         (VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
+                         (VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
+                         (VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
+                         (VNx4SI "s") (VNx2SI "s")
+                         (VNx4SF "s") (VNx2SF "s")
+                         (VNx2DI "d")
+                         (VNx2DF "d")
+                         (HF "h")
+                         (SF "s") (DF "d")
+                         (QI "b") (HI "h")
+                         (SI "s") (DI "d")])
 
 ;; Like Vetype, but map to types that are a quarter of the element size.
 (define_mode_attr Vetype_fourth [(VNx4SI "b") (VNx2DI "h")])
 
 ;; Equivalent of "size" for a vector element.
-(define_mode_attr Vesize [(VNx16QI "b") (VNx8QI  "b")
-                         (VNx4QI  "b") (VNx2QI  "b")
-                         (VNx8HI  "h") (VNx4HI  "h")
-                         (VNx2HI  "h") (VNx8HF  "h")
-                         (VNx4SI  "w") (VNx2SI  "w") (VNx4SF  "w")
-                         (VNx2DI  "d") (VNx2DF  "d")
+(define_mode_attr Vesize [(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
+                         (VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
+                         (VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
+                         (VNx4SI "w") (VNx2SI "w")
+                         (VNx4SF "w") (VNx2SF "w")
+                         (VNx2DI "d")
+                         (VNx2DF "d")
                          (VNx32QI "b") (VNx48QI "b") (VNx64QI "b")
                          (VNx16HI "h") (VNx24HI "h") (VNx32HI "h")
                          (VNx16HF "h") (VNx24HF "h") (VNx32HF "h")
                          (VNx4DI  "d") (VNx6DI  "d") (VNx8DI  "d")
                          (VNx4DF  "d") (VNx6DF  "d") (VNx8DF  "d")])
 
+;; The Z register suffix for an SVE mode's element container, i.e. the
+;; Vetype of full SVE modes that have the same number of elements.
+(define_mode_attr Vctype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "s") (VNx2QI "d")
+                         (VNx8HI "h") (VNx4HI "s") (VNx2HI "d")
+                         (VNx8HF "h") (VNx4HF "s") (VNx2HF "d")
+                         (VNx4SI "s") (VNx2SI "d")
+                         (VNx4SF "s") (VNx2SF "d")
+                         (VNx2DI "d")
+                         (VNx2DF "d")])
+
 ;; Vetype is used everywhere in scheduling type and assembly output,
 ;; sometimes they are not the same, for example HF modes on some
 ;; instructions.  stype is defined to represent scheduling type
                          (SI   "8b")  (SF    "8b")])
 
 ;; Define element mode for each vector mode.
-(define_mode_attr VEL [(V8QI  "QI") (V16QI "QI") (VNx16QI "QI")
-                       (V4HI "HI") (V8HI  "HI") (VNx8HI  "HI")
-                       (V2SI "SI") (V4SI  "SI") (VNx4SI  "SI")
-                       (DI   "DI") (V2DI  "DI") (VNx2DI  "DI")
-                       (V4HF "HF") (V8HF  "HF") (VNx8HF  "HF")
-                       (V2SF "SF") (V4SF  "SF") (VNx4SF  "SF")
-                       (DF   "DF") (V2DF  "DF") (VNx2DF  "DF")
-                       (SI   "SI") (HI    "HI")
-                       (QI   "QI")])
+(define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
+                      (V4HI "HI") (V8HI  "HI")
+                      (V2SI "SI") (V4SI  "SI")
+                      (DI   "DI") (V2DI  "DI")
+                      (V4HF "HF") (V8HF  "HF")
+                      (V2SF "SF") (V4SF  "SF")
+                      (DF   "DF") (V2DF  "DF")
+                      (SI   "SI") (HI    "HI")
+                      (QI   "QI")
+                      (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
+                      (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
+                      (VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF")
+                      (VNx4SI "SI") (VNx2SI "SI")
+                      (VNx4SF "SF") (VNx2SF "SF")
+                      (VNx2DI "DI")
+                      (VNx2DF "DF")])
 
 ;; Define element mode for each vector mode (lower case).
-(define_mode_attr Vel [(V8QI "qi") (V16QI "qi") (VNx16QI "qi")
-                       (V4HI "hi") (V8HI "hi") (VNx8HI  "hi")
-                       (V2SI "si") (V4SI "si") (VNx4SI  "si")
-                       (DI "di")   (V2DI "di") (VNx2DI  "di")
-                       (V4HF "hf") (V8HF "hf") (VNx8HF  "hf")
-                       (V2SF "sf") (V4SF "sf") (VNx4SF  "sf")
-                       (V2DF "df") (DF "df")   (VNx2DF  "df")
-                       (SI   "si") (HI   "hi")
-                       (QI   "qi")])
+(define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
+                      (V4HI "hi") (V8HI "hi")
+                      (V2SI "si") (V4SI "si")
+                      (DI   "di") (V2DI "di")
+                      (V4HF "hf") (V8HF "hf")
+                      (V2SF "sf") (V4SF "sf")
+                      (V2DF "df") (DF   "df")
+                      (SI   "si") (HI   "hi")
+                      (QI   "qi")
+                      (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
+                      (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
+                      (VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf")
+                      (VNx4SI "si") (VNx2SI "si")
+                      (VNx4SF "sf") (VNx2SF "sf")
+                      (VNx2DI "di")
+                      (VNx2DF "df")])
 
 ;; Element mode with floating-point values replaced by like-sized integers.
 (define_mode_attr VEL_INT [(VNx16QI "QI")
                             (V4SF "2s")])
 
 ;; Define corresponding core/FP element mode for each vector mode.
-(define_mode_attr vw [(V8QI "w") (V16QI "w") (VNx16QI "w")
-                     (V4HI "w") (V8HI "w") (VNx8HI "w")
-                     (V2SI "w") (V4SI "w") (VNx4SI "w")
-                     (DI   "x") (V2DI "x") (VNx2DI "x")
-                     (VNx8HF "h")
-                     (V2SF "s") (V4SF "s") (VNx4SF "s")
-                     (V2DF "d") (VNx2DF "d")])
+(define_mode_attr vw [(V8QI "w") (V16QI "w")
+                     (V4HI "w") (V8HI "w")
+                     (V2SI "w") (V4SI "w")
+                     (DI   "x") (V2DI "x")
+                     (V2SF "s") (V4SF "s")
+                     (V2DF "d")])
 
 ;; Corresponding core element mode for each vector mode.  This is a
 ;; variation on <vw> mapping FP modes to GP regs.
-(define_mode_attr vwcore [(V8QI "w") (V16QI "w") (VNx16QI "w")
-                         (V4HI "w") (V8HI "w") (VNx8HI "w")
-                         (V2SI "w") (V4SI "w") (VNx4SI "w")
-                         (DI   "x") (V2DI "x") (VNx2DI "x")
-                         (V4HF "w") (V8HF "w") (VNx8HF "w")
-                         (V2SF "w") (V4SF "w") (VNx4SF "w")
-                         (V2DF "x") (VNx2DF "x")])
+(define_mode_attr vwcore [(V8QI "w") (V16QI "w")
+                         (V4HI "w") (V8HI "w")
+                         (V2SI "w") (V4SI "w")
+                         (DI   "x") (V2DI "x")
+                         (V4HF "w") (V8HF "w")
+                         (V2SF "w") (V4SF "w")
+                         (V2DF "x")
+                         (VNx16QI "w") (VNx8QI "w") (VNx4QI "w") (VNx2QI "w")
+                         (VNx8HI "w") (VNx4HI "w") (VNx2HI "w")
+                         (VNx8HF "w") (VNx4HF "w") (VNx2HF "w")
+                         (VNx4SI "w") (VNx2SI "w")
+                         (VNx4SF "w") (VNx2SF "w")
+                         (VNx2DI "x")
+                         (VNx2DF "x")])
 
 ;; Double vector types for ALLX.
 (define_mode_attr Vallxd [(QI "8b") (HI "4h") (SI "2s")])
 
 ;; The predicate mode associated with an SVE data mode.  For structure modes
 ;; this is equivalent to the <VPRED> of the subvector mode.
-(define_mode_attr VPRED [(VNx16QI "VNx16BI")
-                        (VNx8HI "VNx8BI") (VNx8HF "VNx8BI")
-                        (VNx4SI "VNx4BI") (VNx4SF "VNx4BI")
-                        (VNx2DI "VNx2BI") (VNx2DF "VNx2BI")
+(define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI "VNx8BI")
+                        (VNx4QI "VNx4BI") (VNx2QI "VNx2BI")
+                        (VNx8HI "VNx8BI") (VNx4HI "VNx4BI") (VNx2HI "VNx2BI")
+                        (VNx8HF "VNx8BI") (VNx4HF "VNx4BI") (VNx2HF "VNx2BI")
+                        (VNx4SI "VNx4BI") (VNx2SI "VNx2BI")
+                        (VNx4SF "VNx4BI") (VNx2SF "VNx2BI")
+                        (VNx2DI "VNx2BI")
+                        (VNx2DF "VNx2BI")
                         (VNx32QI "VNx16BI")
                         (VNx16HI "VNx8BI") (VNx16HF "VNx8BI")
                         (VNx8SI "VNx4BI") (VNx8SF "VNx4BI")
                         (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")])
 
 ;; ...and again in lower case.
-(define_mode_attr vpred [(VNx16QI "vnx16bi")
-                        (VNx8HI "vnx8bi") (VNx8HF "vnx8bi")
-                        (VNx4SI "vnx4bi") (VNx4SF "vnx4bi")
-                        (VNx2DI "vnx2bi") (VNx2DF "vnx2bi")
+(define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
+                        (VNx4QI "vnx4bi") (VNx2QI "vnx2bi")
+                        (VNx8HI "vnx8bi") (VNx4HI "vnx4bi") (VNx2HI "vnx2bi")
+                        (VNx8HF "vnx8bi") (VNx4HF "vnx4bi") (VNx2HF "vnx2bi")
+                        (VNx4SI "vnx4bi") (VNx2SI "vnx2bi")
+                        (VNx4SF "vnx4bi") (VNx2SF "vnx2bi")
+                        (VNx2DI "vnx2bi")
+                        (VNx2DF "vnx2bi")
                         (VNx32QI "vnx16bi")
                         (VNx16HI "vnx8bi") (VNx16HF "vnx8bi")
                         (VNx8SI "vnx4bi") (VNx8SF "vnx4bi")
index 8b48698..57505e9 100644 (file)
@@ -1,5 +1,13 @@
 2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>
 
+       * gcc.target/aarch64/sve/mixed_size_1.c: New test.
+       * gcc.target/aarch64/sve/mixed_size_2.c: Likewise.
+       * gcc.target/aarch64/sve/mixed_size_3.c: Likewise.
+       * gcc.target/aarch64/sve/mixed_size_4.c: Likewise.
+       * gcc.target/aarch64/sve/mixed_size_5.c: Likewise.
+
+2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>
+
        * gcc.target/aarch64/sve/clastb_8.c: Use assembly tests to
        check for fully-masked loops.
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_1.c b/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_1.c
new file mode 100644 (file)
index 0000000..a5659b6
--- /dev/null
@@ -0,0 +1,39 @@
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2)                                                \
+  void                                                                 \
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1,     \
+                      TYPE2 *restrict dst2, TYPE2 *restrict src2,      \
+                      int n)                                           \
+  {                                                                    \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+       dst1[i] += src1[i];                                             \
+       dst2[i] = src2[i];                                              \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T) \
+  T (uint16_t, uint8_t) \
+  T (uint32_t, uint16_t) \
+  T (uint32_t, _Float16) \
+  T (uint64_t, uint32_t) \
+  T (uint64_t, float)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_2.c b/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_2.c
new file mode 100644 (file)
index 0000000..34b58e3
--- /dev/null
@@ -0,0 +1,41 @@
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2)                                                \
+  void                                                                 \
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1,     \
+                      TYPE2 *restrict dst2, int n)                     \
+  {                                                                    \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+       dst1[i] += src1[i];                                             \
+       dst2[i] = 1;                                                    \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T) \
+  T (uint16_t, uint8_t) \
+  T (uint32_t, uint16_t) \
+  T (uint32_t, _Float16) \
+  T (uint64_t, uint32_t) \
+  T (uint64_t, float)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.b, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #1\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #1\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_3.c b/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_3.c
new file mode 100644 (file)
index 0000000..9ae3e7b
--- /dev/null
@@ -0,0 +1,41 @@
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2)                                                \
+  void                                                                 \
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1,     \
+                      TYPE2 *restrict dst2, TYPE2 src2, int n)         \
+  {                                                                    \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+       dst1[i] += src1[i];                                             \
+       dst2[i] = src2;                                                 \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T) \
+  T (uint16_t, uint8_t) \
+  T (uint32_t, uint16_t) \
+  T (uint32_t, _Float16) \
+  T (uint64_t, uint32_t) \
+  T (uint64_t, float)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.b, w3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, w3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, h0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, s0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_4.c b/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_4.c
new file mode 100644 (file)
index 0000000..4c475fb
--- /dev/null
@@ -0,0 +1,43 @@
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2)                                                \
+  void                                                                 \
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1,     \
+                      TYPE2 *restrict dst2, TYPE2 n)                   \
+  {                                                                    \
+    for (TYPE2 i = 0; i < n; ++i)                                      \
+      {                                                                        \
+       dst1[i] += src1[i];                                             \
+       dst2[i] = i;                                                    \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T) \
+  T (uint16_t, uint8_t) \
+  T (uint32_t, uint16_t) \
+  T (uint64_t, uint32_t)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-not {\tindex\tz[0-9]+\.b,} } } */
+/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.h, #0, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.s, #0, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.d, #0, #1\n} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tcntb\t} } } */
+/* { dg-final { scan-assembler-times {\tcnth\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tcntw\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tcntd\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_5.c b/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_5.c
new file mode 100644 (file)
index 0000000..83be00f
--- /dev/null
@@ -0,0 +1,42 @@
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns -msve-vector-bits=512" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2)                                                \
+  void                                                                 \
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1,     \
+                      TYPE2 *restrict dst2, TYPE2 *restrict src2,      \
+                      int n)                                           \
+  {                                                                    \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+       dst1[i * 2] = src1[i * 2] + 1;                                  \
+       dst1[i * 2 + 1] = src1[i * 2 + 1] + 1;                          \
+       dst2[i * 2] = 2;                                                \
+       dst2[i * 2 + 1] = 3;                                            \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T) \
+  T (uint16_t, uint8_t) \
+  T (uint32_t, uint16_t) \
+  T (uint32_t, _Float16) \
+  T (uint64_t, uint32_t) \
+  T (uint64_t, float)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */