[AArch64] Add autovec support for partial SVE vectors

author Richard Sandiford <richard.sandiford@arm.com>

Sat, 16 Nov 2019 11:02:09 +0000 (11:02 +0000)

committer Richard Sandiford <rsandifo@gcc.gnu.org>

Sat, 16 Nov 2019 11:02:09 +0000 (11:02 +0000)
author Richard Sandiford <richard.sandiford@arm.com>
Sat, 16 Nov 2019 11:02:09 +0000 (11:02 +0000)
committer Richard Sandiford <rsandifo@gcc.gnu.org>
Sat, 16 Nov 2019 11:02:09 +0000 (11:02 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index b7e46cf..afb995f 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,84 @@
  2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>
  
+       * config/aarch64/aarch64-modes.def: Define partial SVE vector
+       float modes.
+       * config/aarch64/aarch64-protos.h (aarch64_sve_pred_mode): New
+       function.
+       * config/aarch64/aarch64.c (aarch64_classify_vector_mode): Handle the
+       new vector float modes.
+       (aarch64_sve_container_bits): New function.
+       (aarch64_sve_pred_mode): Likewise.
+       (aarch64_get_mask_mode): Use it.
+       (aarch64_sve_element_int_mode): Handle structure modes and partial
+       modes.
+       (aarch64_sve_container_int_mode): New function.
+       (aarch64_vectorize_related_mode): Return SVE modes when given
+       SVE modes.  Handle partial modes, taking the preferred number
+       of units from the size of the given mode.
+       (aarch64_hard_regno_mode_ok): Allow partial modes to be stored
+       in registers.
+       (aarch64_expand_sve_ld1rq): Use the mode form of aarch64_sve_pred_mode.
+       (aarch64_expand_sve_const_vector): Handle partial SVE vectors.
+       (aarch64_split_sve_subreg_move): Use the mode form of
+       aarch64_sve_pred_mode.
+       (aarch64_secondary_reload): Handle partial modes in the same way
+       as full big-endian vectors.
+       (aarch64_vector_mode_supported_p): Allow partial SVE vectors.
+       (aarch64_autovectorize_vector_modes): Try unpacked SVE vectors,
+       merging with the Advanced SIMD modes.  If two modes have the
+       same size, try the Advanced SIMD mode first.
+       (aarch64_simd_valid_immediate): Use the container rather than
+       the element mode for INDEX constants.
+       (aarch64_simd_vector_alignment): Make the alignment of partial
+       SVE vector modes the same as their minimum size.
+       (aarch64_evpc_sel): Use the mode form of aarch64_sve_pred_mode.
+       * config/aarch64/aarch64-sve.md (mov<SVE_FULL:mode>): Extend to...
+       (mov<SVE_ALL:mode>): ...this.
+       (movmisalign<SVE_FULL:mode>): Extend to...
+       (movmisalign<SVE_ALL:mode>): ...this.
+       (*aarch64_sve_mov<mode>_le): Rename to...
+       (*aarch64_sve_mov<mode>_ldr_str): ...this.
+       (*aarch64_sve_mov<SVE_FULL:mode>_be): Rename and extend to...
+       (*aarch64_sve_mov<SVE_ALL:mode>_no_ldr_str): ...this.  Handle
+       partial modes regardless of endianness.
+       (aarch64_sve_reload_be): Rename to...
+       (aarch64_sve_reload_mem): ...this and enable for little-endian.
+       Use aarch64_sve_pred_mode to get the appropriate predicate mode.
+       (@aarch64_pred_mov<SVE_FULL:mode>): Extend to...
+       (@aarch64_pred_mov<SVE_ALL:mode>): ...this.
+       (*aarch64_sve_mov<SVE_FULL:mode>_subreg_be): Extend to...
+       (*aarch64_sve_mov<SVE_ALL:mode>_subreg_be): ...this.
+       (@aarch64_sve_reinterpret<SVE_FULL:mode>): Extend to...
+       (@aarch64_sve_reinterpret<SVE_ALL:mode>): ...this.
+       (*aarch64_sve_reinterpret<SVE_FULL:mode>): Extend to...
+       (*aarch64_sve_reinterpret<SVE_ALL:mode>): ...this.
+       (maskload<SVE_FULL:mode><vpred>): Extend to...
+       (maskload<SVE_ALL:mode><vpred>): ...this.
+       (maskstore<SVE_FULL:mode><vpred>): Extend to...
+       (maskstore<SVE_ALL:mode><vpred>): ...this.
+       (vec_duplicate<SVE_FULL:mode>): Extend to...
+       (vec_duplicate<SVE_ALL:mode>): ...this.
+       (*vec_duplicate<SVE_FULL:mode>_reg): Extend to...
+       (*vec_duplicate<SVE_ALL:mode>_reg): ...this.
+       (sve_ld1r<SVE_FULL:mode>): Extend to...
+       (sve_ld1r<SVE_ALL:mode>): ...this.
+       (vec_series<SVE_FULL_I:mode>): Extend to...
+       (vec_series<SVE_I:mode>): ...this.
+       (*vec_series<SVE_FULL_I:mode>_plus): Extend to...
+       (*vec_series<SVE_I:mode>_plus): ...this.
+       (@aarch64_pred_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>): Avoid
+       new VPRED ambiguity.
+       (@aarch64_cond_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>): Likewise.
+       (add<SVE_FULL_I:mode>3): Extend to...
+       (add<SVE_I:mode>3): ...this.
+       * config/aarch64/iterators.md (SVE_ALL, SVE_I): New mode iterators.
+       (Vetype, Vesize, VEL, Vel, vwcore): Handle partial SVE vector modes.
+       (VPRED, vpred): Likewise.
+       (Vctype): New iterator.
+       (vw): Remove SVE modes.
+
+2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>
+
         * config/aarch64/iterators.md (SVE_PARTIAL): Rename to...
         (SVE_PARTIAL_I): ...this.
         * config/aarch64/aarch64-sve.md: Apply the above renaming throughout.
diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def

index a9b1bce..3c698b6 100644 (file)
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -123,13 +123,18 @@ SVE_MODES (4, VNx64, VNx32, VNx16, VNx8)
  VECTOR_MODES_WITH_PREFIX (VNx, INT, 2, 1);
  VECTOR_MODES_WITH_PREFIX (VNx, INT, 4, 1);
  VECTOR_MODES_WITH_PREFIX (VNx, INT, 8, 1);
+VECTOR_MODES_WITH_PREFIX (VNx, FLOAT, 4, 1);
+VECTOR_MODES_WITH_PREFIX (VNx, FLOAT, 8, 1);
  
  ADJUST_NUNITS (VNx2QI, aarch64_sve_vg);
  ADJUST_NUNITS (VNx2HI, aarch64_sve_vg);
  ADJUST_NUNITS (VNx2SI, aarch64_sve_vg);
+ADJUST_NUNITS (VNx2HF, aarch64_sve_vg);
+ADJUST_NUNITS (VNx2SF, aarch64_sve_vg);
  
  ADJUST_NUNITS (VNx4QI, aarch64_sve_vg * 2);
  ADJUST_NUNITS (VNx4HI, aarch64_sve_vg * 2);
+ADJUST_NUNITS (VNx4HF, aarch64_sve_vg * 2);
  
  ADJUST_NUNITS (VNx8QI, aarch64_sve_vg * 4);
  
@@ -139,8 +144,11 @@ ADJUST_ALIGNMENT (VNx8QI, 1);
  
  ADJUST_ALIGNMENT (VNx2HI, 2);
  ADJUST_ALIGNMENT (VNx4HI, 2);
+ADJUST_ALIGNMENT (VNx2HF, 2);
+ADJUST_ALIGNMENT (VNx4HF, 2);
  
  ADJUST_ALIGNMENT (VNx2SI, 4);
+ADJUST_ALIGNMENT (VNx2SF, 4);
  
  /* Quad float: 128-bit floating mode for long doubles.  */
  FLOAT_MODE (TF, 16, ieee_quad_format);
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index 1d4f4fd..bcb3fd4 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -512,6 +512,7 @@ bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
  bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
  machine_mode aarch64_sve_int_mode (machine_mode);
  opt_machine_mode aarch64_sve_pred_mode (unsigned int);
+machine_mode aarch64_sve_pred_mode (machine_mode);
  opt_machine_mode aarch64_sve_data_mode (scalar_mode, poly_uint64);
  bool aarch64_sve_mode_p (machine_mode);
  HOST_WIDE_INT aarch64_fold_sve_cnt_pat (aarch64_svpattern, unsigned int);
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md

index 5b71ab0..b43d4fb 100644 (file)
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -546,8 +546,8 @@
  ;; -------------------------------------------------------------------------
  
  (define_expand "mov<mode>"
-  [(set (match_operand:SVE_FULL 0 "nonimmediate_operand")
-       (match_operand:SVE_FULL 1 "general_operand"))]
+  [(set (match_operand:SVE_ALL 0 "nonimmediate_operand")
+       (match_operand:SVE_ALL 1 "general_operand"))]
    "TARGET_SVE"
    {
      /* Use the predicated load and store patterns where possible.
@@ -576,8 +576,8 @@
  )
  
  (define_expand "movmisalign<mode>"
-  [(set (match_operand:SVE_FULL 0 "nonimmediate_operand")
-       (match_operand:SVE_FULL 1 "general_operand"))]
+  [(set (match_operand:SVE_ALL 0 "nonimmediate_operand")
+       (match_operand:SVE_ALL 1 "general_operand"))]
    "TARGET_SVE"
    {
      /* Equivalent to a normal move for our purpooses.  */
@@ -586,10 +586,11 @@
    }
  )
  
-;; Unpredicated moves (bytes or little-endian).  Only allow memory operations
-;; during and after RA; before RA we want the predicated load and store
-;; patterns to be used instead.
-(define_insn "*aarch64_sve_mov<mode>_le"
+;; Unpredicated moves that can use LDR and STR, i.e. full vectors for which
+;; little-endian ordering is acceptable.  Only allow memory operations during
+;; and after RA; before RA we want the predicated load and store patterns to
+;; be used instead.
+(define_insn "*aarch64_sve_mov<mode>_ldr_str"
    [(set (match_operand:SVE_FULL 0 "aarch64_sve_nonimmediate_operand" "=w, Utr, w, w")
         (match_operand:SVE_FULL 1 "aarch64_sve_general_operand" "Utr, w, w, Dn"))]
    "TARGET_SVE
@@ -604,35 +605,37 @@
     * return aarch64_output_sve_mov_immediate (operands[1]);"
  )
  
-;; Unpredicated moves (non-byte big-endian).  Memory accesses require secondary
-;; reloads.
-(define_insn "*aarch64_sve_mov<mode>_be"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, w")
-       (match_operand:SVE_FULL 1 "aarch64_nonmemory_operand" "w, Dn"))]
-  "TARGET_SVE && BYTES_BIG_ENDIAN && <MODE>mode != VNx16QImode"
+;; Unpredicated moves that cannot use LDR and STR, i.e. partial vectors
+;; or vectors for which little-endian ordering isn't acceptable.  Memory
+;; accesses require secondary reloads.
+(define_insn "*aarch64_sve_mov<mode>_no_ldr_str"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w")
+       (match_operand:SVE_ALL 1 "aarch64_nonmemory_operand" "w, Dn"))]
+  "TARGET_SVE
+   && <MODE>mode != VNx16QImode
+   && (BYTES_BIG_ENDIAN
+       || maybe_ne (BYTES_PER_SVE_VECTOR, GET_MODE_SIZE (<MODE>mode)))"
    "@
     mov\t%0.d, %1.d
     * return aarch64_output_sve_mov_immediate (operands[1]);"
  )
  
-;; Handle big-endian memory reloads.  We use byte PTRUE for all modes
-;; to try to encourage reuse.
-;; This pattern needs constraints due to TARGET_SECONDARY_RELOAD hook.
-(define_expand "aarch64_sve_reload_be"
+;; Handle memory reloads for modes that can't use LDR and STR.  We use
+;; byte PTRUE for all modes to try to encourage reuse.  This pattern
+;; needs constraints because it is returned by TARGET_SECONDARY_RELOAD.
+(define_expand "aarch64_sve_reload_mem"
    [(parallel
       [(set (match_operand 0)
            (match_operand 1))
        (clobber (match_operand:VNx16BI 2 "register_operand" "=Upl"))])]
-  "TARGET_SVE && BYTES_BIG_ENDIAN"
+  "TARGET_SVE"
    {
      /* Create a PTRUE.  */
      emit_move_insn (operands[2], CONSTM1_RTX (VNx16BImode));
  
      /* Refer to the PTRUE in the appropriate mode for this move.  */
      machine_mode mode = GET_MODE (operands[0]);
-    machine_mode pred_mode
-      = aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (mode)).require ();
-    rtx pred = gen_lowpart (pred_mode, operands[2]);
+    rtx pred = gen_lowpart (aarch64_sve_pred_mode (mode), operands[2]);
  
      /* Emit a predicated load or store.  */
      aarch64_emit_sve_pred_move (operands[0], pred, operands[1]);
@@ -644,18 +647,18 @@
  ;; Note that this pattern is generated directly by aarch64_emit_sve_pred_move,
  ;; so changes to this pattern will need changes there as well.
  (define_insn_and_split "@aarch64_pred_mov<mode>"
-  [(set (match_operand:SVE_FULL 0 "nonimmediate_operand" "=w, w, m")
-       (unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "nonimmediate_operand" "=w, w, m")
+       (unspec:SVE_ALL
           [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-          (match_operand:SVE_FULL 2 "nonimmediate_operand" "w, m, w")]
+          (match_operand:SVE_ALL 2 "nonimmediate_operand" "w, m, w")]
           UNSPEC_PRED_X))]
    "TARGET_SVE
     && (register_operand (operands[0], <MODE>mode)
         || register_operand (operands[2], <MODE>mode))"
    "@
     #
-   ld1<Vesize>\t%0.<Vetype>, %1/z, %2
-   st1<Vesize>\t%2.<Vetype>, %1, %0"
+   ld1<Vesize>\t%0.<Vctype>, %1/z, %2
+   st1<Vesize>\t%2.<Vctype>, %1, %0"
    "&& register_operand (operands[0], <MODE>mode)
     && register_operand (operands[2], <MODE>mode)"
    [(set (match_dup 0) (match_dup 2))]
@@ -666,8 +669,8 @@
  ;; for details.  We use a special predicate for operand 2 to reduce
  ;; the number of patterns.
  (define_insn_and_split "*aarch64_sve_mov<mode>_subreg_be"
-  [(set (match_operand:SVE_FULL 0 "aarch64_sve_nonimmediate_operand" "=w")
-       (unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "aarch64_sve_nonimmediate_operand" "=w")
+       (unspec:SVE_ALL
           [(match_operand:VNx16BI 1 "register_operand" "Upl")
            (match_operand 2 "aarch64_any_register_operand" "w")]
           UNSPEC_REV_SUBREG))]
@@ -685,8 +688,8 @@
  ;; This is equivalent to a subreg on little-endian targets but not for
  ;; big-endian; see the comment at the head of the file for details.
  (define_expand "@aarch64_sve_reinterpret<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand")
-       (unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand")
+       (unspec:SVE_ALL
           [(match_operand 1 "aarch64_any_register_operand")]
           UNSPEC_REINTERPRET))]
    "TARGET_SVE"
@@ -702,8 +705,8 @@
  ;; A pattern for handling type punning on big-endian targets.  We use a
  ;; special predicate for operand 1 to reduce the number of patterns.
  (define_insn_and_split "*aarch64_sve_reinterpret<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-       (unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+       (unspec:SVE_ALL
           [(match_operand 1 "aarch64_any_register_operand" "w")]
           UNSPEC_REINTERPRET))]
    "TARGET_SVE"
@@ -1141,13 +1144,13 @@
  
  ;; Predicated LD1.
  (define_insn "maskload<mode><vpred>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-       (unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+       (unspec:SVE_ALL
           [(match_operand:<VPRED> 2 "register_operand" "Upl")
-          (match_operand:SVE_FULL 1 "memory_operand" "m")]
+          (match_operand:SVE_ALL 1 "memory_operand" "m")]
           UNSPEC_LD1_SVE))]
    "TARGET_SVE"
-  "ld1<Vesize>\t%0.<Vetype>, %2/z, %1"
+  "ld1<Vesize>\t%0.<Vctype>, %2/z, %1"
  )
  
  ;; Unpredicated LD[234].
@@ -1940,14 +1943,14 @@
  
  ;; Predicated ST1.
  (define_insn "maskstore<mode><vpred>"
-  [(set (match_operand:SVE_FULL 0 "memory_operand" "+m")
-       (unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "memory_operand" "+m")
+       (unspec:SVE_ALL
           [(match_operand:<VPRED> 2 "register_operand" "Upl")
-          (match_operand:SVE_FULL 1 "register_operand" "w")
+          (match_operand:SVE_ALL 1 "register_operand" "w")
            (match_dup 0)]
           UNSPEC_ST1_SVE))]
    "TARGET_SVE"
-  "st1<Vesize>\t%1.<Vetype>, %2, %0"
+  "st1<Vesize>\t%1.<Vctype>, %2, %0"
  )
  
  ;; Unpredicated ST[234].  This is always a full update, so the dependence
@@ -2283,8 +2286,8 @@
  
  (define_expand "vec_duplicate<mode>"
    [(parallel
-    [(set (match_operand:SVE_FULL 0 "register_operand")
-         (vec_duplicate:SVE_FULL
+    [(set (match_operand:SVE_ALL 0 "register_operand")
+         (vec_duplicate:SVE_ALL
             (match_operand:<VEL> 1 "aarch64_sve_dup_operand")))
       (clobber (scratch:VNx16BI))])]
    "TARGET_SVE"
@@ -2304,8 +2307,8 @@
  ;; the load at the first opportunity in order to allow the PTRUE to be
  ;; optimized with surrounding code.
  (define_insn_and_split "*vec_duplicate<mode>_reg"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, w, w")
-       (vec_duplicate:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w, w")
+       (vec_duplicate:SVE_ALL
           (match_operand:<VEL> 1 "aarch64_sve_dup_operand" "r, w, Uty")))
     (clobber (match_scratch:VNx16BI 2 "=X, X, Upl"))]
    "TARGET_SVE"
@@ -2364,12 +2367,12 @@
  ;; be used by combine to optimize selects of a a vec_duplicate<mode>
  ;; with zero.
  (define_insn "sve_ld1r<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-       (unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+       (unspec:SVE_ALL
           [(match_operand:<VPRED> 1 "register_operand" "Upl")
-          (vec_duplicate:SVE_FULL
+          (vec_duplicate:SVE_ALL
              (match_operand:<VEL> 2 "aarch64_sve_ld1r_operand" "Uty"))
-          (match_operand:SVE_FULL 3 "aarch64_simd_imm_zero")]
+          (match_operand:SVE_ALL 3 "aarch64_simd_imm_zero")]
           UNSPEC_SEL))]
    "TARGET_SVE"
    "ld1r<Vesize>\t%0.<Vetype>, %1/z, %2"
@@ -2431,29 +2434,29 @@
  ;; -------------------------------------------------------------------------
  
  (define_insn "vec_series<mode>"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, w")
-       (vec_series:SVE_FULL_I
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, w")
+       (vec_series:SVE_I
           (match_operand:<VEL> 1 "aarch64_sve_index_operand" "Usi, r, r")
           (match_operand:<VEL> 2 "aarch64_sve_index_operand" "r, Usi, r")))]
    "TARGET_SVE"
    "@
-   index\t%0.<Vetype>, #%1, %<vw>2
-   index\t%0.<Vetype>, %<vw>1, #%2
-   index\t%0.<Vetype>, %<vw>1, %<vw>2"
+   index\t%0.<Vctype>, #%1, %<vwcore>2
+   index\t%0.<Vctype>, %<vwcore>1, #%2
+   index\t%0.<Vctype>, %<vwcore>1, %<vwcore>2"
  )
  
  ;; Optimize {x, x, x, x, ...} + {0, n, 2*n, 3*n, ...} if n is in range
  ;; of an INDEX instruction.
  (define_insn "*vec_series<mode>_plus"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
-       (plus:SVE_FULL_I
-         (vec_duplicate:SVE_FULL_I
+  [(set (match_operand:SVE_I 0 "register_operand" "=w")
+       (plus:SVE_I
+         (vec_duplicate:SVE_I
             (match_operand:<VEL> 1 "register_operand" "r"))
-         (match_operand:SVE_FULL_I 2 "immediate_operand")))]
+         (match_operand:SVE_I 2 "immediate_operand")))]
    "TARGET_SVE && aarch64_check_zero_based_sve_index_immediate (operands[2])"
    {
      operands[2] = aarch64_check_zero_based_sve_index_immediate (operands[2]);
-    return "index\t%0.<Vetype>, %<vw>1, #%2";
+    return "index\t%0.<Vctype>, %<vwcore>1, #%2";
    }
  )
  
@@ -2821,7 +2824,7 @@
  (define_insn "@aarch64_pred_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>"
    [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
         (unspec:SVE_FULL_HSDI
-         [(match_operand:<VPRED> 1 "register_operand" "Upl")
+         [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl")
            (sign_extend:SVE_FULL_HSDI
              (truncate:SVE_PARTIAL_I
                (match_operand:SVE_FULL_HSDI 2 "register_operand" "w")))]
@@ -2834,7 +2837,7 @@
  (define_insn "@aarch64_cond_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>"
    [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w, ?&w, ?&w")
         (unspec:SVE_FULL_HSDI
-         [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+         [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl")
            (sign_extend:SVE_FULL_HSDI
              (truncate:SVE_PARTIAL_I
                (match_operand:SVE_FULL_HSDI 2 "register_operand" "w, w, w")))
@@ -3386,10 +3389,10 @@
  ;; -------------------------------------------------------------------------
  
  (define_insn "add<mode>3"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, w, ?w, ?w, w")
-       (plus:SVE_FULL_I
-         (match_operand:SVE_FULL_I 1 "register_operand" "%0, 0, 0, w, w, w")
-         (match_operand:SVE_FULL_I 2 "aarch64_sve_add_operand" "vsa, vsn, vsi, vsa, vsn, w")))]
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, w, ?w, ?w, w")
+       (plus:SVE_I
+         (match_operand:SVE_I 1 "register_operand" "%0, 0, 0, w, w, w")
+         (match_operand:SVE_I 2 "aarch64_sve_add_operand" "vsa, vsn, vsi, vsa, vsn, w")))]
    "TARGET_SVE"
    "@
     add\t%0.<Vetype>, %0.<Vetype>, #%D2
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index 9ffe213..d175e1f 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1625,6 +1625,11 @@ aarch64_classify_vector_mode (machine_mode mode)
      case E_VNx4HImode:
      /* Partial SVE SI vector.  */
      case E_VNx2SImode:
+    /* Partial SVE HF vectors.  */
+    case E_VNx2HFmode:
+    case E_VNx4HFmode:
+    /* Partial SVE SF vector.  */
+    case E_VNx2SFmode:
        return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
  
      case E_VNx16QImode:
@@ -1753,6 +1758,22 @@ aarch64_array_mode_supported_p (machine_mode mode,
    return false;
  }
  
+/* MODE is some form of SVE vector mode.  For data modes, return the number
+   of vector register bits that each element of MODE occupies, such as 64
+   for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
+   in a 64-bit container).  For predicate modes, return the number of
+   data bits controlled by each significant predicate bit.  */
+
+static unsigned int
+aarch64_sve_container_bits (machine_mode mode)
+{
+  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+  poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
+                            ? BITS_PER_SVE_VECTOR
+                            : GET_MODE_BITSIZE (mode));
+  return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
+}
+
  /* Return the SVE predicate mode to use for elements that have
     ELEM_NBYTES bytes, if such a mode exists.  */
  
@@ -1773,6 +1794,16 @@ aarch64_sve_pred_mode (unsigned int elem_nbytes)
    return opt_machine_mode ();
  }
  
+/* Return the SVE predicate mode that should be used to control
+   SVE mode MODE.  */
+
+machine_mode
+aarch64_sve_pred_mode (machine_mode mode)
+{
+  unsigned int bits = aarch64_sve_container_bits (mode);
+  return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
+}
+
  /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
  
  static opt_machine_mode
@@ -1780,7 +1811,7 @@ aarch64_get_mask_mode (machine_mode mode)
  {
    unsigned int vec_flags = aarch64_classify_vector_mode (mode);
    if (vec_flags & VEC_SVE_DATA)
-    return aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (mode));
+    return aarch64_sve_pred_mode (mode);
  
    return default_get_mask_mode (mode);
  }
@@ -1806,11 +1837,25 @@ aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
  static scalar_int_mode
  aarch64_sve_element_int_mode (machine_mode mode)
  {
-  unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
+  poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
+                            ? BITS_PER_SVE_VECTOR
+                            : GET_MODE_BITSIZE (mode));
+  unsigned int elt_bits = vector_element_size (vector_bits,
                                                GET_MODE_NUNITS (mode));
    return int_mode_for_size (elt_bits, 0).require ();
  }
  
+/* Return an integer element mode that contains exactly
+   aarch64_sve_container_bits (MODE) bits.  This is wider than
+   aarch64_sve_element_int_mode if MODE is a partial vector,
+   otherwise it's the same.  */
+
+static scalar_int_mode
+aarch64_sve_container_int_mode (machine_mode mode)
+{
+  return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
+}
+
  /* Return the integer vector mode associated with SVE mode MODE.
     Unlike related_int_vector_mode, this can handle the case in which
     MODE is a predicate (and thus has a different total size).  */
@@ -1831,6 +1876,37 @@ aarch64_vectorize_related_mode (machine_mode vector_mode,
  {
    unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
  
+  /* If we're operating on SVE vectors, try to return an SVE mode.  */
+  poly_uint64 sve_nunits;
+  if ((vec_flags & VEC_SVE_DATA)
+      && multiple_p (BYTES_PER_SVE_VECTOR,
+                    GET_MODE_SIZE (element_mode), &sve_nunits))
+    {
+      machine_mode sve_mode;
+      if (maybe_ne (nunits, 0U))
+       {
+         /* Try to find a full or partial SVE mode with exactly
+            NUNITS units.  */
+         if (multiple_p (sve_nunits, nunits)
+             && aarch64_sve_data_mode (element_mode,
+                                       nunits).exists (&sve_mode))
+           return sve_mode;
+       }
+      else
+       {
+         /* Take the preferred number of units from the number of bytes
+            that fit in VECTOR_MODE.  We always start by "autodetecting"
+            a full vector mode with preferred_simd_mode, so vectors
+            chosen here will also be full vector modes.  Then
+            autovectorize_vector_modes tries smaller starting modes
+            and thus smaller preferred numbers of units.  */
+         sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
+         if (aarch64_sve_data_mode (element_mode,
+                                    sve_nunits).exists (&sve_mode))
+           return sve_mode;
+       }
+    }
+
    /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
    if ((vec_flags & VEC_ADVSIMD)
        && known_eq (nunits, 0U)
@@ -1907,11 +1983,6 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
      return mode == DImode;
  
    unsigned int vec_flags = aarch64_classify_vector_mode (mode);
-  /* At the moment, partial vector modes are only useful for memory
-     references, but that could change in future.  */
-  if (vec_flags & VEC_PARTIAL)
-    return false;
-
    if (vec_flags & VEC_SVE_PRED)
      return pr_or_ffr_regnum_p (regno);
  
@@ -4015,8 +4086,7 @@ aarch64_expand_sve_ld1rq (rtx dest, rtx src)
      }
  
    machine_mode mode = GET_MODE (dest);
-  unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
-  machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
+  machine_mode pred_mode = aarch64_sve_pred_mode (mode);
    rtx ptrue = aarch64_ptrue_reg (pred_mode);
    emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
    return true;
@@ -4037,7 +4107,26 @@ aarch64_expand_sve_const_vector (rtx target, rtx src)
    unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
    scalar_mode elt_mode = GET_MODE_INNER (mode);
    unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
-  unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
+  unsigned int container_bits = aarch64_sve_container_bits (mode);
+  unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
+
+  if (nelts_per_pattern == 1
+      && encoded_bits <= 128
+      && container_bits != elt_bits)
+    {
+      /* We have a partial vector mode and a constant whose full-vector
+        equivalent would occupy a repeating 128-bit sequence.  Build that
+        full-vector equivalent instead, so that we have the option of
+        using LD1RQ and Advanced SIMD operations.  */
+      unsigned int repeat = container_bits / elt_bits;
+      machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
+      rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
+      for (unsigned int i = 0; i < npatterns; ++i)
+       for (unsigned int j = 0; j < repeat; ++j)
+         builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
+      target = aarch64_target_reg (target, full_mode);
+      return aarch64_expand_sve_const_vector (target, builder.build ());
+    }
  
    if (nelts_per_pattern == 1 && encoded_bits == 128)
      {
@@ -4730,8 +4819,7 @@ aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
      std::swap (mode_with_wider_elts, mode_with_narrower_elts);
  
    unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
-  unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
-  machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
+  machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
  
    /* Get the operands in the appropriate modes and emit the instruction.  */
    ptrue = gen_lowpart (pred_mode, ptrue);
@@ -9971,19 +10059,21 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
                           machine_mode mode,
                           secondary_reload_info *sri)
  {
-  /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
-     directly by the *aarch64_sve_mov<mode>_[lb]e move patterns.  See the
-     comment at the head of aarch64-sve.md for more details about the
-     big-endian handling.  */
-  if (BYTES_BIG_ENDIAN
-      && reg_class_subset_p (rclass, FP_REGS)
+  /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
+     LDR and STR.  See the comment at the head of aarch64-sve.md for
+     more details about the big-endian handling.  */
+  if (reg_class_subset_p (rclass, FP_REGS)
        && !((REG_P (x) && HARD_REGISTER_P (x))
            || aarch64_simd_valid_immediate (x, NULL))
-      && mode != VNx16QImode
-      && aarch64_sve_data_mode_p (mode))
+      && mode != VNx16QImode)
      {
-      sri->icode = CODE_FOR_aarch64_sve_reload_be;
-      return NO_REGS;
+      unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+      if ((vec_flags & VEC_SVE_DATA)
+         && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
+       {
+         sri->icode = CODE_FOR_aarch64_sve_reload_mem;
+         return NO_REGS;
+       }
      }
  
    /* If we have to disable direct literal pool loads and stores because the
@@ -15837,7 +15927,7 @@ static bool
  aarch64_vector_mode_supported_p (machine_mode mode)
  {
    unsigned int vec_flags = aarch64_classify_vector_mode (mode);
-  return vec_flags != 0 && (vec_flags & (VEC_STRUCT | VEC_PARTIAL)) == 0;
+  return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
  }
  
  /* Return the full-width SVE vector mode for element mode MODE, if one
@@ -15938,29 +16028,72 @@ aarch64_preferred_simd_mode (scalar_mode mode)
  static unsigned int
  aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
  {
-  if (TARGET_SVE)
-    modes->safe_push (VNx16QImode);
+  static const machine_mode sve_modes[] = {
+    /* Try using full vectors for all element types.  */
+    VNx16QImode,
+
+    /* Try using 16-bit containers for 8-bit elements and full vectors
+       for wider elements.  */
+    VNx8QImode,
+
+    /* Try using 32-bit containers for 8-bit and 16-bit elements and
+       full vectors for wider elements.  */
+    VNx4QImode,
  
-  /* Try using 128-bit vectors for all element types.  */
-  modes->safe_push (V16QImode);
+    /* Try using 64-bit containers for all element types.  */
+    VNx2QImode
+  };
+
+  static const machine_mode advsimd_modes[] = {
+    /* Try using 128-bit vectors for all element types.  */
+    V16QImode,
+
+    /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
+       for wider elements.  */
+    V8QImode,
+
+    /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
+       for wider elements.
+
+       TODO: We could support a limited form of V4QImode too, so that
+       we use 32-bit vectors for 8-bit elements.  */
+    V4HImode,
+
+    /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
+       for 64-bit elements.
  
-  /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
-     for wider elements.  */
-  modes->safe_push (V8QImode);
+       TODO: We could similarly support limited forms of V2QImode and V2HImode
+       for this case.  */
+    V2SImode
+  };
  
-  /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
-     for wider elements.
+  /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
+     This is because:
  
-     TODO: We could support a limited form of V4QImode too, so that
-     we use 32-bit vectors for 8-bit elements.  */
-  modes->safe_push (V4HImode);
+     - If we can't use N-byte Advanced SIMD vectors then the placement
+       doesn't matter; we'll just continue as though the Advanced SIMD
+       entry didn't exist.
  
-  /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
-     for 64-bit elements.
+     - If an SVE main loop with N bytes ends up being cheaper than an
+       Advanced SIMD main loop with N bytes then by default we'll replace
+       the Advanced SIMD version with the SVE one.
  
-     TODO: We could similarly support limited forms of V2QImode and V2HImode
-     for this case.  */
-  modes->safe_push (V2SImode);
+     - If an Advanced SIMD main loop with N bytes ends up being cheaper
+       than an SVE main loop with N bytes then by default we'll try to
+       use the SVE loop to vectorize the epilogue instead.  */
+  unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
+  unsigned int advsimd_i = 0;
+  while (advsimd_i < ARRAY_SIZE (advsimd_modes))
+    {
+      if (sve_i < ARRAY_SIZE (sve_modes)
+         && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
+                      GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
+       modes->safe_push (sve_modes[sve_i++]);
+      else
+       modes->safe_push (advsimd_modes[advsimd_i++]);
+    }
+  while (sve_i < ARRAY_SIZE (sve_modes))
+    modes->safe_push (sve_modes[sve_i++]);
  
    unsigned int flags = 0;
    /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
@@ -16507,7 +16640,14 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
         return false;
  
        if (info)
-       *info = simd_immediate_info (elt_mode, base, step);
+       {
+         /* Get the corresponding container mode.  E.g. an INDEX on V2SI
+            should yield two integer values per 128-bit block, meaning
+            that we need to treat it in the same way as V2DI and then
+            ignore the upper 32 bits of each element.  */
+         elt_mode = aarch64_sve_container_int_mode (mode);
+         *info = simd_immediate_info (elt_mode, base, step);
+       }
        return true;
      }
    else if (GET_CODE (op) == CONST_VECTOR
@@ -16976,9 +17116,9 @@ aarch64_simd_vector_alignment (const_tree type)
       direct way we have of identifying real SVE predicate types.  */
    if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
      return 16;
-  if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
-    return 128;
-  return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
+  widest_int min_size
+    = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
+  return wi::umin (min_size, 128).to_uhwi ();
  }
  
  /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
@@ -19154,7 +19294,7 @@ aarch64_evpc_sel (struct expand_vec_perm_d *d)
    if (d->testing_p)
      return true;
  
-  machine_mode pred_mode = aarch64_sve_pred_mode (unit_size).require ();
+  machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
  
    rtx_vector_builder builder (pred_mode, n_patterns, 2);
    for (int i = 0; i < n_patterns * 2; i++)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

index fc27179..4c9035f 100644 (file)
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -344,6 +344,21 @@
                                      VNx4HI VNx2HI
                                      VNx2SI])
  
+;; All SVE vector modes.
+(define_mode_iterator SVE_ALL [VNx16QI VNx8QI VNx4QI VNx2QI
+                              VNx8HI VNx4HI VNx2HI
+                              VNx8HF VNx4HF VNx2HF
+                              VNx4SI VNx2SI
+                              VNx4SF VNx2SF
+                              VNx2DI
+                              VNx2DF])
+
+;; All SVE integer vector modes.
+(define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
+                            VNx8HI VNx4HI VNx2HI
+                            VNx4SI VNx2SI
+                            VNx2DI])
+
  ;; Modes involved in extending or truncating SVE data, for 8 elements per
  ;; 128-bit block.
  (define_mode_iterator VNx8_NARROW [VNx8QI])
@@ -776,28 +791,37 @@
                            (HI   "")])
  
  ;; Mode-to-individual element type mapping.
-(define_mode_attr Vetype [(V8QI "b") (V16QI "b") (VNx16QI "b") (VNx16BI "b")
-                         (V4HI "h") (V8HI  "h") (VNx8HI  "h") (VNx8BI  "h")
-                         (V2SI "s") (V4SI  "s") (VNx4SI  "s") (VNx4BI  "s")
-                         (V2DI "d")             (VNx2DI  "d") (VNx2BI  "d")
-                         (V4HF "h") (V8HF  "h") (VNx8HF  "h")
-                         (V2SF "s") (V4SF  "s") (VNx4SF  "s")
-                         (V2DF "d")             (VNx2DF  "d")
-                         (HF   "h")
-                         (SF   "s") (DF  "d")
-                         (QI "b")   (HI "h")
-                         (SI "s")   (DI "d")])
+(define_mode_attr Vetype [(V8QI "b") (V16QI "b")
+                         (V4HI "h") (V8HI  "h")
+                         (V2SI "s") (V4SI  "s")
+                         (V2DI "d")
+                         (V4HF "h") (V8HF  "h")
+                         (V2SF "s") (V4SF  "s")
+                         (V2DF "d")
+                         (VNx16BI "b") (VNx8BI "h") (VNx4BI "s") (VNx2BI "d")
+                         (VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
+                         (VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
+                         (VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
+                         (VNx4SI "s") (VNx2SI "s")
+                         (VNx4SF "s") (VNx2SF "s")
+                         (VNx2DI "d")
+                         (VNx2DF "d")
+                         (HF "h")
+                         (SF "s") (DF "d")
+                         (QI "b") (HI "h")
+                         (SI "s") (DI "d")])
  
  ;; Like Vetype, but map to types that are a quarter of the element size.
  (define_mode_attr Vetype_fourth [(VNx4SI "b") (VNx2DI "h")])
  
  ;; Equivalent of "size" for a vector element.
-(define_mode_attr Vesize [(VNx16QI "b") (VNx8QI  "b")
-                         (VNx4QI  "b") (VNx2QI  "b")
-                         (VNx8HI  "h") (VNx4HI  "h")
-                         (VNx2HI  "h") (VNx8HF  "h")
-                         (VNx4SI  "w") (VNx2SI  "w") (VNx4SF  "w")
-                         (VNx2DI  "d") (VNx2DF  "d")
+(define_mode_attr Vesize [(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
+                         (VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
+                         (VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
+                         (VNx4SI "w") (VNx2SI "w")
+                         (VNx4SF "w") (VNx2SF "w")
+                         (VNx2DI "d")
+                         (VNx2DF "d")
                           (VNx32QI "b") (VNx48QI "b") (VNx64QI "b")
                           (VNx16HI "h") (VNx24HI "h") (VNx32HI "h")
                           (VNx16HF "h") (VNx24HF "h") (VNx32HF "h")
@@ -806,6 +830,16 @@
                           (VNx4DI  "d") (VNx6DI  "d") (VNx8DI  "d")
                           (VNx4DF  "d") (VNx6DF  "d") (VNx8DF  "d")])
  
+;; The Z register suffix for an SVE mode's element container, i.e. the
+;; Vetype of full SVE modes that have the same number of elements.
+(define_mode_attr Vctype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "s") (VNx2QI "d")
+                         (VNx8HI "h") (VNx4HI "s") (VNx2HI "d")
+                         (VNx8HF "h") (VNx4HF "s") (VNx2HF "d")
+                         (VNx4SI "s") (VNx2SI "d")
+                         (VNx4SF "s") (VNx2SF "d")
+                         (VNx2DI "d")
+                         (VNx2DF "d")])
+
  ;; Vetype is used everywhere in scheduling type and assembly output,
  ;; sometimes they are not the same, for example HF modes on some
  ;; instructions.  stype is defined to represent scheduling type
@@ -827,26 +861,40 @@
                           (SI   "8b")  (SF    "8b")])
  
  ;; Define element mode for each vector mode.
-(define_mode_attr VEL [(V8QI  "QI") (V16QI "QI") (VNx16QI "QI")
-                       (V4HI "HI") (V8HI  "HI") (VNx8HI  "HI")
-                       (V2SI "SI") (V4SI  "SI") (VNx4SI  "SI")
-                       (DI   "DI") (V2DI  "DI") (VNx2DI  "DI")
-                       (V4HF "HF") (V8HF  "HF") (VNx8HF  "HF")
-                       (V2SF "SF") (V4SF  "SF") (VNx4SF  "SF")
-                       (DF   "DF") (V2DF  "DF") (VNx2DF  "DF")
-                       (SI   "SI") (HI    "HI")
-                       (QI   "QI")])
+(define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
+                      (V4HI "HI") (V8HI  "HI")
+                      (V2SI "SI") (V4SI  "SI")
+                      (DI   "DI") (V2DI  "DI")
+                      (V4HF "HF") (V8HF  "HF")
+                      (V2SF "SF") (V4SF  "SF")
+                      (DF   "DF") (V2DF  "DF")
+                      (SI   "SI") (HI    "HI")
+                      (QI   "QI")
+                      (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
+                      (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
+                      (VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF")
+                      (VNx4SI "SI") (VNx2SI "SI")
+                      (VNx4SF "SF") (VNx2SF "SF")
+                      (VNx2DI "DI")
+                      (VNx2DF "DF")])
  
  ;; Define element mode for each vector mode (lower case).
-(define_mode_attr Vel [(V8QI "qi") (V16QI "qi") (VNx16QI "qi")
-                       (V4HI "hi") (V8HI "hi") (VNx8HI  "hi")
-                       (V2SI "si") (V4SI "si") (VNx4SI  "si")
-                       (DI "di")   (V2DI "di") (VNx2DI  "di")
-                       (V4HF "hf") (V8HF "hf") (VNx8HF  "hf")
-                       (V2SF "sf") (V4SF "sf") (VNx4SF  "sf")
-                       (V2DF "df") (DF "df")   (VNx2DF  "df")
-                       (SI   "si") (HI   "hi")
-                       (QI   "qi")])
+(define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
+                      (V4HI "hi") (V8HI "hi")
+                      (V2SI "si") (V4SI "si")
+                      (DI   "di") (V2DI "di")
+                      (V4HF "hf") (V8HF "hf")
+                      (V2SF "sf") (V4SF "sf")
+                      (V2DF "df") (DF   "df")
+                      (SI   "si") (HI   "hi")
+                      (QI   "qi")
+                      (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
+                      (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
+                      (VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf")
+                      (VNx4SI "si") (VNx2SI "si")
+                      (VNx4SF "sf") (VNx2SF "sf")
+                      (VNx2DI "di")
+                      (VNx2DF "df")])
  
  ;; Element mode with floating-point values replaced by like-sized integers.
  (define_mode_attr VEL_INT [(VNx16QI "QI")
@@ -994,23 +1042,29 @@
                              (V4SF "2s")])
  
  ;; Define corresponding core/FP element mode for each vector mode.
-(define_mode_attr vw [(V8QI "w") (V16QI "w") (VNx16QI "w")
-                     (V4HI "w") (V8HI "w") (VNx8HI "w")
-                     (V2SI "w") (V4SI "w") (VNx4SI "w")
-                     (DI   "x") (V2DI "x") (VNx2DI "x")
-                     (VNx8HF "h")
-                     (V2SF "s") (V4SF "s") (VNx4SF "s")
-                     (V2DF "d") (VNx2DF "d")])
+(define_mode_attr vw [(V8QI "w") (V16QI "w")
+                     (V4HI "w") (V8HI "w")
+                     (V2SI "w") (V4SI "w")
+                     (DI   "x") (V2DI "x")
+                     (V2SF "s") (V4SF "s")
+                     (V2DF "d")])
  
  ;; Corresponding core element mode for each vector mode.  This is a
  ;; variation on <vw> mapping FP modes to GP regs.
-(define_mode_attr vwcore [(V8QI "w") (V16QI "w") (VNx16QI "w")
-                         (V4HI "w") (V8HI "w") (VNx8HI "w")
-                         (V2SI "w") (V4SI "w") (VNx4SI "w")
-                         (DI   "x") (V2DI "x") (VNx2DI "x")
-                         (V4HF "w") (V8HF "w") (VNx8HF "w")
-                         (V2SF "w") (V4SF "w") (VNx4SF "w")
-                         (V2DF "x") (VNx2DF "x")])
+(define_mode_attr vwcore [(V8QI "w") (V16QI "w")
+                         (V4HI "w") (V8HI "w")
+                         (V2SI "w") (V4SI "w")
+                         (DI   "x") (V2DI "x")
+                         (V4HF "w") (V8HF "w")
+                         (V2SF "w") (V4SF "w")
+                         (V2DF "x")
+                         (VNx16QI "w") (VNx8QI "w") (VNx4QI "w") (VNx2QI "w")
+                         (VNx8HI "w") (VNx4HI "w") (VNx2HI "w")
+                         (VNx8HF "w") (VNx4HF "w") (VNx2HF "w")
+                         (VNx4SI "w") (VNx2SI "w")
+                         (VNx4SF "w") (VNx2SF "w")
+                         (VNx2DI "x")
+                         (VNx2DF "x")])
  
  ;; Double vector types for ALLX.
  (define_mode_attr Vallxd [(QI "8b") (HI "4h") (SI "2s")])
@@ -1248,10 +1302,14 @@
  
  ;; The predicate mode associated with an SVE data mode.  For structure modes
  ;; this is equivalent to the <VPRED> of the subvector mode.
-(define_mode_attr VPRED [(VNx16QI "VNx16BI")
-                        (VNx8HI "VNx8BI") (VNx8HF "VNx8BI")
-                        (VNx4SI "VNx4BI") (VNx4SF "VNx4BI")
-                        (VNx2DI "VNx2BI") (VNx2DF "VNx2BI")
+(define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI "VNx8BI")
+                        (VNx4QI "VNx4BI") (VNx2QI "VNx2BI")
+                        (VNx8HI "VNx8BI") (VNx4HI "VNx4BI") (VNx2HI "VNx2BI")
+                        (VNx8HF "VNx8BI") (VNx4HF "VNx4BI") (VNx2HF "VNx2BI")
+                        (VNx4SI "VNx4BI") (VNx2SI "VNx2BI")
+                        (VNx4SF "VNx4BI") (VNx2SF "VNx2BI")
+                        (VNx2DI "VNx2BI")
+                        (VNx2DF "VNx2BI")
                          (VNx32QI "VNx16BI")
                          (VNx16HI "VNx8BI") (VNx16HF "VNx8BI")
                          (VNx8SI "VNx4BI") (VNx8SF "VNx4BI")
@@ -1266,10 +1324,14 @@
                          (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")])
  
  ;; ...and again in lower case.
-(define_mode_attr vpred [(VNx16QI "vnx16bi")
-                        (VNx8HI "vnx8bi") (VNx8HF "vnx8bi")
-                        (VNx4SI "vnx4bi") (VNx4SF "vnx4bi")
-                        (VNx2DI "vnx2bi") (VNx2DF "vnx2bi")
+(define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
+                        (VNx4QI "vnx4bi") (VNx2QI "vnx2bi")
+                        (VNx8HI "vnx8bi") (VNx4HI "vnx4bi") (VNx2HI "vnx2bi")
+                        (VNx8HF "vnx8bi") (VNx4HF "vnx4bi") (VNx2HF "vnx2bi")
+                        (VNx4SI "vnx4bi") (VNx2SI "vnx2bi")
+                        (VNx4SF "vnx4bi") (VNx2SF "vnx2bi")
+                        (VNx2DI "vnx2bi")
+                        (VNx2DF "vnx2bi")
                          (VNx32QI "vnx16bi")
                          (VNx16HI "vnx8bi") (VNx16HF "vnx8bi")
                          (VNx8SI "vnx4bi") (VNx8SF "vnx4bi")
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 8b48698..57505e9 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,5 +1,13 @@
  2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>
  
+       * gcc.target/aarch64/sve/mixed_size_1.c: New test.
+       * gcc.target/aarch64/sve/mixed_size_2.c: Likewise.
+       * gcc.target/aarch64/sve/mixed_size_3.c: Likewise.
+       * gcc.target/aarch64/sve/mixed_size_4.c: Likewise.
+       * gcc.target/aarch64/sve/mixed_size_5.c: Likewise.
+
+2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>
+
         * gcc.target/aarch64/sve/clastb_8.c: Use assembly tests to
         check for fully-masked loops.
  
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_1.c b/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_1.c

new file mode 100644 (file)

index 0000000..a5659b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_1.c
@@ -0,0 +1,39 @@
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2)                                                \
+  void                                                                 \
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1,     \
+                      TYPE2 *restrict dst2, TYPE2 *restrict src2,      \
+                      int n)                                           \
+  {                                                                    \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+       dst1[i] += src1[i];                                             \
+       dst2[i] = src2[i];                                              \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T) \
+  T (uint16_t, uint8_t) \
+  T (uint32_t, uint16_t) \
+  T (uint32_t, _Float16) \
+  T (uint64_t, uint32_t) \
+  T (uint64_t, float)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_2.c b/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_2.c

new file mode 100644 (file)

index 0000000..34b58e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_2.c
@@ -0,0 +1,41 @@
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2)                                                \
+  void                                                                 \
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1,     \
+                      TYPE2 *restrict dst2, int n)                     \
+  {                                                                    \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+       dst1[i] += src1[i];                                             \
+       dst2[i] = 1;                                                    \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T) \
+  T (uint16_t, uint8_t) \
+  T (uint32_t, uint16_t) \
+  T (uint32_t, _Float16) \
+  T (uint64_t, uint32_t) \
+  T (uint64_t, float)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.b, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #1\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #1\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_3.c b/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_3.c

new file mode 100644 (file)

index 0000000..9ae3e7b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_3.c
@@ -0,0 +1,41 @@
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2)                                                \
+  void                                                                 \
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1,     \
+                      TYPE2 *restrict dst2, TYPE2 src2, int n)         \
+  {                                                                    \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+       dst1[i] += src1[i];                                             \
+       dst2[i] = src2;                                                 \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T) \
+  T (uint16_t, uint8_t) \
+  T (uint32_t, uint16_t) \
+  T (uint32_t, _Float16) \
+  T (uint64_t, uint32_t) \
+  T (uint64_t, float)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.b, w3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, w3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, h0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, s0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_4.c b/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_4.c

new file mode 100644 (file)

index 0000000..4c475fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_4.c
@@ -0,0 +1,43 @@
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2)                                                \
+  void                                                                 \
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1,     \
+                      TYPE2 *restrict dst2, TYPE2 n)                   \
+  {                                                                    \
+    for (TYPE2 i = 0; i < n; ++i)                                      \
+      {                                                                        \
+       dst1[i] += src1[i];                                             \
+       dst2[i] = i;                                                    \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T) \
+  T (uint16_t, uint8_t) \
+  T (uint32_t, uint16_t) \
+  T (uint64_t, uint32_t)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-not {\tindex\tz[0-9]+\.b,} } } */
+/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.h, #0, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.s, #0, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.d, #0, #1\n} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tcntb\t} } } */
+/* { dg-final { scan-assembler-times {\tcnth\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tcntw\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tcntd\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_5.c b/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_5.c

new file mode 100644 (file)

index 0000000..83be00f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mixed_size_5.c
@@ -0,0 +1,42 @@
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns -msve-vector-bits=512" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2)                                                \
+  void                                                                 \
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1,     \
+                      TYPE2 *restrict dst2, TYPE2 *restrict src2,      \
+                      int n)                                           \
+  {                                                                    \
+    for (int i = 0; i < n; ++i)                                                \
+      {                                                                        \
+       dst1[i * 2] = src1[i * 2] + 1;                                  \
+       dst1[i * 2 + 1] = src1[i * 2 + 1] + 1;                          \
+       dst2[i * 2] = 2;                                                \
+       dst2[i * 2 + 1] = 3;                                            \
+      }                                                                        \
+  }
+
+#define TEST_ALL(T) \
+  T (uint16_t, uint8_t) \
+  T (uint32_t, uint16_t) \
+  T (uint32_t, _Float16) \
+  T (uint64_t, uint32_t) \
+  T (uint64_t, float)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */
author	Richard Sandiford <richard.sandiford@arm.com>
	Sat, 16 Nov 2019 11:02:09 +0000 (11:02 +0000)
committer	Richard Sandiford <rsandifo@gcc.gnu.org>
	Sat, 16 Nov 2019 11:02:09 +0000 (11:02 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/aarch64/aarch64-modes.def		patch \| blob \| history
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| history
gcc/config/aarch64/aarch64-sve.md		patch \| blob \| history
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/config/aarch64/iterators.md		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/mixed_size_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/mixed_size_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/mixed_size_3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/mixed_size_4.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/mixed_size_5.c	[new file with mode: 0644]	patch \| blob