Add optabs for accelerating RAW and WAR alias checks
authorRichard Sandiford <richard.sandiford@arm.com>
Mon, 18 Nov 2019 15:36:10 +0000 (15:36 +0000)
committerRichard Sandiford <rsandifo@gcc.gnu.org>
Mon, 18 Nov 2019 15:36:10 +0000 (15:36 +0000)
This patch adds optabs that check whether a read followed by a write
or a write followed by a read can be divided into interleaved byte
accesses without changing the dependencies between the bytes.
This is one of the uses of the SVE2 WHILERW and WHILEWR instructions.
(The instructions can also be used to limit the VF at runtime,
but that's future work.)

2019-11-18  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
* doc/sourcebuild.texi (vect_check_ptrs): Document.
* optabs.def (check_raw_ptrs_optab, check_war_ptrs_optab): New optabs.
* doc/md.texi: Document them.
* internal-fn.def (IFN_CHECK_RAW_PTRS, IFN_CHECK_WAR_PTRS): New
internal functions.
* internal-fn.h (internal_check_ptrs_fn_supported_p): Declare.
* internal-fn.c (check_ptrs_direct): New macro.
(expand_check_ptrs_optab_fn): Likewise.
(direct_check_ptrs_optab_supported_p): Likewise.
(internal_check_ptrs_fn_supported_p): New fuction.
* tree-data-ref.c: Include internal-fn.h.
(create_ifn_alias_checks): New function.
(create_intersect_range_checks): Use it.
* config/aarch64/iterators.md (SVE2_WHILE_PTR): New int iterator.
(optab, cmp_op): Handle it.
(raw_war, unspec): New int attributes.
* config/aarch64/aarch64.md (UNSPEC_WHILERW, UNSPEC_WHILE_WR): New
constants.
* config/aarch64/predicates.md (aarch64_bytes_per_sve_vector_operand):
New predicate.
* config/aarch64/aarch64-sve2.md (check_<raw_war>_ptrs<mode>): New
expander.
(@aarch64_sve2_while<cmp_op><GPI:mode><PRED_ALL:mode>_ptest): New
pattern.

gcc/testsuite/
* lib/target-supports.exp (check_effective_target_vect_check_ptrs):
New procedure.
* gcc.dg/vect/vect-alias-check-14.c: Expect IFN_CHECK_WAR to be
used, if available.
* gcc.dg/vect/vect-alias-check-15.c: Likewise.
* gcc.dg/vect/vect-alias-check-16.c: Likewise IFN_CHECK_RAW.
* gcc.target/aarch64/sve2/whilerw_1.c: New test.
* gcc.target/aarch64/sve2/whilewr_1.c: Likewise.
* gcc.target/aarch64/sve2/whilewr_2.c: Likewise.

From-SVN: r278414

20 files changed:
gcc/ChangeLog
gcc/config/aarch64/aarch64-sve2.md
gcc/config/aarch64/aarch64.md
gcc/config/aarch64/iterators.md
gcc/config/aarch64/predicates.md
gcc/doc/md.texi
gcc/doc/sourcebuild.texi
gcc/internal-fn.c
gcc/internal-fn.def
gcc/internal-fn.h
gcc/optabs.def
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.dg/vect/vect-alias-check-14.c
gcc/testsuite/gcc.dg/vect/vect-alias-check-15.c
gcc/testsuite/gcc.dg/vect/vect-alias-check-16.c
gcc/testsuite/gcc.target/aarch64/sve2/whilerw_1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve2/whilewr_1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve2/whilewr_2.c [new file with mode: 0644]
gcc/testsuite/lib/target-supports.exp
gcc/tree-data-ref.c

index be2fac7..c57e8c4 100644 (file)
@@ -1,5 +1,32 @@
 2019-11-18  Richard Sandiford  <richard.sandiford@arm.com>
 
+       * doc/sourcebuild.texi (vect_check_ptrs): Document.
+       * optabs.def (check_raw_ptrs_optab, check_war_ptrs_optab): New optabs.
+       * doc/md.texi: Document them.
+       * internal-fn.def (IFN_CHECK_RAW_PTRS, IFN_CHECK_WAR_PTRS): New
+       internal functions.
+       * internal-fn.h (internal_check_ptrs_fn_supported_p): Declare.
+       * internal-fn.c (check_ptrs_direct): New macro.
+       (expand_check_ptrs_optab_fn): Likewise.
+       (direct_check_ptrs_optab_supported_p): Likewise.
+       (internal_check_ptrs_fn_supported_p): New fuction.
+       * tree-data-ref.c: Include internal-fn.h.
+       (create_ifn_alias_checks): New function.
+       (create_intersect_range_checks): Use it.
+       * config/aarch64/iterators.md (SVE2_WHILE_PTR): New int iterator.
+       (optab, cmp_op): Handle it.
+       (raw_war, unspec): New int attributes.
+       * config/aarch64/aarch64.md (UNSPEC_WHILERW, UNSPEC_WHILE_WR): New
+       constants.
+       * config/aarch64/predicates.md (aarch64_bytes_per_sve_vector_operand):
+       New predicate.
+       * config/aarch64/aarch64-sve2.md (check_<raw_war>_ptrs<mode>): New
+       expander.
+       (@aarch64_sve2_while<cmp_op><GPI:mode><PRED_ALL:mode>_ptest): New
+       pattern.
+
+2019-11-18  Richard Sandiford  <richard.sandiford@arm.com>
+
        * tree.c (build_vector_from_ctor): Directly return a zero vector for
        empty constructors.
 
index 15142d1..106a9a0 100644 (file)
   }
   [(set_attr "movprfx" "*,yes")]
 )
+
+;; Use WHILERW and WHILEWR to accelerate alias checks.  This is only
+;; possible if the accesses we're checking are exactly the same size
+;; as an SVE vector.
+(define_expand "check_<raw_war>_ptrs<mode>"
+  [(match_operand:GPI 0 "register_operand")
+   (unspec:VNx16BI
+     [(match_operand:GPI 1 "register_operand")
+      (match_operand:GPI 2 "register_operand")
+      (match_operand:GPI 3 "aarch64_bytes_per_sve_vector_operand")
+      (match_operand:GPI 4 "const_int_operand")]
+     SVE2_WHILE_PTR)]
+  "TARGET_SVE2"
+{
+  /* Use the widest predicate mode we can.  */
+  unsigned int align = INTVAL (operands[4]);
+  if (align > 8)
+    align = 8;
+  machine_mode pred_mode = aarch64_sve_pred_mode (align).require ();
+
+  /* Emit a WHILERW or WHILEWR, setting the condition codes based on
+     the result.  */
+  emit_insn (gen_aarch64_sve2_while_ptest
+            (<SVE2_WHILE_PTR:unspec>, <MODE>mode, pred_mode,
+             gen_rtx_SCRATCH (pred_mode), operands[1], operands[2],
+             CONSTM1_RTX (VNx16BImode), CONSTM1_RTX (pred_mode)));
+
+  /* Set operand 0 to true if the last bit of the predicate result is set,
+     i.e. if all elements are free of dependencies.  */
+  rtx cc_reg = gen_rtx_REG (CC_NZCmode, CC_REGNUM);
+  rtx cmp = gen_rtx_LTU (<MODE>mode, cc_reg, const0_rtx);
+  emit_insn (gen_aarch64_cstore<mode> (operands[0], cmp, cc_reg));
+  DONE;
+})
+
+;; A WHILERW or WHILEWR in which only the flags result is interesting.
+(define_insn_and_rewrite "@aarch64_sve2_while<cmp_op><GPI:mode><PRED_ALL:mode>_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+       (unspec:CC_NZC
+         [(match_operand 3)
+          (match_operand 4)
+          (const_int SVE_KNOWN_PTRUE)
+          (unspec:PRED_ALL
+            [(match_operand:GPI 1 "register_operand" "r")
+             (match_operand:GPI 2 "register_operand" "r")]
+            SVE2_WHILE_PTR)]
+         UNSPEC_PTEST))
+   (clobber (match_scratch:PRED_ALL 0 "=Upa"))]
+  "TARGET_SVE2"
+  "while<cmp_op>\t%0.<PRED_ALL:Vetype>, %x1, %x2"
+  ;; Force the compiler to drop the unused predicate operand, so that we
+  ;; don't have an unnecessary PTRUE.
+  "&& (!CONSTANT_P (operands[3]) || !CONSTANT_P (operands[4]))"
+  {
+    operands[3] = CONSTM1_RTX (VNx16BImode);
+    operands[4] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
+  }
+)
index f19e227..87e9b93 100644 (file)
     UNSPEC_WHILE_LO
     UNSPEC_WHILE_LS
     UNSPEC_WHILE_LT
+    UNSPEC_WHILERW
+    UNSPEC_WHILEWR
     UNSPEC_LDN
     UNSPEC_STN
     UNSPEC_INSR
index bfeebe9..83a0d15 100644 (file)
 (define_int_iterator SVE_WHILE [UNSPEC_WHILE_LE UNSPEC_WHILE_LO
                                UNSPEC_WHILE_LS UNSPEC_WHILE_LT])
 
+(define_int_iterator SVE2_WHILE_PTR [UNSPEC_WHILERW UNSPEC_WHILEWR])
+
 (define_int_iterator SVE_SHIFT_WIDE [UNSPEC_ASHIFT_WIDE
                                     UNSPEC_ASHIFTRT_WIDE
                                     UNSPEC_LSHIFTRT_WIDE])
                        (UNSPEC_FEXPA "fexpa")
                        (UNSPEC_FTSMUL "ftsmul")
                        (UNSPEC_FTSSEL "ftssel")
+                       (UNSPEC_WHILERW "vec_check_raw_alias")
+                       (UNSPEC_WHILEWR "vec_check_war_alias")
                        (UNSPEC_COND_FABS "abs")
                        (UNSPEC_COND_FADD "add")
                        (UNSPEC_COND_FCADD90 "cadd90")
                         (UNSPEC_WHILE_LE "le")
                         (UNSPEC_WHILE_LO "lo")
                         (UNSPEC_WHILE_LS "ls")
-                        (UNSPEC_WHILE_LT "lt")])
+                        (UNSPEC_WHILE_LT "lt")
+                        (UNSPEC_WHILERW "rw")
+                        (UNSPEC_WHILEWR "wr")])
 
 (define_int_attr while_optab_cmp [(UNSPEC_WHILE_LE "le")
                                  (UNSPEC_WHILE_LO "ult")
                                  (UNSPEC_WHILE_LS "ule")
                                  (UNSPEC_WHILE_LT "lt")])
 
+(define_int_attr raw_war [(UNSPEC_WHILERW "raw")
+                         (UNSPEC_WHILEWR "war")])
+
 (define_int_attr brk_op [(UNSPEC_BRKA "a") (UNSPEC_BRKB "b")
                         (UNSPEC_BRKN "n")
                         (UNSPEC_BRKPA "pa") (UNSPEC_BRKPB "pb")])
                                (UNSPEC_REVB "16")
                                (UNSPEC_REVH "32")
                                (UNSPEC_REVW "64")])
+
+(define_int_attr unspec [(UNSPEC_WHILERW "UNSPEC_WHILERW")
+                        (UNSPEC_WHILEWR "UNSPEC_WHILEWR")])
index 2c5c53c..2323612 100644 (file)
 
 (define_predicate "aarch64_sve_any_binary_operator"
   (match_code "plus,minus,mult,div,udiv,smax,umax,smin,umin,and,ior,xor"))
+
+(define_predicate "aarch64_bytes_per_sve_vector_operand"
+  (and (match_code "const_int,const_poly_int")
+       (match_test "known_eq (wi::to_poly_wide (op, mode),
+                             BYTES_PER_SVE_VECTOR)")))
index 87bbeb4..0ad4a00 100644 (file)
@@ -5076,6 +5076,37 @@ for (i = 1; i < GET_MODE_NUNITS (@var{n}); i++)
   operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
 @end smallexample
 
+@cindex @code{check_raw_ptrs@var{m}} instruction pattern
+@item @samp{check_raw_ptrs@var{m}}
+Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
+a write of @var{len} bytes at @var{a} followed by a read of @var{len} bytes
+at @var{b} can be split into interleaved byte accesses
+@samp{@var{a}[0], @var{b}[0], @var{a}[1], @var{b}[1], @dots{}}
+without affecting the dependencies between the bytes.  Set operand 0
+to true if the split is possible and false otherwise.
+
+Operands 1, 2 and 3 provide the values of @var{a}, @var{b} and @var{len}
+respectively.  Operand 4 is a constant integer that provides the known
+common alignment of @var{a} and @var{b}.  All inputs have mode @var{m}.
+
+This split is possible if:
+
+@smallexample
+@var{a} == @var{b} || @var{a} + @var{len} <= @var{b} || @var{b} + @var{len} <= @var{a}
+@end smallexample
+
+You should only define this pattern if the target has a way of accelerating
+the test without having to do the individual comparisons.
+
+@cindex @code{check_war_ptrs@var{m}} instruction pattern
+@item @samp{check_war_ptrs@var{m}}
+Like @samp{check_raw_ptrs@var{m}}, but with the read and write swapped round.
+The split is possible in this case if:
+
+@smallexample
+@var{b} <= @var{a} || @var{a} + @var{len} <= @var{b}
+@end smallexample
+
 @cindex @code{vec_cmp@var{m}@var{n}} instruction pattern
 @item @samp{vec_cmp@var{m}@var{n}}
 Output a vector comparison.  Operand 0 of mode @var{n} is the destination for
index f3bf66c..a3432bc 100644 (file)
@@ -1487,6 +1487,10 @@ Target supports hardware vectors of @code{long}.
 @item vect_long_long
 Target supports hardware vectors of @code{long long}.
 
+@item vect_check_ptrs
+Target supports the @code{check_raw_ptrs} and @code{check_war_ptrs}
+optabs on vectors.
+
 @item vect_fully_masked
 Target supports fully-masked (also known as fully-predicated) loops,
 so that vector loops can handle partial as well as full vectors.
index 6a878bd..88d52d2 100644 (file)
@@ -118,6 +118,7 @@ init_internal_fns ()
 #define fold_extract_direct { 2, 2, false }
 #define fold_left_direct { 1, 1, false }
 #define mask_fold_left_direct { 1, 1, false }
+#define check_ptrs_direct { 0, 0, false }
 
 const direct_internal_fn_info direct_internal_fn_array[IFN_LAST + 1] = {
 #define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) not_direct,
@@ -3006,6 +3007,9 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 #define expand_mask_fold_left_optab_fn(FN, STMT, OPTAB) \
   expand_direct_optab_fn (FN, STMT, OPTAB, 3)
 
+#define expand_check_ptrs_optab_fn(FN, STMT, OPTAB) \
+  expand_direct_optab_fn (FN, STMT, OPTAB, 4)
+
 /* RETURN_TYPE and ARGS are a return type and argument list that are
    in principle compatible with FN (which satisfies direct_internal_fn_p).
    Return the types that should be used to determine whether the
@@ -3095,6 +3099,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
 #define direct_fold_extract_optab_supported_p direct_optab_supported_p
 #define direct_fold_left_optab_supported_p direct_optab_supported_p
 #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
+#define direct_check_ptrs_optab_supported_p direct_optab_supported_p
 
 /* Return the optab used by internal function FN.  */
 
@@ -3572,6 +3577,24 @@ internal_gather_scatter_fn_supported_p (internal_fn ifn, tree vector_type,
          && insn_operand_matches (icode, 3 + output_ops, GEN_INT (scale)));
 }
 
+/* Return true if the target supports IFN_CHECK_{RAW,WAR}_PTRS function IFN
+   for pointers of type TYPE when the accesses have LENGTH bytes and their
+   common byte alignment is ALIGN.  */
+
+bool
+internal_check_ptrs_fn_supported_p (internal_fn ifn, tree type,
+                                   poly_uint64 length, unsigned int align)
+{
+  machine_mode mode = TYPE_MODE (type);
+  optab optab = direct_internal_fn_optab (ifn);
+  insn_code icode = direct_optab_handler (optab, mode);
+  if (icode == CODE_FOR_nothing)
+    return false;
+  rtx length_rtx = immed_wide_int_const (length, mode);
+  return (insn_operand_matches (icode, 3, length_rtx)
+         && insn_operand_matches (icode, 4, GEN_INT (align)));
+}
+
 /* Expand STMT as though it were a call to internal function FN.  */
 
 void
index a945944..85f45d6 100644 (file)
@@ -63,6 +63,7 @@ along with GCC; see the file COPYING3.  If not see
    - cond_ternary: a conditional ternary optab, such as cond_fma_rev<mode>
 
    - fold_left: for scalar = FN (scalar, vector), keyed off the vector mode
+   - check_ptrs: used for check_{raw,war}_ptrs
 
    DEF_INTERNAL_SIGNED_OPTAB_FN defines an internal function that
    maps to one of two optabs, depending on the signedness of an input.
@@ -136,6 +137,10 @@ DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES, 0,
                       vec_mask_store_lanes, mask_store_lanes)
 
 DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
+DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
+                      check_raw_ptrs, check_ptrs)
+DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
+                      check_war_ptrs, check_ptrs)
 
 DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | ECF_NOTHROW,
                       vec_shl_insert, binary)
index 389241a..a1bc081 100644 (file)
@@ -221,6 +221,8 @@ extern int internal_fn_mask_index (internal_fn);
 extern int internal_fn_stored_value_index (internal_fn);
 extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
                                                    tree, tree, int);
+extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
+                                               poly_uint64, unsigned int);
 
 extern void expand_internal_call (gcall *);
 extern void expand_internal_call (internal_fn, gcall *);
index 90e177a..24d8275 100644 (file)
@@ -429,6 +429,9 @@ OPTAB_D (atomic_xor_optab, "atomic_xor$I$a")
 OPTAB_D (get_thread_pointer_optab, "get_thread_pointer$I$a")
 OPTAB_D (set_thread_pointer_optab, "set_thread_pointer$I$a")
 
+OPTAB_D (check_raw_ptrs_optab, "check_raw_ptrs$a")
+OPTAB_D (check_war_ptrs_optab, "check_war_ptrs$a")
+
 OPTAB_DC (vec_duplicate_optab, "vec_duplicate$a", VEC_DUPLICATE)
 OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
 OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
index 3a37d94..01e8e2b 100644 (file)
@@ -1,5 +1,17 @@
 2019-11-18  Richard Sandiford  <richard.sandiford@arm.com>
 
+       * lib/target-supports.exp (check_effective_target_vect_check_ptrs):
+       New procedure.
+       * gcc.dg/vect/vect-alias-check-14.c: Expect IFN_CHECK_WAR to be
+       used, if available.
+       * gcc.dg/vect/vect-alias-check-15.c: Likewise.
+       * gcc.dg/vect/vect-alias-check-16.c: Likewise IFN_CHECK_RAW.
+       * gcc.target/aarch64/sve2/whilerw_1.c: New test.
+       * gcc.target/aarch64/sve2/whilewr_1.c: Likewise.
+       * gcc.target/aarch64/sve2/whilewr_2.c: Likewise.
+
+2019-11-18  Richard Sandiford  <richard.sandiford@arm.com>
+
        * gcc.target/aarch64/sve/acle/asm/ptest_pmore.c: New test.
 
 2019-11-18  Richard Sandiford  <richard.sandiford@arm.com>
index 1d148a0..29bc571 100644 (file)
@@ -60,5 +60,6 @@ main (void)
 
 /* { dg-final { scan-tree-dump {flags: *WAR\n} "vect" { target vect_int } } } */
 /* { dg-final { scan-tree-dump-not {flags: [^\n]*ARBITRARY\n} "vect" } } */
-/* { dg-final { scan-tree-dump "using an address-based WAR/WAW test" "vect" } } */
+/* { dg-final { scan-tree-dump "using an address-based WAR/WAW test" "vect" { target { ! vect_check_ptrs } } } } */
+/* { dg-final { scan-tree-dump "using an IFN_CHECK_WAR_PTRS test" "vect" { target vect_check_ptrs } } } */
 /* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
index fbe3f84..ad74496 100644 (file)
@@ -57,5 +57,6 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump {flags: *WAW\n} "vect" { target vect_int } } } */
-/* { dg-final { scan-tree-dump "using an address-based WAR/WAW test" "vect" } } */
+/* { dg-final { scan-tree-dump "using an address-based WAR/WAW test" "vect" { target { ! vect_check_ptrs } } } } */
+/* { dg-final { scan-tree-dump "using an IFN_CHECK_WAR_PTRS test" "vect" { target vect_check_ptrs } } } */
 /* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
index 81c252d..8a9a6ff 100644 (file)
@@ -62,5 +62,6 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump {flags: *RAW\n} "vect" { target vect_int } } } */
-/* { dg-final { scan-tree-dump "using an address-based overlap test" "vect" } } */
+/* { dg-final { scan-tree-dump "using an address-based overlap test" "vect" { target { ! vect_check_ptrs } } } } */
+/* { dg-final { scan-tree-dump "using an IFN_CHECK_RAW_PTRS test" "vect" { target vect_check_ptrs } } } */
 /* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/whilerw_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/whilerw_1.c
new file mode 100644 (file)
index 0000000..63a6d2f
--- /dev/null
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-require-effective-target lp64 } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE)                                \
+  TYPE                                         \
+  test_##TYPE (TYPE *dst, TYPE *src, int n)    \
+  {                                            \
+    TYPE res = 0;                              \
+    for (int i = 0; i < n; ++i)                        \
+      {                                                \
+       dst[i] += 1;                            \
+       res += src[i];                          \
+      }                                                \
+    return res;                                        \
+  }
+
+TEST_LOOP (int8_t);
+TEST_LOOP (int16_t);
+TEST_LOOP (int32_t);
+TEST_LOOP (int64_t);
+
+/* { dg-final { scan-assembler-times {\twhilerw\t} 4 } } */
+/* { dg-final { scan-assembler-times {\twhilerw\tp[0-9]+\.b, x0, x1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilerw\tp[0-9]+\.h, x0, x1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilerw\tp[0-9]+\.s, x0, x1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilerw\tp[0-9]+\.d, x[0-9]+, x1\n} 1 } } */
+/* { dg-final { scan-assembler-not {\twhilewr\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/whilewr_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/whilewr_1.c
new file mode 100644 (file)
index 0000000..e204b37
--- /dev/null
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-require-effective-target lp64 } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE)                                                \
+  void                                                         \
+  test_##TYPE (TYPE *dst, TYPE *src1, TYPE *src2, int n)       \
+  {                                                            \
+    for (int i = 0; i < n; ++i)                                        \
+      dst[i] = src1[i] + src2[i];                              \
+  }
+
+TEST_LOOP (int8_t);
+TEST_LOOP (int16_t);
+TEST_LOOP (int32_t);
+TEST_LOOP (int64_t);
+
+/* { dg-final { scan-assembler-times {\twhilewr\t} 8 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.b, x1, x0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.b, x2, x0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.h, x1, x0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.h, x2, x0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.s, x1, x0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.s, x2, x0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.d, x1, x0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.d, x2, x0\n} 1 } } */
+/* { dg-final { scan-assembler-not {\twhilerw\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/whilewr_2.c b/gcc/testsuite/gcc.target/aarch64/sve2/whilewr_2.c
new file mode 100644 (file)
index 0000000..0b86991
--- /dev/null
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */
+/* { dg-require-effective-target lp64 } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE)                                                \
+  void                                                         \
+  test_##TYPE (TYPE *dst1, TYPE *dst2, TYPE *dst3, int n)      \
+  {                                                            \
+    for (int i = 0; i < n; ++i)                                        \
+      {                                                                \
+        dst1[i] = 1;                                           \
+        dst2[i] = 2;                                           \
+        dst3[i] = 3;                                           \
+      }                                                                \
+   }
+
+TEST_LOOP (int8_t);
+TEST_LOOP (int16_t);
+TEST_LOOP (int32_t);
+TEST_LOOP (int64_t);
+
+/* { dg-final { scan-assembler-times {\twhilewr\t} 12 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.b, x0, x1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.b, x0, x2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.b, x1, x2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.h, x0, x1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.h, x0, x2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.h, x1, x2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.s, x0, x1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.s, x0, x2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.s, x1, x2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.d, x0, x1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.d, x0, x2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.d, x1, x2\n} 1 } } */
+/* { dg-final { scan-assembler-not {\twhilerw\t} } } */
index 54b2fca..08af9f8 100644 (file)
@@ -6459,6 +6459,13 @@ proc check_effective_target_vect_natural_alignment { } {
     return $et_vect_natural_alignment
 }
 
+# Return true if the target supports the check_raw_ptrs and check_war_ptrs
+# optabs on vectors.
+
+proc check_effective_target_vect_check_ptrs { } {
+    return [check_effective_target_aarch64_sve2]
+}
+
 # Return true if fully-masked loops are supported.
 
 proc check_effective_target_vect_fully_masked { } {
index bad80e1..117a14b 100644 (file)
@@ -96,6 +96,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "builtins.h"
 #include "tree-eh.h"
 #include "ssa.h"
+#include "internal-fn.h"
 
 static struct datadep_stats
 {
@@ -1719,6 +1720,80 @@ prune_runtime_alias_test_list (vec<dr_with_seg_len_pair_t> *alias_pairs,
     }
 }
 
+/* A subroutine of create_intersect_range_checks, with a subset of the
+   same arguments.  Try to use IFN_CHECK_RAW_PTRS and IFN_CHECK_WAR_PTRS
+   to optimize cases in which the references form a simple RAW, WAR or
+   WAR dependence.  */
+
+static bool
+create_ifn_alias_checks (tree *cond_expr,
+                        const dr_with_seg_len_pair_t &alias_pair)
+{
+  const dr_with_seg_len& dr_a = alias_pair.first;
+  const dr_with_seg_len& dr_b = alias_pair.second;
+
+  /* Check for cases in which:
+
+     (a) we have a known RAW, WAR or WAR dependence
+     (b) the accesses are well-ordered in both the original and new code
+        (see the comment above the DR_ALIAS_* flags for details); and
+     (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR.  */
+  if (alias_pair.flags & ~(DR_ALIAS_RAW | DR_ALIAS_WAR | DR_ALIAS_WAW))
+    return false;
+
+  /* Make sure that both DRs access the same pattern of bytes,
+     with a constant length and and step.  */
+  poly_uint64 seg_len;
+  if (!operand_equal_p (dr_a.seg_len, dr_b.seg_len, 0)
+      || !poly_int_tree_p (dr_a.seg_len, &seg_len)
+      || maybe_ne (dr_a.access_size, dr_b.access_size)
+      || !operand_equal_p (DR_STEP (dr_a.dr), DR_STEP (dr_b.dr), 0)
+      || !tree_fits_uhwi_p (DR_STEP (dr_a.dr)))
+    return false;
+
+  unsigned HOST_WIDE_INT bytes = tree_to_uhwi (DR_STEP (dr_a.dr));
+  tree addr_a = DR_BASE_ADDRESS (dr_a.dr);
+  tree addr_b = DR_BASE_ADDRESS (dr_b.dr);
+
+  /* See whether the target suports what we want to do.  WAW checks are
+     equivalent to WAR checks here.  */
+  internal_fn ifn = (alias_pair.flags & DR_ALIAS_RAW
+                    ? IFN_CHECK_RAW_PTRS
+                    : IFN_CHECK_WAR_PTRS);
+  unsigned int align = MIN (dr_a.align, dr_b.align);
+  poly_uint64 full_length = seg_len + bytes;
+  if (!internal_check_ptrs_fn_supported_p (ifn, TREE_TYPE (addr_a),
+                                          full_length, align))
+    {
+      full_length = seg_len + dr_a.access_size;
+      if (!internal_check_ptrs_fn_supported_p (ifn, TREE_TYPE (addr_a),
+                                              full_length, align))
+       return false;
+    }
+
+  /* Commit to using this form of test.  */
+  addr_a = fold_build_pointer_plus (addr_a, DR_OFFSET (dr_a.dr));
+  addr_a = fold_build_pointer_plus (addr_a, DR_INIT (dr_a.dr));
+
+  addr_b = fold_build_pointer_plus (addr_b, DR_OFFSET (dr_b.dr));
+  addr_b = fold_build_pointer_plus (addr_b, DR_INIT (dr_b.dr));
+
+  *cond_expr = build_call_expr_internal_loc (UNKNOWN_LOCATION,
+                                            ifn, boolean_type_node,
+                                            4, addr_a, addr_b,
+                                            size_int (full_length),
+                                            size_int (align));
+
+  if (dump_enabled_p ())
+    {
+      if (ifn == IFN_CHECK_RAW_PTRS)
+       dump_printf (MSG_NOTE, "using an IFN_CHECK_RAW_PTRS test\n");
+      else
+       dump_printf (MSG_NOTE, "using an IFN_CHECK_WAR_PTRS test\n");
+    }
+  return true;
+}
+
 /* Try to generate a runtime condition that is true if ALIAS_PAIR is
    free of aliases, using a condition based on index values instead
    of a condition based on addresses.  Return true on success,
@@ -2240,6 +2315,9 @@ create_intersect_range_checks (class loop *loop, tree *cond_expr,
   if (create_intersect_range_checks_index (loop, cond_expr, alias_pair))
     return;
 
+  if (create_ifn_alias_checks (cond_expr, alias_pair))
+    return;
+
   if (create_waw_or_war_checks (cond_expr, alias_pair))
     return;