From 8489e1f45b50600c01eb8ed8c5d0ca914ded281c Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Mon, 18 Nov 2019 15:26:55 +0000 Subject: [PATCH] Optimise WAR and WAW alias checks For: void f1 (int *x, int *y) { for (int i = 0; i < 32; ++i) x[i] += y[i]; } we checked at runtime whether one vector at x would overlap one vector at y. But in cases like this, the vector code would handle x <= y just fine, since any write to address A still happens after any read from address A. The only problem is if x is ahead of y by less than a vector. The same is true for two writes: void f2 (int *x, int *y) { for (int i = 0; i < 32; ++i) { x[i] = i; y[i] = 2; } } if y <= x then a vector write at y after a vector write at x would have the same net effect as the original scalar writes. This patch optimises the alias checks for these two cases. E.g., before the patch, f1 used: add x2, x0, 15 sub x2, x2, x1 cmp x2, 30 bls .L2 whereas after the patch it uses: add x2, x1, 4 sub x2, x0, x2 cmp x2, 8 bls .L2 Read-after-write cases like: int f3 (int *x, int *y) { int res = 0; for (int i = 0; i < 32; ++i) { x[i] = i; res += y[i]; } return res; } can cope with x == y, but otherwise don't allow overlap in either direction. Since checking for x == y at runtime would require extra code, we're probably better off sticking with the current overlap test. An overlap test is also needed if the scalar or vector accesses covered by the alias check are mixed together, rather than all statements for the second access following all statements for the first access. The new code for gcc.target/aarch64/sve/var_strict_[135].c is slightly better than before. 2019-11-18 Richard Sandiford gcc/ * tree-data-ref.c (create_intersect_range_checks_index): If the alias pair describes simple WAW and WAR dependencies, just check whether the first B access overlaps later A accesses. (create_waw_or_war_checks): New function that performs the same optimization on addresses. (create_intersect_range_checks): Call it. gcc/testsuite/ * gcc.dg/vect/vect-alias-check-8.c: Expect WAR/WAW checks to be used. * gcc.dg/vect/vect-alias-check-14.c: Likewise. * gcc.dg/vect/vect-alias-check-15.c: Likewise. * gcc.dg/vect/vect-alias-check-18.c: Likewise. * gcc.dg/vect/vect-alias-check-19.c: Likewise. * gcc.target/aarch64/sve/var_stride_1.c: Update expected sequence. * gcc.target/aarch64/sve/var_stride_2.c: Likewise. * gcc.target/aarch64/sve/var_stride_3.c: Likewise. * gcc.target/aarch64/sve/var_stride_5.c: Likewise. From-SVN: r278409 --- gcc/ChangeLog | 9 + gcc/testsuite/ChangeLog | 12 ++ gcc/testsuite/gcc.dg/vect/vect-alias-check-14.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-alias-check-15.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-alias-check-18.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-alias-check-19.c | 2 +- gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c | 2 +- .../gcc.target/aarch64/sve/var_stride_1.c | 16 +- .../gcc.target/aarch64/sve/var_stride_2.c | 2 +- .../gcc.target/aarch64/sve/var_stride_3.c | 17 +- .../gcc.target/aarch64/sve/var_stride_5.c | 17 +- gcc/tree-data-ref.c | 218 ++++++++++++++++++++- 12 files changed, 260 insertions(+), 41 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index eac0410..6ca0c57 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,14 @@ 2019-11-18 Richard Sandiford + * tree-data-ref.c (create_intersect_range_checks_index): If the + alias pair describes simple WAW and WAR dependencies, just check + whether the first B access overlaps later A accesses. + (create_waw_or_war_checks): New function that performs the same + optimization on addresses. + (create_intersect_range_checks): Call it. + +2019-11-18 Richard Sandiford + * lra-constraints.c (valid_address_p): Take the operand and a constraint as argument. If the operand is a MEM and the constraint is a memory constraint, check whether the eliminated form of the diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 163ea21..12a0d08 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,17 @@ 2019-11-18 Richard Sandiford + * gcc.dg/vect/vect-alias-check-8.c: Expect WAR/WAW checks to be used. + * gcc.dg/vect/vect-alias-check-14.c: Likewise. + * gcc.dg/vect/vect-alias-check-15.c: Likewise. + * gcc.dg/vect/vect-alias-check-18.c: Likewise. + * gcc.dg/vect/vect-alias-check-19.c: Likewise. + * gcc.target/aarch64/sve/var_stride_1.c: Update expected sequence. + * gcc.target/aarch64/sve/var_stride_2.c: Likewise. + * gcc.target/aarch64/sve/var_stride_3.c: Likewise. + * gcc.target/aarch64/sve/var_stride_5.c: Likewise. + +2019-11-18 Richard Sandiford + * gcc.target/aarch64/sve/acle/asm/ld1rq_f16.c: Remove XFAIL. * gcc.target/aarch64/sve/acle/asm/ld1rq_f32.c: Likewise. * gcc.target/aarch64/sve/acle/asm/ld1rq_f64.c: Likewise. diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-14.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-14.c index 87045c9..1d148a0 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-14.c +++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-14.c @@ -60,5 +60,5 @@ main (void) /* { dg-final { scan-tree-dump {flags: *WAR\n} "vect" { target vect_int } } } */ /* { dg-final { scan-tree-dump-not {flags: [^\n]*ARBITRARY\n} "vect" } } */ -/* { dg-final { scan-tree-dump "using an address-based overlap test" "vect" } } */ +/* { dg-final { scan-tree-dump "using an address-based WAR/WAW test" "vect" } } */ /* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-15.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-15.c index ad8d69e..fbe3f84 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-15.c +++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-15.c @@ -57,5 +57,5 @@ main (void) } /* { dg-final { scan-tree-dump {flags: *WAW\n} "vect" { target vect_int } } } */ -/* { dg-final { scan-tree-dump "using an address-based overlap test" "vect" } } */ +/* { dg-final { scan-tree-dump "using an address-based WAR/WAW test" "vect" } } */ /* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-18.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-18.c index e9fd31a..9d07391 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-18.c +++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-18.c @@ -60,5 +60,5 @@ main (void) } /* { dg-final { scan-tree-dump {flags: *WAR\n} "vect" { target vect_int } } } */ -/* { dg-final { scan-tree-dump "using an index-based overlap test" "vect" } } */ +/* { dg-final { scan-tree-dump "using an index-based WAR/WAW test" "vect" } } */ /* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-19.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-19.c index 583e296..7c0ff36 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-19.c +++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-19.c @@ -58,5 +58,5 @@ main (void) } /* { dg-final { scan-tree-dump {flags: *WAW\n} "vect" { target vect_int } } } */ -/* { dg-final { scan-tree-dump "using an index-based overlap test" "vect" } } */ +/* { dg-final { scan-tree-dump "using an index-based WAR/WAW test" "vect" } } */ /* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c b/gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c index 6cd0219..7e5df13 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c +++ b/gcc/testsuite/gcc.dg/vect/vect-alias-check-8.c @@ -60,5 +60,5 @@ main (void) } /* { dg-final { scan-tree-dump {flags: *WAR\n} "vect" { target vect_int } } } */ -/* { dg-final { scan-tree-dump "using an index-based overlap test" "vect" } } */ +/* { dg-final { scan-tree-dump "using an index-based WAR/WAW test" "vect" } } */ /* { dg-final { scan-tree-dump-not "using an address-based" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c index 68baba9..40ff2d5 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_1.c @@ -15,13 +15,9 @@ f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused))) /* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */ /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */ /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ -/* Should multiply by (VF-1)*4 rather than (257-1)*4. */ -/* { dg-final { scan-assembler-not {, 1024} } } */ -/* { dg-final { scan-assembler-not {\t.bfiz\t} } } */ -/* { dg-final { scan-assembler-not {lsl[^\n]*[, ]10} } } */ -/* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */ -/* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */ -/* { dg-final { scan-assembler-not {\tcsel\tx[0-9]+} } } */ -/* Two range checks and a check for n being zero. */ -/* { dg-final { scan-assembler-times {\tcmp\t} 1 } } */ -/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */ +/* Should use a WAR check that multiplies by (VF-2)*4 rather than + an overlap check that multiplies by (257-1)*4. */ +/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */ +/* One range check and a check for n being zero. */ +/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c index 30f6d26..b8afea7 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c @@ -15,7 +15,7 @@ f (TYPE *x, TYPE *y, unsigned short n, unsigned short m) /* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */ /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */ /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ -/* Should multiply by (257-1)*4 rather than (VF-1)*4. */ +/* Should multiply by (257-1)*4 rather than (VF-1)*4 or (VF-2)*4. */ /* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x2, 10, 16\n} 1 } } */ /* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x3, 10, 16\n} 1 } } */ /* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3.c index 70792ff..5ab6859 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_3.c @@ -15,13 +15,10 @@ f (TYPE *x, TYPE *y, int n, long m __attribute__((unused))) /* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */ /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */ /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ -/* Should multiply by (VF-1)*4 rather than (257-1)*4. */ -/* { dg-final { scan-assembler-not {, 1024} } } */ -/* { dg-final { scan-assembler-not {\t.bfiz\t} } } */ -/* { dg-final { scan-assembler-not {lsl[^\n]*[, ]10} } } */ -/* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */ -/* { dg-final { scan-assembler {\tcmp\tw2, 0} } } */ -/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 2 } } */ -/* Two range checks and a check for n being zero. */ -/* { dg-final { scan-assembler {\tcmp\t} } } */ -/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */ +/* Should use a WAR check that multiplies by (VF-2)*4 rather than + an overlap check that multiplies by (257-1)*4. */ +/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */ +/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+[^\n]*xzr} 1 } } */ +/* One range check and a check for n being zero. */ +/* { dg-final { scan-assembler-times {\tcmp\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5.c index 688f3be6..93c1141 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_5.c @@ -15,13 +15,10 @@ f (TYPE *x, TYPE *y, long n, long m __attribute__((unused))) /* { dg-final { scan-assembler {\tst1d\tz[0-9]+} } } */ /* { dg-final { scan-assembler {\tldr\td[0-9]+} } } */ /* { dg-final { scan-assembler {\tstr\td[0-9]+} } } */ -/* Should multiply by (VF-1)*8 rather than (257-1)*8. */ -/* { dg-final { scan-assembler-not {, 2048} } } */ -/* { dg-final { scan-assembler-not {\t.bfiz\t} } } */ -/* { dg-final { scan-assembler-not {lsl[^\n]*[, ]11} } } */ -/* { dg-final { scan-assembler {\tcmp\tx[0-9]+, 0} } } */ -/* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */ -/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 2 } } */ -/* Two range checks and a check for n being zero. */ -/* { dg-final { scan-assembler {\tcmp\t} } } */ -/* { dg-final { scan-assembler-times {\tccmp\t} 2 } } */ +/* Should use a WAR check that multiplies by (VF-2)*8 rather than + an overlap check that multiplies by (257-1)*4. */ +/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #16\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */ +/* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+[^\n]*xzr} 1 } } */ +/* One range check and a check for n being zero. */ +/* { dg-final { scan-assembler-times {\tcmp\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */ diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c index 4dfa334..bad80e1 100644 --- a/gcc/tree-data-ref.c +++ b/gcc/tree-data-ref.c @@ -1805,6 +1805,8 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr, abs_step, &niter_access2)) return false; + bool waw_or_war_p = (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW)) == 0; + unsigned int i; for (i = 0; i < DR_NUM_DIMENSIONS (dr_a.dr); i++) { @@ -1906,16 +1908,57 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr, Combining the tests requires limit to be computable in an unsigned form of the index type; if it isn't, we fall back to the usual - pointer-based checks. */ - poly_offset_int limit = (idx_len1 + idx_access1 - 1 - + idx_len2 + idx_access2 - 1); + pointer-based checks. + + We can do better if DR_B is a write and if DR_A and DR_B are + well-ordered in both the original and the new code (see the + comment above the DR_ALIAS_* flags for details). In this case + we know that for each i in [0, n-1], the write performed by + access i of DR_B occurs after access numbers j<=i of DR_A in + both the original and the new code. Any write or anti + dependencies wrt those DR_A accesses are therefore maintained. + + We just need to make sure that each individual write in DR_B does not + overlap any higher-indexed access in DR_A; such DR_A accesses happen + after the DR_B access in the original code but happen before it in + the new code. + + We know the steps for both accesses are equal, so by induction, we + just need to test whether the first write of DR_B overlaps a later + access of DR_A. In other words, we need to move min1 along by + one iteration: + + min1' = min1 + idx_step + + and use the ranges: + + [min1' + low_offset1', min1' + high_offset1' + idx_access1 - 1] + + and: + + [min2, min2 + idx_access2 - 1] + + where: + + low_offset1' = +ve step ? 0 : -(idx_len1 - |idx_step|) + high_offset1' = +ve_step ? idx_len1 - |idx_step| : 0. */ + if (waw_or_war_p) + idx_len1 -= abs_idx_step; + + poly_offset_int limit = idx_len1 + idx_access1 - 1 + idx_access2 - 1; + if (!waw_or_war_p) + limit += idx_len2; + tree utype = unsigned_type_for (TREE_TYPE (min1)); if (!wi::fits_to_tree_p (limit, utype)) return false; poly_offset_int low_offset1 = neg_step ? -idx_len1 : 0; - poly_offset_int high_offset2 = neg_step ? 0 : idx_len2; + poly_offset_int high_offset2 = neg_step || waw_or_war_p ? 0 : idx_len2; poly_offset_int bias = high_offset2 + idx_access2 - 1 - low_offset1; + /* Equivalent to adding IDX_STEP to MIN1. */ + if (waw_or_war_p) + bias -= wi::to_offset (idx_step); tree subject = fold_build2 (MINUS_EXPR, utype, fold_convert (utype, min2), @@ -1931,7 +1974,169 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr, *cond_expr = part_cond_expr; } if (dump_enabled_p ()) - dump_printf (MSG_NOTE, "using an index-based overlap test\n"); + { + if (waw_or_war_p) + dump_printf (MSG_NOTE, "using an index-based WAR/WAW test\n"); + else + dump_printf (MSG_NOTE, "using an index-based overlap test\n"); + } + return true; +} + +/* A subroutine of create_intersect_range_checks, with a subset of the + same arguments. Try to optimize cases in which the second access + is a write and in which some overlap is valid. */ + +static bool +create_waw_or_war_checks (tree *cond_expr, + const dr_with_seg_len_pair_t &alias_pair) +{ + const dr_with_seg_len& dr_a = alias_pair.first; + const dr_with_seg_len& dr_b = alias_pair.second; + + /* Check for cases in which: + + (a) DR_B is always a write; + (b) the accesses are well-ordered in both the original and new code + (see the comment above the DR_ALIAS_* flags for details); and + (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */ + if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW)) + return false; + + /* Check for equal (but possibly variable) steps. */ + tree step = DR_STEP (dr_a.dr); + if (!operand_equal_p (step, DR_STEP (dr_b.dr))) + return false; + + /* Make sure that we can operate on sizetype without loss of precision. */ + tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr)); + if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype)) + return false; + + /* All addresses involved are known to have a common alignment ALIGN. + We can therefore subtract ALIGN from an exclusive endpoint to get + an inclusive endpoint. In the best (and common) case, ALIGN is the + same as the access sizes of both DRs, and so subtracting ALIGN + cancels out the addition of an access size. */ + unsigned int align = MIN (dr_a.align, dr_b.align); + poly_uint64 last_chunk_a = dr_a.access_size - align; + poly_uint64 last_chunk_b = dr_b.access_size - align; + + /* Get a boolean expression that is true when the step is negative. */ + tree indicator = dr_direction_indicator (dr_a.dr); + tree neg_step = fold_build2 (LT_EXPR, boolean_type_node, + fold_convert (ssizetype, indicator), + ssize_int (0)); + + /* Get lengths in sizetype. */ + tree seg_len_a + = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len)); + step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step)); + + /* Each access has the following pattern: + + <- |seg_len| -> + <--- A: -ve step ---> + +-----+-------+-----+-------+-----+ + | n-1 | ..... | 0 | ..... | n-1 | + +-----+-------+-----+-------+-----+ + <--- B: +ve step ---> + <- |seg_len| -> + | + base address + + where "n" is the number of scalar iterations covered by the segment. + + A is the range of bytes accessed when the step is negative, + B is the range when the step is positive. + + We know that DR_B is a write. We also know (from checking that + DR_A and DR_B are well-ordered) that for each i in [0, n-1], + the write performed by access i of DR_B occurs after access numbers + j<=i of DR_A in both the original and the new code. Any write or + anti dependencies wrt those DR_A accesses are therefore maintained. + + We just need to make sure that each individual write in DR_B does not + overlap any higher-indexed access in DR_A; such DR_A accesses happen + after the DR_B access in the original code but happen before it in + the new code. + + We know the steps for both accesses are equal, so by induction, we + just need to test whether the first write of DR_B overlaps a later + access of DR_A. In other words, we need to move addr_a along by + one iteration: + + addr_a' = addr_a + step + + and check whether: + + [addr_b, addr_b + last_chunk_b] + + overlaps: + + [addr_a' + low_offset_a, addr_a' + high_offset_a + last_chunk_a] + + where [low_offset_a, high_offset_a] spans accesses [1, n-1]. I.e.: + + low_offset_a = +ve step ? 0 : seg_len_a - step + high_offset_a = +ve step ? seg_len_a - step : 0 + + This is equivalent to testing whether: + + addr_a' + low_offset_a <= addr_b + last_chunk_b + && addr_b <= addr_a' + high_offset_a + last_chunk_a + + Converting this into a single test, there is an overlap if: + + 0 <= addr_b + last_chunk_b - addr_a' - low_offset_a <= limit + + where limit = high_offset_a - low_offset_a + last_chunk_a + last_chunk_b + + If DR_A is performed, limit + |step| - last_chunk_b is known to be + less than the size of the object underlying DR_A. We also know + that last_chunk_b <= |step|; this is checked elsewhere if it isn't + guaranteed at compile time. There can therefore be no overflow if + "limit" is calculated in an unsigned type with pointer precision. */ + tree addr_a = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_a.dr), + DR_OFFSET (dr_a.dr)); + addr_a = fold_build_pointer_plus (addr_a, DR_INIT (dr_a.dr)); + + tree addr_b = fold_build_pointer_plus (DR_BASE_ADDRESS (dr_b.dr), + DR_OFFSET (dr_b.dr)); + addr_b = fold_build_pointer_plus (addr_b, DR_INIT (dr_b.dr)); + + /* Advance ADDR_A by one iteration and adjust the length to compensate. */ + addr_a = fold_build_pointer_plus (addr_a, step); + tree seg_len_a_minus_step = fold_build2 (MINUS_EXPR, sizetype, + seg_len_a, step); + if (!CONSTANT_CLASS_P (seg_len_a_minus_step)) + seg_len_a_minus_step = build1 (SAVE_EXPR, sizetype, seg_len_a_minus_step); + + tree low_offset_a = fold_build3 (COND_EXPR, sizetype, neg_step, + seg_len_a_minus_step, size_zero_node); + if (!CONSTANT_CLASS_P (low_offset_a)) + low_offset_a = build1 (SAVE_EXPR, sizetype, low_offset_a); + + /* We could use COND_EXPR , + but it's usually more efficient to reuse the LOW_OFFSET_A result. */ + tree high_offset_a = fold_build2 (MINUS_EXPR, sizetype, seg_len_a_minus_step, + low_offset_a); + + /* The amount added to addr_b - addr_a'. */ + tree bias = fold_build2 (MINUS_EXPR, sizetype, + size_int (last_chunk_b), low_offset_a); + + tree limit = fold_build2 (MINUS_EXPR, sizetype, high_offset_a, low_offset_a); + limit = fold_build2 (PLUS_EXPR, sizetype, limit, + size_int (last_chunk_a + last_chunk_b)); + + tree subject = fold_build2 (POINTER_DIFF_EXPR, ssizetype, addr_b, addr_a); + subject = fold_build2 (PLUS_EXPR, sizetype, + fold_convert (sizetype, subject), bias); + + *cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit); + if (dump_enabled_p ()) + dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n"); return true; } @@ -2035,6 +2240,9 @@ create_intersect_range_checks (class loop *loop, tree *cond_expr, if (create_intersect_range_checks_index (loop, cond_expr, alias_pair)) return; + if (create_waw_or_war_checks (cond_expr, alias_pair)) + return; + unsigned HOST_WIDE_INT min_align; tree_code cmp_code; /* We don't have to check DR_ALIAS_MIXED_STEPS here, since both versions -- 2.7.4