From 89c52e5e2ca6e63b1dc0868893d05434d61a0c33 Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Thu, 5 Jul 2018 10:31:04 +0000 Subject: [PATCH] Simplify movmem code by always doing overlapping copies when larger than 8 bytes on AArch64. This changes the movmem code in AArch64 that does copy for data between 4 and 7 bytes to use the smallest possible mode capable of copying the remaining bytes in one go and then overlapping the reads if needed. This means that if we're copying 5 bytes we would issue an SImode and QImode load instead of two SImode loads. This does smaller memory accesses but also gives the mid-end a chance to realise that it can CSE the loads in certain circumstances. e.g. when you have something like return foo; where foo is a struct. This would be transformed by the mid-end into SSA form as D.XXXX = foo; return D.XXXX; This movmem routine will handle the first copy, but it's usually not needed, the mid-end would do SImode and QImode stores into X0 for the 5 bytes example but without the first copies being in the same mode, it doesn't know it doesn't need the stores at all. From-SVN: r262434 --- gcc/ChangeLog | 4 + gcc/config/aarch64/aarch64.c | 112 ++++++++------------------ gcc/testsuite/ChangeLog | 4 + gcc/testsuite/gcc.target/aarch64/struct_cpy.c | 69 ++++++++++++++++ 4 files changed, 112 insertions(+), 77 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/struct_cpy.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 1ecd226..f30e83d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,7 @@ +2018-07-05 Tamar Christina + + * config/aarch64/aarch64.c (aarch64_expand_movmem): Fix mode size. + 2018-07-05 Jakub Jelinek Revert diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 143f9d0..01f35f8 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -16137,26 +16137,29 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst, bool aarch64_expand_movmem (rtx *operands) { - unsigned int n; + int n, mode_bits; rtx dst = operands[0]; rtx src = operands[1]; rtx base; + machine_mode cur_mode = BLKmode, next_mode; bool speed_p = !optimize_function_for_size_p (cfun); /* When optimizing for size, give a better estimate of the length of a - memcpy call, but use the default otherwise. */ - unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2; + memcpy call, but use the default otherwise. Moves larger than 8 bytes + will always require an even number of instructions to do now. And each + operation requires both a load+store, so devide the max number by 2. */ + int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2; /* We can't do anything smart if the amount to copy is not constant. */ if (!CONST_INT_P (operands[2])) return false; - n = UINTVAL (operands[2]); + n = INTVAL (operands[2]); - /* Try to keep the number of instructions low. For cases below 16 bytes we - need to make at most two moves. For cases above 16 bytes it will be one - move for each 16 byte chunk, then at most two additional moves. */ - if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions) + /* Try to keep the number of instructions low. For all cases we will do at + most two moves for the residual amount, since we'll always overlap the + remainder. */ + if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves) return false; base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); @@ -16165,81 +16168,36 @@ aarch64_expand_movmem (rtx *operands) base = copy_to_mode_reg (Pmode, XEXP (src, 0)); src = adjust_automodify_address (src, VOIDmode, base, 0); - /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a - 1-byte chunk. */ - if (n < 4) - { - if (n >= 2) - { - aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode); - n -= 2; - } - - if (n == 1) - aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode); - - return true; - } + /* Convert n to bits to make the rest of the code simpler. */ + n = n * BITS_PER_UNIT; - /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second - 4-byte chunk, partially overlapping with the previously copied chunk. */ - if (n < 8) + while (n > 0) { - aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); - n -= 4; - if (n > 0) - { - int move = n - 4; + /* Find the largest mode in which to do the copy in without over reading + or writing. */ + opt_scalar_int_mode mode_iter; + FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT) + if (GET_MODE_BITSIZE (mode_iter.require ()) <= n) + cur_mode = mode_iter.require (); - src = aarch64_move_pointer (src, move); - dst = aarch64_move_pointer (dst, move); - aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); - } - return true; - } + gcc_assert (cur_mode != BLKmode); - /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of - them, then (if applicable) an 8-byte chunk. */ - while (n >= 8) - { - if (n / 16) - { - aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode); - n -= 16; - } - else - { - aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode); - n -= 8; - } - } + mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant (); + aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode); - /* Finish the final bytes of the copy. We can always do this in one - instruction. We either copy the exact amount we need, or partially - overlap with the previous chunk we copied and copy 8-bytes. */ - if (n == 0) - return true; - else if (n == 1) - aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode); - else if (n == 2) - aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode); - else if (n == 4) - aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); - else - { - if (n == 3) - { - src = aarch64_move_pointer (src, -1); - dst = aarch64_move_pointer (dst, -1); - aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); - } - else - { - int move = n - 8; + n -= mode_bits; - src = aarch64_move_pointer (src, move); - dst = aarch64_move_pointer (dst, move); - aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode); + /* Do certain trailing copies as overlapping if it's going to be + cheaper. i.e. less instructions to do so. For instance doing a 15 + byte copy it's more efficient to do two overlapping 8 byte copies than + 8 + 6 + 1. */ + next_mode = smallest_mode_for_size (n, MODE_INT); + int n_bits = GET_MODE_BITSIZE (next_mode).to_constant (); + if (n > 0 && n_bits > n && n_bits <= 8 * BITS_PER_UNIT) + { + src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT); + dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT); + n = n_bits; } } diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index db1197e..5ff92a4 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2018-07-05 Tamar Christina + + * gcc.target/aarch64/struct_cpy.c: New. + 2018-07-05 Christophe Lyon * c-c++-common/unroll-1.c: Remove 'note:' in matching string. diff --git a/gcc/testsuite/gcc.target/aarch64/struct_cpy.c b/gcc/testsuite/gcc.target/aarch64/struct_cpy.c new file mode 100644 index 0000000..4feb3ea --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/struct_cpy.c @@ -0,0 +1,69 @@ +/* { dg-do compile } */ + +struct struct1 { char a;}; +struct struct2 { char a, b;}; +struct struct3 { char a, b, c; }; +struct struct4 { char a, b, c, d; }; +struct struct5 { char a, b, c, d, e; }; +struct struct6 { char a, b, c, d, e, f; }; +struct struct7 { char a, b, c, d, e, f, g; }; +struct struct8 { char a, b, c, d, e, f, g, h; }; +struct struct9 { char a, b, c, d, e, f, g, h, i; }; +struct struct10 { char a, b, c, d, e, f, g, h, i, j; }; +struct struct11 { char a, b, c, d, e, f, g, h, i, j, k; }; +struct struct12 { char a, b, c, d, e, f, g, h, i, j, k, l; }; +struct struct13 { char a, b, c, d, e, f, g, h, i, j, k, l, m; }; +struct struct14 { char a, b, c, d, e, f, g, h, i, j, k, l, m, n; }; +struct struct15 { char a, b, c, d, e, f, g, h, i, j, k, l, m, n, o; }; +struct struct16 { char a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p; }; + +struct struct1 foo1 = {'1'}; +struct struct2 foo2 = { 'a', 'b'}; +struct struct3 foo3 = { 'A', 'B', 'C'}; +struct struct4 foo4 = {'1', '2', '3', '4'}; +struct struct5 foo5 = {'a', 'b', 'c', 'd', 'e'}; +struct struct6 foo6 = {'A', 'B', 'C', 'D', 'E', 'F'}; +struct struct7 foo7 = {'1', '2', '3', '4', '5', '6', '7'}; +struct struct8 foo8 = {'1', '2', '3', '4', '5', '6', '7', '8'}; +struct struct9 foo9 = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'}; +struct struct10 foo10 = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'}; +struct struct11 foo11 = { + '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B'}; +struct struct12 foo12 = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L'}; +struct struct13 foo13 = { + 'a','b','c','d','e','f','g','h','i','j','k','l','m'}; +struct struct14 foo14 = { + 'a','b','c','d','e','f','g','h','i','j','k','l','m','n'}; +struct struct15 foo15 = { + 'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o'}; +struct struct16 foo16 = { + 'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'}; + +#define FUN(x) void fun##x ()\ +{ \ + volatile struct struct##x var##x = foo##x; \ +} + +FUN(1); +FUN(2); +FUN(3); +FUN(4); +FUN(5); +FUN(6); +FUN(7); +FUN(8); +FUN(9); +FUN(10); +FUN(11); +FUN(12); +FUN(13); +FUN(14); +FUN(15); +FUN(16); + +/* { dg-final { scan-assembler-times {ldr\s} 18 } } */ +/* { dg-final { scan-assembler-times {ldrb} 4 } } */ +/* { dg-final { scan-assembler-times {ldrh} 4 } } */ +/* { dg-final { scan-assembler-times {ldp} 1 } } */ -- 2.7.4