From cd676dfa57e643a4f7d8445e6ebad0f21cf3fd84 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 16 Dec 2020 13:08:07 +0100 Subject: [PATCH] bswap: Handle vector CONSTRUCTORs [PR96239] The following patch teaches the bswap pass to handle for small (2/4/8 byte long) vectors a CONSTRUCTOR by determining if the bytes of the constructor come from non-vector sources and are either nop or bswap and changing the CONSTRUCTOR in that case to VIEW_CONVERT_EXPR from scalar integer to the vector type. Unfortunately, as I found after the patch was written, due to pass ordering this doesn't really fix the original testcase, just the one I wrote, because both loop and slp vectorization is done only after the bswap pass. A possible way out of that would be to perform just this particular bswap optimization (i.e. for CONSTRUCTOR assignments with integral vector types call find_bswap_or_nop and bswap_replace if successful) also during the store merging pass, it isn't really a store, but the store merging pass already performs bswapping when handling store, so it wouldn't be that big hack. What do you think? 2020-12-16 Jakub Jelinek PR tree-optimization/96239 * gimple-ssa-store-merging.c (find_bswap_or_nop): Handle a vector CONSTRUCTOR. (bswap_replace): Likewise. * gcc.dg/pr96239.c: New test. --- gcc/gimple-ssa-store-merging.c | 91 +++++++++++++++++++++++++++++++++++++++--- gcc/testsuite/gcc.dg/pr96239.c | 54 +++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/pr96239.c diff --git a/gcc/gimple-ssa-store-merging.c b/gcc/gimple-ssa-store-merging.c index 17a4250..2894205f 100644 --- a/gcc/gimple-ssa-store-merging.c +++ b/gcc/gimple-ssa-store-merging.c @@ -865,7 +865,66 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap) gimple *ins_stmt = find_bswap_or_nop_1 (stmt, n, limit); if (!ins_stmt) - return NULL; + { + if (gimple_assign_rhs_code (stmt) != CONSTRUCTOR + || BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN) + return NULL; + unsigned HOST_WIDE_INT sz = tree_to_uhwi (type_size) * BITS_PER_UNIT; + if (sz != 16 && sz != 32 && sz != 64) + return NULL; + tree rhs = gimple_assign_rhs1 (stmt); + tree eltype = TREE_TYPE (TREE_TYPE (rhs)); + unsigned HOST_WIDE_INT eltsz + = int_size_in_bytes (eltype) * BITS_PER_UNIT; + if (TYPE_PRECISION (eltype) != eltsz) + return NULL; + constructor_elt *elt; + unsigned int i; + tree type = build_nonstandard_integer_type (sz, 1); + FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (rhs), i, elt) + { + if (TREE_CODE (elt->value) != SSA_NAME + || !INTEGRAL_TYPE_P (TREE_TYPE (elt->value))) + return NULL; + struct symbolic_number n1; + gimple *source_stmt + = find_bswap_or_nop_1 (SSA_NAME_DEF_STMT (elt->value), &n1, + limit - 1); + + if (!source_stmt) + return NULL; + + n1.type = type; + if (!n1.base_addr) + n1.range = sz / BITS_PER_UNIT; + + if (i == 0) + { + ins_stmt = source_stmt; + *n = n1; + } + else + { + if (n->vuse != n1.vuse) + return NULL; + + struct symbolic_number n0 = *n; + + if (!BYTES_BIG_ENDIAN) + { + if (!do_shift_rotate (LSHIFT_EXPR, &n1, i * eltsz)) + return NULL; + } + else if (!do_shift_rotate (LSHIFT_EXPR, &n0, eltsz)) + return NULL; + ins_stmt + = perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n); + + if (!ins_stmt) + return NULL; + } + } + } uint64_t cmpxchg, cmpnop; find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop); @@ -939,11 +998,18 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, { tree src, tmp, tgt = NULL_TREE; gimple *bswap_stmt; + tree_code conv_code = NOP_EXPR; gimple *cur_stmt = gsi_stmt (gsi); src = n->src; if (cur_stmt) - tgt = gimple_assign_lhs (cur_stmt); + { + tgt = gimple_assign_lhs (cur_stmt); + if (gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR + && tgt + && VECTOR_TYPE_P (TREE_TYPE (tgt))) + conv_code = VIEW_CONVERT_EXPR; + } /* Need to load the value from memory first. */ if (n->base_addr) @@ -1031,7 +1097,9 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, load_stmt = gimple_build_assign (val_tmp, val_expr); gimple_set_vuse (load_stmt, n->vuse); gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT); - gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp); + if (conv_code == VIEW_CONVERT_EXPR) + val_tmp = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (tgt), val_tmp); + gimple_assign_set_rhs_with_ops (&gsi, conv_code, val_tmp); update_stmt (cur_stmt); } else if (cur_stmt) @@ -1073,7 +1141,9 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, { if (!is_gimple_val (src)) return NULL_TREE; - g = gimple_build_assign (tgt, NOP_EXPR, src); + if (conv_code == VIEW_CONVERT_EXPR) + src = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (tgt), src); + g = gimple_build_assign (tgt, conv_code, src); } else if (cur_stmt) g = gimple_build_assign (tgt, src); @@ -1153,7 +1223,10 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, gimple *convert_stmt; tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst"); - convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp); + tree atmp = tmp; + if (conv_code == VIEW_CONVERT_EXPR) + atmp = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (tgt), tmp); + convert_stmt = gimple_build_assign (tgt, conv_code, atmp); gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT); } @@ -1258,6 +1331,14 @@ pass_optimize_bswap::execute (function *fun) /* Fall through. */ case BIT_IOR_EXPR: break; + case CONSTRUCTOR: + { + tree rhs = gimple_assign_rhs1 (cur_stmt); + if (VECTOR_TYPE_P (TREE_TYPE (rhs)) + && INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs)))) + break; + } + continue; default: continue; } diff --git a/gcc/testsuite/gcc.dg/pr96239.c b/gcc/testsuite/gcc.dg/pr96239.c new file mode 100644 index 0000000..8af56e1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr96239.c @@ -0,0 +1,54 @@ +/* PR tree-optimization/96239 */ +/* { dg-do run { target { ilp32 || lp64 } } } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump-times " r>> 8;" 1 "optimized" { target bswap } } } */ +/* { dg-final { scan-tree-dump-times " = __builtin_bswap64 " 1 "optimized" { target bswap } } } */ +/* { dg-final { scan-tree-dump-not " >> \(8\|16\|24\|32\|40\|48\|56\);" "optimized" { target bswap } } } */ + +typedef unsigned char V __attribute__((vector_size (2))); +typedef unsigned char W __attribute__((vector_size (8))); + +__attribute__((noipa)) void +foo (unsigned short x, V *p) +{ + *p = (V) { x >> 8, x }; +} + +__attribute__((noipa)) void +bar (unsigned long long x, W *p) +{ + *p = (W) { x >> 56, x >> 48, x >> 40, x >> 32, x >> 24, x >> 16, x >> 8, x }; +} + +__attribute__((noipa)) void +baz (unsigned short x, V *p) +{ + *p = (V) { x, x >> 8 }; +} + +__attribute__((noipa)) void +qux (unsigned long long x, W *p) +{ + *p = (W) { x, x >> 8, x >> 16, x >> 24, x >> 32, x >> 40, x >> 48, x >> 56 }; +} + +int +main () +{ + V a, c, e, g; + W b, d, f, h; + foo (0xcafe, &a); + bar (0xdeadbeefcafebabeULL, &b); + baz (0xdead, &c); + qux (0xfeedbac1beefdeadULL, &d); + e = (V) { 0xca, 0xfe }; + f = (W) { 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, 0xba, 0xbe }; + g = (V) { 0xad, 0xde }; + h = (W) { 0xad, 0xde, 0xef, 0xbe, 0xc1, 0xba, 0xed, 0xfe }; + if (__builtin_memcmp (&a, &e, sizeof (V)) + || __builtin_memcmp (&b, &f, sizeof (W)) + || __builtin_memcmp (&c, &g, sizeof (V)) + || __builtin_memcmp (&d, &h, sizeof (W))) + __builtin_abort (); + return 0; +} -- 2.7.4