From: Richard Biener Date: Fri, 18 Feb 2022 13:32:14 +0000 (+0100) Subject: target/99881 - x86 vector cost of CTOR from integer regs X-Git-Tag: upstream/12.2.0~1378 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=90d693bdc9d71841f51d68826ffa5bd685d7f0bc;p=platform%2Fupstream%2Fgcc.git target/99881 - x86 vector cost of CTOR from integer regs This uses the now passed SLP node to the vectorizer costing hook to adjust vector construction costs for the cost of moving an integer component from a GPR to a vector register when that's required for building a vector from components. A cruical difference here is whether the component is loaded from memory or extracted from a vector register as in those cases no intermediate GPR is involved. The pr99881.c testcase can be Un-XFAILed with this patch, the pr91446.c testcase now produces scalar code which looks superior to me so I've adjusted it as well. 2022-02-18 Richard Biener PR tree-optimization/104582 PR target/99881 * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Cost GPR to vector register moves for integer vector construction. * gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-1.c: New. * gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c: Likewise. * gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-3.c: Likewise. * gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-4.c: Likewise. * gcc.target/i386/pr99881.c: Un-XFAIL. * gcc.target/i386/pr91446.c: Adjust to not expect vectorization. --- diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 0830dbd..b2bf905 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -22997,7 +22997,7 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar) unsigned ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, - stmt_vec_info stmt_info, slp_tree, + stmt_vec_info stmt_info, slp_tree node, tree vectype, int misalign, vect_cost_model_location where) { @@ -23160,6 +23160,49 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1); } + else if (kind == vec_construct + && node + && SLP_TREE_DEF_TYPE (node) == vect_external_def + && INTEGRAL_TYPE_P (TREE_TYPE (vectype))) + { + stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + unsigned i; + tree op; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) + if (TREE_CODE (op) == SSA_NAME) + TREE_VISITED (op) = 0; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) + { + if (TREE_CODE (op) != SSA_NAME + || TREE_VISITED (op)) + continue; + TREE_VISITED (op) = 1; + gimple *def = SSA_NAME_DEF_STMT (op); + tree tem; + if (is_gimple_assign (def) + && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)) + && ((tem = gimple_assign_rhs1 (def)), true) + && TREE_CODE (tem) == SSA_NAME + /* A sign-change expands to nothing. */ + && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (def)), + TREE_TYPE (tem))) + def = SSA_NAME_DEF_STMT (tem); + /* When the component is loaded from memory we can directly + move it to a vector register, otherwise we have to go + via a GPR or via vpinsr which involves similar cost. + Likewise with a BIT_FIELD_REF extracting from a vector + register we can hope to avoid using a GPR. */ + if (!is_gimple_assign (def) + || (!gimple_assign_load_p (def) + && (gimple_assign_rhs_code (def) != BIT_FIELD_REF + || !VECTOR_TYPE_P (TREE_TYPE + (TREE_OPERAND (gimple_assign_rhs1 (def), 0)))))) + stmt_cost += ix86_cost->sse_to_integer; + } + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) + if (TREE_CODE (op) == SSA_NAME) + TREE_VISITED (op) = 0; + } if (stmt_cost == -1) stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-1.c new file mode 100644 index 0000000..992a845 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-1.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-msse -fdump-tree-slp2-details" } */ + +struct S { unsigned long a, b; } s; + +void +foo (unsigned long *a, unsigned long *b) +{ + unsigned long a_ = *a; + unsigned long b_ = *b; + s.a = a_; + s.b = b_; +} + +/* { dg-final { scan-tree-dump "basic block part vectorized" "slp2" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c new file mode 100644 index 0000000..7637cdb --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-msse -fdump-tree-slp2-details" } */ + +struct S { unsigned long a, b; } s; + +void +foo (unsigned long a, unsigned long b) +{ + s.a = a; + s.b = b; +} + +/* { dg-final { scan-tree-dump-not "basic block part vectorized" "slp2" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-3.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-3.c new file mode 100644 index 0000000..999c490 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-3.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-msse -fdump-tree-slp2-details" } */ + +struct S { double a, b; } s; + +void +foo (double a, double b) +{ + s.a = a; + s.b = b; +} + +/* { dg-final { scan-tree-dump "basic block part vectorized" "slp2" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-4.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-4.c new file mode 100644 index 0000000..cc471e1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-4.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-msse -fdump-tree-slp2-details" } */ + +struct S { unsigned long a, b; } s; + +void +foo (signed long *a, unsigned long *b) +{ + unsigned long a_ = *a; + unsigned long b_ = *b; + s.a = a_; + s.b = b_; +} + +/* { dg-final { scan-tree-dump "basic block part vectorized" "slp2" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr91446.c b/gcc/testsuite/gcc.target/i386/pr91446.c index 0243ca3..067bf43 100644 --- a/gcc/testsuite/gcc.target/i386/pr91446.c +++ b/gcc/testsuite/gcc.target/i386/pr91446.c @@ -21,4 +21,4 @@ foo (unsigned long long width, unsigned long long height, bar (&t); } -/* { dg-final { scan-assembler-times "vmovdqa\[^\n\r\]*xmm\[0-9\]" 2 } } */ +/* { dg-final { scan-assembler-times "xmm\[0-9\]" 0 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c index 3e087eb..a1ec1d1b 100644 --- a/gcc/testsuite/gcc.target/i386/pr99881.c +++ b/gcc/testsuite/gcc.target/i386/pr99881.c @@ -1,7 +1,7 @@ /* PR target/99881. */ /* { dg-do compile { target { ! ia32 } } } */ /* { dg-options "-Ofast -march=skylake" } */ -/* { dg-final { scan-assembler-not "xmm\[0-9\]" { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-not "xmm\[0-9\]" } } */ void foo (int* __restrict a, int n, int c)