re PR tree-optimization/92645 (Hand written vector code is 450 times slower when...
authorRichard Biener <rguenther@suse.de>
Thu, 28 Nov 2019 12:22:04 +0000 (12:22 +0000)
committerRichard Biener <rguenth@gcc.gnu.org>
Thu, 28 Nov 2019 12:22:04 +0000 (12:22 +0000)
2019-11-28  Richard Biener  <rguenther@suse.de>

PR tree-optimization/92645
* tree-ssa-forwprop.c (get_bit_field_ref_def): Also handle
conversions inside a mode class.  Remove restriction on
preserving the element size.
(simplify_vector_constructor): Deal with the above and for
identity permutes also try using VEC_UNPACK_[FLOAT_]LO_EXPR
and VEC_PACK_TRUNC_EXPR.

* gcc.target/i386/pr92645-4.c: New testcase.

From-SVN: r278806

gcc/ChangeLog
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/i386/pr92645-4.c [new file with mode: 0644]
gcc/tree-ssa-forwprop.c

index b1c4240..d4a66fd 100644 (file)
@@ -1,3 +1,13 @@
+2019-11-28  Richard Biener  <rguenther@suse.de>
+
+       PR tree-optimization/92645
+       * tree-ssa-forwprop.c (get_bit_field_ref_def): Also handle
+       conversions inside a mode class.  Remove restriction on
+       preserving the element size.
+       (simplify_vector_constructor): Deal with the above and for
+       identity permutes also try using VEC_UNPACK_[FLOAT_]LO_EXPR
+       and VEC_PACK_TRUNC_EXPR.
+
 2019-11-28  Georg-Johann Lay  <avr@gjlay.de>
 
        Must use push insn to pass varargs arguments of DFmode because
index 969c8bd..a0fdcd5 100644 (file)
@@ -1,3 +1,8 @@
+2019-11-28  Richard Biener  <rguenther@suse.de>
+
+       PR tree-optimization/92645
+       * gcc.target/i386/pr92645-4.c: New testcase.
+
 2019-11-28  Christophe Lyon  <christophe.lyon@linaro.org>
 
        * gcc.target/arm/asm-flag-4.c: Use -mfloat-abi=softfp.
diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c b/gcc/testsuite/gcc.target/i386/pr92645-4.c
new file mode 100644 (file)
index 0000000..788a97e
--- /dev/null
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -fdump-tree-optimized -Wno-psabi" } */
+
+typedef unsigned int u32v4 __attribute__((vector_size(16)));
+typedef unsigned short u16v16 __attribute__((vector_size(32)));
+typedef unsigned char u8v16 __attribute__((vector_size(16)));
+
+union vec128 {
+  u8v16 u8;
+  u32v4 u32;
+};
+
+#define memcpy __builtin_memcpy
+
+static u16v16 zxt(u8v16 x)
+{
+  return (u16v16) {
+    x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+  };
+}
+
+static u8v16 narrow(u16v16 x)
+{
+  return (u8v16) {
+    x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+  };
+}
+
+void f(char *dst, char *src, unsigned long n, unsigned c)
+{
+  unsigned ia = 255 - (c >> 24);
+  ia += ia >> 7;
+
+  union vec128 c4 = {0}, ia16 = {0};
+  c4.u32 += c;
+  ia16.u8 += (unsigned char)ia;
+
+  u16v16 c16 = (zxt(c4.u8) << 8) + 128;
+
+  for (; n; src += 16, dst += 16, n -= 4) {
+    union vec128 s;
+    memcpy(&s, src, sizeof s);
+    s.u8 = narrow((zxt(s.u8)*zxt(ia16.u8) + c16) >> 8);
+    memcpy(dst, &s, sizeof s);
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "vec_unpack_lo" 3 "optimized" } } */
+/* We're missing an opportunity to, after later optimizations, combine
+   a uniform CTOR with a vec_unpack_lo_expr to a CTOR on a converted
+   element.  */
+/* { dg-final { scan-tree-dump-times "vec_unpack_lo" 2 "optimized" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */
index f95b05b..b275a63 100644 (file)
@@ -2004,16 +2004,12 @@ get_bit_field_ref_def (tree val, enum tree_code &conv_code)
     return NULL_TREE;
   enum tree_code code = gimple_assign_rhs_code (def_stmt);
   if (code == FLOAT_EXPR
-      || code == FIX_TRUNC_EXPR)
+      || code == FIX_TRUNC_EXPR
+      || CONVERT_EXPR_CODE_P (code))
     {
       tree op1 = gimple_assign_rhs1 (def_stmt);
       if (conv_code == ERROR_MARK)
-       {
-         if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (val))),
-                       GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
-           return NULL_TREE;
-         conv_code = code;
-       }
+       conv_code = code;
       else if (conv_code != code)
        return NULL_TREE;
       if (TREE_CODE (op1) != SSA_NAME)
@@ -2078,9 +2074,8 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
          && VECTOR_TYPE_P (TREE_TYPE (ref))
          && useless_type_conversion_p (TREE_TYPE (op1),
                                        TREE_TYPE (TREE_TYPE (ref)))
-         && known_eq (bit_field_size (op1), elem_size)
          && constant_multiple_p (bit_field_offset (op1),
-                                 elem_size, &elem)
+                                 bit_field_size (op1), &elem)
          && TYPE_VECTOR_SUBPARTS (TREE_TYPE (ref)).is_constant (&refnelts))
        {
          unsigned int j;
@@ -2153,7 +2148,83 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
       if (conv_code != ERROR_MARK
          && !supportable_convert_operation (conv_code, type, conv_src_type,
                                             &conv_code))
-       return false;
+       {
+         /* Only few targets implement direct conversion patterns so try
+            some simple special cases via VEC_[UN]PACK[_FLOAT]_LO_EXPR.  */
+         optab optab;
+         tree halfvectype, dblvectype;
+         if (CONVERT_EXPR_CODE_P (conv_code)
+             && (2 * TYPE_PRECISION (TREE_TYPE (TREE_TYPE (orig[0])))
+                 == TYPE_PRECISION (TREE_TYPE (type)))
+             && mode_for_vector (as_a <scalar_mode>
+                                 (TYPE_MODE (TREE_TYPE (TREE_TYPE (orig[0])))),
+                                 nelts * 2).exists ()
+             && (dblvectype
+                 = build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])),
+                                      nelts * 2))
+             && (optab = optab_for_tree_code (FLOAT_TYPE_P (TREE_TYPE (type))
+                                              ? VEC_UNPACK_FLOAT_LO_EXPR
+                                              : VEC_UNPACK_LO_EXPR,
+                                              dblvectype,
+                                              optab_default))
+             && (optab_handler (optab, TYPE_MODE (dblvectype))
+                 != CODE_FOR_nothing))
+           {
+             gimple_seq stmts = NULL;
+             tree dbl;
+             if (refnelts == nelts)
+               {
+                 /* ???  Paradoxical subregs don't exist, so insert into
+                    the lower half of a wider zero vector.  */
+                 dbl = gimple_build (&stmts, BIT_INSERT_EXPR, dblvectype,
+                                     build_zero_cst (dblvectype), orig[0],
+                                     bitsize_zero_node);
+               }
+             else if (refnelts == 2 * nelts)
+               dbl = orig[0];
+             else
+               dbl = gimple_build (&stmts, BIT_FIELD_REF, dblvectype,
+                                   orig[0], TYPE_SIZE (dblvectype),
+                                   bitsize_zero_node);
+             gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+             gimple_assign_set_rhs_with_ops (gsi,
+                                             FLOAT_TYPE_P (TREE_TYPE (type))
+                                             ? VEC_UNPACK_FLOAT_LO_EXPR
+                                             : VEC_UNPACK_LO_EXPR,
+                                             dbl);
+           }
+         else if (CONVERT_EXPR_CODE_P (conv_code)
+                  && (TYPE_PRECISION (TREE_TYPE (TREE_TYPE (orig[0])))
+                      == 2 * TYPE_PRECISION (TREE_TYPE (type)))
+                  && mode_for_vector (as_a <scalar_mode>
+                                        (TYPE_MODE
+                                          (TREE_TYPE (TREE_TYPE (orig[0])))),
+                                      nelts / 2).exists ()
+                  && (halfvectype
+                        = build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])),
+                                             nelts / 2))
+                  && (optab = optab_for_tree_code (VEC_PACK_TRUNC_EXPR,
+                                                   halfvectype,
+                                                   optab_default))
+                  && (optab_handler (optab, TYPE_MODE (halfvectype))
+                      != CODE_FOR_nothing))
+           {
+             gimple_seq stmts = NULL;
+             tree low = gimple_build (&stmts, BIT_FIELD_REF, halfvectype,
+                                      orig[0], TYPE_SIZE (halfvectype),
+                                      bitsize_zero_node);
+             tree hig = gimple_build (&stmts, BIT_FIELD_REF, halfvectype,
+                                      orig[0], TYPE_SIZE (halfvectype),
+                                      TYPE_SIZE (halfvectype));
+             gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+             gimple_assign_set_rhs_with_ops (gsi, VEC_PACK_TRUNC_EXPR,
+                                             low, hig);
+           }
+         else
+           return false;
+         update_stmt (gsi_stmt (*gsi));
+         return true;
+       }
       if (nelts != refnelts)
        {
          gassign *lowpart
@@ -2178,9 +2249,8 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
                       ? perm_type
                       : build_vector_type (TREE_TYPE (perm_type), nelts));
       if (conv_code != ERROR_MARK
-         && (!supportable_convert_operation (conv_code, type, conv_src_type,
-                                             &conv_code)
-             || conv_code == CALL_EXPR))
+         && !supportable_convert_operation (conv_code, type, conv_src_type,
+                                            &conv_code))
        return false;
 
       /* Now that we know the number of elements of the source build the