Add a separate function to calculate cost for WIDEN_MULT_EXPR.

author liuhongt <hongtao.liu@intel.com>

Wed, 28 Jul 2021 08:24:52 +0000 (16:24 +0800)

committer liuhongt <hongtao.liu@intel.com>

Thu, 29 Jul 2021 01:06:24 +0000 (09:06 +0800)
author liuhongt <hongtao.liu@intel.com>
Wed, 28 Jul 2021 08:24:52 +0000 (16:24 +0800)
committer liuhongt <hongtao.liu@intel.com>
Thu, 29 Jul 2021 01:06:24 +0000 (09:06 +0800)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

index 12ae37e..a0285e6 100644 (file)
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19845,6 +19845,44 @@ ix86_vec_cost (machine_mode mode, int cost)
    return cost;
  }
  
+/* Return cost of vec_widen_<s>mult_hi/lo_<mode>,
+   vec_widen_<s>mul_hi/lo_<mode> is only available for VI124_AVX2.  */
+static int
+ix86_widen_mult_cost (const struct processor_costs *cost,
+                     enum machine_mode mode, bool uns_p)
+{
+  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
+  int extra_cost = 0;
+  int basic_cost = 0;
+  switch (mode)
+    {
+    case V8HImode:
+    case V16HImode:
+      if (!uns_p || mode == V16HImode)
+       extra_cost = cost->sse_op * 2;
+      basic_cost = cost->mulss * 2 + cost->sse_op * 4;
+      break;
+    case V4SImode:
+    case V8SImode:
+      /* pmulhw/pmullw can be used.  */
+      basic_cost = cost->mulss * 2 + cost->sse_op * 2;
+      break;
+    case V2DImode:
+      /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend,
+        require extra 4 mul, 4 add, 4 cmp and 2 shift.  */
+      if (!TARGET_SSE4_1 && !uns_p)
+       extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4
+                     + cost->sse_op * 2;
+      /* Fallthru.  */
+    case V4DImode:
+      basic_cost = cost->mulss * 2 + cost->sse_op * 4;
+      break;
+    default:
+      gcc_unreachable();
+    }
+  return ix86_vec_cost (mode, basic_cost + extra_cost);
+}
+
  /* Return cost of multiplication in MODE.  */
  
  static int
@@ -22575,10 +22613,18 @@ ix86_add_stmt_cost (class vec_info *vinfo, void *data, int count,
           break;
  
         case MULT_EXPR:
-       case WIDEN_MULT_EXPR:
+         /* For MULT_HIGHPART_EXPR, x86 only supports pmulhw,
+            take it as MULT_EXPR.  */
         case MULT_HIGHPART_EXPR:
           stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
           break;
+         /* There's no direct instruction for WIDEN_MULT_EXPR,
+            take emulation into account.  */
+       case WIDEN_MULT_EXPR:
+         stmt_cost = ix86_widen_mult_cost (ix86_cost, mode,
+                                           TYPE_UNSIGNED (vectype));
+         break;
+
         case NEGATE_EXPR:
           if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
             stmt_cost = ix86_cost->sse_op;
diff --git a/gcc/testsuite/gcc.target/i386/sse2-pr39821.c b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c

new file mode 100644 (file)

index 0000000..bcd4b77
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-pr39821.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -mno-sse4.1 -O3 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 5 "vect" } } */
+#include<stdint.h>
+void
+vec_widen_smul8 (int16_t* __restrict v3, int8_t *v1, int8_t *v2, int order)
+{
+  while (order--)
+    *v3++ = (int16_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_umul8(uint16_t* __restrict v3, uint8_t *v1, uint8_t *v2, int order)
+{
+  while (order--)
+    *v3++ = (uint16_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_smul16(int32_t* __restrict v3, int16_t *v1, int16_t *v2, int order)
+{
+  while (order--)
+    *v3++ = (int32_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_umul16(uint32_t* __restrict v3, uint16_t *v1, uint16_t *v2, int order)
+{
+  while (order--)
+    *v3++ = (uint32_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_smul32(int64_t* __restrict v3, int32_t *v1, int32_t *v2, int order)
+{
+  while (order--)
+    *v3++ = (int64_t) *v1++ * *v2++;
+}
+
+void
+vec_widen_umul32(uint64_t* __restrict v3, uint32_t *v1, uint32_t *v2, int order)
+{
+  while (order--)
+    *v3++ = (uint64_t) *v1++ * *v2++;
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse4-pr39821.c b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c

new file mode 100644 (file)

index 0000000..4456c31
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4-pr39821.c
@@ -0,0 +1,4 @@
+/* { dg-do compile } */
+/* { dg-options "-msse4.1 -O3 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 6 "vect"} } */
+#include "sse2-pr39821.c"
author	liuhongt <hongtao.liu@intel.com>
	Wed, 28 Jul 2021 08:24:52 +0000 (16:24 +0800)
committer	liuhongt <hongtao.liu@intel.com>
	Thu, 29 Jul 2021 01:06:24 +0000 (09:06 +0800)
gcc/config/i386/i386.c		patch \| blob \| history
gcc/testsuite/gcc.target/i386/sse2-pr39821.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/sse4-pr39821.c	[new file with mode: 0644]	patch \| blob