* config/i386/i386.c (ix86_expand_builtin): If gather mask
authorjakub <jakub@138bc75d-0d04-0410-961f-82ee72b054a4>
Mon, 7 Nov 2011 16:00:08 +0000 (16:00 +0000)
committerjakub <jakub@138bc75d-0d04-0410-961f-82ee72b054a4>
Mon, 7 Nov 2011 16:00:08 +0000 (16:00 +0000)
argument is known to have all high bits set, pass pc_rtx as
second argument to the expander instead of op0.
* config/i386/sse.md (*avx2_gathersi<mode>_2,
*avx2_gatherdi<mode>_2): New patterns.
* config/i386/avx2intrin.h (_mm256_i32gather_pd,
_mm256_i64gather_pd, _mm256_i32gather_ps): Set mask using
_mm256_cmp_pd with zero vector arguments and _CMP_EQ_OQ instead of
_mm256_set1_pd.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@181090 138bc75d-0d04-0410-961f-82ee72b054a4

gcc/ChangeLog
gcc/config/i386/avx2intrin.h
gcc/config/i386/i386.c
gcc/config/i386/sse.md

index 0d0db8a..0bfd95a 100644 (file)
@@ -1,5 +1,15 @@
 2011-11-07  Jakub Jelinek  <jakub@redhat.com>
 
+       * config/i386/i386.c (ix86_expand_builtin): If gather mask
+       argument is known to have all high bits set, pass pc_rtx as
+       second argument to the expander instead of op0.
+       * config/i386/sse.md (*avx2_gathersi<mode>_2,
+       *avx2_gatherdi<mode>_2): New patterns.
+       * config/i386/avx2intrin.h (_mm256_i32gather_pd,
+       _mm256_i64gather_pd, _mm256_i32gather_ps): Set mask using
+       _mm256_cmp_pd with zero vector arguments and _CMP_EQ_OQ instead of
+       _mm256_set1_pd.
+
        PR tree-optimization/50789
        * tree-vect-stmts.c (process_use): Add force argument, avoid
        exist_non_indexing_operands_for_use_p check if true.
index 3c8f360..12ed05f 100644 (file)
@@ -1252,7 +1252,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_i32gather_pd (double const *base, __m128i index, const int scale)
 {
   __v4df src = _mm256_setzero_pd ();
-  __v4df mask = _mm256_set1_pd((double)(long long int) -1);
+  __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
 
   return (__m256d) __builtin_ia32_gathersiv4df (src,
                                                base,
@@ -1304,7 +1304,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_i64gather_pd (double const *base, __m256i index, const int scale)
 {
   __v4df src = _mm256_setzero_pd ();
-  __v4df mask = _mm256_set1_pd((double)(long long int) -1);
+  __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
 
   return (__m256d) __builtin_ia32_gatherdiv4df (src,
                                                base,
@@ -1356,7 +1356,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_i32gather_ps (float const *base, __m256i index, const int scale)
 {
   __v8sf src = _mm256_setzero_ps ();
-  __v8sf mask = _mm256_set1_ps((float)(int) -1);
+  __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
 
   return (__m256) __builtin_ia32_gathersiv8sf (src,
                                               base,
index 4d7d2cf..4461fbb 100644 (file)
@@ -29087,6 +29087,71 @@ rdrand_step:
           error ("last argument must be scale 1, 2, 4, 8");
           return const0_rtx;
        }
+
+      /* Optimize.  If mask is known to have all high bits set,
+        replace op0 with pc_rtx to signal that the instruction
+        overwrites the whole destination and doesn't use its
+        previous contents.  */
+      if (optimize)
+       {
+         if (TREE_CODE (arg3) == VECTOR_CST)
+           {
+             tree elt;
+             unsigned int negative = 0;
+             for (elt = TREE_VECTOR_CST_ELTS (arg3);
+                  elt; elt = TREE_CHAIN (elt))
+               {
+                 tree cst = TREE_VALUE (elt);
+                 if (TREE_CODE (cst) == INTEGER_CST
+                     && tree_int_cst_sign_bit (cst))
+                   negative++;
+                 else if (TREE_CODE (cst) == REAL_CST
+                          && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
+                   negative++;
+               }
+             if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
+               op0 = pc_rtx;
+           }
+         else if (TREE_CODE (arg3) == SSA_NAME)
+           {
+             /* Recognize also when mask is like:
+                __v2df src = _mm_setzero_pd ();
+                __v2df mask = _mm_cmpeq_pd (src, src);
+                or
+                __v8sf src = _mm256_setzero_ps ();
+                __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
+                as that is a cheaper way to load all ones into
+                a register than having to load a constant from
+                memory.  */
+             gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
+             if (is_gimple_call (def_stmt))
+               {
+                 tree fndecl = gimple_call_fndecl (def_stmt);
+                 if (fndecl
+                     && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
+                   switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
+                     {
+                     case IX86_BUILTIN_CMPPD:
+                     case IX86_BUILTIN_CMPPS:
+                     case IX86_BUILTIN_CMPPD256:
+                     case IX86_BUILTIN_CMPPS256:
+                       if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
+                         break;
+                       /* FALLTHRU */
+                     case IX86_BUILTIN_CMPEQPD:
+                     case IX86_BUILTIN_CMPEQPS:
+                       if (initializer_zerop (gimple_call_arg (def_stmt, 0))
+                           && initializer_zerop (gimple_call_arg (def_stmt,
+                                                                  1)))
+                         op0 = pc_rtx;
+                       break;
+                     default:
+                       break;
+                     }
+               }
+           }
+       }
+
       pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
       if (! pat)
        return const0_rtx;
index e3de9ec..688b5be 100644 (file)
    (set_attr "prefix" "vex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*avx2_gathersi<mode>_2"
+  [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x")
+       (unspec:VEC_GATHER_MODE
+         [(pc)
+          (match_operator:<ssescalarmode> 6 "vsib_mem_operator"
+            [(unspec:P
+               [(match_operand:P 2 "vsib_address_operand" "p")
+                (match_operand:<VEC_GATHER_IDXSI> 3 "register_operand" "x")
+                (match_operand:SI 5 "const1248_operand" "n")]
+               UNSPEC_VSIBADDR)])
+          (mem:BLK (scratch))
+          (match_operand:VEC_GATHER_MODE 4 "register_operand" "1")]
+         UNSPEC_GATHER))
+   (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))]
+  "TARGET_AVX2"
+  "v<sseintprefix>gatherd<ssemodesuffix>\t{%1, %6, %0|%0, %6, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_expand "avx2_gatherdi<mode>"
   [(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "")
                   (unspec:VEC_GATHER_MODE
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "vex")
    (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx2_gatherdi<mode>_2"
+  [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x")
+       (unspec:VEC_GATHER_MODE
+         [(pc)
+          (match_operator:<ssescalarmode> 6 "vsib_mem_operator"
+            [(unspec:P
+               [(match_operand:P 2 "vsib_address_operand" "p")
+                (match_operand:<VEC_GATHER_IDXDI> 3 "register_operand" "x")
+                (match_operand:SI 5 "const1248_operand" "n")]
+               UNSPEC_VSIBADDR)])
+          (mem:BLK (scratch))
+          (match_operand:<VEC_GATHER_SRCDI> 4 "register_operand" "1")]
+         UNSPEC_GATHER))
+   (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))]
+  "TARGET_AVX2"
+{
+  if (<MODE>mode != <VEC_GATHER_SRCDI>mode)
+    return "v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %x0|%x0, %6, %4}";
+  return "v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %0|%0, %6, %4}";
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])