i386: Add pack/unpack patterns for 64bit vectors [PR89021]
authorUros Bizjak <ubizjak@gmail.com>
Thu, 24 Jun 2021 13:39:26 +0000 (15:39 +0200)
committerUros Bizjak <ubizjak@gmail.com>
Thu, 24 Jun 2021 13:40:28 +0000 (15:40 +0200)
2021-06-24  Uroš Bizjak  <ubizjak@gmail.com>

gcc/
PR target/89021
* config/i386/i386-expand.c (ix86_expand_sse_unpack):
Handle V8QI and V4HI modes.
* config/i386/mmx.md (sse4_1_<any_extend:code>v4qiv4hi2):
New insn pattern.
(sse4_1_<any_extend:code>v4qiv4hi2): Ditto.
(mmxpackmode): New mode attribute.
(vec_pack_trunc_<mmxpackmode:mode>): New expander.
(mmxunpackmode): New mode attribute.
(vec_unpacks_lo_<mmxunpackmode:mode>): New expander.
(vec_unpacks_hi_<mmxunpackmode:mode>): Ditto.
(vec_unpacku_lo_<mmxunpackmode:mode>): Ditto.
(vec_unpacku_hi_<mmxunpackmode:mode>): Ditto.
* config/i386/i386.md (extsuffix): Move from ...
* config/i386/sse.md: ... here.

gcc/testsuite/

PR target/89021
* gcc.dg/vect/vect-nb-iter-ub-3.c (dg-additional-options):
Add --param vect-epilogues-nomask=0.
* gcc.target/i386/pr97249-1.c (foo): Add #pragma GCC unroll
to avoid loop vectorization.
(foo1): Ditto.
(foo2): Ditto.

gcc/config/i386/i386-expand.c
gcc/config/i386/i386.md
gcc/config/i386/mmx.md
gcc/config/i386/sse.md
gcc/testsuite/gcc.dg/vect/vect-nb-iter-ub-3.c
gcc/testsuite/gcc.target/i386/pr97249-1.c

index 2cb939e..e9763eb 100644 (file)
@@ -5161,6 +5161,18 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
          else
            unpack = gen_sse4_1_sign_extendv2siv2di2;
          break;
+       case E_V8QImode:
+         if (unsigned_p)
+           unpack = gen_sse4_1_zero_extendv4qiv4hi2;
+         else
+           unpack = gen_sse4_1_sign_extendv4qiv4hi2;
+         break;
+       case E_V4HImode:
+         if (unsigned_p)
+           unpack = gen_sse4_1_zero_extendv2hiv2si2;
+         else
+           unpack = gen_sse4_1_sign_extendv2hiv2si2;
+         break;
        default:
          gcc_unreachable ();
        }
@@ -5172,10 +5184,24 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
        }
       else if (high_p)
        {
-         /* Shift higher 8 bytes to lower 8 bytes.  */
-         tmp = gen_reg_rtx (V1TImode);
-         emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
-                                        GEN_INT (64)));
+         switch (GET_MODE_SIZE (imode))
+           {
+           case 16:
+             /* Shift higher 8 bytes to lower 8 bytes.  */
+             tmp = gen_reg_rtx (V1TImode);
+             emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
+                                            GEN_INT (64)));
+             break;
+           case 8:
+             /* Shift higher 4 bytes to lower 4 bytes.  */
+             tmp = gen_reg_rtx (V1DImode);
+             emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
+                                           GEN_INT (32)));
+             break;
+           default:
+             gcc_unreachable ();
+           }
+
          tmp = gen_lowpart (imode, tmp);
        }
       else
@@ -5207,6 +5233,18 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
          else
            unpack = gen_vec_interleave_lowv4si;
          break;
+       case E_V8QImode:
+         if (high_p)
+           unpack = gen_mmx_punpckhbw;
+         else
+           unpack = gen_mmx_punpcklbw;
+         break;
+       case E_V4HImode:
+         if (high_p)
+           unpack = gen_mmx_punpckhwd;
+         else
+           unpack = gen_mmx_punpcklwd;
+         break;
        default:
          gcc_unreachable ();
        }
index 9043be3..9b619e2 100644 (file)
 (define_code_attr trunsuffix
   [(ss_truncate "s") (truncate "") (us_truncate "us")])
 
+;; Instruction suffix for SSE sign and zero extensions.
+(define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")])
+
 ;; Used in signed and unsigned fix.
 (define_code_iterator any_fix [fix unsigned_fix])
 (define_code_attr fixsuffix [(fix "") (unsigned_fix "u")])
index 7a827dc..e887f03 100644 (file)
    (set_attr "type" "mmxcvt,sselog,sselog")
    (set_attr "mode" "DI,TI,TI")])
 
+(define_insn "sse4_1_<code>v4qiv4hi2"
+  [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,Yw")
+       (any_extend:V4HI
+         (vec_select:V4QI
+           (match_operand:V8QI 1 "register_operand" "Yr,*x,Yw")
+           (parallel [(const_int 0) (const_int 1)
+                      (const_int 2) (const_int 3)]))))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+  "%vpmov<extsuffix>bw\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_<code>v2hiv2si2"
+  [(set (match_operand:V2SI 0 "register_operand" "=Yr,*x,v")
+       (any_extend:V2SI
+         (vec_select:V2HI
+           (match_operand:V4HI 1 "register_operand" "Yr,*x,v")
+           (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+  "%vpmov<extsuffix>wd\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "TI")])
+
+;; Pack/unpack vector modes
+(define_mode_attr mmxpackmode
+  [(V4HI "V8QI") (V2SI "V4HI")])
+
+(define_expand "vec_pack_trunc_<mode>"
+  [(match_operand:<mmxpackmode> 0 "register_operand")
+   (match_operand:MMXMODE24 1 "register_operand")
+   (match_operand:MMXMODE24 2 "register_operand")]
+  "TARGET_MMX_WITH_SSE"
+{
+  rtx op1 = gen_lowpart (<mmxpackmode>mode, operands[1]);
+  rtx op2 = gen_lowpart (<mmxpackmode>mode, operands[2]);
+  ix86_expand_vec_extract_even_odd (operands[0], op1, op2, 0);
+  DONE;
+})
+
+(define_mode_attr mmxunpackmode
+  [(V8QI "V4HI") (V4HI "V2SI")])
+
+(define_expand "vec_unpacks_lo_<mode>"
+  [(match_operand:<mmxunpackmode> 0 "register_operand")
+   (match_operand:MMXMODE12 1 "register_operand")]
+  "TARGET_MMX_WITH_SSE"
+  "ix86_expand_sse_unpack (operands[0], operands[1], false, false); DONE;")
+
+(define_expand "vec_unpacks_hi_<mode>"
+  [(match_operand:<mmxunpackmode> 0 "register_operand")
+   (match_operand:MMXMODE12 1 "register_operand")]
+  "TARGET_MMX_WITH_SSE"
+  "ix86_expand_sse_unpack (operands[0], operands[1], false, true); DONE;")
+
+(define_expand "vec_unpacku_lo_<mode>"
+  [(match_operand:<mmxunpackmode> 0 "register_operand")
+   (match_operand:MMXMODE12 1 "register_operand")]
+  "TARGET_MMX_WITH_SSE"
+  "ix86_expand_sse_unpack (operands[0], operands[1], true, false); DONE;")
+
+(define_expand "vec_unpacku_hi_<mode>"
+  [(match_operand:<mmxunpackmode> 0 "register_operand")
+   (match_operand:MMXMODE12 1 "register_operand")]
+  "TARGET_MMX_WITH_SSE"
+  "ix86_expand_sse_unpack (operands[0], operands[1], true, true); DONE;")
+
 (define_insn "*mmx_pinsrd"
   [(set (match_operand:V2SI 0 "register_operand" "=x,Yv")
         (vec_merge:V2SI
index 2d29877..e4f01e6 100644 (file)
  [(V8SI "si") (V8SF "ps") (V4DF "pd")
   (V16SI "si") (V16SF "ps") (V8DF "pd")])
 
-;; Instruction suffix for sign and zero extensions.
-(define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")])
-
 ;; i128 for integer vectors and TARGET_AVX2, f128 otherwise.
 ;; i64x4 or f64x4 for 512bit modes.
 (define_mode_attr i128
index dbf5091..1666526 100644 (file)
@@ -1,4 +1,4 @@
-/* { dg-additional-options "-fdump-tree-cunroll-details" } */
+/* { dg-additional-options "-fdump-tree-cunroll-details --param vect-epilogues-nomask=0" } */
 
 #include "tree-vect.h"
 
index 4478a34..e7d1d74 100644 (file)
@@ -8,23 +8,26 @@
 void
 foo (unsigned char* p1, unsigned char* p2, short* __restrict p3)
 {
-    for (int i = 0 ; i != 8; i++)
-     p3[i] = p1[i] + p2[i];
-     return;
+  /* Avoid loop vectorization.  */
+#pragma GCC unroll 8
+  for (int i = 0 ; i != 8; i++)
+    p3[i] = p1[i] + p2[i];
 }
 
 void
 foo1 (unsigned short* p1, unsigned short* p2, int* __restrict p3)
 {
-    for (int i = 0 ; i != 4; i++)
-     p3[i] = p1[i] + p2[i];
-     return;
+  /* Avoid loop vectorization.  */
+#pragma GCC unroll 4
+  for (int i = 0 ; i != 4; i++)
+    p3[i] = p1[i] + p2[i];
 }
 
 void
 foo2 (unsigned int* p1, unsigned int* p2, long long* __restrict p3)
 {
-    for (int i = 0 ; i != 2; i++)
-      p3[i] = (long long)p1[i] + (long long)p2[i];
-     return;
+  /* Avoid loop vectorization.  */
+#pragma GCC unroll 2
+  for (int i = 0 ; i != 2; i++)
+    p3[i] = (long long)p1[i] + (long long)p2[i];
 }