From c7501e00efae2c0150db641fd5184732f52e8b88 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 26 May 2016 10:43:17 +0200 Subject: [PATCH] * config/i386/sse.md (avx512vl_shuf_32x4_1): Rename to ... (avx512vl_shuf_32x4_1): ... this. (*avx_vperm_broadcast_v4sf): Use v constraint instead of x. Use maybe_evex prefix instead of vex. (*avx_vperm_broadcast_): Use v constraint instead of x. Handle EXT_REX_SSE_REG_P (op0) case in the splitter. * gcc.target/i386/avx512vl-vbroadcast-3.c: New test. From-SVN: r236763 --- gcc/ChangeLog | 11 ++ gcc/config/i386/sse.md | 29 +++- gcc/testsuite/ChangeLog | 4 + .../gcc.target/i386/avx512vl-vbroadcast-3.c | 162 +++++++++++++++++++++ 4 files changed, 200 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a3c4d90..0bdd47e 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,14 @@ +2016-05-26 Jakub Jelinek + + * config/i386/sse.md + (avx512vl_shuf_32x4_1): Rename + to ... + (avx512vl_shuf_32x4_1): ... this. + (*avx_vperm_broadcast_v4sf): Use v constraint instead of x. Use + maybe_evex prefix instead of vex. + (*avx_vperm_broadcast_): Use v constraint instead of x. Handle + EXT_REX_SSE_REG_P (op0) case in the splitter. + 2016-05-25 Jeff Law PR tree-optimization/71272 diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 2297ca2..c681098 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -12399,7 +12399,7 @@ DONE; }) -(define_insn "avx512vl_shuf_32x4_1" +(define_insn "avx512vl_shuf_32x4_1" [(set (match_operand:VI4F_256 0 "register_operand" "=v") (vec_select:VI4F_256 (vec_concat: @@ -17283,9 +17283,9 @@ ;; If it so happens that the input is in memory, use vbroadcast. ;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128). (define_insn "*avx_vperm_broadcast_v4sf" - [(set (match_operand:V4SF 0 "register_operand" "=x,x,x") + [(set (match_operand:V4SF 0 "register_operand" "=v,v,v") (vec_select:V4SF - (match_operand:V4SF 1 "nonimmediate_operand" "m,o,x") + (match_operand:V4SF 1 "nonimmediate_operand" "m,o,v") (match_parallel 2 "avx_vbroadcast_operand" [(match_operand 3 "const_int_operand" "C,n,n")])))] "TARGET_AVX" @@ -17307,13 +17307,13 @@ [(set_attr "type" "ssemov,ssemov,sselog1") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "0,0,1") - (set_attr "prefix" "vex") + (set_attr "prefix" "maybe_evex") (set_attr "mode" "SF,SF,V4SF")]) (define_insn_and_split "*avx_vperm_broadcast_" - [(set (match_operand:VF_256 0 "register_operand" "=x,x,x") + [(set (match_operand:VF_256 0 "register_operand" "=v,v,v") (vec_select:VF_256 - (match_operand:VF_256 1 "nonimmediate_operand" "m,o,?x") + (match_operand:VF_256 1 "nonimmediate_operand" "m,o,?v") (match_parallel 2 "avx_vbroadcast_operand" [(match_operand 3 "const_int_operand" "C,n,n")])))] "TARGET_AVX" @@ -17345,6 +17345,23 @@ /* Shuffle the lane we care about into both lanes of the dest. */ mask = (elt / ( / 2)) * 0x11; + if (EXT_REX_SSE_REG_P (op0)) + { + /* There is no EVEX VPERM2F128, but we can use either VBROADCASTSS + or VSHUFF128. */ + gcc_assert (mode == V8SFmode); + if ((mask & 1) == 0) + emit_insn (gen_avx2_vec_dupv8sf (op0, + gen_lowpart (V4SFmode, op0))); + else + emit_insn (gen_avx512vl_shuf_f32x4_1 (op0, op0, op0, + GEN_INT (4), GEN_INT (5), + GEN_INT (6), GEN_INT (7), + GEN_INT (12), GEN_INT (13), + GEN_INT (14), GEN_INT (15))); + DONE; + } + emit_insn (gen_avx_vperm2f1283 (op0, op0, op0, GEN_INT (mask))); DONE; } diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 8f1d5a7..fc925c3 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2016-05-26 Jakub Jelinek + + * gcc.target/i386/avx512vl-vbroadcast-3.c: New test. + 2016-05-26 Jiong Wang * gcc.target/aarch64/simd/vmul_elem_1.c: Force result variables to be diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c b/gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c new file mode 100644 index 0000000..d981fe4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vbroadcast-3.c @@ -0,0 +1,162 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mavx512vl -masm=att" } */ + +typedef float V1 __attribute__((vector_size (16))); +typedef float V2 __attribute__((vector_size (32))); +typedef int V4 __attribute__((vector_size (16))); +typedef int V5 __attribute__((vector_size (32))); + +void +f1 (V1 x) +{ + register V1 a __asm ("xmm16"); + a = x; + asm volatile ("" : "+v" (a)); + a = __builtin_shuffle (a, (V4) { 0, 0, 0, 0 }); + asm volatile ("" : "+v" (a)); +} + +void +f2 (V1 x) +{ + register V1 a __asm ("xmm16"); + a = x; + asm volatile ("" : "+v" (a)); + a = __builtin_shuffle (a, (V4) { 1, 1, 1, 1 }); + asm volatile ("" : "+v" (a)); +} + +void +f3 (V1 x) +{ + register V1 a __asm ("xmm16"); + a = x; + asm volatile ("" : "+v" (a)); + a = __builtin_shuffle (a, (V4) { 2, 2, 2, 2 }); + asm volatile ("" : "+v" (a)); +} + +void +f4 (V1 x) +{ + register V1 a __asm ("xmm16"); + a = x; + asm volatile ("" : "+v" (a)); + a = __builtin_shuffle (a, (V4) { 3, 3, 3, 3 }); + asm volatile ("" : "+v" (a)); +} + +void +f5 (V1 *x) +{ + register V1 a __asm ("xmm16"); + a = __builtin_shuffle (*x, (V4) { 0, 0, 0, 0 }); + asm volatile ("" : "+v" (a)); +} + +void +f6 (V1 *x) +{ + register V1 a __asm ("xmm16"); + a = __builtin_shuffle (*x, (V4) { 1, 1, 1, 1 }); + asm volatile ("" : "+v" (a)); +} + +void +f7 (V1 *x) +{ + register V1 a __asm ("xmm16"); + a = __builtin_shuffle (*x, (V4) { 2, 2, 2, 2 }); + asm volatile ("" : "+v" (a)); +} + +void +f8 (V1 *x) +{ + register V1 a __asm ("xmm16"); + a = __builtin_shuffle (*x, (V4) { 3, 3, 3, 3 }); + asm volatile ("" : "+v" (a)); +} + +void +f9 (V2 x) +{ + register V2 a __asm ("xmm16"); + a = x; + asm volatile ("" : "+v" (a)); + a = __builtin_shuffle (a, (V5) { 0, 0, 0, 0, 0, 0, 0, 0 }); + asm volatile ("" : "+v" (a)); +} + +void +f10 (V2 x) +{ + register V2 a __asm ("xmm16"); + a = x; + asm volatile ("" : "+v" (a)); + a = __builtin_shuffle (a, (V5) { 1, 1, 1, 1, 1, 1, 1, 1 }); + asm volatile ("" : "+v" (a)); +} + +void +f11 (V2 x) +{ + register V2 a __asm ("xmm16"); + a = x; + asm volatile ("" : "+v" (a)); + a = __builtin_shuffle (a, (V5) { 4, 4, 4, 4, 4, 4, 4, 4 }); + asm volatile ("" : "+v" (a)); +} + +void +f12 (V2 x) +{ + register V2 a __asm ("xmm16"); + a = x; + asm volatile ("" : "+v" (a)); + a = __builtin_shuffle (a, (V5) { 5, 5, 5, 5, 5, 5, 5, 5 }); + asm volatile ("" : "+v" (a)); +} + +void +f13 (V2 *x) +{ + register V2 a __asm ("xmm16"); + a = __builtin_shuffle (*x, (V5) { 0, 0, 0, 0, 0, 0, 0, 0 }); + asm volatile ("" : "+v" (a)); +} + +void +f14 (V2 *x) +{ + register V2 a __asm ("xmm16"); + a = __builtin_shuffle (*x, (V5) { 1, 1, 1, 1, 1, 1, 1, 1 }); + asm volatile ("" : "+v" (a)); +} + +void +f15 (V2 *x) +{ + register V2 a __asm ("xmm16"); + a = __builtin_shuffle (*x, (V5) { 4, 4, 4, 4, 4, 4, 4, 4 }); + asm volatile ("" : "+v" (a)); +} + +void +f16 (V2 *x) +{ + register V2 a __asm ("xmm16"); + a = __builtin_shuffle (*x, (V5) { 5, 5, 5, 5, 5, 5, 5, 5 }); + asm volatile ("" : "+v" (a)); +} + +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%rdi\[^\n\r]*%xmm16" 4 } } */ +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%xmm16\[^\n\r]*%ymm16" 3 } } */ +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\r]*%rdi\[^\n\r]*%ymm16" 3 } } */ +/* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$0\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */ +/* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$85\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */ +/* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$170\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */ +/* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$255\[^\n\r]*%xmm16\[^\n\r]*%xmm16" 1 } } */ +/* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$0\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 1 } } */ +/* { dg-final { scan-assembler-times "vpermilps\[^\n\r]*\\\$85\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 2 } } */ +/* { dg-final { scan-assembler-times "vshuff32x4\[^\n\r]*\\\$3\[^\n\r]*%ymm16\[^\n\r]*%ymm16\[^\n\r]*%ymm16" 2 } } */ -- 2.7.4