From ad36491ebb993ed53534c02030e47b4695f8fd2d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 26 Mar 2020 20:46:11 +0000 Subject: [PATCH] [X86] Prefer PACKUS(AND(),AND()) to SHUFFLE(PSHUFB(),PSHUFB()) on all targets Extends rG9d1721ce3926 to support AVX2+ targets. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 5 +-- llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll | 57 +++++++++---------------- llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll | 8 ++-- llvm/test/CodeGen/X86/vector-trunc.ll | 46 +++++++++----------- 4 files changed, 44 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1c1dc26a..e40055f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -14833,14 +14833,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, // // If the mask is a binary compaction, we can more efficiently perform this // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()). - // TODO: AVX2+ sees a regression as they fail to see through VBROADCAST_LOAD - // masks. // // FIXME: The only exceptions to the above are blends which are exact // interleavings with direct instructions supporting them. We currently don't // handle those well here. - if (Subtarget.hasSSSE3() && - (Subtarget.hasInt256() || IsSingleInput || NumEvenDrops != 1)) { + if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) { bool V1InUse = false; bool V2InUse = false; diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index 8f9ad92..5e074b4 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -13,56 +13,39 @@ ; Ideally, the shuffles should be lowered to code with the same quality as the truncates. define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand 16(%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpand (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vmovdqa %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX-NEXT: vpand 16(%rdi), %xmm0, %xmm1 +; AVX-NEXT: vpand (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand 16(%rdi), %xmm0, %xmm1 +; AVX512F-NEXT: vpand (%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpand 16(%rdi), %xmm0, %xmm1 +; AVX512VL-NEXT: vpand (%rdi), %xmm0, %xmm0 +; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpand 16(%rdi), %xmm0, %xmm1 +; AVX512BW-NEXT: vpand (%rdi), %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) ; AVX512BW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 756add1..5ba0efd 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1987,10 +1987,10 @@ define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) { ; ; AVX2-LABEL: PR12412: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: PR12412: diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index 3848e0f..852d017 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -1690,44 +1690,36 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: trunc2x8i16_16i8: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc2x8i16_16i8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: retq +; AVX-LABEL: trunc2x8i16_16i8: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc2x8i16_16i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc2x8i16_16i8: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc2x8i16_16i8: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc2x8i16_16i8: -- 2.7.4