From 321e54f72d40aecd51f5db9e6d26c8db307e49a8 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 23 Oct 2017 22:05:02 +0000 Subject: [PATCH] [X86][SSE] combineBitcastvxi1 - use PACKSSWB directly to pack v8i16 to v16i8 Avoid difficulties determining the number of sign bits later on in shuffle lowering to lower to PACKSS llvm-svn: 316383 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 9 ++- llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll | 86 +++++++++----------------- llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll | 52 ++++++---------- llvm/test/CodeGen/X86/bitcast-setcc-256.ll | 58 ++++++----------- llvm/test/CodeGen/X86/bitcast-setcc-512.ll | 4 +- 5 files changed, 70 insertions(+), 139 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6e4c73c..6a6f591 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29598,13 +29598,12 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, } if (SExtVT == MVT::v8i16) { - V = DAG.getBitcast(MVT::v16i8, V); - V = DAG.getVectorShuffle( - MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8), - {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1}); + assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector"); + V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V, + DAG.getUNDEF(MVT::v8i16)); } else assert(SExtVT.getScalarType() != MVT::i16 && - "Vectors of i16 must be shuffled"); + "Vectors of i16 must be packed"); if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE) V = DAG.getBitcast(FPCastVT, V); V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll index f4ff046..190f1c9 100644 --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -6,33 +6,22 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512 define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) { -; SSE2-LABEL: v8i16: -; SSE2: # BB#0: -; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: # kill: %AL %AL %EAX -; SSE2-NEXT: ret{{[l|q]}} -; -; SSSE3-LABEL: v8i16: -; SSSE3: # BB#0: -; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: pmovmskb %xmm2, %eax -; SSSE3-NEXT: # kill: %AL %AL %EAX -; SSSE3-NEXT: ret{{[l|q]}} +; SSE2-SSSE3-LABEL: v8i16: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax +; SSE2-SSSE3-NEXT: # kill: %AL %AL %EAX +; SSE2-SSSE3-NEXT: ret{{[l|q]}} ; ; AVX12-LABEL: v8i16: ; AVX12: # BB#0: ; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX12-NEXT: vpmovmskb %xmm0, %eax ; AVX12-NEXT: # kill: %AL %AL %EAX ; AVX12-NEXT: ret{{[l|q]}} @@ -754,42 +743,23 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) { } define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { -; SSE2-LABEL: v8i8: -; SSE2: # BB#0: -; SSE2-NEXT: psllw $8, %xmm3 -; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: psllw $8, %xmm2 -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-NEXT: psllw $8, %xmm1 -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: psllw $8, %xmm0 -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: # kill: %AL %AL %EAX -; SSE2-NEXT: ret{{[l|q]}} -; -; SSSE3-LABEL: v8i8: -; SSSE3: # BB#0: -; SSSE3-NEXT: psllw $8, %xmm3 -; SSSE3-NEXT: psraw $8, %xmm3 -; SSSE3-NEXT: psllw $8, %xmm2 -; SSSE3-NEXT: psraw $8, %xmm2 -; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSSE3-NEXT: psllw $8, %xmm1 -; SSSE3-NEXT: psraw $8, %xmm1 -; SSSE3-NEXT: psllw $8, %xmm0 -; SSSE3-NEXT: psraw $8, %xmm0 -; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSSE3-NEXT: # kill: %AL %AL %EAX -; SSSE3-NEXT: ret{{[l|q]}} +; SSE2-SSSE3-LABEL: v8i8: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: psllw $8, %xmm3 +; SSE2-SSSE3-NEXT: psraw $8, %xmm3 +; SSE2-SSSE3-NEXT: psllw $8, %xmm2 +; SSE2-SSSE3-NEXT: psraw $8, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: psllw $8, %xmm1 +; SSE2-SSSE3-NEXT: psraw $8, %xmm1 +; SSE2-SSSE3-NEXT: psllw $8, %xmm0 +; SSE2-SSSE3-NEXT: psraw $8, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: # kill: %AL %AL %EAX +; SSE2-SSSE3-NEXT: ret{{[l|q]}} ; ; AVX12-LABEL: v8i8: ; AVX12: # BB#0: @@ -804,7 +774,7 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX12-NEXT: vpmovmskb %xmm0, %eax ; AVX12-NEXT: # kill: %AL %AL %EAX ; AVX12-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll index 166ccc1..5c158af 100644 --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -210,34 +210,19 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { } define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { -; SSE2-LABEL: v8i32: -; SSE2: # BB#0: -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: packssdw %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 -; SSE2-NEXT: packuswb %xmm4, %xmm4 -; SSE2-NEXT: pmovmskb %xmm4, %eax -; SSE2-NEXT: # kill: %AL %AL %EAX -; SSE2-NEXT: ret{{[l|q]}} -; -; SSSE3-LABEL: v8i32: -; SSSE3: # BB#0: -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 -; SSSE3-NEXT: packssdw %xmm1, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 -; SSSE3-NEXT: packssdw %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm0, %xmm4 -; SSSE3-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: pmovmskb %xmm4, %eax -; SSSE3-NEXT: # kill: %AL %AL %EAX -; SSSE3-NEXT: ret{{[l|q]}} +; SSE2-SSSE3-LABEL: v8i32: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: pmovmskb %xmm4, %eax +; SSE2-SSSE3-NEXT: # kill: %AL %AL %EAX +; SSE2-SSSE3-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: v8i32: ; AVX1: # BB#0: @@ -252,7 +237,7 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: # kill: %AL %AL %EAX ; AVX1-NEXT: vzeroupper @@ -267,7 +252,7 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpmovmskb %xmm0, %eax ; AVX2-NEXT: # kill: %AL %AL %EAX ; AVX2-NEXT: vzeroupper @@ -310,8 +295,7 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 +; SSE2-NEXT: packsswb %xmm0, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax ; SSE2-NEXT: # kill: %AL %AL %EAX ; SSE2-NEXT: ret{{[l|q]}} @@ -330,7 +314,7 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) ; SSSE3-NEXT: pshufb %xmm1, %xmm6 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] ; SSSE3-NEXT: pand %xmm2, %xmm6 -; SSSE3-NEXT: pshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: packsswb %xmm0, %xmm6 ; SSSE3-NEXT: pmovmskb %xmm6, %eax ; SSSE3-NEXT: # kill: %AL %AL %EAX ; SSSE3-NEXT: ret{{[l|q]}} @@ -344,7 +328,7 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) ; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX12-NEXT: vpmovmskb %xmm0, %eax ; AVX12-NEXT: # kill: %AL %AL %EAX ; AVX12-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-setcc-256.ll index f4316c6..5728aea 100644 --- a/llvm/test/CodeGen/X86/bitcast-setcc-256.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-256.ll @@ -50,26 +50,15 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) { } define i8 @v8i32(<8 x i32> %a, <8 x i32> %b) { -; SSE2-LABEL: v8i32: -; SSE2: # BB#0: -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: packsswb %xmm1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: # kill: %AL %AL %EAX -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v8i32: -; SSSE3: # BB#0: -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 -; SSSE3-NEXT: packsswb %xmm1, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSSE3-NEXT: # kill: %AL %AL %EAX -; SSSE3-NEXT: retq +; SSE2-SSSE3-LABEL: v8i32: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: # kill: %AL %AL %EAX +; SSE2-SSSE3-NEXT: retq ; ; AVX1-LABEL: v8i32: ; AVX1: # BB#0: @@ -104,26 +93,15 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b) { } define i8 @v8f32(<8 x float> %a, <8 x float> %b) { -; SSE2-LABEL: v8f32: -; SSE2: # BB#0: -; SSE2-NEXT: cmpltps %xmm1, %xmm3 -; SSE2-NEXT: cmpltps %xmm0, %xmm2 -; SSE2-NEXT: packsswb %xmm3, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: # kill: %AL %AL %EAX -; SSE2-NEXT: retq -; -; SSSE3-LABEL: v8f32: -; SSSE3: # BB#0: -; SSSE3-NEXT: cmpltps %xmm1, %xmm3 -; SSSE3-NEXT: cmpltps %xmm0, %xmm2 -; SSSE3-NEXT: packsswb %xmm3, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: pmovmskb %xmm2, %eax -; SSSE3-NEXT: # kill: %AL %AL %EAX -; SSSE3-NEXT: retq +; SSE2-SSSE3-LABEL: v8f32: +; SSE2-SSSE3: # BB#0: +; SSE2-SSSE3-NEXT: cmpltps %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: packsswb %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax +; SSE2-SSSE3-NEXT: # kill: %AL %AL %EAX +; SSE2-SSSE3-NEXT: retq ; ; AVX12-LABEL: v8f32: ; AVX12: # BB#0: diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll index 4262bdf..bcd0566 100644 --- a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll @@ -1045,7 +1045,7 @@ define i8 @v8i64(<8 x i64> %a, <8 x i64> %b) { ; SSE-NEXT: pcmpgtq %xmm4, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 ; SSE-NEXT: packsswb %xmm2, %xmm0 -; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE-NEXT: packsswb %xmm0, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax ; SSE-NEXT: # kill: %AL %AL %EAX ; SSE-NEXT: retq @@ -1109,7 +1109,7 @@ define i8 @v8f64(<8 x double> %a, <8 x double> %b) { ; SSE-NEXT: cmpltpd %xmm0, %xmm4 ; SSE-NEXT: packsswb %xmm5, %xmm4 ; SSE-NEXT: packsswb %xmm6, %xmm4 -; SSE-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE-NEXT: packsswb %xmm0, %xmm4 ; SSE-NEXT: pmovmskb %xmm4, %eax ; SSE-NEXT: # kill: %AL %AL %EAX ; SSE-NEXT: retq -- 2.7.4