From b97885cc2eb06a29141b6ea67413f8ff17afcc10 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 11 Jan 2019 05:44:56 +0000 Subject: [PATCH] [X86] Change vXi1 extract_vector_elt lowering to be legal if the index is 0. Add DAG combine to turn scalar_to_vector+extract_vector_elt into extract_subvector. We were lowering the last step extract_vector_elt to a bitcast+truncate. Change it to use an extract_vector_elt of index 0 instead. Add isel patterns to do the equivalent of what the bitcast would have done. Plus an isel pattern for an any_extend+extract to prevent some regressions. Finally add a DAG combine to turn v1i1 scalar_to_vector+extract_vector_elt of 0 into an extract_subvector. This fixes some of the regressions from D350800. llvm-svn: 350918 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 46 +++++++-------- llvm/lib/Target/X86/X86InstrAVX512.td | 11 ++++ llvm/test/CodeGen/X86/avx512-insert-extract.ll | 4 +- llvm/test/CodeGen/X86/avx512-mask-op.ll | 82 ++++++++++++-------------- llvm/test/CodeGen/X86/avx512-schedule.ll | 48 +++++++-------- llvm/test/CodeGen/X86/masked_gather_scatter.ll | 8 +-- llvm/test/CodeGen/X86/pr33349.ll | 32 +++++----- llvm/test/CodeGen/X86/vec_floor.ll | 48 +++++---------- 8 files changed, 133 insertions(+), 146 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index def4be5..c8ed78d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -16296,34 +16296,25 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, } unsigned IdxVal = cast(Idx)->getZExtValue(); + if (IdxVal == 0) // the operation is legal + return Op; - // If the kshift instructions of the correct width aren't natively supported - // then we need to promote the vector to the native size to get the correct - // zeroing behavior. - if (VecVT.getVectorNumElements() < 16) { - VecVT = MVT::v16i1; - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, - DAG.getUNDEF(VecVT), Vec, + // Extend to natively supported kshift. + unsigned NumElems = VecVT.getVectorNumElements(); + MVT WideVecVT = VecVT; + if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) { + WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT, + DAG.getUNDEF(WideVecVT), Vec, DAG.getIntPtrConstant(0, dl)); } - // Extracts from element 0 are always allowed. - if (IdxVal != 0) { - // Use kshiftr instruction to move to the lower element. - Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, - DAG.getConstant(IdxVal, dl, MVT::i8)); - } - - // Shrink to v16i1 since that's always legal. - if (VecVT.getVectorNumElements() > 16) { - VecVT = MVT::v16i1; - Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Vec, - DAG.getIntPtrConstant(0, dl)); - } + // Use kshiftr instruction to move to the lower element. + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec, + DAG.getConstant(IdxVal, dl, MVT::i8)); - // Convert to a bitcast+aext/trunc. - MVT CastVT = MVT::getIntegerVT(VecVT.getVectorNumElements()); - return DAG.getAnyExtOrTrunc(DAG.getBitcast(CastVT, Vec), dl, EltVT); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, + DAG.getIntPtrConstant(0, dl)); } SDValue @@ -41413,6 +41404,15 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1, Src.getOperand(0)); + // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec. + if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() && + Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1) + if (auto *C = dyn_cast(Src.getOperand(1))) + if (C->isNullValue()) + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, + Src.getOperand(0), Src.getOperand(1)); + return SDValue(); } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 105ca2e..97c9eac 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2889,6 +2889,11 @@ let Predicates = [HasAVX512] in { (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>; } +def X86kextract : SDNode<"ISD::EXTRACT_VECTOR_ELT", + SDTypeProfile<1, 2, [SDTCisVT<0, i8>, + SDTCVecEltisVT<1, i1>, + SDTCisPtrTy<2>]>>; + let Predicates = [HasAVX512] in { multiclass operation_gpr_mask_copy_lowering { def : Pat<(maskVT (scalar_to_vector GR32:$src)), @@ -2896,6 +2901,12 @@ let Predicates = [HasAVX512] in { def : Pat<(maskVT (scalar_to_vector GR8:$src)), (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>; + + def : Pat<(i8 (X86kextract maskRC:$src, (iPTR 0))), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>; + + def : Pat<(i32 (anyext (i8 (X86kextract maskRC:$src, (iPTR 0))))), + (i32 (COPY_TO_REGCLASS maskRC:$src, GR32))>; } defm : operation_gpr_mask_copy_lowering; diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index 05542ac..3d19447 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -270,7 +270,7 @@ define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) { ; SKX: ## %bb.0: ; SKX-NEXT: movq %rdi, %rax ; SKX-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 -; SKX-NEXT: kshiftrw $4, %k0, %k0 +; SKX-NEXT: kshiftrb $4, %k0, %k0 ; SKX-NEXT: kmovd %k0, %ecx ; SKX-NEXT: testb $1, %cl ; SKX-NEXT: cmoveq %rsi, %rax @@ -976,7 +976,7 @@ define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) { ; SKX-LABEL: test_extractelement_v4i1: ; SKX: ## %bb.0: ; SKX-NEXT: vpcmpnleud %xmm1, %xmm0, %k0 -; SKX-NEXT: kshiftrw $3, %k0, %k0 +; SKX-NEXT: kshiftrb $3, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: andl $1, %eax ; SKX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index 122be67..500672c 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1281,19 +1281,15 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kmovw %esi, %k1 ; KNL-NEXT: kshiftrw $8, %k1, %k2 -; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: kshiftrw $9, %k1, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: kshiftrw $6, %k0, %k1 -; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $6, %k0, %k3 +; KNL-NEXT: kxorw %k1, %k3, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $9, %k1, %k1 ; KNL-NEXT: kxorw %k1, %k0, %k0 ; KNL-NEXT: kshiftlw $9, %k0, %k0 ; KNL-NEXT: kshiftrw $9, %k0, %k0 -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kshiftlw $7, %k2, %k1 ; KNL-NEXT: korw %k1, %k0, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm0, %ymm0 @@ -1303,37 +1299,37 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; ; SKX-LABEL: test18: ; SKX: ## %bb.0: -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: kmovd %esi, %k2 -; SKX-NEXT: kshiftrw $8, %k2, %k0 -; SKX-NEXT: kshiftrw $9, %k2, %k2 -; SKX-NEXT: kshiftrb $6, %k1, %k3 -; SKX-NEXT: kxorb %k2, %k3, %k2 -; SKX-NEXT: kshiftlb $7, %k2, %k2 -; SKX-NEXT: kshiftrb $1, %k2, %k2 -; SKX-NEXT: kxorb %k2, %k1, %k1 -; SKX-NEXT: kshiftlb $1, %k1, %k1 +; SKX-NEXT: kmovd %edi, %k0 +; SKX-NEXT: kmovd %esi, %k1 +; SKX-NEXT: kshiftrw $8, %k1, %k2 +; SKX-NEXT: kshiftrw $9, %k1, %k1 +; SKX-NEXT: kshiftrb $6, %k0, %k3 +; SKX-NEXT: kxorb %k1, %k3, %k1 +; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k1 -; SKX-NEXT: kshiftlb $7, %k0, %k0 -; SKX-NEXT: korb %k0, %k1, %k0 +; SKX-NEXT: kxorb %k1, %k0, %k0 +; SKX-NEXT: kshiftlb $1, %k0, %k0 +; SKX-NEXT: kshiftrb $1, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k2, %k1 +; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: vpmovm2w %k0, %xmm0 ; SKX-NEXT: retq ; ; AVX512BW-LABEL: test18: ; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: kmovd %esi, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 -; AVX512BW-NEXT: kxorw %k2, %k3, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k2 -; AVX512BW-NEXT: kxorw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $9, %k1, %k1 +; AVX512BW-NEXT: kmovd %edi, %k0 +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $7, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k0, %k3 +; AVX512BW-NEXT: kxorw %k1, %k3, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $9, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $7, %k2, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1344,19 +1340,15 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; AVX512DQ-NEXT: kmovw %edi, %k0 ; AVX512DQ-NEXT: kmovw %esi, %k1 ; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2 -; AVX512DQ-NEXT: kmovw %k2, %eax ; AVX512DQ-NEXT: kshiftrw $9, %k1, %k1 -; AVX512DQ-NEXT: kmovw %k1, %ecx -; AVX512DQ-NEXT: kshiftrb $6, %k0, %k1 -; AVX512DQ-NEXT: kmovw %ecx, %k2 -; AVX512DQ-NEXT: kxorb %k2, %k1, %k1 +; AVX512DQ-NEXT: kshiftrb $6, %k0, %k3 +; AVX512DQ-NEXT: kxorb %k1, %k3, %k1 ; AVX512DQ-NEXT: kshiftlb $7, %k1, %k1 ; AVX512DQ-NEXT: kshiftrb $1, %k1, %k1 ; AVX512DQ-NEXT: kxorb %k1, %k0, %k0 ; AVX512DQ-NEXT: kshiftlb $1, %k0, %k0 ; AVX512DQ-NEXT: kshiftrb $1, %k0, %k0 -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: kshiftlb $7, %k1, %k1 +; AVX512DQ-NEXT: kshiftlb $7, %k2, %k1 ; AVX512DQ-NEXT: korb %k1, %k0, %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 @@ -1368,16 +1360,16 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; X86: ## %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: kshiftrw $9, %k1, %k2 -; X86-NEXT: kshiftrw $8, %k1, %k1 -; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrw $8, %k1, %k2 +; X86-NEXT: kshiftrw $9, %k1, %k1 ; X86-NEXT: kshiftrb $6, %k0, %k3 -; X86-NEXT: kxorb %k2, %k3, %k2 -; X86-NEXT: kshiftlb $7, %k2, %k2 -; X86-NEXT: kshiftrb $1, %k2, %k2 -; X86-NEXT: kxorb %k2, %k0, %k0 +; X86-NEXT: kxorb %k1, %k3, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $1, %k1, %k1 +; X86-NEXT: kxorb %k1, %k0, %k0 ; X86-NEXT: kshiftlb $1, %k0, %k0 ; X86-NEXT: kshiftrb $1, %k0, %k0 +; X86-NEXT: kshiftlb $7, %k2, %k1 ; X86-NEXT: korb %k1, %k0, %k0 ; X86-NEXT: vpmovm2w %k0, %xmm0 ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/avx512-schedule.ll b/llvm/test/CodeGen/X86/avx512-schedule.ll index 2c9b6e1..bc9e6f7 100755 --- a/llvm/test/CodeGen/X86/avx512-schedule.ll +++ b/llvm/test/CodeGen/X86/avx512-schedule.ll @@ -7300,37 +7300,37 @@ define <64 x i8> @vmov_test17(i64 %x, i32 %y, i32 %z) { define <8 x i1> @vmov_test18(i8 %a, i16 %y) { ; GENERIC-LABEL: vmov_test18: ; GENERIC: # %bb.0: -; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33] -; GENERIC-NEXT: kmovd %esi, %k2 # sched: [1:0.33] -; GENERIC-NEXT: kshiftrw $8, %k2, %k0 # sched: [1:1.00] -; GENERIC-NEXT: kshiftrw $9, %k2, %k2 # sched: [1:1.00] -; GENERIC-NEXT: kshiftrb $6, %k1, %k3 # sched: [1:1.00] -; GENERIC-NEXT: kxorb %k2, %k3, %k2 # sched: [1:0.33] -; GENERIC-NEXT: kshiftlb $7, %k2, %k2 # sched: [1:1.00] -; GENERIC-NEXT: kshiftrb $1, %k2, %k2 # sched: [1:1.00] -; GENERIC-NEXT: kxorb %k2, %k1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: kshiftlb $1, %k1, %k1 # sched: [1:1.00] +; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33] +; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33] +; GENERIC-NEXT: kshiftrw $8, %k1, %k2 # sched: [1:1.00] +; GENERIC-NEXT: kshiftrw $9, %k1, %k1 # sched: [1:1.00] +; GENERIC-NEXT: kshiftrb $6, %k0, %k3 # sched: [1:1.00] +; GENERIC-NEXT: kxorb %k1, %k3, %k1 # sched: [1:0.33] +; GENERIC-NEXT: kshiftlb $7, %k1, %k1 # sched: [1:1.00] ; GENERIC-NEXT: kshiftrb $1, %k1, %k1 # sched: [1:1.00] -; GENERIC-NEXT: kshiftlb $7, %k0, %k0 # sched: [1:1.00] -; GENERIC-NEXT: korb %k0, %k1, %k0 # sched: [1:0.33] +; GENERIC-NEXT: kxorb %k1, %k0, %k0 # sched: [1:0.33] +; GENERIC-NEXT: kshiftlb $1, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kshiftrb $1, %k0, %k0 # sched: [1:1.00] +; GENERIC-NEXT: kshiftlb $7, %k2, %k1 # sched: [1:1.00] +; GENERIC-NEXT: korb %k1, %k0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: vpmovm2w %k0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: vmov_test18: ; SKX: # %bb.0: -; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; SKX-NEXT: kmovd %esi, %k2 # sched: [1:1.00] -; SKX-NEXT: kshiftrw $8, %k2, %k0 # sched: [3:1.00] -; SKX-NEXT: kshiftrw $9, %k2, %k2 # sched: [3:1.00] -; SKX-NEXT: kshiftrb $6, %k1, %k3 # sched: [3:1.00] -; SKX-NEXT: kxorb %k2, %k3, %k2 # sched: [1:1.00] -; SKX-NEXT: kshiftlb $7, %k2, %k2 # sched: [3:1.00] -; SKX-NEXT: kshiftrb $1, %k2, %k2 # sched: [3:1.00] -; SKX-NEXT: kxorb %k2, %k1, %k1 # sched: [1:1.00] -; SKX-NEXT: kshiftlb $1, %k1, %k1 # sched: [3:1.00] +; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00] +; SKX-NEXT: kshiftrw $8, %k1, %k2 # sched: [3:1.00] +; SKX-NEXT: kshiftrw $9, %k1, %k1 # sched: [3:1.00] +; SKX-NEXT: kshiftrb $6, %k0, %k3 # sched: [3:1.00] +; SKX-NEXT: kxorb %k1, %k3, %k1 # sched: [1:1.00] +; SKX-NEXT: kshiftlb $7, %k1, %k1 # sched: [3:1.00] ; SKX-NEXT: kshiftrb $1, %k1, %k1 # sched: [3:1.00] -; SKX-NEXT: kshiftlb $7, %k0, %k0 # sched: [3:1.00] -; SKX-NEXT: korb %k0, %k1, %k0 # sched: [1:1.00] +; SKX-NEXT: kxorb %k1, %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kshiftlb $1, %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: kshiftrb $1, %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: kshiftlb $7, %k2, %k1 # sched: [3:1.00] +; SKX-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: vpmovm2w %k0, %xmm0 # sched: [1:0.25] ; SKX-NEXT: retq # sched: [7:1.00] %b = bitcast i8 %a to <8 x i1> diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 8212053..5e6d88a 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -1743,7 +1743,7 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ; SKX-NEXT: vmovq %xmm0, %rax ; SKX-NEXT: vpinsrd $0, (%rax), %xmm3, %xmm3 ; SKX-NEXT: .LBB31_2: # %else -; SKX-NEXT: kshiftrw $1, %k0, %k1 +; SKX-NEXT: kshiftrb $1, %k0, %k1 ; SKX-NEXT: kmovw %k1, %eax ; SKX-NEXT: testb $1, %al ; SKX-NEXT: je .LBB31_4 @@ -1751,7 +1751,7 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ; SKX-NEXT: vpextrq $1, %xmm0, %rax ; SKX-NEXT: vpinsrd $1, (%rax), %xmm3, %xmm3 ; SKX-NEXT: .LBB31_4: # %else2 -; SKX-NEXT: kshiftrw $2, %k0, %k0 +; SKX-NEXT: kshiftrb $2, %k0, %k0 ; SKX-NEXT: kmovw %k0, %eax ; SKX-NEXT: testb $1, %al ; SKX-NEXT: je .LBB31_6 @@ -1781,7 +1781,7 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ; SKX_32-NEXT: vmovd %xmm1, %eax ; SKX_32-NEXT: vpinsrd $0, (%eax), %xmm0, %xmm0 ; SKX_32-NEXT: .LBB31_2: # %else -; SKX_32-NEXT: kshiftrw $1, %k0, %k1 +; SKX_32-NEXT: kshiftrb $1, %k0, %k1 ; SKX_32-NEXT: kmovw %k1, %eax ; SKX_32-NEXT: testb $1, %al ; SKX_32-NEXT: je .LBB31_4 @@ -1789,7 +1789,7 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x ; SKX_32-NEXT: vpextrd $1, %xmm1, %eax ; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0 ; SKX_32-NEXT: .LBB31_4: # %else2 -; SKX_32-NEXT: kshiftrw $2, %k0, %k0 +; SKX_32-NEXT: kshiftrb $2, %k0, %k0 ; SKX_32-NEXT: kmovw %k0, %eax ; SKX_32-NEXT: testb $1, %al ; SKX_32-NEXT: je .LBB31_6 diff --git a/llvm/test/CodeGen/X86/pr33349.ll b/llvm/test/CodeGen/X86/pr33349.ll index ec9f238..63edae0 100644 --- a/llvm/test/CodeGen/X86/pr33349.ll +++ b/llvm/test/CodeGen/X86/pr33349.ll @@ -12,32 +12,32 @@ target triple = "x86_64-unknown-linux-gnu" ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftrw $1, %k0, %k1 ; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kshiftrw $2, %k0, %k1 +; KNL-NEXT: kshiftrw $1, %k1, %k2 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: testb $1, %al ; KNL-NEXT: fld1 ; KNL-NEXT: fldz ; KNL-NEXT: fld %st(0) ; KNL-NEXT: fcmovne %st(2), %st(0) -; KNL-NEXT: kshiftrw $2, %k0, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k2 -; KNL-NEXT: kmovw %k2, %eax -; KNL-NEXT: testb $1, %al +; KNL-NEXT: testb $1, %cl ; KNL-NEXT: fld %st(1) ; KNL-NEXT: fcmovne %st(3), %st(0) -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: kmovw %k2, %eax ; KNL-NEXT: testb $1, %al ; KNL-NEXT: fld %st(2) ; KNL-NEXT: fcmovne %st(4), %st(0) -; KNL-NEXT: kmovw %k1, %eax +; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $1, %al ; KNL-NEXT: fxch %st(3) ; KNL-NEXT: fcmovne %st(4), %st(0) ; KNL-NEXT: fstp %st(4) ; KNL-NEXT: fxch %st(3) -; KNL-NEXT: fstpt 20(%rdi) -; KNL-NEXT: fxch %st(1) ; KNL-NEXT: fstpt (%rdi) ; KNL-NEXT: fxch %st(1) ; KNL-NEXT: fstpt 30(%rdi) +; KNL-NEXT: fxch %st(1) +; KNL-NEXT: fstpt 20(%rdi) ; KNL-NEXT: fstpt 10(%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -46,20 +46,20 @@ target triple = "x86_64-unknown-linux-gnu" ; SKX: # %bb.0: # %bb ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 ; SKX-NEXT: vpmovd2m %xmm0, %k0 +; SKX-NEXT: kshiftrb $1, %k0, %k1 +; SKX-NEXT: kmovd %k1, %eax ; SKX-NEXT: kshiftrb $2, %k0, %k1 -; SKX-NEXT: kshiftrw $1, %k1, %k2 -; SKX-NEXT: kmovd %k2, %eax +; SKX-NEXT: kshiftrb $1, %k1, %k2 +; SKX-NEXT: kmovd %k1, %ecx ; SKX-NEXT: testb $1, %al ; SKX-NEXT: fld1 ; SKX-NEXT: fldz ; SKX-NEXT: fld %st(0) ; SKX-NEXT: fcmovne %st(2), %st(0) -; SKX-NEXT: kmovd %k1, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: fld %st(1) ; SKX-NEXT: fcmovne %st(3), %st(0) -; SKX-NEXT: kshiftrw $1, %k0, %k1 -; SKX-NEXT: kmovd %k1, %eax +; SKX-NEXT: kmovd %k2, %eax ; SKX-NEXT: testb $1, %al ; SKX-NEXT: fld %st(2) ; SKX-NEXT: fcmovne %st(4), %st(0) @@ -71,10 +71,10 @@ target triple = "x86_64-unknown-linux-gnu" ; SKX-NEXT: fxch %st(3) ; SKX-NEXT: fstpt (%rdi) ; SKX-NEXT: fxch %st(1) -; SKX-NEXT: fstpt 10(%rdi) +; SKX-NEXT: fstpt 30(%rdi) ; SKX-NEXT: fxch %st(1) ; SKX-NEXT: fstpt 20(%rdi) -; SKX-NEXT: fstpt 30(%rdi) +; SKX-NEXT: fstpt 10(%rdi) ; SKX-NEXT: retq bb: %tmp = select <4 x i1> %m, <4 x x86_fp80> , <4 x x86_fp80> zeroinitializer diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll index 7a5f626..ef499af 100644 --- a/llvm/test/CodeGen/X86/vec_floor.ll +++ b/llvm/test/CodeGen/X86/vec_floor.ll @@ -1694,8 +1694,7 @@ define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x flo ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %k1 +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512F-NEXT: vmovaps %xmm2, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -1703,8 +1702,7 @@ define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x flo ; ; AVX512VL-LABEL: floor_mask_ss_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k0 -; AVX512VL-NEXT: kmovw %k0, %k1 +; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 ; AVX512VL-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512VL-NEXT: vmovaps %xmm2, %xmm0 ; AVX512VL-NEXT: retq @@ -1755,16 +1753,14 @@ define <4 x float> @floor_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwin ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %k1 +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: floor_maskz_ss_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k0 -; AVX512VL-NEXT: kmovw %k0, %k1 +; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 ; AVX512VL-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq %mask1 = fcmp oeq <4 x float> %x, %y @@ -1808,8 +1804,7 @@ define <2 x double> @floor_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %k1 +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512F-NEXT: vmovapd %xmm2, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -1817,8 +1812,7 @@ define <2 x double> @floor_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x ; ; AVX512VL-LABEL: floor_mask_sd_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; AVX512VL-NEXT: kmovw %k0, %k1 +; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 ; AVX512VL-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512VL-NEXT: vmovapd %xmm2, %xmm0 ; AVX512VL-NEXT: retq @@ -1869,16 +1863,14 @@ define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) noun ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %k1 +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: floor_maskz_sd_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; AVX512VL-NEXT: kmovw %k0, %k1 +; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 ; AVX512VL-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq %mask1 = fcmp oeq <2 x double> %x, %y @@ -2766,8 +2758,7 @@ define <4 x float> @ceil_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x floa ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %k1 +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512F-NEXT: vmovaps %xmm2, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -2775,8 +2766,7 @@ define <4 x float> @ceil_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x floa ; ; AVX512VL-LABEL: ceil_mask_ss_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k0 -; AVX512VL-NEXT: kmovw %k0, %k1 +; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 ; AVX512VL-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512VL-NEXT: vmovaps %xmm2, %xmm0 ; AVX512VL-NEXT: retq @@ -2827,16 +2817,14 @@ define <4 x float> @ceil_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %k1 +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: ceil_maskz_ss_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k0 -; AVX512VL-NEXT: kmovw %k0, %k1 +; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1 ; AVX512VL-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq %mask1 = fcmp oeq <4 x float> %x, %y @@ -2880,8 +2868,7 @@ define <2 x double> @ceil_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x d ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %k1 +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512F-NEXT: vmovapd %xmm2, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -2889,8 +2876,7 @@ define <2 x double> @ceil_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x d ; ; AVX512VL-LABEL: ceil_mask_sd_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; AVX512VL-NEXT: kmovw %k0, %k1 +; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 ; AVX512VL-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm2 {%k1} ; AVX512VL-NEXT: vmovapd %xmm2, %xmm0 ; AVX512VL-NEXT: retq @@ -2941,16 +2927,14 @@ define <2 x double> @ceil_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) nounw ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %k1 +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: ceil_maskz_sd_mask8: ; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 -; AVX512VL-NEXT: kmovw %k0, %k1 +; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 ; AVX512VL-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq %mask1 = fcmp oeq <2 x double> %x, %y -- 2.7.4