From f51913155c54eefd6ddb5baad291e8935b98be3a Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 26 Sep 2017 16:43:57 +0000 Subject: [PATCH] [X86] Add support for v16i32 UMUL_LOHI/SMUL_LOHI Summary: This patch extends the v8i32/v4i32 custom lowering to support v16i32 Reviewers: zvi, RKSimon Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D38274 llvm-svn: 314221 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 37 +-- llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll | 404 ++------------------------ llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll | 356 ++--------------------- 3 files changed, 68 insertions(+), 729 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 02e8f9d..23c0ce3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1298,6 +1298,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); setOperationAction(ISD::MUL, MVT::v8i64, Custom); + setOperationAction(ISD::MUL, MVT::v16i32, Legal); + + setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom); + setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); @@ -1306,7 +1310,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v8i64, Custom); setOperationAction(ISD::SELECT, MVT::v16f32, Custom); - setOperationAction(ISD::MUL, MVT::v16i32, Legal); // NonVLX sub-targets extend 128/256 vectors to use the 512 version. setOperationAction(ISD::ABS, MVT::v4i64, Legal); @@ -21800,7 +21803,10 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget, } assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) || - (VT == MVT::v8i32 && Subtarget.hasInt256())); + (VT == MVT::v8i32 && Subtarget.hasInt256()) || + (VT == MVT::v16i32 && Subtarget.hasAVX512())); + + int NumElts = VT.getVectorNumElements(); // PMULxD operations multiply each even value (starting at 0) of LHS with // the related value of RHS and produce a widen result. @@ -21814,17 +21820,17 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget, // // Place the odd value at an even position (basically, shift all values 1 // step to the left): - const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; + const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1}; // => SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, - makeArrayRef(&Mask[0], VT.getVectorNumElements())); + makeArrayRef(&Mask[0], NumElts)); // => SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, - makeArrayRef(&Mask[0], VT.getVectorNumElements())); + makeArrayRef(&Mask[0], NumElts)); // Emit two multiplies, one for the lower 2 ints and one for the higher 2 // ints. - MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64; + MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2); bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; unsigned Opcode = (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; @@ -21836,19 +21842,16 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget, SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); // Shuffle it back into the right order. - SDValue Highs, Lows; - if (VT == MVT::v8i32) { - const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; - Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; - Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); - } else { - const int HighMask[] = {1, 5, 3, 7}; - Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - const int LowMask[] = {0, 4, 2, 6}; - Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + SmallVector HighMask(NumElts); + SmallVector LowMask(NumElts); + for (int i = 0; i != NumElts; ++i) { + HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1; + LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts); } + SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); + SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + // If we have a signed multiply but no PMULDQ fix up the high parts of a // unsigned multiply. if (IsSigned && !Subtarget.hasSSE41()) { diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll index 540540c..c582ebf 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -84,172 +84,17 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind { define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind { ; AVX-LABEL: test_div7_16i32: ; AVX: # BB#0: -; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX-NEXT: vpextrd $1, %xmm1, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $31, %ecx -; AVX-NEXT: sarl $2, %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: vmovd %xmm1, %ecx -; AVX-NEXT: movslq %ecx, %rcx -; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm2 -; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrd $2, %xmm1, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $31, %ecx -; AVX-NEXT: sarl $2, %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrd $3, %xmm1, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $31, %ecx -; AVX-NEXT: sarl $2, %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX-NEXT: vpextrd $1, %xmm2, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $31, %ecx -; AVX-NEXT: sarl $2, %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: vmovd %xmm2, %ecx -; AVX-NEXT: movslq %ecx, %rcx -; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm3 -; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $2, %xmm2, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $31, %ecx -; AVX-NEXT: sarl $2, %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $3, %xmm2, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $31, %ecx -; AVX-NEXT: sarl $2, %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2 -; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpextrd $1, %xmm2, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $31, %ecx -; AVX-NEXT: sarl $2, %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: vmovd %xmm2, %ecx -; AVX-NEXT: movslq %ecx, %rcx -; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm3 -; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $2, %xmm2, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $31, %ecx -; AVX-NEXT: sarl $2, %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $3, %xmm2, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $31, %ecx -; AVX-NEXT: sarl $2, %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2 -; AVX-NEXT: vpextrd $1, %xmm0, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $31, %ecx -; AVX-NEXT: sarl $2, %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: movslq %ecx, %rcx -; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm3 -; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $2, %xmm0, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $31, %ecx -; AVX-NEXT: sarl $2, %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $3, %xmm0, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $31, %ecx -; AVX-NEXT: sarl $2, %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0 -; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] +; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 +; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1 +; AVX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; AVX-NEXT: vpaddd %zmm0, %zmm3, %zmm0 +; AVX-NEXT: vpsrld $31, %zmm0, %zmm1 +; AVX-NEXT: vpsrad $2, %zmm0, %zmm0 +; AVX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX-NEXT: retq %res = sdiv <16 x i32> %a, ret <16 x i32> %res @@ -1159,220 +1004,19 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind { ; AVX-LABEL: test_rem7_16i32: ; AVX: # BB#0: -; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX-NEXT: vpextrd $1, %xmm1, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: leal (,%rcx,8), %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: subl %edx, %eax -; AVX-NEXT: vmovd %xmm1, %ecx -; AVX-NEXT: movslq %ecx, %rcx -; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $2, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: leal (,%rdx,8), %esi -; AVX-NEXT: subl %edx, %esi -; AVX-NEXT: subl %esi, %ecx -; AVX-NEXT: vmovd %ecx, %xmm2 -; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrd $2, %xmm1, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: leal (,%rcx,8), %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: subl %edx, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrd $3, %xmm1, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: leal (,%rcx,8), %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: subl %edx, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX-NEXT: vpextrd $1, %xmm2, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: leal (,%rcx,8), %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: subl %edx, %eax -; AVX-NEXT: vmovd %xmm2, %ecx -; AVX-NEXT: movslq %ecx, %rcx -; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $2, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: leal (,%rdx,8), %esi -; AVX-NEXT: subl %edx, %esi -; AVX-NEXT: subl %esi, %ecx -; AVX-NEXT: vmovd %ecx, %xmm3 -; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $2, %xmm2, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: leal (,%rcx,8), %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: subl %edx, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $3, %xmm2, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: leal (,%rcx,8), %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: subl %edx, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2 -; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpextrd $1, %xmm2, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: leal (,%rcx,8), %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: subl %edx, %eax -; AVX-NEXT: vmovd %xmm2, %ecx -; AVX-NEXT: movslq %ecx, %rcx -; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $2, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: leal (,%rdx,8), %esi -; AVX-NEXT: subl %edx, %esi -; AVX-NEXT: subl %esi, %ecx -; AVX-NEXT: vmovd %ecx, %xmm3 -; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $2, %xmm2, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: leal (,%rcx,8), %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: subl %edx, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $3, %xmm2, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: leal (,%rcx,8), %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: subl %edx, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2 -; AVX-NEXT: vpextrd $1, %xmm0, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: leal (,%rcx,8), %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: subl %edx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: movslq %ecx, %rcx -; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: movl %edx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $2, %edx -; AVX-NEXT: addl %esi, %edx -; AVX-NEXT: leal (,%rdx,8), %esi -; AVX-NEXT: subl %edx, %esi -; AVX-NEXT: subl %esi, %ecx -; AVX-NEXT: vmovd %ecx, %xmm3 -; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $2, %xmm0, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: leal (,%rcx,8), %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: subl %edx, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $3, %xmm0, %eax -; AVX-NEXT: cltq -; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $2, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: leal (,%rcx,8), %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: subl %edx, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0 -; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] +; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 +; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1 +; AVX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; AVX-NEXT: vpaddd %zmm0, %zmm3, %zmm1 +; AVX-NEXT: vpsrld $31, %zmm1, %zmm2 +; AVX-NEXT: vpsrad $2, %zmm1, %zmm1 +; AVX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; AVX-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm1, %zmm1 +; AVX-NEXT: vpsubd %zmm1, %zmm0, %zmm0 ; AVX-NEXT: retq %res = srem <16 x i32> %a, ret <16 x i32> %res diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll index c2db722..a8ddf75 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -92,140 +92,17 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind { define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind { ; AVX-LABEL: test_div7_16i32: ; AVX: # BB#0: -; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX-NEXT: vpextrd $1, %xmm1, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $2, %eax -; AVX-NEXT: vmovd %xmm1, %ecx -; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: shrl %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: shrl $2, %ecx -; AVX-NEXT: vmovd %ecx, %xmm2 -; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrd $2, %xmm1, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $2, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrd $3, %xmm1, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $2, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX-NEXT: vpextrd $1, %xmm2, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $2, %eax -; AVX-NEXT: vmovd %xmm2, %ecx -; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: shrl %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: shrl $2, %ecx -; AVX-NEXT: vmovd %ecx, %xmm3 -; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $2, %xmm2, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $2, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $3, %xmm2, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $2, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2 -; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpextrd $1, %xmm2, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $2, %eax -; AVX-NEXT: vmovd %xmm2, %ecx -; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: shrl %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: shrl $2, %ecx -; AVX-NEXT: vmovd %ecx, %xmm3 -; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $2, %xmm2, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $2, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $3, %xmm2, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $2, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2 -; AVX-NEXT: vpextrd $1, %xmm0, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $2, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: shrl %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: shrl $2, %ecx -; AVX-NEXT: vmovd %ecx, %xmm3 -; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $2, %xmm0, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $2, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $3, %xmm0, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: shrl %eax -; AVX-NEXT: addl %ecx, %eax -; AVX-NEXT: shrl $2, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0 -; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 +; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1 +; AVX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; AVX-NEXT: vpsubd %zmm3, %zmm0, %zmm0 +; AVX-NEXT: vpsrld $1, %zmm0, %zmm0 +; AVX-NEXT: vpaddd %zmm3, %zmm0, %zmm0 +; AVX-NEXT: vpsrld $2, %zmm0, %zmm0 ; AVX-NEXT: retq %res = udiv <16 x i32> %a, ret <16 x i32> %res @@ -1013,204 +890,19 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind { define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind { ; AVX-LABEL: test_rem7_16i32: ; AVX: # BB#0: -; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX-NEXT: vpextrd $1, %xmm1, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $2, %edx -; AVX-NEXT: leal (,%rdx,8), %ecx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm1, %ecx -; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: movl %ecx, %esi -; AVX-NEXT: subl %edx, %esi -; AVX-NEXT: shrl %esi -; AVX-NEXT: addl %edx, %esi -; AVX-NEXT: shrl $2, %esi -; AVX-NEXT: leal (,%rsi,8), %edx -; AVX-NEXT: subl %esi, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm2 -; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrd $2, %xmm1, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $2, %edx -; AVX-NEXT: leal (,%rdx,8), %ecx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpextrd $3, %xmm1, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $2, %edx -; AVX-NEXT: leal (,%rdx,8), %ecx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX-NEXT: vpextrd $1, %xmm2, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $2, %edx -; AVX-NEXT: leal (,%rdx,8), %ecx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm2, %ecx -; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: movl %ecx, %esi -; AVX-NEXT: subl %edx, %esi -; AVX-NEXT: shrl %esi -; AVX-NEXT: addl %edx, %esi -; AVX-NEXT: shrl $2, %esi -; AVX-NEXT: leal (,%rsi,8), %edx -; AVX-NEXT: subl %esi, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm3 -; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $2, %xmm2, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $2, %edx -; AVX-NEXT: leal (,%rdx,8), %ecx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $3, %xmm2, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $2, %edx -; AVX-NEXT: leal (,%rdx,8), %ecx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2 -; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpextrd $1, %xmm2, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $2, %edx -; AVX-NEXT: leal (,%rdx,8), %ecx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm2, %ecx -; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: movl %ecx, %esi -; AVX-NEXT: subl %edx, %esi -; AVX-NEXT: shrl %esi -; AVX-NEXT: addl %edx, %esi -; AVX-NEXT: shrl $2, %esi -; AVX-NEXT: leal (,%rsi,8), %edx -; AVX-NEXT: subl %esi, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm3 -; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $2, %xmm2, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $2, %edx -; AVX-NEXT: leal (,%rdx,8), %ecx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $3, %xmm2, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $2, %edx -; AVX-NEXT: leal (,%rdx,8), %ecx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2 -; AVX-NEXT: vpextrd $1, %xmm0, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $2, %edx -; AVX-NEXT: leal (,%rdx,8), %ecx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rdx -; AVX-NEXT: movl %ecx, %esi -; AVX-NEXT: subl %edx, %esi -; AVX-NEXT: shrl %esi -; AVX-NEXT: addl %edx, %esi -; AVX-NEXT: shrl $2, %esi -; AVX-NEXT: leal (,%rsi,8), %edx -; AVX-NEXT: subl %esi, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm3 -; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $2, %xmm0, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $2, %edx -; AVX-NEXT: leal (,%rdx,8), %ecx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 -; AVX-NEXT: vpextrd $3, %xmm0, %eax -; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925 -; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $2, %edx -; AVX-NEXT: leal (,%rdx,8), %ecx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0 -; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 +; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1 +; AVX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 +; AVX-NEXT: vpsubd %zmm3, %zmm0, %zmm1 +; AVX-NEXT: vpsrld $1, %zmm1, %zmm1 +; AVX-NEXT: vpaddd %zmm3, %zmm1, %zmm1 +; AVX-NEXT: vpsrld $2, %zmm1, %zmm1 +; AVX-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm1, %zmm1 +; AVX-NEXT: vpsubd %zmm1, %zmm0, %zmm0 ; AVX-NEXT: retq %res = urem <16 x i32> %a, ret <16 x i32> %res -- 2.7.4