From b16ccde7a42f8e2b1870338e93c4608660fbd2a2 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Sat, 15 Dec 2012 16:47:44 +0000 Subject: [PATCH] X86: Add a couple of target-specific dag combines that turn VSELECTS into psubus if possible. We match the pattern "x >= y ? x-y : 0" into "subus x, y" and two special cases if y is a constant. DAGCombiner canonicalizes those so we first have to undo the canonicalization for those cases. The pattern occurs in gzip when the loop vectorizer is enabled. Part of PR14613. llvm-svn: 170273 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 68 ++++++ llvm/lib/Target/X86/X86ISelLowering.h | 3 + llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 1 + llvm/lib/Target/X86/X86InstrSSE.td | 34 ++- llvm/test/CodeGen/X86/psubus.ll | 340 +++++++++++++++++++++++++++ 5 files changed, 428 insertions(+), 18 deletions(-) create mode 100644 llvm/test/CodeGen/X86/psubus.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d4ee985..70089fc 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -10097,6 +10097,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + // SSE2/AVX2 sub with unsigned saturation intrinsics + case Intrinsic::x86_sse2_psubus_b: + case Intrinsic::x86_sse2_psubus_w: + case Intrinsic::x86_avx2_psubus_b: + case Intrinsic::x86_avx2_psubus_w: + return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + // SSE3/AVX horizontal add/sub intrinsics case Intrinsic::x86_sse3_hadd_ps: case Intrinsic::x86_sse3_hadd_pd: @@ -11961,6 +11969,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PSIGN: return "X86ISD::PSIGN"; case X86ISD::BLENDV: return "X86ISD::BLENDV"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; + case X86ISD::SUBUS: return "X86ISD::SUBUS"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; @@ -14913,6 +14922,65 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } + // Match VSELECTs into subs with unsigned saturation. + if (!DCI.isBeforeLegalize() && + N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && + // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. + ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || + (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { + ISD::CondCode CC = cast(Cond.getOperand(2))->get(); + + // Check if one of the arms of the VSELECT is a zero vector. If it's on the + // left side invert the predicate to simplify logic below. + SDValue Other; + if (ISD::isBuildVectorAllZeros(LHS.getNode())) { + Other = RHS; + CC = ISD::getSetCCInverse(CC, true); + } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { + Other = LHS; + } + + if (Other.getNode() && Other->getNumOperands() == 2 && + DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) { + SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); + SDValue CondRHS = Cond->getOperand(1); + + // Look for a general sub with unsigned saturation first. + // x >= y ? x-y : 0 --> subus x, y + // x > y ? x-y : 0 --> subus x, y + if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && + Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) + return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); + + // If the RHS is a constant we have to reverse the const canonicalization. + // x > C-1 ? x+-C : 0 --> subus x, C + if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && + isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) { + APInt A = cast(OpRHS.getOperand(0))->getAPIntValue(); + if (CondRHS.getConstantOperandVal(0) == -A-1) { + SmallVector V(VT.getVectorNumElements(), + DAG.getConstant(-A, VT.getScalarType())); + return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, + DAG.getNode(ISD::BUILD_VECTOR, DL, VT, + V.data(), V.size())); + } + } + + // Another special case: If C was a sign bit, the sub has been + // canonicalized into a xor. + // FIXME: Would it be better to use ComputeMaskedBits to determine whether + // it's safe to decanonicalize the xor? + // x s< 0 ? x^C : 0 --> subus x, C + if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && + ISD::isBuildVectorAllZeros(CondRHS.getNode()) && + isSplatVector(OpRHS.getNode())) { + APInt A = cast(OpRHS.getOperand(0))->getAPIntValue(); + if (A.isSignBit()) + return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); + } + } + } + // If we know that this node is legal then we know that it is going to be // matched by one of the SSE/AVX BLEND instructions. These instructions only // depend on the highest bit in each word. Try to use SimplifyDemandedBits diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index cbe83a7..60f3409 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -182,6 +182,9 @@ namespace llvm { /// BLENDI - Blend where the selector is an immediate. BLENDI, + // SUBUS - Integer sub with unsigned saturation. + SUBUS, + /// HADD - Integer horizontal add. HADD, diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 09ab995..7d16d27 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -128,6 +128,7 @@ def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>; def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>; +def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 1912a93..54032fe 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -3724,6 +3724,12 @@ defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, VR128, memopv2i64, i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; defm VPSUBQ : PDI_binop_rm<0xFB, "vpsubq", sub, v2i64, VR128, memopv2i64, i128mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V; +defm VPSUBUSB : PDI_binop_rm<0xD8, "vpsubusb", X86subus, v16i8, VR128, + memopv2i64, i128mem, SSE_INTALU_ITINS_P, 0, 0>, + VEX_4V; +defm VPSUBUSW : PDI_binop_rm<0xD9, "vpsubusw", X86subus, v8i16, VR128, + memopv2i64, i128mem, SSE_INTALU_ITINS_P, 0, 0>, + VEX_4V; defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; @@ -3735,12 +3741,6 @@ defm VPSUBSB : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b, defm VPSUBSW : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_sse2_psubs_w, VR128, memopv2i64, i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPSUBUSB : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_sse2_psubus_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; -defm VPSUBUSW : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_sse2_psubus_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V; defm VPADDSB : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_sse2_padds_b, VR128, memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V; @@ -3804,6 +3804,12 @@ defm VPSUBDY : PDI_binop_rm<0xFA, "vpsubd", sub, v8i32, VR256, memopv4i64, i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L; defm VPSUBQY : PDI_binop_rm<0xFB, "vpsubq", sub, v4i64, VR256, memopv4i64, i256mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V, VEX_L; +defm VPSUBUSBY : PDI_binop_rm<0xD8, "vpsubusb", X86subus, v32i8, VR256, + memopv4i64, i256mem, SSE_INTALU_ITINS_P, 0, 0>, + VEX_4V, VEX_L; +defm VPSUBUSWY : PDI_binop_rm<0xD9, "vpsubusw", X86subus, v16i16, VR256, + memopv4i64, i256mem, SSE_INTALU_ITINS_P, 0, 0>, + VEX_4V, VEX_L; defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32, VR256, memopv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L; @@ -3815,12 +3821,6 @@ defm VPSUBSBY : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_avx2_psubs_b, defm VPSUBSWY : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_avx2_psubs_w, VR256, memopv4i64, i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L; -defm VPSUBUSBY : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_avx2_psubus_b, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L; -defm VPSUBUSWY : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_avx2_psubus_w, - VR256, memopv4i64, i256mem, - SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L; defm VPADDSBY : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_avx2_padds_b, VR256, memopv4i64, i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L; @@ -3884,6 +3884,10 @@ defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32, VR128, memopv2i64, i128mem, SSE_INTALU_ITINS_P>; defm PSUBQ : PDI_binop_rm<0xFB, "psubq", sub, v2i64, VR128, memopv2i64, i128mem, SSE_INTALUQ_ITINS_P>; +defm PSUBUSB : PDI_binop_rm<0xD8, "psubusb", X86subus, v16i8, VR128, memopv2i64, + i128mem, SSE_INTALU_ITINS_P>; +defm PSUBUSW : PDI_binop_rm<0xD9, "psubusw", X86subus, v8i16, VR128, memopv2i64, + i128mem, SSE_INTALU_ITINS_P>; defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>; @@ -3894,12 +3898,6 @@ defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b, defm PSUBSW : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w, VR128, memopv2i64, i128mem, SSE_INTALU_ITINS_P>; -defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P>; -defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w, - VR128, memopv2i64, i128mem, - SSE_INTALU_ITINS_P>; defm PADDSB : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, VR128, memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>; diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll new file mode 100644 index 0000000..aff4afb --- /dev/null +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -0,0 +1,340 @@ +; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE2 +; RUN: llc -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1 +; RUN: llc -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2 + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +define void @test1(i16* nocapture %head) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i16* %head, i64 %index + %1 = bitcast i16* %0 to <8 x i16>* + %2 = load <8 x i16>* %1, align 2 + %3 = icmp slt <8 x i16> %2, zeroinitializer + %4 = xor <8 x i16> %2, + %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer + store <8 x i16> %5, <8 x i16>* %1, align 2 + %index.next = add i64 %index, 8 + %6 = icmp eq i64 %index.next, 16384 + br i1 %6, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; SSE2: @test1 +; SSE2: psubusw LCPI0_0(%rip), %xmm0 + +; AVX1: @test1 +; AVX1: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0 + +; AVX2: @test1 +; AVX2: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0 +} + +define void @test2(i16* nocapture %head) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i16* %head, i64 %index + %1 = bitcast i16* %0 to <8 x i16>* + %2 = load <8 x i16>* %1, align 2 + %3 = icmp ugt <8 x i16> %2, + %4 = add <8 x i16> %2, + %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer + store <8 x i16> %5, <8 x i16>* %1, align 2 + %index.next = add i64 %index, 8 + %6 = icmp eq i64 %index.next, 16384 + br i1 %6, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; SSE2: @test2 +; SSE2: psubusw LCPI1_0(%rip), %xmm0 + +; AVX1: @test2 +; AVX1: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0 + +; AVX2: @test2 +; AVX2: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0 +} + +define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind { +vector.ph: + %0 = insertelement <8 x i16> undef, i16 %w, i32 0 + %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds i16* %head, i64 %index + %2 = bitcast i16* %1 to <8 x i16>* + %3 = load <8 x i16>* %2, align 2 + %4 = icmp ult <8 x i16> %3, %broadcast15 + %5 = sub <8 x i16> %3, %broadcast15 + %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5 + store <8 x i16> %6, <8 x i16>* %2, align 2 + %index.next = add i64 %index, 8 + %7 = icmp eq i64 %index.next, 16384 + br i1 %7, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; SSE2: @test3 +; SSE2: psubusw %xmm0, %xmm1 + +; AVX1: @test3 +; AVX1: vpsubusw %xmm0, %xmm1, %xmm1 + +; AVX2: @test3 +; AVX2: vpsubusw %xmm0, %xmm1, %xmm1 +} + +define void @test4(i8* nocapture %head) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i8* %head, i64 %index + %1 = bitcast i8* %0 to <16 x i8>* + %2 = load <16 x i8>* %1, align 1 + %3 = icmp slt <16 x i8> %2, zeroinitializer + %4 = xor <16 x i8> %2, + %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer + store <16 x i8> %5, <16 x i8>* %1, align 1 + %index.next = add i64 %index, 16 + %6 = icmp eq i64 %index.next, 16384 + br i1 %6, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; SSE2: @test4 +; SSE2: psubusb LCPI3_0(%rip), %xmm0 + +; AVX1: @test4 +; AVX1: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0 + +; AVX2: @test4 +; AVX2: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0 +} + +define void @test5(i8* nocapture %head) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i8* %head, i64 %index + %1 = bitcast i8* %0 to <16 x i8>* + %2 = load <16 x i8>* %1, align 1 + %3 = icmp ugt <16 x i8> %2, + %4 = add <16 x i8> %2, + %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer + store <16 x i8> %5, <16 x i8>* %1, align 1 + %index.next = add i64 %index, 16 + %6 = icmp eq i64 %index.next, 16384 + br i1 %6, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; SSE2: @test5 +; SSE2: psubusb LCPI4_0(%rip), %xmm0 + +; AVX1: @test5 +; AVX1: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0 + +; AVX2: @test5 +; AVX2: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0 +} + +define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind { +vector.ph: + %0 = insertelement <16 x i8> undef, i8 %w, i32 0 + %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds i8* %head, i64 %index + %2 = bitcast i8* %1 to <16 x i8>* + %3 = load <16 x i8>* %2, align 1 + %4 = icmp ult <16 x i8> %3, %broadcast15 + %5 = sub <16 x i8> %3, %broadcast15 + %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5 + store <16 x i8> %6, <16 x i8>* %2, align 1 + %index.next = add i64 %index, 16 + %7 = icmp eq i64 %index.next, 16384 + br i1 %7, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; SSE2: @test6 +; SSE2: psubusb %xmm0, %xmm1 + +; AVX1: @test6 +; AVX1: vpsubusb %xmm0, %xmm1, %xmm1 + +; AVX2: @test6 +; AVX2: vpsubusb %xmm0, %xmm1, %xmm1 +} + +define void @test7(i16* nocapture %head) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i16* %head, i64 %index + %1 = bitcast i16* %0 to <16 x i16>* + %2 = load <16 x i16>* %1, align 2 + %3 = icmp slt <16 x i16> %2, zeroinitializer + %4 = xor <16 x i16> %2, + %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer + store <16 x i16> %5, <16 x i16>* %1, align 2 + %index.next = add i64 %index, 8 + %6 = icmp eq i64 %index.next, 16384 + br i1 %6, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX2: @test7 +; AVX2: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0 +} + +define void @test8(i16* nocapture %head) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i16* %head, i64 %index + %1 = bitcast i16* %0 to <16 x i16>* + %2 = load <16 x i16>* %1, align 2 + %3 = icmp ugt <16 x i16> %2, + %4 = add <16 x i16> %2, + %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer + store <16 x i16> %5, <16 x i16>* %1, align 2 + %index.next = add i64 %index, 8 + %6 = icmp eq i64 %index.next, 16384 + br i1 %6, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX2: @test8 +; AVX2: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0 +} + +define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind { +vector.ph: + %0 = insertelement <16 x i16> undef, i16 %w, i32 0 + %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds i16* %head, i64 %index + %2 = bitcast i16* %1 to <16 x i16>* + %3 = load <16 x i16>* %2, align 2 + %4 = icmp ult <16 x i16> %3, %broadcast15 + %5 = sub <16 x i16> %3, %broadcast15 + %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5 + store <16 x i16> %6, <16 x i16>* %2, align 2 + %index.next = add i64 %index, 8 + %7 = icmp eq i64 %index.next, 16384 + br i1 %7, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + + +; AVX2: @test9 +; AVX2: vpsubusw %ymm0, %ymm1, %ymm1 +} + +define void @test10(i8* nocapture %head) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i8* %head, i64 %index + %1 = bitcast i8* %0 to <32 x i8>* + %2 = load <32 x i8>* %1, align 1 + %3 = icmp slt <32 x i8> %2, zeroinitializer + %4 = xor <32 x i8> %2, + %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer + store <32 x i8> %5, <32 x i8>* %1, align 1 + %index.next = add i64 %index, 16 + %6 = icmp eq i64 %index.next, 16384 + br i1 %6, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + + +; AVX2: @test10 +; AVX2: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0 +} + +define void @test11(i8* nocapture %head) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i8* %head, i64 %index + %1 = bitcast i8* %0 to <32 x i8>* + %2 = load <32 x i8>* %1, align 1 + %3 = icmp ugt <32 x i8> %2, + %4 = add <32 x i8> %2, + %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer + store <32 x i8> %5, <32 x i8>* %1, align 1 + %index.next = add i64 %index, 16 + %6 = icmp eq i64 %index.next, 16384 + br i1 %6, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX2: @test11 +; AVX2: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0 +} + +define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind { +vector.ph: + %0 = insertelement <32 x i8> undef, i8 %w, i32 0 + %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds i8* %head, i64 %index + %2 = bitcast i8* %1 to <32 x i8>* + %3 = load <32 x i8>* %2, align 1 + %4 = icmp ult <32 x i8> %3, %broadcast15 + %5 = sub <32 x i8> %3, %broadcast15 + %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5 + store <32 x i8> %6, <32 x i8>* %2, align 1 + %index.next = add i64 %index, 16 + %7 = icmp eq i64 %index.next, 16384 + br i1 %7, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX2: @test12 +; AVX2: vpsubusb %ymm0, %ymm1, %ymm1 +} -- 2.7.4