%res = call i4 @llvm.smul.fix.sat.i4(i4 2, i4 4, i32 1) ; %res = 4 (1 x 2 = 2)
+'``llvm.umul.fix.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.umul.fix.sat``
+on any integer bit width or vectors of integers.
+
+::
+
+ declare i16 @llvm.umul.fix.sat.i16(i16 %a, i16 %b, i32 %scale)
+ declare i32 @llvm.umul.fix.sat.i32(i32 %a, i32 %b, i32 %scale)
+ declare i64 @llvm.umul.fix.sat.i64(i64 %a, i64 %b, i32 %scale)
+ declare <4 x i32> @llvm.umul.fix.sat.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale)
+
+Overview
+"""""""""
+
+The '``llvm.umul.fix.sat``' family of intrinsic functions perform unsigned
+fixed point saturation multiplication on 2 arguments of the same scale.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo unsigned fixed point multiplication. The argument
+``%scale`` represents the scale of both operands, and must be a constant
+integer.
+
+Semantics:
+""""""""""
+
+This operation performs fixed point multiplication on the 2 arguments of a
+specified scale. The result will also be returned in the same scale specified
+in the third argument.
+
+If the result value cannot be precisely represented in the given scale, the
+value is rounded up or down to the closest representable value. The rounding
+direction is unspecified.
+
+The maximum value this operation can clamp to is the largest unsigned value
+representable by the bit width of the first 2 arguments. The minimum value is the
+smallest unsigned value representable by this bit width (zero).
+
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+ %res = call i4 @llvm.umul.fix.sat.i4(i4 3, i4 2, i32 0) ; %res = 6 (2 x 3 = 6)
+ %res = call i4 @llvm.umul.fix.sat.i4(i4 3, i4 2, i32 1) ; %res = 3 (1.5 x 1 = 1.5)
+
+ ; The result in the following could be rounded down to 2 or up to 2.5
+ %res = call i4 @llvm.umul.fix.sat.i4(i4 3, i4 3, i32 1) ; %res = 4 (or 5) (1.5 x 1.5 = 2.25)
+
+ ; Saturation
+ %res = call i4 @llvm.umul.fix.sat.i4(i4 8, i4 2, i32 0) ; %res = 15 (8 x 2 -> clamped to 15)
+ %res = call i4 @llvm.umul.fix.sat.i4(i4 8, i4 8, i32 2) ; %res = 15 (2 x 2 -> clamped to 3.75)
+
+ ; Scale can affect the saturation result
+ %res = call i4 @llvm.umul.fix.sat.i4(i4 2, i4 4, i32 0) ; %res = 7 (2 x 4 -> clamped to 7)
+ %res = call i4 @llvm.umul.fix.sat.i4(i4 2, i4 4, i32 1) ; %res = 4 (1 x 2 = 2)
+
+
Specialised Arithmetic Intrinsics
---------------------------------
/// Same as the corresponding unsaturated fixed point instructions, but the
/// result is clamped between the min and max values representable by the
/// bits of the first 2 operands.
- SMULFIXSAT,
+ SMULFIXSAT, UMULFIXSAT,
/// Simple binary floating point operators.
FADD, FSUB, FMUL, FDIV, FREM,
case ISD::SMULFIX:
case ISD::SMULFIXSAT:
case ISD::UMULFIX:
+ case ISD::UMULFIXSAT:
Supported = isSupportedFixedPointOperation(Op, VT, Scale);
break;
}
/// method accepts integers as its arguments.
SDValue expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const;
- /// Method for building the DAG expansion of ISD::SMULFIX. This method accepts
- /// integers as its arguments.
+ /// Method for building the DAG expansion of ISD::[U|S]MULFIX[SAT]. This
+ /// method accepts integers as its arguments.
SDValue expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const;
/// Method for building the DAG expansion of ISD::U(ADD|SUB)O. Expansion
def int_smul_fix_sat : Intrinsic<[llvm_anyint_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative, ImmArg<2>]>;
+def int_umul_fix_sat : Intrinsic<[llvm_anyint_ty],
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative, ImmArg<2>]>;
//===------------------------- Memory Use Markers -------------------------===//
//
def smulfix : SDNode<"ISD::SMULFIX" , SDTIntScaledBinOp, [SDNPCommutative]>;
def smulfixsat : SDNode<"ISD::SMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>;
def umulfix : SDNode<"ISD::UMULFIX" , SDTIntScaledBinOp, [SDNPCommutative]>;
+def umulfixsat : SDNode<"ISD::UMULFIXSAT", SDTIntScaledBinOp, [SDNPCommutative]>;
def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>;
case Intrinsic::smul_fix:
case Intrinsic::smul_fix_sat:
case Intrinsic::umul_fix:
+ case Intrinsic::umul_fix_sat:
case Intrinsic::sqrt: // Begin floating-point.
case Intrinsic::sin:
case Intrinsic::cos:
case Intrinsic::smul_fix:
case Intrinsic::smul_fix_sat:
case Intrinsic::umul_fix:
+ case Intrinsic::umul_fix_sat:
return (ScalarOpdIdx == 2);
default:
return false;
case ISD::SUBCARRY: return visitSUBCARRY(N);
case ISD::SMULFIX:
case ISD::SMULFIXSAT:
- case ISD::UMULFIX: return visitMULFIX(N);
+ case ISD::UMULFIX:
+ case ISD::UMULFIXSAT: return visitMULFIX(N);
case ISD::MUL: return visitMUL(N);
case ISD::SDIV: return visitSDIV(N);
case ISD::UDIV: return visitUDIV(N);
return SDValue();
}
-// Notice that "mulfix" can be any of SMULFIX, SMULFIXSAT and UMULFIX here.
+// Notice that "mulfix" can be any of SMULFIX, SMULFIXSAT, UMULFIX and
+// UMULFIXSAT here.
SDValue DAGCombiner::visitMULFIX(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
}
case ISD::SMULFIX:
case ISD::SMULFIXSAT:
- case ISD::UMULFIX: {
+ case ISD::UMULFIX:
+ case ISD::UMULFIXSAT: {
unsigned Scale = Node->getConstantOperandVal(2);
Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
Node->getValueType(0), Scale);
case ISD::SMULFIX:
case ISD::SMULFIXSAT:
case ISD::UMULFIX:
+ case ISD::UMULFIXSAT:
Results.push_back(TLI.expandFixedPointMul(Node, DAG));
break;
case ISD::ADDCARRY:
case ISD::UADDSAT:
case ISD::SSUBSAT:
case ISD::USUBSAT: Res = PromoteIntRes_ADDSUBSAT(N); break;
+
case ISD::SMULFIX:
case ISD::SMULFIXSAT:
- case ISD::UMULFIX: Res = PromoteIntRes_MULFIX(N); break;
+ case ISD::UMULFIX:
+ case ISD::UMULFIXSAT: Res = PromoteIntRes_MULFIX(N); break;
+
case ISD::ABS: Res = PromoteIntRes_ABS(N); break;
case ISD::ATOMIC_LOAD:
SDValue Op1Promoted, Op2Promoted;
bool Signed =
N->getOpcode() == ISD::SMULFIX || N->getOpcode() == ISD::SMULFIXSAT;
+ bool Saturating =
+ N->getOpcode() == ISD::SMULFIXSAT || N->getOpcode() == ISD::UMULFIXSAT;
if (Signed) {
Op1Promoted = SExtPromotedInteger(N->getOperand(0));
Op2Promoted = SExtPromotedInteger(N->getOperand(1));
unsigned DiffSize =
PromotedType.getScalarSizeInBits() - OldType.getScalarSizeInBits();
- bool Saturating = N->getOpcode() == ISD::SMULFIXSAT;
if (Saturating) {
// Promoting the operand and result values changes the saturation width,
// which is extends the values that we clamp to on saturation. This could be
case ISD::SMULFIX:
case ISD::SMULFIXSAT:
- case ISD::UMULFIX: Res = PromoteIntOp_MULFIX(N); break;
+ case ISD::UMULFIX:
+ case ISD::UMULFIXSAT: Res = PromoteIntOp_MULFIX(N); break;
case ISD::FPOWI: Res = PromoteIntOp_FPOWI(N); break;
case ISD::SMULFIX:
case ISD::SMULFIXSAT:
- case ISD::UMULFIX: ExpandIntRes_MULFIX(N, Lo, Hi); break;
+ case ISD::UMULFIX:
+ case ISD::UMULFIXSAT: ExpandIntRes_MULFIX(N, Lo, Hi); break;
case ISD::VECREDUCE_ADD:
case ISD::VECREDUCE_MUL:
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
uint64_t Scale = N->getConstantOperandVal(2);
- bool Saturating = N->getOpcode() == ISD::SMULFIXSAT;
+ bool Saturating = (N->getOpcode() == ISD::SMULFIXSAT ||
+ N->getOpcode() == ISD::UMULFIXSAT);
bool Signed = (N->getOpcode() == ISD::SMULFIX ||
N->getOpcode() == ISD::SMULFIXSAT);
Result = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
} else {
EVT BoolVT = getSetCCResultType(VT);
- Result = DAG.getNode(ISD::SMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+ unsigned MulOp = Signed ? ISD::SMULO : ISD::UMULO;
+ Result = DAG.getNode(MulOp, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
SDValue Product = Result.getValue(0);
SDValue Overflow = Result.getValue(1);
- assert(Signed && "Unsigned saturation not supported (yet).");
- APInt MinVal = APInt::getSignedMinValue(VTSize);
- APInt MaxVal = APInt::getSignedMaxValue(VTSize);
- SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
- SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
- SDValue Zero = DAG.getConstant(0, dl, VT);
- SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
- Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
- Result = DAG.getSelect(dl, VT, Overflow, Result, Product);
+ if (Signed) {
+ APInt MinVal = APInt::getSignedMinValue(VTSize);
+ APInt MaxVal = APInt::getSignedMaxValue(VTSize);
+ SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
+ SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+ SDValue Zero = DAG.getConstant(0, dl, VT);
+ SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
+ Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
+ Result = DAG.getSelect(dl, VT, Overflow, Result, Product);
+ } else {
+ // For unsigned multiplication, we only need to check the max since we
+ // can't really overflow towards zero.
+ APInt MaxVal = APInt::getMaxValue(VTSize);
+ SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+ Result = DAG.getSelect(dl, VT, Overflow, SatMax, Product);
+ }
}
SplitInteger(Result, Lo, Hi);
return;
}
+ // For SMULFIX[SAT] we only expect to find Scale<VTSize, but this assert will
+ // cover for unhandled cases below, while still being valid for UMULFIX[SAT].
+ assert(Scale <= VTSize && "Scale can't be larger than the value type size.");
+
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
SDValue LL, LH, RL, RH;
GetExpandedInteger(LHS, LL, LH);
if (!Saturating)
return;
+ // Can not overflow when there is no integer part.
+ if (Scale == VTSize)
+ return;
+
// To handle saturation we must check for overflow in the multiplication.
//
+ // Unsigned overflow happened if the upper (VTSize - Scale) bits (of Result)
+ // aren't all zeroes.
+ //
// Signed overflow happened if the upper (VTSize - Scale + 1) bits (of Result)
// aren't all ones or all zeroes.
//
// We cannot overflow past HH when multiplying 2 ints of size VTSize, so the
- // highest bit of HH determines saturation direction in the event of
+ // highest bit of HH determines saturation direction in the event of signed
// saturation.
SDValue ResultHL = Result[2];
SDValue NVTNeg1 = DAG.getConstant(-1, dl, NVT);
EVT BoolNVT = getSetCCResultType(NVT);
- if (!Signed)
- llvm_unreachable("Unsigned saturation not supported (yet).");
+ if (!Signed) {
+ if (Scale < NVTSize) {
+ // Overflow happened if ((HH | (HL >> Scale)) != 0).
+ SDValue HLAdjusted = DAG.getNode(ISD::SRL, dl, NVT, ResultHL,
+ DAG.getConstant(Scale, dl, ShiftTy));
+ SDValue Tmp = DAG.getNode(ISD::OR, dl, NVT, HLAdjusted, ResultHH);
+ SatMax = DAG.getSetCC(dl, BoolNVT, Tmp, NVTZero, ISD::SETNE);
+ } else if (Scale == NVTSize) {
+ // Overflow happened if (HH != 0).
+ SatMax = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETNE);
+ } else if (Scale < VTSize) {
+ // Overflow happened if ((HH >> (Scale - NVTSize)) != 0).
+ SDValue HLAdjusted = DAG.getNode(ISD::SRL, dl, NVT, ResultHL,
+ DAG.getConstant(Scale - NVTSize, dl,
+ ShiftTy));
+ SatMax = DAG.getSetCC(dl, BoolNVT, HLAdjusted, NVTZero, ISD::SETNE);
+ } else
+ llvm_unreachable("Scale must be less or equal to VTSize for UMULFIXSAT"
+ "(and saturation can't happen with Scale==VTSize).");
+
+ Hi = DAG.getSelect(dl, NVT, SatMax, NVTNeg1, Hi);
+ Lo = DAG.getSelect(dl, NVT, SatMax, NVTNeg1, Lo);
+ return;
+ }
if (Scale < NVTSize) {
// The number of overflow bits we can check are VTSize - Scale + 1 (we
break;
case ISD::SMULFIX:
case ISD::SMULFIXSAT:
- case ISD::UMULFIX: {
+ case ISD::UMULFIX:
+ case ISD::UMULFIXSAT: {
unsigned Scale = Node->getConstantOperandVal(2);
Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
Node->getValueType(0), Scale);
case ISD::UMULFIX:
return ExpandFixedPointMul(Op);
case ISD::SMULFIXSAT:
- // FIXME: We do not expand SMULFIXSAT here yet, not sure why. Maybe it
- // results in worse codegen compared to the default unroll? This should
- // probably be investigated. And if we still prefer to unroll an explanation
- // could be helpful, otherwise it just looks like something that hasn't been
- // "implemented" yet.
+ case ISD::UMULFIXSAT:
+ // FIXME: We do not expand SMULFIXSAT/UMULFIXSAT here yet, not sure exactly
+ // why. Maybe it results in worse codegen compared to the unroll for some
+ // targets? This should probably be investigated. And if we still prefer to
+ // unroll an explanation could be helpful.
return DAG.UnrollVectorOp(Op.getNode());
case ISD::STRICT_FADD:
case ISD::STRICT_FSUB:
case ISD::SMULFIX:
case ISD::SMULFIXSAT:
case ISD::UMULFIX:
+ case ISD::UMULFIXSAT:
R = ScalarizeVecRes_MULFIX(N);
break;
}
case ISD::SMULFIX:
case ISD::SMULFIXSAT:
case ISD::UMULFIX:
+ case ISD::UMULFIXSAT:
SplitVecRes_MULFIX(N, Lo, Hi);
break;
}
case ISD::SMULFIX:
case ISD::SMULFIXSAT:
case ISD::UMULFIX:
+ case ISD::UMULFIXSAT:
// These are binary operations, but with an extra operand that shouldn't
// be widened (the scale).
Res = WidenVecRes_BinaryWithExtraScalarOp(N);
Op3));
return;
}
+ case Intrinsic::umul_fix_sat: {
+ SDValue Op1 = getValue(I.getArgOperand(0));
+ SDValue Op2 = getValue(I.getArgOperand(1));
+ SDValue Op3 = getValue(I.getArgOperand(2));
+ setValue(&I, DAG.getNode(ISD::UMULFIXSAT, sdl, Op1.getValueType(), Op1, Op2,
+ Op3));
+ return;
+ }
case Intrinsic::stacksave: {
SDValue Op = getRoot();
Res = DAG.getNode(
case ISD::SMULFIX: return "smulfix";
case ISD::SMULFIXSAT: return "smulfixsat";
case ISD::UMULFIX: return "umulfix";
+ case ISD::UMULFIXSAT: return "umulfixsat";
// Conversion operators.
case ISD::SIGN_EXTEND: return "sign_extend";
TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
assert((Node->getOpcode() == ISD::SMULFIX ||
Node->getOpcode() == ISD::UMULFIX ||
- Node->getOpcode() == ISD::SMULFIXSAT) &&
+ Node->getOpcode() == ISD::SMULFIXSAT ||
+ Node->getOpcode() == ISD::UMULFIXSAT) &&
"Expected a fixed point multiplication opcode");
SDLoc dl(Node);
SDValue RHS = Node->getOperand(1);
EVT VT = LHS.getValueType();
unsigned Scale = Node->getConstantOperandVal(2);
- bool Saturating = Node->getOpcode() == ISD::SMULFIXSAT;
+ bool Saturating = (Node->getOpcode() == ISD::SMULFIXSAT ||
+ Node->getOpcode() == ISD::UMULFIXSAT);
+ bool Signed = (Node->getOpcode() == ISD::SMULFIX ||
+ Node->getOpcode() == ISD::SMULFIXSAT);
EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
unsigned VTSize = VT.getScalarSizeInBits();
if (!Scale) {
// [us]mul.fix(a, b, 0) -> mul(a, b)
- if (!Saturating && isOperationLegalOrCustom(ISD::MUL, VT)) {
- return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
- } else if (Saturating && isOperationLegalOrCustom(ISD::SMULO, VT)) {
+ if (!Saturating) {
+ if (isOperationLegalOrCustom(ISD::MUL, VT))
+ return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+ } else if (Signed && isOperationLegalOrCustom(ISD::SMULO, VT)) {
SDValue Result =
DAG.getNode(ISD::SMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
SDValue Product = Result.getValue(0);
SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
return DAG.getSelect(dl, VT, Overflow, Result, Product);
+ } else if (!Signed && isOperationLegalOrCustom(ISD::UMULO, VT)) {
+ SDValue Result =
+ DAG.getNode(ISD::UMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+ SDValue Product = Result.getValue(0);
+ SDValue Overflow = Result.getValue(1);
+
+ APInt MaxVal = APInt::getMaxValue(VTSize);
+ SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+ return DAG.getSelect(dl, VT, Overflow, SatMax, Product);
}
}
- bool Signed =
- Node->getOpcode() == ISD::SMULFIX || Node->getOpcode() == ISD::SMULFIXSAT;
assert(((Signed && Scale < VTSize) || (!Signed && Scale <= VTSize)) &&
"Expected scale to be less than the number of bits if signed or at "
"most the number of bits if unsigned.");
if (Scale == VTSize)
// Result is just the top half since we'd be shifting by the width of the
- // operand.
+ // operand. Overflow impossible so this works for both UMULFIX and
+ // UMULFIXSAT.
return Hi;
// The result will need to be shifted right by the scale since both operands
if (!Saturating)
return Result;
- unsigned OverflowBits = VTSize - Scale + 1; // +1 for the sign
- SDValue HiMask =
- DAG.getConstant(APInt::getHighBitsSet(VTSize, OverflowBits), dl, VT);
- SDValue LoMask = DAG.getConstant(
- APInt::getLowBitsSet(VTSize, VTSize - OverflowBits), dl, VT);
- APInt MaxVal = APInt::getSignedMaxValue(VTSize);
- APInt MinVal = APInt::getSignedMinValue(VTSize);
+ if (!Signed) {
+ // Unsigned overflow happened if the upper (VTSize - Scale) bits (of the
+ // widened multiplication) aren't all zeroes.
+
+ // Saturate to max if ((Hi >> Scale) != 0),
+ // which is the same as if (Hi > ((1 << Scale) - 1))
+ APInt MaxVal = APInt::getMaxValue(VTSize);
+ SDValue LowMask = DAG.getConstant(APInt::getLowBitsSet(VTSize, Scale),
+ dl, VT);
+ Result = DAG.getSelectCC(dl, Hi, LowMask,
+ DAG.getConstant(MaxVal, dl, VT), Result,
+ ISD::SETUGT);
- Result = DAG.getSelectCC(dl, Hi, LoMask,
+ return Result;
+ }
+
+ // Signed overflow happened if the upper (VTSize - Scale + 1) bits (of the
+ // widened multiplication) aren't all ones or all zeroes. We handled Scale==0
+ // above so all the bits to examine is in Hi.
+
+ // Saturate to max if ((Hi >> (Scale - 1)) > 0),
+ // which is the same as if (Hi > (1 << (Scale - 1)) - 1)
+ APInt MaxVal = APInt::getSignedMaxValue(VTSize);
+ SDValue LowMask = DAG.getConstant(APInt::getLowBitsSet(VTSize, Scale - 1),
+ dl, VT);
+ Result = DAG.getSelectCC(dl, Hi, LowMask,
DAG.getConstant(MaxVal, dl, VT), Result,
ISD::SETGT);
- return DAG.getSelectCC(dl, Hi, HiMask,
- DAG.getConstant(MinVal, dl, VT), Result,
- ISD::SETLT);
+ // Saturate to min if (Hi >> (Scale - 1)) < -1),
+ // which is the same as if (HI < (-1 << (Scale - 1))
+ APInt MinVal = APInt::getSignedMinValue(VTSize);
+ SDValue HighMask =
+ DAG.getConstant(APInt::getHighBitsSet(VTSize, VTSize - Scale + 1),
+ dl, VT);
+ Result = DAG.getSelectCC(dl, Hi, HighMask,
+ DAG.getConstant(MinVal, dl, VT), Result,
+ ISD::SETLT);
+ return Result;
}
void TargetLowering::expandUADDSUBO(
setOperationAction(ISD::SMULFIX, VT, Expand);
setOperationAction(ISD::SMULFIXSAT, VT, Expand);
setOperationAction(ISD::UMULFIX, VT, Expand);
+ setOperationAction(ISD::UMULFIXSAT, VT, Expand);
// Overflow operations default to expand
setOperationAction(ISD::SADDO, VT, Expand);
}
case Intrinsic::smul_fix:
case Intrinsic::smul_fix_sat:
- case Intrinsic::umul_fix: {
+ case Intrinsic::umul_fix:
+ case Intrinsic::umul_fix_sat: {
Value *Op1 = Call.getArgOperand(0);
Value *Op2 = Call.getArgOperand(1);
Assert(Op1->getType()->isIntOrIntVectorTy(),
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=ppc32 | FileCheck %s
+
+declare i32 @llvm.umul.fix.sat.i32(i32, i32, i32)
+
+define i32 @func1(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: func1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li 5, -1
+; CHECK-NEXT: mulhwu. 6, 3, 4
+; CHECK-NEXT: mullw 3, 3, 4
+; CHECK-NEXT: bclr 12, 2, 0
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ori 3, 5, 0
+; CHECK-NEXT: blr
+ %tmp = call i32 @llvm.umul.fix.sat.i32(i32 %x, i32 %y, i32 0)
+ ret i32 %tmp
+}
+
+define i32 @func2(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: func2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: mulhwu 6, 3, 4
+; CHECK-NEXT: li 5, -1
+; CHECK-NEXT: cmplwi 6, 1
+; CHECK-NEXT: mullw 3, 3, 4
+; CHECK-NEXT: rotlwi 3, 3, 31
+; CHECK-NEXT: rlwimi 3, 6, 31, 0, 0
+; CHECK-NEXT: bc 12, 1, .LBB1_1
+; CHECK-NEXT: blr
+; CHECK-NEXT: .LBB1_1:
+; CHECK-NEXT: addi 3, 5, 0
+; CHECK-NEXT: blr
+ %tmp = call i32 @llvm.umul.fix.sat.i32(i32 %x, i32 %y, i32 1)
+ ret i32 %tmp
+}
declare i32 @llvm.smul.fix.i32(i32, i32, i32 immarg)
declare i32 @llvm.umul.fix.i32(i32, i32, i32 immarg)
declare i32 @llvm.smul.fix.sat.i32(i32, i32, i32 immarg)
+declare i32 @llvm.umul.fix.sat.i32(i32, i32, i32 immarg)
declare <4 x i32> @llvm.smul.fix.v4i32(<4 x i32>, <4 x i32>, i32 immarg)
declare <4 x i32> @llvm.umul.fix.v4i32(<4 x i32>, <4 x i32>, i32 immarg)
declare <4 x i32> @llvm.smul.fix.sat.v4i32(<4 x i32>, <4 x i32>, i32 immarg)
+declare <4 x i32> @llvm.umul.fix.sat.v4i32(<4 x i32>, <4 x i32>, i32 immarg)
define i32 @smulfix_undef(i32 %y) nounwind {
; CHECK-LABEL: smulfix_undef:
ret i32 %tmp
}
+define i32 @umulfixsat_undef(i32 %y) nounwind {
+; CHECK-LABEL: umulfixsat_undef:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: retq
+ %tmp = call i32 @llvm.umul.fix.sat.i32(i32 undef, i32 %y, i32 2)
+ ret i32 %tmp
+}
+
+define i32 @umulfixsat_zero(i32 %y) nounwind {
+; CHECK-LABEL: umulfixsat_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: retq
+ %tmp = call i32 @llvm.umul.fix.sat.i32(i32 0, i32 %y, i32 2)
+ ret i32 %tmp
+}
+
define <4 x i32> @vec_smulfix_undef(<4 x i32> %y) nounwind {
; CHECK-LABEL: vec_smulfix_undef:
; CHECK: # %bb.0:
%tmp = call <4 x i32> @llvm.smul.fix.sat.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> %y, i32 2)
ret <4 x i32> %tmp
}
+
+define <4 x i32> @vec_umulfixsat_undef(<4 x i32> %y) nounwind {
+; CHECK-LABEL: vec_umulfixsat_undef:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %tmp = call <4 x i32> @llvm.umul.fix.sat.v4i32(<4 x i32> undef, <4 x i32> %y, i32 2)
+ ret <4 x i32> %tmp
+}
+
+define <4 x i32> @vec_umulfixsat_zero(<4 x i32> %y) nounwind {
+; CHECK-LABEL: vec_umulfixsat_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %tmp = call <4 x i32> @llvm.umul.fix.sat.v4i32(<4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> %y, i32 2)
+ ret <4 x i32> %tmp
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+declare i4 @llvm.umul.fix.sat.i4 (i4, i4, i32)
+declare i32 @llvm.umul.fix.sat.i32 (i32, i32, i32)
+declare i64 @llvm.umul.fix.sat.i64 (i64, i64, i32)
+declare <4 x i32> @llvm.umul.fix.sat.v4i32(<4 x i32>, <4 x i32>, i32)
+
+define i32 @func(i32 %x, i32 %y) nounwind {
+; X64-LABEL: func:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: imulq %rax, %rcx
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: shrdl $2, %eax, %ecx
+; X64-NEXT: cmpl $3, %eax
+; X64-NEXT: movl $-1, %eax
+; X64-NEXT: cmovbel %ecx, %eax
+; X64-NEXT: retq
+;
+; X86-LABEL: func:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: shrdl $2, %edx, %eax
+; X86-NEXT: cmpl $3, %edx
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: cmoval %ecx, %eax
+; X86-NEXT: retl
+ %tmp = call i32 @llvm.umul.fix.sat.i32(i32 %x, i32 %y, i32 2)
+ ret i32 %tmp
+}
+
+define i64 @func2(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func2:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: shrdq $2, %rdx, %rax
+; X64-NEXT: cmpq $3, %rdx
+; X64-NEXT: movq $-1, %rcx
+; X64-NEXT: cmovaq %rcx, %rax
+; X64-NEXT: retq
+;
+; X86-LABEL: func2:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: shrdl $2, %eax, %ecx
+; X86-NEXT: shrdl $2, %edx, %eax
+; X86-NEXT: shrl $2, %edx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl $-1, %edx
+; X86-NEXT: cmovnel %edx, %ecx
+; X86-NEXT: cmovel %eax, %edx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+ %tmp = call i64 @llvm.umul.fix.sat.i64(i64 %x, i64 %y, i32 2)
+ ret i64 %tmp
+}
+
+define i4 @func3(i4 %x, i4 %y) nounwind {
+; X64-LABEL: func3:
+; X64: # %bb.0:
+; X64-NEXT: andl $15, %esi
+; X64-NEXT: shlb $4, %dil
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: imull %esi, %eax
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: shrb $2, %cl
+; X64-NEXT: shrl $8, %eax
+; X64-NEXT: movl %eax, %edx
+; X64-NEXT: shlb $6, %dl
+; X64-NEXT: orb %cl, %dl
+; X64-NEXT: movzbl %dl, %ecx
+; X64-NEXT: cmpb $3, %al
+; X64-NEXT: movl $255, %eax
+; X64-NEXT: cmovbel %ecx, %eax
+; X64-NEXT: shrb $4, %al
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; X86-LABEL: func3:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-NEXT: andb $15, %al
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: shlb $4, %cl
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: imull %edx, %eax
+; X86-NEXT: movb %ah, %cl
+; X86-NEXT: shlb $6, %cl
+; X86-NEXT: shrb $2, %al
+; X86-NEXT: orb %cl, %al
+; X86-NEXT: movzbl %al, %ecx
+; X86-NEXT: cmpb $3, %ah
+; X86-NEXT: movl $255, %eax
+; X86-NEXT: cmovbel %ecx, %eax
+; X86-NEXT: shrb $4, %al
+; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: retl
+ %tmp = call i4 @llvm.umul.fix.sat.i4(i4 %x, i4 %y, i32 2)
+ ret i4 %tmp
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X64-LABEL: vec:
+; X64: # %bb.0:
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; X64-NEXT: movd %xmm2, %eax
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; X64-NEXT: movd %xmm2, %ecx
+; X64-NEXT: imulq %rax, %rcx
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: shrdl $2, %eax, %ecx
+; X64-NEXT: cmpl $3, %eax
+; X64-NEXT: movl $-1, %eax
+; X64-NEXT: cmoval %eax, %ecx
+; X64-NEXT: movd %ecx, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT: movd %xmm3, %ecx
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X64-NEXT: movd %xmm3, %edx
+; X64-NEXT: imulq %rcx, %rdx
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: shrq $32, %rcx
+; X64-NEXT: shrdl $2, %ecx, %edx
+; X64-NEXT: cmpl $3, %ecx
+; X64-NEXT: cmoval %eax, %edx
+; X64-NEXT: movd %edx, %xmm3
+; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT: movd %xmm1, %ecx
+; X64-NEXT: movd %xmm0, %edx
+; X64-NEXT: imulq %rcx, %rdx
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: shrq $32, %rcx
+; X64-NEXT: shrdl $2, %ecx, %edx
+; X64-NEXT: cmpl $3, %ecx
+; X64-NEXT: cmoval %eax, %edx
+; X64-NEXT: movd %edx, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X64-NEXT: movd %xmm1, %ecx
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-NEXT: movd %xmm0, %edx
+; X64-NEXT: imulq %rcx, %rdx
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: shrq $32, %rcx
+; X64-NEXT: shrdl $2, %ecx, %edx
+; X64-NEXT: cmpl $3, %ecx
+; X64-NEXT: cmoval %eax, %edx
+; X64-NEXT: movd %edx, %xmm0
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-NEXT: movdqa %xmm2, %xmm0
+; X64-NEXT: retq
+;
+; X86-LABEL: vec:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: shrdl $2, %edx, %esi
+; X86-NEXT: cmpl $3, %edx
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: cmoval %ecx, %esi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: shrdl $2, %edx, %ebp
+; X86-NEXT: cmpl $3, %edx
+; X86-NEXT: cmoval %ecx, %ebp
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: shrdl $2, %edx, %ebx
+; X86-NEXT: cmpl $3, %edx
+; X86-NEXT: cmoval %ecx, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: shrdl $2, %edx, %eax
+; X86-NEXT: cmpl $3, %edx
+; X86-NEXT: cmoval %ecx, %eax
+; X86-NEXT: movl %eax, 12(%edi)
+; X86-NEXT: movl %ebx, 8(%edi)
+; X86-NEXT: movl %ebp, 4(%edi)
+; X86-NEXT: movl %esi, (%edi)
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+ %tmp = call <4 x i32> @llvm.umul.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2)
+ ret <4 x i32> %tmp
+}
+
+; These result in regular integer multiplication
+define i32 @func4(i32 %x, i32 %y) nounwind {
+; X64-LABEL: func4:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: mull %esi
+; X64-NEXT: movl $-1, %ecx
+; X64-NEXT: cmovol %ecx, %eax
+; X64-NEXT: retq
+;
+; X86-LABEL: func4:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: cmovol %ecx, %eax
+; X86-NEXT: retl
+ %tmp = call i32 @llvm.umul.fix.sat.i32(i32 %x, i32 %y, i32 0)
+ ret i32 %tmp
+}
+
+define i64 @func5(i64 %x, i64 %y) {
+; X64-LABEL: func5:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq $-1, %rcx
+; X64-NEXT: cmovoq %rcx, %rax
+; X64-NEXT: retq
+;
+; X86-LABEL: func5:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: pushl %ebx
+; X86-NEXT: .cfi_def_cfa_offset 12
+; X86-NEXT: pushl %edi
+; X86-NEXT: .cfi_def_cfa_offset 16
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 20
+; X86-NEXT: .cfi_offset %esi, -20
+; X86-NEXT: .cfi_offset %edi, -16
+; X86-NEXT: .cfi_offset %ebx, -12
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: setne %dl
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: setne %cl
+; X86-NEXT: andb %dl, %cl
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: seto %bl
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: seto %ch
+; X86-NEXT: orb %bl, %ch
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: setb %bl
+; X86-NEXT: orb %ch, %bl
+; X86-NEXT: orb %cl, %bl
+; X86-NEXT: movl $-1, %ecx
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: cmovnel %ecx, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 16
+; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 12
+; X86-NEXT: popl %ebx
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 4
+; X86-NEXT: retl
+ %tmp = call i64 @llvm.umul.fix.sat.i64(i64 %x, i64 %y, i32 0)
+ ret i64 %tmp
+}
+
+define i4 @func6(i4 %x, i4 %y) nounwind {
+; X64-LABEL: func6:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andb $15, %sil
+; X64-NEXT: shlb $4, %al
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: mulb %sil
+; X64-NEXT: movzbl %al, %ecx
+; X64-NEXT: movl $255, %eax
+; X64-NEXT: cmovnol %ecx, %eax
+; X64-NEXT: shrb $4, %al
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; X86-LABEL: func6:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: andb $15, %cl
+; X86-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-NEXT: shlb $4, %al
+; X86-NEXT: mulb %cl
+; X86-NEXT: movzbl %al, %ecx
+; X86-NEXT: movl $255, %eax
+; X86-NEXT: cmovnol %ecx, %eax
+; X86-NEXT: shrb $4, %al
+; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: retl
+ %tmp = call i4 @llvm.umul.fix.sat.i4(i4 %x, i4 %y, i32 0)
+ ret i4 %tmp
+}
+
+define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X64-LABEL: vec2:
+; X64: # %bb.0:
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; X64-NEXT: movd %xmm2, %eax
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; X64-NEXT: movd %xmm2, %ecx
+; X64-NEXT: mull %ecx
+; X64-NEXT: movl $-1, %ecx
+; X64-NEXT: cmovol %ecx, %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; X64-NEXT: movd %xmm3, %eax
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X64-NEXT: movd %xmm3, %edx
+; X64-NEXT: mull %edx
+; X64-NEXT: cmovol %ecx, %eax
+; X64-NEXT: movd %eax, %xmm3
+; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: movd %xmm1, %edx
+; X64-NEXT: mull %edx
+; X64-NEXT: cmovol %ecx, %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-NEXT: movd %xmm0, %edx
+; X64-NEXT: mull %edx
+; X64-NEXT: cmovol %ecx, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-NEXT: movdqa %xmm2, %xmm0
+; X64-NEXT: retq
+;
+; X86-LABEL: vec2:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl $-1, %esi
+; X86-NEXT: cmovol %esi, %ebp
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: cmovol %esi, %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: cmovol %esi, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: cmovol %esi, %eax
+; X86-NEXT: movl %eax, 12(%ecx)
+; X86-NEXT: movl %edi, 8(%ecx)
+; X86-NEXT: movl %ebx, 4(%ecx)
+; X86-NEXT: movl %ebp, (%ecx)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+ %tmp = call <4 x i32> @llvm.umul.fix.sat.v4i32(<4 x i32> %x, <4 x i32> %y, i32 0)
+ ret <4 x i32> %tmp
+}
+
+define i64 @func7(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func7:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: shrdq $32, %rdx, %rax
+; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
+; X64-NEXT: cmpq %rcx, %rdx
+; X64-NEXT: movq $-1, %rcx
+; X64-NEXT: cmovaq %rcx, %rax
+; X64-NEXT: retq
+;
+; X86-LABEL: func7:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl %ebp, %edx
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpl $1, %ebx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: notl %ecx
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+ %tmp = call i64 @llvm.umul.fix.sat.i64(i64 %x, i64 %y, i32 32)
+ ret i64 %tmp
+}
+
+define i64 @func8(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func8:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: shrdq $63, %rdx, %rax
+; X64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT: cmpq %rcx, %rdx
+; X64-NEXT: movq $-1, %rcx
+; X64-NEXT: cmovaq %rcx, %rax
+; X64-NEXT: retq
+;
+; X86-LABEL: func8:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: shrdl $31, %edx, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shrl $31, %esi
+; X86-NEXT: xorl %edi, %edi
+; X86-NEXT: cmpl $1, %esi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: notl %edi
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: shldl $1, %edx, %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+ %tmp = call i64 @llvm.umul.fix.sat.i64(i64 %x, i64 %y, i32 63)
+ ret i64 %tmp
+}
declare <4 x i16> @llvm.smul.fix.v4i16(<4 x i16>, <4 x i16>, i32 immarg)
declare <4 x i16> @llvm.umul.fix.v4i16(<4 x i16>, <4 x i16>, i32 immarg)
declare <4 x i16> @llvm.smul.fix.sat.v4i16(<4 x i16>, <4 x i16>, i32 immarg)
+declare <4 x i16> @llvm.umul.fix.sat.v4i16(<4 x i16>, <4 x i16>, i32 immarg)
define <4 x i16> @smulfix(<4 x i16> %a) {
; CHECK-LABEL: smulfix:
}
+define <4 x i16> @umulfixsat(<4 x i16> %a) {
+; CHECK-LABEL: umulfixsat:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pextrw $2, %xmm0, %eax
+; CHECK-NEXT: leal (%rax,%rax,2), %eax
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shrl $16, %edx
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: shldw $1, %ax, %cx
+; CHECK-NEXT: cmpl $32767, %edx # imm = 0x7FFF
+; CHECK-NEXT: movl $65535, %eax # imm = 0xFFFF
+; CHECK-NEXT: cmoval %eax, %ecx
+; CHECK-NEXT: pextrw $1, %xmm0, %edx
+; CHECK-NEXT: addl %edx, %edx
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: shrl $16, %esi
+; CHECK-NEXT: movl %esi, %edi
+; CHECK-NEXT: shldw $1, %dx, %di
+; CHECK-NEXT: cmpl $32767, %esi # imm = 0x7FFF
+; CHECK-NEXT: cmoval %eax, %edi
+; CHECK-NEXT: movd %xmm0, %edx
+; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: shldw $1, %dx, %si
+; CHECK-NEXT: movl $32767, %edx # imm = 0x7FFF
+; CHECK-NEXT: negl %edx
+; CHECK-NEXT: cmoval %eax, %esi
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: pinsrw $0, %esi, %xmm1
+; CHECK-NEXT: pinsrw $1, %edi, %xmm1
+; CHECK-NEXT: pinsrw $2, %ecx, %xmm1
+; CHECK-NEXT: pextrw $3, %xmm0, %ecx
+; CHECK-NEXT: shll $2, %ecx
+; CHECK-NEXT: movl %ecx, %edx
+; CHECK-NEXT: shrl $16, %edx
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: shldw $1, %cx, %si
+; CHECK-NEXT: cmpl $32767, %edx # imm = 0x7FFF
+; CHECK-NEXT: cmoval %eax, %esi
+; CHECK-NEXT: pinsrw $3, %esi, %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %t = call <4 x i16> @llvm.umul.fix.sat.v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> %a, i32 15)
+ ret <4 x i16> %t
+}
; Binary int plus constant scalar operand
declare <2 x i32> @llvm.smul.fix.sat.v2i32(<2 x i32>, <2 x i32>, i32)
+declare <2 x i32> @llvm.umul.fix.sat.v2i32(<2 x i32>, <2 x i32>, i32)
; CHECK-LABEL: @scalarize_sqrt_v2f32(
%smulfixsat = call <2 x i32> @llvm.smul.fix.sat.v2i32(<2 x i32> %x, <2 x i32> <i32 5, i32 19>, i32 31)
ret <2 x i32> %smulfixsat
}
+
+; CHECK-LABEL: @scalarize_umul_fix_sat_v2i32(
+; CHECK: %umulfixsat.i0 = call i32 @llvm.umul.fix.sat.i32(i32 %x.i0, i32 5, i32 31)
+; CHECK: %umulfixsat.i1 = call i32 @llvm.umul.fix.sat.i32(i32 %x.i1, i32 19, i32 31)
+; CHECK: %umulfixsat.upto0 = insertelement <2 x i32> undef, i32 %umulfixsat.i0, i32 0
+; CHECK: %umulfixsat = insertelement <2 x i32> %umulfixsat.upto0, i32 %umulfixsat.i1, i32 1
+; CHECK: ret <2 x i32> %umulfixsat
+define <2 x i32> @scalarize_umul_fix_sat_v2i32(<2 x i32> %x) #0 {
+ %umulfixsat = call <2 x i32> @llvm.umul.fix.sat.v2i32(<2 x i32> %x, <2 x i32> <i32 5, i32 19>, i32 31)
+ ret <2 x i32> %umulfixsat
+}
ret i64 %ret
}
+declare i64 @llvm.smul.fix.sat.i64(i64, i64, i32)
+define i64 @smul_fix_sat(i64 %arg0, i64 %arg1, i32 %arg2) {
+ ; CHECK: immarg operand has non-immediate parameter
+ ; CHECK-NEXT: i32 %arg2
+ ; CHECK-NEXT: %ret = call i64 @llvm.smul.fix.sat.i64(i64 %arg0, i64 %arg1, i32 %arg2)
+ %ret = call i64 @llvm.smul.fix.sat.i64(i64 %arg0, i64 %arg1, i32 %arg2)
+ ret i64 %ret
+}
+
declare i64 @llvm.umul.fix.i64(i64, i64, i32)
define i64 @umul_fix(i64 %arg0, i64 %arg1, i32 %arg2) {
; CHECK: immarg operand has non-immediate parameter
ret i64 %ret
}
+declare i64 @llvm.umul.fix.sat.i64(i64, i64, i32)
+define i64 @umul_fix_sat(i64 %arg0, i64 %arg1, i32 %arg2) {
+ ; CHECK: immarg operand has non-immediate parameter
+ ; CHECK-NEXT: i32 %arg2
+ ; CHECK-NEXT: %ret = call i64 @llvm.umul.fix.sat.i64(i64 %arg0, i64 %arg1, i32 %arg2)
+ %ret = call i64 @llvm.umul.fix.sat.i64(i64 %arg0, i64 %arg1, i32 %arg2)
+ ret i64 %ret
+}
+
declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
define <2 x double> @masked_load(<2 x i1> %mask, <2 x double>* %addr, <2 x double> %dst, i32 %align) {
; CHECK: immarg operand has non-immediate parameter