unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
EVT VT = Op.getValueType();
- // TODO: Assume we don't know anything for now.
- if (VT.isScalableVector())
- return 1;
-
- APInt DemandedElts = VT.isVector()
+ // Since the number of lanes in a scalable vector is unknown at compile time,
+ // we track one bit which is implicitly broadcast to all lanes. This means
+ // that all lanes in a scalable vector are considered demanded.
+ APInt DemandedElts = VT.isFixedLengthVector()
? APInt::getAllOnes(VT.getVectorNumElements())
: APInt(1, 1);
return ComputeNumSignBits(Op, DemandedElts, Depth);
if (Depth >= MaxRecursionDepth)
return 1; // Limit search depth.
- if (!DemandedElts || VT.isScalableVector())
+ if (!DemandedElts)
return 1; // No demanded elts, better to assume we don't know anything.
unsigned Opcode = Op.getOpcode();
case ISD::MERGE_VALUES:
return ComputeNumSignBits(Op.getOperand(Op.getResNo()), DemandedElts,
Depth + 1);
+ case ISD::SPLAT_VECTOR: {
+ // Check if the sign bits of source go down as far as the truncated value.
+ unsigned NumSrcBits = Op.getOperand(0).getValueSizeInBits();
+ unsigned NumSrcSignBits = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+ if (NumSrcSignBits > (NumSrcBits - VTBits))
+ return NumSrcSignBits - (NumSrcBits - VTBits);
+ break;
+ }
case ISD::BUILD_VECTOR:
+ assert(!VT.isScalableVector());
Tmp = VTBits;
for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) {
if (!DemandedElts[i])
}
case ISD::BITCAST: {
+ if (VT.isScalableVector())
+ return 1;
SDValue N0 = Op.getOperand(0);
EVT SrcVT = N0.getValueType();
unsigned SrcBits = SrcVT.getScalarSizeInBits();
Tmp2 = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
return std::max(Tmp, Tmp2);
case ISD::SIGN_EXTEND_VECTOR_INREG: {
+ if (VT.isScalableVector())
+ return 1;
SDValue Src = Op.getOperand(0);
EVT SrcVT = Src.getValueType();
APInt DemandedSrcElts = DemandedElts.zext(SrcVT.getVectorNumElements());
break;
}
case ISD::EXTRACT_ELEMENT: {
+ if (VT.isScalableVector())
+ return 1;
const int KnownSign = ComputeNumSignBits(Op.getOperand(0), Depth+1);
const int BitWidth = Op.getValueSizeInBits();
const int Items = Op.getOperand(0).getValueSizeInBits() / BitWidth;
return std::clamp(KnownSign - rIndex * BitWidth, 0, BitWidth);
}
case ISD::INSERT_VECTOR_ELT: {
+ if (VT.isScalableVector())
+ return 1;
// If we know the element index, split the demand between the
// source vector and the inserted element, otherwise assume we need
// the original demanded vector elements and the value.
return Tmp;
}
case ISD::EXTRACT_VECTOR_ELT: {
+ if (VT.isScalableVector())
+ return 1;
SDValue InVec = Op.getOperand(0);
SDValue EltNo = Op.getOperand(1);
EVT VecVT = InVec.getValueType();
return ComputeNumSignBits(Src, DemandedSrcElts, Depth + 1);
}
case ISD::CONCAT_VECTORS: {
+ if (VT.isScalableVector())
+ return 1;
// Determine the minimum number of sign bits across all demanded
// elts of the input vectors. Early out if the result is already 1.
Tmp = std::numeric_limits<unsigned>::max();
return Tmp;
}
case ISD::INSERT_SUBVECTOR: {
+ if (VT.isScalableVector())
+ return 1;
// Demand any elements from the subvector and the remainder from the src its
// inserted into.
SDValue Src = Op.getOperand(0);
// We only need to handle vectors - computeKnownBits should handle
// scalar cases.
Type *CstTy = Cst->getType();
- if (CstTy->isVectorTy() &&
+ if (CstTy->isVectorTy() && !VT.isScalableVector() &&
(NumElts * VTBits) == CstTy->getPrimitiveSizeInBits() &&
VTBits == CstTy->getScalarSizeInBits()) {
Tmp = VTBits;
Opcode == ISD::INTRINSIC_WO_CHAIN ||
Opcode == ISD::INTRINSIC_W_CHAIN ||
Opcode == ISD::INTRINSIC_VOID) {
+ // TODO: This can probably be removed once target code is audited. This
+ // is here purely to reduce patch size and review complexity.
+ if (VT.isScalableVector())
+ return 1;
unsigned NumBits =
TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth);
if (NumBits > 1)
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtb z1.d, p0/m, z1.d
; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
-; CHECK-NEXT: movprfx z2, z0
-; CHECK-NEXT: smulh z2.d, p0/m, z2.d, z1.d
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT: asr z1.d, z0.d, #63
-; CHECK-NEXT: movprfx z3, z0
-; CHECK-NEXT: sxtb z3.d, p0/m, z0.d
-; CHECK-NEXT: cmpne p1.d, p0/z, z2.d, z1.d
-; CHECK-NEXT: cmpne p0.d, p0/z, z3.d, z0.d
-; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT: movprfx z1, z0
+; CHECK-NEXT: sxtb z1.d, p0/m, z0.d
+; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, z0.d
; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0
; CHECK-NEXT: ret
%a = call { <vscale x 2 x i8>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y)
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: sxtb z1.s, p0/m, z1.s
; CHECK-NEXT: sxtb z0.s, p0/m, z0.s
-; CHECK-NEXT: movprfx z2, z0
-; CHECK-NEXT: smulh z2.s, p0/m, z2.s, z1.s
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: asr z1.s, z0.s, #31
-; CHECK-NEXT: movprfx z3, z0
-; CHECK-NEXT: sxtb z3.s, p0/m, z0.s
-; CHECK-NEXT: cmpne p1.s, p0/z, z2.s, z1.s
-; CHECK-NEXT: cmpne p0.s, p0/z, z3.s, z0.s
-; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT: movprfx z1, z0
+; CHECK-NEXT: sxtb z1.s, p0/m, z0.s
+; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, z0.s
; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0
; CHECK-NEXT: ret
%a = call { <vscale x 4 x i8>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %y)
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: sxtb z1.h, p0/m, z1.h
; CHECK-NEXT: sxtb z0.h, p0/m, z0.h
-; CHECK-NEXT: movprfx z2, z0
-; CHECK-NEXT: smulh z2.h, p0/m, z2.h, z1.h
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT: asr z1.h, z0.h, #15
-; CHECK-NEXT: movprfx z3, z0
-; CHECK-NEXT: sxtb z3.h, p0/m, z0.h
-; CHECK-NEXT: cmpne p1.h, p0/z, z2.h, z1.h
-; CHECK-NEXT: cmpne p0.h, p0/z, z3.h, z0.h
-; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT: movprfx z1, z0
+; CHECK-NEXT: sxtb z1.h, p0/m, z0.h
+; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, z0.h
; CHECK-NEXT: mov z0.h, p0/m, #0 // =0x0
; CHECK-NEXT: ret
%a = call { <vscale x 8 x i8>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y)
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxth z1.d, p0/m, z1.d
; CHECK-NEXT: sxth z0.d, p0/m, z0.d
-; CHECK-NEXT: movprfx z2, z0
-; CHECK-NEXT: smulh z2.d, p0/m, z2.d, z1.d
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT: asr z1.d, z0.d, #63
-; CHECK-NEXT: movprfx z3, z0
-; CHECK-NEXT: sxth z3.d, p0/m, z0.d
-; CHECK-NEXT: cmpne p1.d, p0/z, z2.d, z1.d
-; CHECK-NEXT: cmpne p0.d, p0/z, z3.d, z0.d
-; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT: movprfx z1, z0
+; CHECK-NEXT: sxth z1.d, p0/m, z0.d
+; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, z0.d
; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0
; CHECK-NEXT: ret
%a = call { <vscale x 2 x i16>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %y)
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: sxth z1.s, p0/m, z1.s
; CHECK-NEXT: sxth z0.s, p0/m, z0.s
-; CHECK-NEXT: movprfx z2, z0
-; CHECK-NEXT: smulh z2.s, p0/m, z2.s, z1.s
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: asr z1.s, z0.s, #31
-; CHECK-NEXT: movprfx z3, z0
-; CHECK-NEXT: sxth z3.s, p0/m, z0.s
-; CHECK-NEXT: cmpne p1.s, p0/z, z2.s, z1.s
-; CHECK-NEXT: cmpne p0.s, p0/z, z3.s, z0.s
-; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT: movprfx z1, z0
+; CHECK-NEXT: sxth z1.s, p0/m, z0.s
+; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, z0.s
; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0
; CHECK-NEXT: ret
%a = call { <vscale x 4 x i16>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y)
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtw z1.d, p0/m, z1.d
; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
-; CHECK-NEXT: movprfx z2, z0
-; CHECK-NEXT: smulh z2.d, p0/m, z2.d, z1.d
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT: asr z1.d, z0.d, #63
-; CHECK-NEXT: movprfx z3, z0
-; CHECK-NEXT: sxtw z3.d, p0/m, z0.d
-; CHECK-NEXT: cmpne p1.d, p0/z, z2.d, z1.d
-; CHECK-NEXT: cmpne p0.d, p0/z, z3.d, z0.d
-; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
+; CHECK-NEXT: movprfx z1, z0
+; CHECK-NEXT: sxtw z1.d, p0/m, z0.d
+; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, z0.d
; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0
; CHECK-NEXT: ret
%a = call { <vscale x 2 x i32>, <vscale x 2 x i1> } @llvm.smul.with.overflow.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y)