STATISTIC(NumShiftInserts, "Number of vector shift inserts");
STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
-static cl::opt<bool>
-EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
- cl::desc("Allow AArch64 SLI/SRI formation"),
- cl::init(false));
-
// FIXME: The necessary dtprel relocations don't seem to be supported
// well in the GNU bfd and gold linkers at the moment. Therefore, by
// default, for now, fall back to GeneralDynamic code generation.
case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
+ case AArch64ISD::VSLI: return "AArch64ISD::VSLI";
+ case AArch64ISD::VSRI: return "AArch64ISD::VSRI";
case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
"llvm.eh.recoverfp must take a function as the first argument");
return IncomingFPOp;
}
+
+ case Intrinsic::aarch64_neon_vsri:
+ case Intrinsic::aarch64_neon_vsli: {
+ EVT Ty = Op.getValueType();
+
+ if (!Ty.isVector())
+ report_fatal_error("Unexpected type for aarch64_neon_vsli");
+
+ uint64_t ShiftAmount = Op.getConstantOperandVal(3);
+ unsigned ElemSizeInBits = Ty.getScalarSizeInBits();
+ assert(ShiftAmount <= ElemSizeInBits);
+
+ bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
+ unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
+ return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
+ Op.getOperand(3));
+ }
}
}
// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
-// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
-// Also, logical shift right -> sri, with the same structure.
+// BUILD_VECTORs with constant element C1, C2 is a constant, and:
+// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
+// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
+// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
SDLoc DL(N);
- // Is the first op an AND?
- const SDValue And = N->getOperand(0);
- if (And.getOpcode() != ISD::AND)
+ SDValue And;
+ SDValue Shift;
+
+ SDValue FirstOp = N->getOperand(0);
+ unsigned FirstOpc = FirstOp.getOpcode();
+ SDValue SecondOp = N->getOperand(1);
+ unsigned SecondOpc = SecondOp.getOpcode();
+
+ // Is one of the operands an AND or a BICi? The AND may have been optimised to
+ // a BICi in order to use an immediate instead of a register.
+ // Is the other operand an shl or lshr? This will have been turned into:
+ // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
+ if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
+ (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) {
+ And = FirstOp;
+ Shift = SecondOp;
+
+ } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
+ (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) {
+ And = SecondOp;
+ Shift = FirstOp;
+ } else
return SDValue();
- // Is the second op an shl or lshr?
- SDValue Shift = N->getOperand(1);
- // This will have been turned into: AArch64ISD::VSHL vector, #shift
- // or AArch64ISD::VLSHR vector, #shift
- unsigned ShiftOpc = Shift.getOpcode();
- if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
- return SDValue();
- bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
+ bool IsAnd = And.getOpcode() == ISD::AND;
+ bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
// Is the shift amount constant?
ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
if (!C2node)
return SDValue();
- // Is the and mask vector all constant?
uint64_t C1;
- if (!isAllConstantBuildVector(And.getOperand(1), C1))
- return SDValue();
+ if (IsAnd) {
+ // Is the and mask vector all constant?
+ if (!isAllConstantBuildVector(And.getOperand(1), C1))
+ return SDValue();
+ } else {
+ // Reconstruct the corresponding AND immediate from the two BICi immediates.
+ ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
+ ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
+ assert(C1nodeImm && C1nodeShift);
+ C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
+ }
- // Is C1 == ~C2, taking into account how much one can shift elements of a
- // particular size?
+ // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
+ // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
+ // how much one can shift elements of a particular size?
uint64_t C2 = C2node->getZExtValue();
unsigned ElemSizeInBits = VT.getScalarSizeInBits();
if (C2 > ElemSizeInBits)
return SDValue();
- unsigned ElemMask = (1 << ElemSizeInBits) - 1;
- if ((C1 & ElemMask) != (~C2 & ElemMask))
+
+ APInt C1AsAPInt(ElemSizeInBits, C1);
+ APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
+ : APInt::getLowBitsSet(ElemSizeInBits, C2);
+ if (C1AsAPInt != RequiredC1)
return SDValue();
SDValue X = And.getOperand(0);
SDValue Y = Shift.getOperand(0);
- unsigned Intrin =
- IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
- SDValue ResultSLI =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
- DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
- Shift.getOperand(1));
+ unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
+ SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
LLVM_DEBUG(N->dump(&DAG));
SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SelectionDAG &DAG) const {
// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
- if (EnableAArch64SlrGeneration) {
- if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
- return Res;
- }
+ if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
+ return Res;
EVT VT = Op.getValueType();
-; RUN: llc < %s -aarch64-shift-insert-generation=true -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
-define void @testLeftGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
-; CHECK-LABEL: testLeftGood:
-; CHECK: sli.16b v0, v1, #3
- %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
+define void @testLeftGood8x8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftGood8x8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sli.8b v0, v1, #3
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <8 x i8> %src1, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ %vshl_n = shl <8 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ %result = or <8 x i8> %and.i, %vshl_n
+ store <8 x i8> %result, <8 x i8>* %dest, align 8
+ ret void
+}
+
+define void @testLeftBad8x8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftBad8x8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi.8b v2, #165
+; CHECK-NEXT: and.8b v0, v0, v2
+; CHECK-NEXT: shl.8b v1, v1, #1
+; CHECK-NEXT: orr.8b v0, v0, v1
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <8 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
+ %vshl_n = shl <8 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ %result = or <8 x i8> %and.i, %vshl_n
+ store <8 x i8> %result, <8 x i8>* %dest, align 8
+ ret void
+}
+
+define void @testRightGood8x8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightGood8x8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sri.8b v0, v1, #3
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <8 x i8> %src1, <i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224>
+ %vshl_n = lshr <8 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ %result = or <8 x i8> %and.i, %vshl_n
+ store <8 x i8> %result, <8 x i8>* %dest, align 8
+ ret void
+}
+
+define void @testRightBad8x8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightBad8x8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi.8b v2, #165
+; CHECK-NEXT: and.8b v0, v0, v2
+; CHECK-NEXT: ushr.8b v1, v1, #1
+; CHECK-NEXT: orr.8b v0, v0, v1
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <8 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
+ %vshl_n = lshr <8 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ %result = or <8 x i8> %and.i, %vshl_n
+ store <8 x i8> %result, <8 x i8>* %dest, align 8
+ ret void
+}
+
+define void @testLeftGood16x8(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftGood16x8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sli.16b v0, v1, #3
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <16 x i8> %src1, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
%vshl_n = shl <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
%result = or <16 x i8> %and.i, %vshl_n
store <16 x i8> %result, <16 x i8>* %dest, align 16
ret void
}
-define void @testLeftBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
-; CHECK-LABEL: testLeftBad:
-; CHECK-NOT: sli
+define void @testLeftBad16x8(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftBad16x8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi.16b v2, #165
+; CHECK-NEXT: and.16b v0, v0, v2
+; CHECK-NEXT: shl.16b v1, v1, #1
+; CHECK-NEXT: orr.16b v0, v0, v1
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
%and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
%vshl_n = shl <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
%result = or <16 x i8> %and.i, %vshl_n
ret void
}
-define void @testRightGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
-; CHECK-LABEL: testRightGood:
-; CHECK: sri.16b v0, v1, #3
- %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
+define void @testRightGood16x8(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightGood16x8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sri.16b v0, v1, #3
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <16 x i8> %src1, <i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224, i8 224>
%vshl_n = lshr <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
%result = or <16 x i8> %and.i, %vshl_n
store <16 x i8> %result, <16 x i8>* %dest, align 16
ret void
}
-define void @testRightBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
-; CHECK-LABEL: testRightBad:
-; CHECK-NOT: sri
+define void @testRightBad16x8(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightBad16x8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi.16b v2, #165
+; CHECK-NEXT: and.16b v0, v0, v2
+; CHECK-NEXT: ushr.16b v1, v1, #1
+; CHECK-NEXT: orr.16b v0, v0, v1
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
%and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
%vshl_n = lshr <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
%result = or <16 x i8> %and.i, %vshl_n
store <16 x i8> %result, <16 x i8>* %dest, align 16
ret void
}
+
+define void @testLeftGood4x16(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind {
+; CHECK-LABEL: testLeftGood4x16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sli.4h v0, v1, #14
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <4 x i16> %src1, <i16 16383, i16 16383, i16 16383, i16 16383>
+ %vshl_n = shl <4 x i16> %src2, <i16 14, i16 14, i16 14, i16 14>
+ %result = or <4 x i16> %and.i, %vshl_n
+ store <4 x i16> %result, <4 x i16>* %dest, align 8
+ ret void
+}
+
+define void @testLeftBad4x16(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind {
+; CHECK-LABEL: testLeftBad4x16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #16500
+; CHECK-NEXT: dup.4h v2, w8
+; CHECK-NEXT: and.8b v0, v0, v2
+; CHECK-NEXT: shl.4h v1, v1, #14
+; CHECK-NEXT: orr.8b v0, v0, v1
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <4 x i16> %src1, <i16 16500, i16 16500, i16 16500, i16 16500>
+ %vshl_n = shl <4 x i16> %src2, <i16 14, i16 14, i16 14, i16 14>
+ %result = or <4 x i16> %and.i, %vshl_n
+ store <4 x i16> %result, <4 x i16>* %dest, align 8
+ ret void
+}
+
+define void @testRightGood4x16(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind {
+; CHECK-LABEL: testRightGood4x16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sri.4h v0, v1, #14
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <4 x i16> %src1, <i16 65532, i16 65532, i16 65532, i16 65532>
+ %vshl_n = lshr <4 x i16> %src2, <i16 14, i16 14, i16 14, i16 14>
+ %result = or <4 x i16> %and.i, %vshl_n
+ store <4 x i16> %result, <4 x i16>* %dest, align 8
+ ret void
+}
+
+define void @testRightBad4x16(<4 x i16> %src1, <4 x i16> %src2, <4 x i16>* %dest) nounwind {
+; CHECK-LABEL: testRightBad4x16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #16500
+; CHECK-NEXT: dup.4h v2, w8
+; CHECK-NEXT: and.8b v0, v0, v2
+; CHECK-NEXT: ushr.4h v1, v1, #14
+; CHECK-NEXT: orr.8b v0, v0, v1
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <4 x i16> %src1, <i16 16500, i16 16500, i16 16500, i16 16500>
+ %vshl_n = lshr <4 x i16> %src2, <i16 14, i16 14, i16 14, i16 14>
+ %result = or <4 x i16> %and.i, %vshl_n
+ store <4 x i16> %result, <4 x i16>* %dest, align 8
+ ret void
+}
+
+define void @testLeftGood8x16(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind {
+; CHECK-LABEL: testLeftGood8x16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sli.8h v0, v1, #14
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <8 x i16> %src1, <i16 16383, i16 16383, i16 16383, i16 16383, i16 16383, i16 16383, i16 16383, i16 16383>
+ %vshl_n = shl <8 x i16> %src2, <i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14>
+ %result = or <8 x i16> %and.i, %vshl_n
+ store <8 x i16> %result, <8 x i16>* %dest, align 16
+ ret void
+}
+
+define void @testLeftBad8x16(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind {
+; CHECK-LABEL: testLeftBad8x16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #16500
+; CHECK-NEXT: dup.8h v2, w8
+; CHECK-NEXT: and.16b v0, v0, v2
+; CHECK-NEXT: shl.8h v1, v1, #14
+; CHECK-NEXT: orr.16b v0, v0, v1
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <8 x i16> %src1, <i16 16500, i16 16500, i16 16500, i16 16500, i16 16500, i16 16500, i16 16500, i16 16500>
+ %vshl_n = shl <8 x i16> %src2, <i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14>
+ %result = or <8 x i16> %and.i, %vshl_n
+ store <8 x i16> %result, <8 x i16>* %dest, align 16
+ ret void
+}
+
+define void @testRightGood8x16(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind {
+; CHECK-LABEL: testRightGood8x16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sri.8h v0, v1, #14
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <8 x i16> %src1, <i16 65532, i16 65532, i16 65532, i16 65532, i16 65532, i16 65532, i16 65532, i16 65532>
+ %vshl_n = lshr <8 x i16> %src2, <i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14>
+ %result = or <8 x i16> %and.i, %vshl_n
+ store <8 x i16> %result, <8 x i16>* %dest, align 16
+ ret void
+}
+
+define void @testRightBad8x16(<8 x i16> %src1, <8 x i16> %src2, <8 x i16>* %dest) nounwind {
+; CHECK-LABEL: testRightBad8x16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #16500
+; CHECK-NEXT: dup.8h v2, w8
+; CHECK-NEXT: and.16b v0, v0, v2
+; CHECK-NEXT: ushr.8h v1, v1, #14
+; CHECK-NEXT: orr.16b v0, v0, v1
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <8 x i16> %src1, <i16 16500, i16 16500, i16 16500, i16 16500, i16 16500, i16 16500, i16 16500, i16 16500>
+ %vshl_n = lshr <8 x i16> %src2, <i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14>
+ %result = or <8 x i16> %and.i, %vshl_n
+ store <8 x i16> %result, <8 x i16>* %dest, align 16
+ ret void
+}
+
+define void @testLeftGood2x32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind {
+; CHECK-LABEL: testLeftGood2x32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sli.2s v0, v1, #22
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <2 x i32> %src1, <i32 4194303, i32 4194303>
+ %vshl_n = shl <2 x i32> %src2, <i32 22, i32 22>
+ %result = or <2 x i32> %and.i, %vshl_n
+ store <2 x i32> %result, <2 x i32>* %dest, align 8
+ ret void
+}
+
+define void @testLeftBad2x32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind {
+; CHECK-LABEL: testLeftBad2x32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #4194300
+; CHECK-NEXT: dup.2s v2, w8
+; CHECK-NEXT: and.8b v0, v0, v2
+; CHECK-NEXT: shl.2s v1, v1, #22
+; CHECK-NEXT: orr.8b v0, v0, v1
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <2 x i32> %src1, <i32 4194300, i32 4194300>
+ %vshl_n = shl <2 x i32> %src2, <i32 22, i32 22>
+ %result = or <2 x i32> %and.i, %vshl_n
+ store <2 x i32> %result, <2 x i32>* %dest, align 8
+ ret void
+}
+
+define void @testRightGood2x32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind {
+; CHECK-LABEL: testRightGood2x32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sri.2s v0, v1, #22
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <2 x i32> %src1, <i32 4294966272, i32 4294966272>
+ %vshl_n = lshr <2 x i32> %src2, <i32 22, i32 22>
+ %result = or <2 x i32> %and.i, %vshl_n
+ store <2 x i32> %result, <2 x i32>* %dest, align 8
+ ret void
+}
+
+define void @testRightBad2x32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>* %dest) nounwind {
+; CHECK-LABEL: testRightBad2x32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #4194300
+; CHECK-NEXT: dup.2s v2, w8
+; CHECK-NEXT: and.8b v0, v0, v2
+; CHECK-NEXT: ushr.2s v1, v1, #22
+; CHECK-NEXT: orr.8b v0, v0, v1
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <2 x i32> %src1, <i32 4194300, i32 4194300>
+ %vshl_n = lshr <2 x i32> %src2, <i32 22, i32 22>
+ %result = or <2 x i32> %and.i, %vshl_n
+ store <2 x i32> %result, <2 x i32>* %dest, align 8
+ ret void
+}
+
+define void @testLeftGood4x32(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind {
+; CHECK-LABEL: testLeftGood4x32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sli.4s v0, v1, #22
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <4 x i32> %src1, <i32 4194303, i32 4194303, i32 4194303, i32 4194303>
+ %vshl_n = shl <4 x i32> %src2, <i32 22, i32 22, i32 22, i32 22>
+ %result = or <4 x i32> %and.i, %vshl_n
+ store <4 x i32> %result, <4 x i32>* %dest, align 16
+ ret void
+}
+
+define void @testLeftBad4x32(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind {
+; CHECK-LABEL: testLeftBad4x32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #4194300
+; CHECK-NEXT: dup.4s v2, w8
+; CHECK-NEXT: and.16b v0, v0, v2
+; CHECK-NEXT: shl.4s v1, v1, #22
+; CHECK-NEXT: orr.16b v0, v0, v1
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <4 x i32> %src1, <i32 4194300, i32 4194300, i32 4194300, i32 4194300>
+ %vshl_n = shl <4 x i32> %src2, <i32 22, i32 22, i32 22, i32 22>
+ %result = or <4 x i32> %and.i, %vshl_n
+ store <4 x i32> %result, <4 x i32>* %dest, align 16
+ ret void
+}
+
+define void @testRightGood4x32(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind {
+; CHECK-LABEL: testRightGood4x32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sri.4s v0, v1, #22
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <4 x i32> %src1, <i32 4294966272, i32 4294966272, i32 4294966272, i32 4294966272>
+ %vshl_n = lshr <4 x i32> %src2, <i32 22, i32 22, i32 22, i32 22>
+ %result = or <4 x i32> %and.i, %vshl_n
+ store <4 x i32> %result, <4 x i32>* %dest, align 16
+ ret void
+}
+
+define void @testRightBad4x32(<4 x i32> %src1, <4 x i32> %src2, <4 x i32>* %dest) nounwind {
+; CHECK-LABEL: testRightBad4x32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #4194300
+; CHECK-NEXT: dup.4s v2, w8
+; CHECK-NEXT: and.16b v0, v0, v2
+; CHECK-NEXT: ushr.4s v1, v1, #22
+; CHECK-NEXT: orr.16b v0, v0, v1
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <4 x i32> %src1, <i32 4194300, i32 4194300, i32 4194300, i32 4194300>
+ %vshl_n = lshr <4 x i32> %src2, <i32 22, i32 22, i32 22, i32 22>
+ %result = or <4 x i32> %and.i, %vshl_n
+ store <4 x i32> %result, <4 x i32>* %dest, align 16
+ ret void
+}
+
+define void @testLeftGood2x64(<2 x i64> %src1, <2 x i64> %src2, <2 x i64>* %dest) nounwind {
+; CHECK-LABEL: testLeftGood2x64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sli.2d v0, v1, #48
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <2 x i64> %src1, <i64 281474976710655, i64 281474976710655>
+ %vshl_n = shl <2 x i64> %src2, <i64 48, i64 48>
+ %result = or <2 x i64> %and.i, %vshl_n
+ store <2 x i64> %result, <2 x i64>* %dest, align 16
+ ret void
+}
+
+define void @testLeftBad2x64(<2 x i64> %src1, <2 x i64> %src2, <2 x i64>* %dest) nounwind {
+; CHECK-LABEL: testLeftBad2x64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, #10
+; CHECK-NEXT: movk x8, #1, lsl #48
+; CHECK-NEXT: dup.2d v2, x8
+; CHECK-NEXT: and.16b v0, v0, v2
+; CHECK-NEXT: shl.2d v1, v1, #48
+; CHECK-NEXT: orr.16b v0, v0, v1
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <2 x i64> %src1, <i64 281474976710666, i64 281474976710666>
+ %vshl_n = shl <2 x i64> %src2, <i64 48, i64 48>
+ %result = or <2 x i64> %and.i, %vshl_n
+ store <2 x i64> %result, <2 x i64>* %dest, align 16
+ ret void
+}
+
+define void @testRightGood2x64(<2 x i64> %src1, <2 x i64> %src2, <2 x i64>* %dest) nounwind {
+; CHECK-LABEL: testRightGood2x64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sri.2d v0, v1, #48
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <2 x i64> %src1, <i64 18446744073709486080, i64 18446744073709486080>
+ %vshl_n = lshr <2 x i64> %src2, <i64 48, i64 48>
+ %result = or <2 x i64> %and.i, %vshl_n
+ store <2 x i64> %result, <2 x i64>* %dest, align 16
+ ret void
+}
+
+define void @testRightBad2x64(<2 x i64> %src1, <2 x i64> %src2, <2 x i64>* %dest) nounwind {
+; CHECK-LABEL: testRightBad2x64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, #10
+; CHECK-NEXT: movk x8, #1, lsl #48
+; CHECK-NEXT: dup.2d v2, x8
+; CHECK-NEXT: and.16b v0, v0, v2
+; CHECK-NEXT: ushr.2d v1, v1, #48
+; CHECK-NEXT: orr.16b v0, v0, v1
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <2 x i64> %src1, <i64 281474976710666, i64 281474976710666>
+ %vshl_n = lshr <2 x i64> %src2, <i64 48, i64 48>
+ %result = or <2 x i64> %and.i, %vshl_n
+ store <2 x i64> %result, <2 x i64>* %dest, align 16
+ ret void
+}
+
+define void @testLeftShouldNotCreateSLI1x128(<1 x i128> %src1, <1 x i128> %src2, <1 x i128>* %dest) nounwind {
+; CHECK-LABEL: testLeftShouldNotCreateSLI1x128:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfi x1, x2, #6, #58
+; CHECK-NEXT: stp x0, x1, [x4]
+; CHECK-NEXT: ret
+ %and.i = and <1 x i128> %src1, <i128 1180591620717411303423>
+ %vshl_n = shl <1 x i128> %src2, <i128 70>
+ %result = or <1 x i128> %and.i, %vshl_n
+ store <1 x i128> %result, <1 x i128>* %dest, align 16
+ ret void
+}
+
+define void @testLeftNotAllConstantBuildVec8x8(<8 x i8> %src1, <8 x i8> %src2, <8 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftNotAllConstantBuildVec8x8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI29_0
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI29_0]
+; CHECK-NEXT: shl.8b v1, v1, #3
+; CHECK-NEXT: and.8b v0, v0, v2
+; CHECK-NEXT: orr.8b v0, v0, v1
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+ %and.i = and <8 x i8> %src1, <i8 7, i8 7, i8 255, i8 7, i8 7, i8 7, i8 255, i8 7>
+ %vshl_n = shl <8 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ %result = or <8 x i8> %and.i, %vshl_n
+ store <8 x i8> %result, <8 x i8>* %dest, align 8
+ ret void
+}