def OP_QDMULH_LN : Op<(call "vqdmulh", $p0, (call_mangled "splat_lane", $p1, $p2))>;
def OP_QRDMULH_LN : Op<(call "vqrdmulh", $p0, (call_mangled "splat_lane", $p1, $p2))>;
def OP_QRDMULH_N : Op<(call "vqrdmulh", $p0, (dup $p1))>;
-def OP_QRDMLAH : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, $p2))>;
-def OP_QRDMLSH : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, $p2))>;
-def OP_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, (call_mangled "splat_lane", $p2, $p3)))>;
-def OP_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, (call_mangled "splat_lane", $p2, $p3)))>;
+def OP_QRDMLAH_LN : Op<(call "vqrdmlah", $p0, $p1, (call_mangled "splat_lane", $p2, $p3))>;
+def OP_QRDMLSH_LN : Op<(call "vqrdmlsh", $p0, $p1, (call_mangled "splat_lane", $p2, $p3))>;
def OP_FMS_LN : Op<(call "vfma_lane", $p0, (op "-", $p1), $p2, $p3)>;
def OP_FMS_LNQ : Op<(call "vfma_laneq", $p0, (op "-", $p1), $p2, $p3)>;
def OP_TRN1 : Op<(shuffle $p0, $p1, (interleave (decimate mask0, 2),
def OP_SCALAR_QDMULH_LN : ScalarMulOp<"vqdmulh">;
def OP_SCALAR_QRDMULH_LN : ScalarMulOp<"vqrdmulh">;
-def OP_SCALAR_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1,
- (call "vget_lane", $p2, $p3)))>;
-def OP_SCALAR_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1,
- (call "vget_lane", $p2, $p3)))>;
+def OP_SCALAR_QRDMLAH_LN : Op<(call "vqrdmlah", $p0, $p1,
+ (call "vget_lane", $p2, $p3))>;
+def OP_SCALAR_QRDMLSH_LN : Op<(call "vqrdmlsh", $p0, $p1,
+ (call "vget_lane", $p2, $p3))>;
def OP_SCALAR_HALF_GET_LN : Op<(bitcast "float16_t",
(call "vget_lane",
def VQRDMULH : SInst<"vqrdmulh", "...", "siQsQi">;
let ArchGuard = "defined(__ARM_FEATURE_QRDMX)" in {
-def VQRDMLAH : SOpInst<"vqrdmlah", "....", "siQsQi", OP_QRDMLAH>;
-def VQRDMLSH : SOpInst<"vqrdmlsh", "....", "siQsQi", OP_QRDMLSH>;
+def VQRDMLAH : SInst<"vqrdmlah", "....", "siQsQi">;
+def VQRDMLSH : SInst<"vqrdmlsh", "....", "siQsQi">;
}
def VQDMLAL : SInst<"vqdmlal", "(>Q)(>Q)..", "si">;
let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in {
////////////////////////////////////////////////////////////////////////////////
// Signed Saturating Rounding Doubling Multiply Accumulate Returning High Half
-def SCALAR_SQRDMLAH : SOpInst<"vqrdmlah", "1111", "SsSi", OP_QRDMLAH>;
+def SCALAR_SQRDMLAH : SInst<"vqrdmlah", "1111", "SsSi">;
////////////////////////////////////////////////////////////////////////////////
// Signed Saturating Rounding Doubling Multiply Subtract Returning High Half
-def SCALAR_SQRDMLSH : SOpInst<"vqrdmlsh", "1111", "SsSi", OP_QRDMLSH>;
+def SCALAR_SQRDMLSH : SInst<"vqrdmlsh", "1111", "SsSi">;
}
////////////////////////////////////////////////////////////////////////////////
NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType),
NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType),
NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType),
+ NEONMAP1(vqrdmlah_v, arm_neon_vqrdmlah, Add1ArgType),
+ NEONMAP1(vqrdmlahq_v, arm_neon_vqrdmlah, Add1ArgType),
+ NEONMAP1(vqrdmlsh_v, arm_neon_vqrdmlsh, Add1ArgType),
+ NEONMAP1(vqrdmlshq_v, arm_neon_vqrdmlsh, Add1ArgType),
NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType),
NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType),
NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
+ NEONMAP1(vqrdmlah_v, aarch64_neon_sqrdmlah, Add1ArgType),
+ NEONMAP1(vqrdmlahq_v, aarch64_neon_sqrdmlah, Add1ArgType),
+ NEONMAP1(vqrdmlsh_v, aarch64_neon_sqrdmlsh, Add1ArgType),
+ NEONMAP1(vqrdmlshq_v, aarch64_neon_sqrdmlsh, Add1ArgType),
NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0),
NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
NEONMAP1(vqnegd_s64, aarch64_neon_sqneg, Add1ArgType),
NEONMAP1(vqnegh_s16, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
NEONMAP1(vqnegs_s32, aarch64_neon_sqneg, Add1ArgType),
+ NEONMAP1(vqrdmlahh_s16, aarch64_neon_sqrdmlah, Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqrdmlahs_s32, aarch64_neon_sqrdmlah, Add1ArgType),
+ NEONMAP1(vqrdmlshh_s16, aarch64_neon_sqrdmlsh, Vectorize1ArgType | Use64BitVectors),
+ NEONMAP1(vqrdmlshs_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
NEONMAP1(vqrdmulhh_s16, aarch64_neon_sqrdmulh, Vectorize1ArgType | Use64BitVectors),
NEONMAP1(vqrdmulhs_s32, aarch64_neon_sqrdmulh, Add1ArgType),
NEONMAP1(vqrshlb_s8, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4:[0-9]+]]
-// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR4]]
-// CHECK-NEXT: ret <4 x i16> [[VQADD_V2_I]]
+// CHECK-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]]
//
int16x4_t test_vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
return vqrdmlah_laneq_s16(a, b, v, 7);
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR4]]
-// CHECK-NEXT: ret <2 x i32> [[VQADD_V2_I]]
+// CHECK-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]]
//
int32x2_t test_vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
return vqrdmlah_laneq_s32(a, b, v, 3);
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
-// CHECK-NEXT: ret <8 x i16> [[VQADDQ_V2_I]]
+// CHECK-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]]
//
int16x8_t test_vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
return vqrdmlahq_laneq_s16(a, b, v, 7);
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
-// CHECK-NEXT: ret <4 x i32> [[VQADDQ_V2_I]]
+// CHECK-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]]
//
int32x4_t test_vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
return vqrdmlahq_laneq_s32(a, b, v, 3);
// CHECK-LABEL: @test_vqrdmlahh_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
-// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i64 0
-// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]]
-// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
-// CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
-// CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0
-// CHECK-NEXT: [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]]
-// CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0
-// CHECK-NEXT: ret i16 [[TMP5]]
+// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i64 0
+// CHECK-NEXT: [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0
+// CHECK-NEXT: ret i16 [[TMP3]]
//
int16_t test_vqrdmlahh_s16(int16_t a, int16_t b, int16_t c) {
return vqrdmlahh_s16(a, b, c);
// CHECK-LABEL: @test_vqrdmlahs_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I]]) #[[ATTR4]]
-// CHECK-NEXT: ret i32 [[VQADDS_S32_I]]
+// CHECK-NEXT: [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR4]]
+// CHECK-NEXT: ret i32 [[VQRDMLAHS_S32_I]]
//
int32_t test_vqrdmlahs_s32(int32_t a, int32_t b, int32_t c) {
return vqrdmlahs_s32(a, b, c);
// CHECK-LABEL: @test_vqrdmlahh_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[C:%.*]], i32 3
-// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
-// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
-// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]]
-// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
-// CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
-// CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0
-// CHECK-NEXT: [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]]
-// CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0
-// CHECK-NEXT: ret i16 [[TMP5]]
+// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK-NEXT: [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0
+// CHECK-NEXT: ret i16 [[TMP3]]
//
int16_t test_vqrdmlahh_lane_s16(int16_t a, int16_t b, int16x4_t c) {
return vqrdmlahh_lane_s16(a, b, c, 3);
// CHECK-LABEL: @test_vqrdmlahs_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1
-// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[VGET_LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I]]) #[[ATTR4]]
-// CHECK-NEXT: ret i32 [[VQADDS_S32_I]]
+// CHECK-NEXT: [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]]) #[[ATTR4]]
+// CHECK-NEXT: ret i32 [[VQRDMLAHS_S32_I]]
//
int32_t test_vqrdmlahs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
return vqrdmlahs_lane_s32(a, b, c, 1);
// CHECK-LABEL: @test_vqrdmlahh_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[C:%.*]], i32 7
-// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
-// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
-// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]]
-// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
-// CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
-// CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0
-// CHECK-NEXT: [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]]
-// CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0
-// CHECK-NEXT: ret i16 [[TMP5]]
+// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK-NEXT: [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0
+// CHECK-NEXT: ret i16 [[TMP3]]
//
int16_t test_vqrdmlahh_laneq_s16(int16_t a, int16_t b, int16x8_t c) {
return vqrdmlahh_laneq_s16(a, b, c, 7);
// CHECK-LABEL: @test_vqrdmlahs_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3
-// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[VGETQ_LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I]]) #[[ATTR4]]
-// CHECK-NEXT: ret i32 [[VQADDS_S32_I]]
+// CHECK-NEXT: [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]]) #[[ATTR4]]
+// CHECK-NEXT: ret i32 [[VQRDMLAHS_S32_I]]
//
int32_t test_vqrdmlahs_laneq_s32(int32_t a, int32_t b, int32x4_t c) {
return vqrdmlahs_laneq_s32(a, b, c, 3);
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR4]]
-// CHECK-NEXT: ret <4 x i16> [[VQSUB_V2_I]]
+// CHECK-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]]
//
int16x4_t test_vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
return vqrdmlsh_laneq_s16(a, b, v, 7);
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR4]]
-// CHECK-NEXT: ret <2 x i32> [[VQSUB_V2_I]]
+// CHECK-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]]
//
int32x2_t test_vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
return vqrdmlsh_laneq_s32(a, b, v, 3);
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
-// CHECK-NEXT: ret <8 x i16> [[VQSUBQ_V2_I]]
+// CHECK-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]]
//
int16x8_t test_vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
return vqrdmlshq_laneq_s16(a, b, v, 7);
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
-// CHECK-NEXT: ret <4 x i32> [[VQSUBQ_V2_I]]
+// CHECK-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]]
//
int32x4_t test_vqrdmlshq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
return vqrdmlshq_laneq_s32(a, b, v, 3);
// CHECK-LABEL: @test_vqrdmlshh_s16(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
-// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i64 0
-// CHECK-NEXT: [[VQRDMULHH_S16_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]]
-// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I_I]], i64 0
-// CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
-// CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0
-// CHECK-NEXT: [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]]
-// CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0
-// CHECK-NEXT: ret i16 [[TMP5]]
+// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i64 0
+// CHECK-NEXT: [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0
+// CHECK-NEXT: ret i16 [[TMP3]]
//
int16_t test_vqrdmlshh_s16(int16_t a, int16_t b, int16_t c) {
return vqrdmlshh_s16(a, b, c);
// CHECK-LABEL: @test_vqrdmlshs_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VQRDMULHS_S32_I_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I_I]]) #[[ATTR4]]
-// CHECK-NEXT: ret i32 [[VQSUBS_S32_I]]
+// CHECK-NEXT: [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR4]]
+// CHECK-NEXT: ret i32 [[VQRDMLSHS_S32_I]]
//
int32_t test_vqrdmlshs_s32(int32_t a, int32_t b, int32_t c) {
return vqrdmlshs_s32(a, b, c);
// CHECK-LABEL: @test_vqrdmlshh_lane_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[C:%.*]], i32 3
-// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
-// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
-// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]]
-// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
-// CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
-// CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0
-// CHECK-NEXT: [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]]
-// CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0
-// CHECK-NEXT: ret i16 [[TMP5]]
+// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK-NEXT: [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0
+// CHECK-NEXT: ret i16 [[TMP3]]
//
int16_t test_vqrdmlshh_lane_s16(int16_t a, int16_t b, int16x4_t c) {
return vqrdmlshh_lane_s16(a, b, c, 3);
// CHECK-LABEL: @test_vqrdmlshs_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1
-// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[VGET_LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I]]) #[[ATTR4]]
-// CHECK-NEXT: ret i32 [[VQSUBS_S32_I]]
+// CHECK-NEXT: [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]]) #[[ATTR4]]
+// CHECK-NEXT: ret i32 [[VQRDMLSHS_S32_I]]
//
int32_t test_vqrdmlshs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
return vqrdmlshs_lane_s32(a, b, c, 1);
// CHECK-LABEL: @test_vqrdmlshh_laneq_s16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[C:%.*]], i32 7
-// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
-// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
-// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #[[ATTR4]]
-// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
-// CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
-// CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i64 0
-// CHECK-NEXT: [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]]) #[[ATTR4]]
-// CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0
-// CHECK-NEXT: ret i16 [[TMP5]]
+// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK-NEXT: [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0
+// CHECK-NEXT: ret i16 [[TMP3]]
//
int16_t test_vqrdmlshh_laneq_s16(int16_t a, int16_t b, int16x8_t c) {
return vqrdmlshh_laneq_s16(a, b, c, 7);
// CHECK-LABEL: @test_vqrdmlshs_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3
-// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[B:%.*]], i32 [[VGETQ_LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A:%.*]], i32 [[VQRDMULHS_S32_I]]) #[[ATTR4]]
-// CHECK-NEXT: ret i32 [[VQSUBS_S32_I]]
+// CHECK-NEXT: [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]]) #[[ATTR4]]
+// CHECK-NEXT: ret i32 [[VQRDMLSHS_S32_I]]
//
int32_t test_vqrdmlshs_laneq_s32(int32_t a, int32_t b, int32x4_t c) {
return vqrdmlshs_laneq_s32(a, b, c, 3);
// CHECK-ARM-LABEL: @test_vqrdmlah_s16(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR4:[0-9]+]]
-// CHECK-ARM-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <4 x i16> [[VQADD_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3:[0-9]+]]
+// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlah_s16(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3:[0-9]+]]
-// CHECK-AARCH64-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQADD_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3:[0-9]+]]
+// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]]
//
int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
// CHECK-ARM-LABEL: @test_vqrdmlah_s32(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <2 x i32> [[VQADD_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlah_s32(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQADD_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]]
//
int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
// CHECK-ARM-LABEL: @test_vqrdmlahq_s16(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <8 x i16> [[VQADDQ_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlahq_s16(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQADDQ_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]]
//
int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
// CHECK-ARM-LABEL: @test_vqrdmlahq_s32(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <4 x i32> [[VQADDQ_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlahq_s32(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQADDQ_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]]
//
int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <4 x i16> [[VQADD_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s16(
// CHECK-AARCH64-NEXT: entry:
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQADD_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]]
//
int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-ARM-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <2 x i32> [[VQADD_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s32(
// CHECK-AARCH64-NEXT: entry:
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQADD_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]]
//
int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <8 x i16> [[VQADDQ_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s16(
// CHECK-AARCH64-NEXT: entry:
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQADDQ_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]]
//
int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <4 x i32> [[VQADDQ_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s32(
// CHECK-AARCH64-NEXT: entry:
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQADDQ_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]]
//
int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
// CHECK-ARM-LABEL: @test_vqrdmlsh_s16(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <4 x i16> [[VQSUB_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlsh_s16(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQSUB_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]]
//
int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
// CHECK-ARM-LABEL: @test_vqrdmlsh_s32(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <2 x i32> [[VQSUB_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlsh_s32(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQSUB_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]]
//
int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
// CHECK-ARM-LABEL: @test_vqrdmlshq_s16(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <8 x i16> [[VQSUBQ_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlshq_s16(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQSUBQ_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]]
//
int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
// CHECK-ARM-LABEL: @test_vqrdmlshq_s32(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <4 x i32> [[VQSUBQ_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlshq_s32(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQSUBQ_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]]
//
int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <4 x i16> [[VQSUB_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s16(
// CHECK-AARCH64-NEXT: entry:
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VQRDMULH_V2_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQSUB_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]]
//
int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-ARM-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <2 x i32> [[VQSUB_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s32(
// CHECK-AARCH64-NEXT: entry:
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-AARCH64-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VQRDMULH_V2_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQSUB_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]]
//
int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <8 x i16> [[VQSUBQ_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s16(
// CHECK-AARCH64-NEXT: entry:
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VQRDMULHQ_V2_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQSUBQ_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]]
//
int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-ARM-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR4]]
-// CHECK-ARM-NEXT: ret <4 x i32> [[VQSUBQ_V2_I]]
+// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s32(
// CHECK-AARCH64-NEXT: entry:
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-AARCH64-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQRDMULHQ_V2_I]]) #[[ATTR3]]
-// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQSUBQ_V2_I]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]]
//
int32x4_t test_vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
[LLVMMatchType<0>, llvm_anyint_ty, llvm_i32_ty],
[IntrNoMem]>;
+ class AdvSIMD_3IntArg_Intrinsic
+ : DefaultAttrsIntrinsic<[llvm_anyint_ty],
+ [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+ [IntrNoMem]>;
class AdvSIMD_3VectorArg_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
def int_aarch64_neon_sqrdmulh_lane : AdvSIMD_2VectorArg_Lane_Intrinsic;
def int_aarch64_neon_sqrdmulh_laneq : AdvSIMD_2VectorArg_Lane_Intrinsic;
+ def int_aarch64_neon_sqrdmlah : AdvSIMD_3IntArg_Intrinsic;
+ def int_aarch64_neon_sqrdmlsh : AdvSIMD_3IntArg_Intrinsic;
+
// Vector Polynominal Multiply
def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic;
def int_arm_neon_sha256h2: SHA_3Arg_v4i32_Intrinsic;
def int_arm_neon_sha256su1: SHA_3Arg_v4i32_Intrinsic;
+def int_arm_neon_vqrdmlah : Neon_3Arg_Intrinsic;
+def int_arm_neon_vqrdmlsh : Neon_3Arg_Intrinsic;
+
// Armv8.2-A dot product instructions
class Neon_Dot_Intrinsic
: Intrinsic<[llvm_anyvector_ty],
pattern> {
}
multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
- SDPatternOperator Accum> {
+ SDPatternOperator op> {
def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h",
[(set (v4i16 V64:$dst),
- (Accum (v4i16 V64:$Rd),
- (v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn),
- (v4i16 V64:$Rm)))))]>;
+ (v4i16 (op (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h",
[(set (v8i16 V128:$dst),
- (Accum (v8i16 V128:$Rd),
- (v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn),
- (v8i16 V128:$Rm)))))]>;
+ (v8i16 (op (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s",
[(set (v2i32 V64:$dst),
- (Accum (v2i32 V64:$Rd),
- (v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn),
- (v2i32 V64:$Rm)))))]>;
+ (v2i32 (op (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s",
[(set (v4i32 V128:$dst),
- (Accum (v4i32 V128:$Rd),
- (v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn),
- (v4i32 V128:$Rm)))))]>;
+ (v4i32 (op (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
}
multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
- SDPatternOperator Accum> {
+ SDPatternOperator op> {
def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
V64, V64, V128_lo, VectorIndexH,
asm, ".4h", ".4h", ".4h", ".h",
[(set (v4i16 V64:$dst),
- (Accum (v4i16 V64:$Rd),
- (v4i16 (int_aarch64_neon_sqrdmulh
- (v4i16 V64:$Rn),
- (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
- VectorIndexH:$idx))))))]> {
+ (v4i16 (op (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx)))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
V128, V128, V128_lo, VectorIndexH,
asm, ".8h", ".8h", ".8h", ".h",
[(set (v8i16 V128:$dst),
- (Accum (v8i16 V128:$Rd),
- (v8i16 (int_aarch64_neon_sqrdmulh
- (v8i16 V128:$Rn),
- (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
- VectorIndexH:$idx))))))]> {
+ (v8i16 (op (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+ (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx)))))]> {
bits<3> idx;
let Inst{11} = idx{2};
let Inst{21} = idx{1};
V64, V64, V128, VectorIndexS,
asm, ".2s", ".2s", ".2s", ".s",
[(set (v2i32 V64:$dst),
- (Accum (v2i32 V64:$Rd),
- (v2i32 (int_aarch64_neon_sqrdmulh
- (v2i32 V64:$Rn),
- (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
- VectorIndexS:$idx))))))]> {
+ (v2i32 (op (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+ (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx)))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
}
- // FIXME: it would be nice to use the scalar (v1i32) instruction here, but
- // an intermediate EXTRACT_SUBREG would be untyped.
- // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we
- // got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..)))
- def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
- (i32 (vector_extract
- (v4i32 (insert_subvector
- (undef),
- (v2i32 (int_aarch64_neon_sqrdmulh
- (v2i32 V64:$Rn),
- (v2i32 (AArch64duplane32
- (v4i32 V128:$Rm),
- VectorIndexS:$idx)))),
- (i64 0))),
- (i64 0))))),
- (EXTRACT_SUBREG
- (v2i32 (!cast<Instruction>(NAME # v2i32_indexed)
- (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
- FPR32Op:$Rd,
- ssub)),
- V64:$Rn,
- V128:$Rm,
- VectorIndexS:$idx)),
- ssub)>;
-
def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
V128, V128, V128, VectorIndexS,
asm, ".4s", ".4s", ".4s", ".s",
[(set (v4i32 V128:$dst),
- (Accum (v4i32 V128:$Rd),
- (v4i32 (int_aarch64_neon_sqrdmulh
- (v4i32 V128:$Rn),
- (v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
- VectorIndexS:$idx))))))]> {
+ (v4i32 (op (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+ (v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+ VectorIndexS:$idx)))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
}
- // FIXME: it would be nice to use the scalar (v1i32) instruction here, but
- // an intermediate EXTRACT_SUBREG would be untyped.
- def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
- (i32 (vector_extract
- (v4i32 (int_aarch64_neon_sqrdmulh
- (v4i32 V128:$Rn),
- (v4i32 (AArch64duplane32
- (v4i32 V128:$Rm),
- VectorIndexS:$idx)))),
- (i64 0))))),
- (EXTRACT_SUBREG
- (v4i32 (!cast<Instruction>(NAME # v4i32_indexed)
- (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
- FPR32Op:$Rd,
- ssub)),
- V128:$Rn,
- V128:$Rm,
- VectorIndexS:$idx)),
- ssub)>;
-
def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
FPR16Op, FPR16Op, V128_lo,
VectorIndexH, asm, ".h", "", "", ".h",
FPR32Op, FPR32Op, V128, VectorIndexS,
asm, ".s", "", "", ".s",
[(set (i32 FPR32Op:$dst),
- (Accum (i32 FPR32Op:$Rd),
- (i32 (int_aarch64_neon_sqrdmulh
- (i32 FPR32Op:$Rn),
- (i32 (vector_extract (v4i32 V128:$Rm),
- VectorIndexS:$idx))))))]> {
+ (i32 (op (i32 FPR32Op:$Rd), (i32 FPR32Op:$Rn),
+ (i32 (vector_extract (v4i32 V128:$Rm),
+ VectorIndexS:$idx)))))]> {
bits<2> idx;
let Inst{11} = idx{1};
let Inst{21} = idx{0};
defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
- int_aarch64_neon_sqadd>;
+ int_aarch64_neon_sqrdmlah>;
defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
- int_aarch64_neon_sqsub>;
+ int_aarch64_neon_sqrdmlsh>;
// Extra saturate patterns, other than the intrinsics matches above
defm : SIMDThreeSameVectorExtraPatterns<"SQADD", saddsat>;
let Predicates = [HasRDM] in {
defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">;
defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">;
- def : Pat<(i32 (int_aarch64_neon_sqadd
- (i32 FPR32:$Rd),
- (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
- (i32 FPR32:$Rm))))),
+ def : Pat<(i32 (int_aarch64_neon_sqrdmlah (i32 FPR32:$Rd), (i32 FPR32:$Rn),
+ (i32 FPR32:$Rm))),
(SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
- def : Pat<(i32 (int_aarch64_neon_sqsub
- (i32 FPR32:$Rd),
- (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
- (i32 FPR32:$Rm))))),
+ def : Pat<(i32 (int_aarch64_neon_sqrdmlsh (i32 FPR32:$Rd), (i32 FPR32:$Rn),
+ (i32 FPR32:$Rm))),
(SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
}
defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
int_aarch64_neon_sqsub>;
defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah",
- int_aarch64_neon_sqadd>;
+ int_aarch64_neon_sqrdmlah>;
defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
- int_aarch64_neon_sqsub>;
+ int_aarch64_neon_sqrdmlsh>;
defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
defm VQRDMLAH : N3VInt3_HS<1, 0, 0b1011, 1, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
null_frag>;
- def : Pat<(v4i16 (saddsat
- (v4i16 DPR:$src1),
- (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
- (v4i16 DPR:$Vm))))),
+ def : Pat<(v4i16 (int_arm_neon_vqrdmlah (v4i16 DPR:$src1), (v4i16 DPR:$Vn),
+ (v4i16 DPR:$Vm))),
(v4i16 (VQRDMLAHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
- def : Pat<(v2i32 (saddsat
- (v2i32 DPR:$src1),
- (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
- (v2i32 DPR:$Vm))))),
+ def : Pat<(v2i32 (int_arm_neon_vqrdmlah (v2i32 DPR:$src1), (v2i32 DPR:$Vn),
+ (v2i32 DPR:$Vm))),
(v2i32 (VQRDMLAHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
- def : Pat<(v8i16 (saddsat
- (v8i16 QPR:$src1),
- (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
- (v8i16 QPR:$Vm))))),
+ def : Pat<(v8i16 (int_arm_neon_vqrdmlah (v8i16 QPR:$src1), (v8i16 QPR:$Vn),
+ (v8i16 QPR:$Vm))),
(v8i16 (VQRDMLAHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
- def : Pat<(v4i32 (saddsat
- (v4i32 QPR:$src1),
- (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
- (v4i32 QPR:$Vm))))),
+ def : Pat<(v4i32 (int_arm_neon_vqrdmlah (v4i32 QPR:$src1), (v4i32 QPR:$Vn),
+ (v4i32 QPR:$Vm))),
(v4i32 (VQRDMLAHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
defm VQRDMLAHsl : N3VMulOpSL_HS<0b1110, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
null_frag>;
- def : Pat<(v4i16 (saddsat
- (v4i16 DPR:$src1),
- (v4i16 (int_arm_neon_vqrdmulh
+ def : Pat<(v4i16 (int_arm_neon_vqrdmlah (v4i16 DPR:$src1),
(v4i16 DPR:$Vn),
(v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
- imm:$lane)))))),
+ imm:$lane)))),
(v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm,
imm:$lane))>;
- def : Pat<(v2i32 (saddsat
- (v2i32 DPR:$src1),
- (v2i32 (int_arm_neon_vqrdmulh
+ def : Pat<(v2i32 (int_arm_neon_vqrdmlah (v2i32 DPR:$src1),
(v2i32 DPR:$Vn),
(v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
- imm:$lane)))))),
+ imm:$lane)))),
(v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
imm:$lane))>;
- def : Pat<(v8i16 (saddsat
- (v8i16 QPR:$src1),
- (v8i16 (int_arm_neon_vqrdmulh
+ def : Pat<(v8i16 (int_arm_neon_vqrdmlah (v8i16 QPR:$src1),
(v8i16 QPR:$src2),
(v8i16 (ARMvduplane (v8i16 QPR:$src3),
- imm:$lane)))))),
+ imm:$lane)))),
(v8i16 (VQRDMLAHslv8i16 (v8i16 QPR:$src1),
(v8i16 QPR:$src2),
(v4i16 (EXTRACT_SUBREG
QPR:$src3,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
- def : Pat<(v4i32 (saddsat
- (v4i32 QPR:$src1),
- (v4i32 (int_arm_neon_vqrdmulh
+ def : Pat<(v4i32 (int_arm_neon_vqrdmlah (v4i32 QPR:$src1),
(v4i32 QPR:$src2),
(v4i32 (ARMvduplane (v4i32 QPR:$src3),
- imm:$lane)))))),
+ imm:$lane)))),
(v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1),
(v4i32 QPR:$src2),
(v2i32 (EXTRACT_SUBREG
defm VQRDMLSH : N3VInt3_HS<1, 0, 0b1100, 1, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
null_frag>;
- def : Pat<(v4i16 (ssubsat
- (v4i16 DPR:$src1),
- (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
- (v4i16 DPR:$Vm))))),
+ def : Pat<(v4i16 (int_arm_neon_vqrdmlsh (v4i16 DPR:$src1), (v4i16 DPR:$Vn),
+ (v4i16 DPR:$Vm))),
(v4i16 (VQRDMLSHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
- def : Pat<(v2i32 (ssubsat
- (v2i32 DPR:$src1),
- (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
- (v2i32 DPR:$Vm))))),
+ def : Pat<(v2i32 (int_arm_neon_vqrdmlsh (v2i32 DPR:$src1), (v2i32 DPR:$Vn),
+ (v2i32 DPR:$Vm))),
(v2i32 (VQRDMLSHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
- def : Pat<(v8i16 (ssubsat
- (v8i16 QPR:$src1),
- (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
- (v8i16 QPR:$Vm))))),
+ def : Pat<(v8i16 (int_arm_neon_vqrdmlsh (v8i16 QPR:$src1), (v8i16 QPR:$Vn),
+ (v8i16 QPR:$Vm))),
(v8i16 (VQRDMLSHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
- def : Pat<(v4i32 (ssubsat
- (v4i32 QPR:$src1),
- (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
- (v4i32 QPR:$Vm))))),
+ def : Pat<(v4i32 (int_arm_neon_vqrdmlsh (v4i32 QPR:$src1), (v4i32 QPR:$Vn),
+ (v4i32 QPR:$Vm))),
(v4i32 (VQRDMLSHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
defm VQRDMLSHsl : N3VMulOpSL_HS<0b1111, IIC_VMACi16D, IIC_VMACi32D,
IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
null_frag>;
- def : Pat<(v4i16 (ssubsat
- (v4i16 DPR:$src1),
- (v4i16 (int_arm_neon_vqrdmulh
+ def : Pat<(v4i16 (int_arm_neon_vqrdmlsh (v4i16 DPR:$src1),
(v4i16 DPR:$Vn),
(v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
- imm:$lane)))))),
+ imm:$lane)))),
(v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>;
- def : Pat<(v2i32 (ssubsat
- (v2i32 DPR:$src1),
- (v2i32 (int_arm_neon_vqrdmulh
+ def : Pat<(v2i32 (int_arm_neon_vqrdmlsh (v2i32 DPR:$src1),
(v2i32 DPR:$Vn),
(v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
- imm:$lane)))))),
+ imm:$lane)))),
(v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
imm:$lane))>;
- def : Pat<(v8i16 (ssubsat
- (v8i16 QPR:$src1),
- (v8i16 (int_arm_neon_vqrdmulh
+ def : Pat<(v8i16 (int_arm_neon_vqrdmlsh (v8i16 QPR:$src1),
(v8i16 QPR:$src2),
(v8i16 (ARMvduplane (v8i16 QPR:$src3),
- imm:$lane)))))),
+ imm:$lane)))),
(v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1),
(v8i16 QPR:$src2),
(v4i16 (EXTRACT_SUBREG
QPR:$src3,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
- def : Pat<(v4i32 (ssubsat
- (v4i32 QPR:$src1),
- (v4i32 (int_arm_neon_vqrdmulh
+ def : Pat<(v4i32 (int_arm_neon_vqrdmlsh (v4i32 QPR:$src1),
(v4i32 QPR:$src2),
(v4i32 (ARMvduplane (v4i32 QPR:$src3),
- imm:$lane)))))),
+ imm:$lane)))),
(v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1),
(v4i32 QPR:$src2),
(v2i32 (EXTRACT_SUBREG
declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqrdmlah.i32(i32, i32, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32, i32, i32)
+
+; The sadd intrinsics in this file previously transformed into sqrdmlah where they
+; shouldn't. They should produce sqrdmulh and sqadd.
+
;-----------------------------------------------------------------------------
; RDMA Vector
; test for SIMDThreeSameVectorSQRDMLxHTiedHS
define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
; CHECK-LABEL: test_sqrdmlah_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sqrdmlah v0.4h, v1.4h, v2.4h
+; CHECK-NEXT: sqrdmulh v1.4h, v1.4h, v2.4h
+; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
%prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
%retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
; CHECK-LABEL: test_sqrdmlah_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sqrdmlah v0.8h, v1.8h, v2.8h
+; CHECK-NEXT: sqrdmulh v1.8h, v1.8h, v2.8h
+; CHECK-NEXT: sqadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
%retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
; CHECK-LABEL: test_sqrdmlah_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sqrdmlah v0.2s, v1.2s, v2.2s
+; CHECK-NEXT: sqrdmulh v1.2s, v1.2s, v2.2s
+; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s
; CHECK-NEXT: ret
%prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
%retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
; CHECK-LABEL: test_sqrdmlah_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sqrdmlah v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: sqrdmulh v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: sqadd v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
%retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
; CHECK-LABEL: test_sqrdmlsh_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sqrdmlsh v0.4h, v1.4h, v2.4h
+; CHECK-NEXT: sqrdmulh v1.4h, v1.4h, v2.4h
+; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
%prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
%retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
; CHECK-LABEL: test_sqrdmlsh_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sqrdmlsh v0.8h, v1.8h, v2.8h
+; CHECK-NEXT: sqrdmulh v1.8h, v1.8h, v2.8h
+; CHECK-NEXT: sqsub v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
%retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
; CHECK-LABEL: test_sqrdmlsh_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sqrdmlsh v0.2s, v1.2s, v2.2s
+; CHECK-NEXT: sqrdmulh v1.2s, v1.2s, v2.2s
+; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s
; CHECK-NEXT: ret
%prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
%retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
; CHECK-LABEL: test_sqrdmlsh_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sqrdmlsh v0.4s, v1.4s, v2.4s
+; CHECK-NEXT: sqrdmulh v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: sqsub v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
%retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
; CHECK-LABEL: test_sqrdmlah_lane_s16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: sqrdmlah v0.4h, v1.4h, v2.h[3]
+; CHECK-NEXT: sqrdmulh v1.4h, v1.4h, v2.h[3]
+; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
; CHECK-LABEL: test_sqrdmlahq_lane_s16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sqrdmlah v0.8h, v1.8h, v2.h[2]
+; CHECK-NEXT: sqrdmulh v1.8h, v1.8h, v2.h[2]
+; CHECK-NEXT: sqadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-LABEL: test_sqrdmlah_lane_s32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: sqrdmlah v0.2s, v1.2s, v2.s[1]
+; CHECK-NEXT: sqrdmulh v1.2s, v1.2s, v2.s[1]
+; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
; CHECK-LABEL: test_sqrdmlahq_lane_s32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sqrdmlah v0.4s, v1.4s, v2.s[0]
+; CHECK-NEXT: sqrdmulh v1.4s, v1.4s, v2.s[0]
+; CHECK-NEXT: sqadd v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-LABEL: test_sqrdmlsh_lane_s16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: sqrdmlsh v0.4h, v1.4h, v2.h[3]
+; CHECK-NEXT: sqrdmulh v1.4h, v1.4h, v2.h[3]
+; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
; CHECK-LABEL: test_sqrdmlshq_lane_s16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sqrdmlsh v0.8h, v1.8h, v2.h[2]
+; CHECK-NEXT: sqrdmulh v1.8h, v1.8h, v2.h[2]
+; CHECK-NEXT: sqsub v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-LABEL: test_sqrdmlsh_lane_s32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: sqrdmlsh v0.2s, v1.2s, v2.s[1]
+; CHECK-NEXT: sqrdmulh v1.2s, v1.2s, v2.s[1]
+; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
; CHECK-LABEL: test_sqrdmlshq_lane_s32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sqrdmlsh v0.4s, v1.4s, v2.s[0]
+; CHECK-NEXT: sqrdmulh v1.4s, v1.4s, v2.s[0]
+; CHECK-NEXT: sqsub v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s2, w0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: sqrdmlah v2.4h, v0.4h, v1.h[1]
-; CHECK-NEXT: umov w0, v2.h[0]
+; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[1]
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqadd v0.4h, v1.4h, v0.4h
+; CHECK-NEXT: umov w0, v0.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s2, w0
-; CHECK-NEXT: sqrdmlah v2.8h, v0.8h, v1.h[1]
-; CHECK-NEXT: umov w0, v2.h[0]
+; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[1]
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqadd v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: umov w0, v0.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s2, w0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: sqrdmlah v2.2s, v0.2s, v1.s[0]
-; CHECK-NEXT: fmov w0, s2
+; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqadd s0, s1, s0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s2, w0
-; CHECK-NEXT: sqrdmlah v2.4s, v0.4s, v1.s[0]
-; CHECK-NEXT: fmov w0, s2
+; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[0]
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqadd s0, s1, s0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s2, w0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: sqrdmlsh v2.4h, v0.4h, v1.h[1]
-; CHECK-NEXT: umov w0, v2.h[0]
+; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[1]
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqsub v0.4h, v1.4h, v0.4h
+; CHECK-NEXT: umov w0, v0.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s2, w0
-; CHECK-NEXT: sqrdmlsh v2.8h, v0.8h, v1.h[1]
-; CHECK-NEXT: umov w0, v2.h[0]
+; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[1]
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqsub v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: umov w0, v0.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s2, w0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: sqrdmlsh v2.2s, v0.2s, v1.s[0]
-; CHECK-NEXT: fmov w0, s2
+; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqsub s0, s1, s0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s2, w0
-; CHECK-NEXT: sqrdmlsh v2.4s, v0.4s, v1.s[0]
-; CHECK-NEXT: fmov w0, s2
+; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[0]
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqsub s0, s1, s0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s0, w1
; CHECK-NEXT: fmov s1, w2
-; CHECK-NEXT: fmov s2, w0
-; CHECK-NEXT: sqrdmlah v2.4h, v0.4h, v1.4h
-; CHECK-NEXT: umov w0, v2.h[0]
+; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqadd v0.4h, v1.4h, v0.4h
+; CHECK-NEXT: umov w0, v0.h[0]
; CHECK-NEXT: ret
%x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
%y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s0, w1
; CHECK-NEXT: fmov s1, w2
-; CHECK-NEXT: fmov s2, w0
-; CHECK-NEXT: sqrdmlah v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: fmov w0, s2
+; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqadd v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
%y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s0, w1
; CHECK-NEXT: fmov s1, w2
-; CHECK-NEXT: fmov s2, w0
-; CHECK-NEXT: sqrdmlsh v2.4h, v0.4h, v1.4h
-; CHECK-NEXT: umov w0, v2.h[0]
+; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqsub v0.4h, v1.4h, v0.4h
+; CHECK-NEXT: umov w0, v0.h[0]
; CHECK-NEXT: ret
%x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
%y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s0, w1
; CHECK-NEXT: fmov s1, w2
-; CHECK-NEXT: fmov s2, w0
-; CHECK-NEXT: sqrdmlsh v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: fmov w0, s2
+; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqsub v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
%y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
; CHECK-LABEL: test_sqrdmlah_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s0, w1
+; CHECK-NEXT: fmov s1, w2
+; CHECK-NEXT: sqrdmulh s0, s0, s1
; CHECK-NEXT: fmov s1, w0
-; CHECK-NEXT: fmov s2, w2
-; CHECK-NEXT: sqrdmlah s1, s0, s2
-; CHECK-NEXT: fmov w0, s1
+; CHECK-NEXT: sqadd s0, s1, s0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs)
%retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod)
; CHECK-LABEL: test_sqrdmlsh_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s0, w1
+; CHECK-NEXT: fmov s1, w2
+; CHECK-NEXT: sqrdmulh s0, s0, s1
; CHECK-NEXT: fmov s1, w0
-; CHECK-NEXT: fmov s2, w2
-; CHECK-NEXT: sqrdmlsh s1, s0, s2
-; CHECK-NEXT: fmov w0, s1
+; CHECK-NEXT: sqsub s0, s1, s0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs)
%retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod)
; CHECK-LABEL: test_sqrdmlah_extract_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s1, w1
-; CHECK-NEXT: fmov s2, w0
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: sqrdmlah v2.4h, v1.4h, v0.h[1]
-; CHECK-NEXT: umov w0, v2.h[0]
+; CHECK-NEXT: sqrdmulh v0.4h, v1.4h, v0.h[1]
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqadd v0.4h, v1.4h, v0.4h
+; CHECK-NEXT: umov w0, v0.h[0]
; CHECK-NEXT: ret
%shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
%x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
; CHECK-LABEL: test_sqrdmlah_extract_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s1, w1
-; CHECK-NEXT: fmov s2, w0
-; CHECK-NEXT: sqrdmlah s2, s1, v0.s[3]
-; CHECK-NEXT: fmov w0, s2
+; CHECK-NEXT: sqrdmulh s0, s1, v0.s[3]
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqadd s0, s1, s0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%extract = extractelement <4 x i32> %rhs, i32 3
%prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract)
; CHECK-LABEL: test_sqrdmlshq_extract_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s1, w1
-; CHECK-NEXT: fmov s2, w0
-; CHECK-NEXT: sqrdmlsh v2.8h, v1.8h, v0.h[1]
-; CHECK-NEXT: umov w0, v2.h[0]
+; CHECK-NEXT: sqrdmulh v0.8h, v1.8h, v0.h[1]
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqsub v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: umov w0, v0.h[0]
; CHECK-NEXT: ret
%shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
%x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
; CHECK-LABEL: test_sqrdmlsh_extract_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s1, w1
-; CHECK-NEXT: fmov s2, w0
-; CHECK-NEXT: sqrdmlsh s2, s1, v0.s[3]
-; CHECK-NEXT: fmov w0, s2
+; CHECK-NEXT: sqrdmulh s0, s1, v0.s[3]
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: sqsub s0, s1, s0
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%extract = extractelement <4 x i32> %rhs, i32 3
%prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract)
%retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod)
ret i32 %retval
}
+
+
+;-----------------------------------------------------------------------------
+; Using sqrdmlah intrinsics
+
+define <4 x i16> @test_vqrdmlah_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmlah_laneq_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmlah v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT: ret
+entry:
+ %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+ %vqrdmlah_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4
+ ret <4 x i16> %vqrdmlah_v3.i
+}
+
+define <2 x i32> @test_vqrdmlah_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmlah_laneq_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmlah v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT: ret
+entry:
+ %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> <i32 3, i32 3>
+ %vqrdmlah_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4
+ ret <2 x i32> %vqrdmlah_v3.i
+}
+
+define <8 x i16> @test_vqrdmlahq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmlahq_laneq_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmlah v0.8h, v1.8h, v2.h[7]
+; CHECK-NEXT: ret
+entry:
+ %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ %vqrdmlahq_v3.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %lane) #4
+ ret <8 x i16> %vqrdmlahq_v3.i
+}
+
+define <4 x i32> @test_vqrdmlahq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmlahq_laneq_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmlah v0.4s, v1.4s, v2.s[3]
+; CHECK-NEXT: ret
+entry:
+ %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %vqrdmlahq_v3.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %lane) #4
+ ret <4 x i32> %vqrdmlahq_v3.i
+}
+
+define i16 @test_vqrdmlahh_s16(i16 %a, i16 %b, i16 %c) {
+; CHECK-LABEL: test_vqrdmlahh_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s0, w1
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: fmov s2, w2
+; CHECK-NEXT: sqrdmlah v1.4h, v0.4h, v2.4h
+; CHECK-NEXT: umov w0, v1.h[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = insertelement <4 x i16> undef, i16 %a, i64 0
+ %1 = insertelement <4 x i16> undef, i16 %b, i64 0
+ %2 = insertelement <4 x i16> undef, i16 %c, i64 0
+ %vqrdmlahh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
+ %3 = extractelement <4 x i16> %vqrdmlahh_s16.i, i64 0
+ ret i16 %3
+}
+
+define i32 @test_vqrdmlahs_s32(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: test_vqrdmlahs_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s0, w1
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: fmov s2, w2
+; CHECK-NEXT: sqrdmlah s1, s0, s2
+; CHECK-NEXT: fmov w0, s1
+; CHECK-NEXT: ret
+entry:
+ %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %c) #4
+ ret i32 %vqrdmlahs_s32.i
+}
+
+define i16 @test_vqrdmlahh_lane_s16(i16 %a, i16 %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vqrdmlahh_lane_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s1, w1
+; CHECK-NEXT: fmov s2, w0
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: sqrdmlah v2.4h, v1.4h, v0.h[3]
+; CHECK-NEXT: umov w0, v2.h[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = insertelement <4 x i16> undef, i16 %a, i64 0
+ %1 = insertelement <4 x i16> undef, i16 %b, i64 0
+ %2 = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+ %vqrdmlahh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
+ %3 = extractelement <4 x i16> %vqrdmlahh_s16.i, i64 0
+ ret i16 %3
+}
+
+define i32 @test_vqrdmlahs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vqrdmlahs_lane_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s1, w1
+; CHECK-NEXT: fmov s2, w0
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: sqrdmlah s2, s1, v0.s[1]
+; CHECK-NEXT: fmov w0, s2
+; CHECK-NEXT: ret
+entry:
+ %vget_lane = extractelement <2 x i32> %c, i64 1
+ %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %vget_lane) #4
+ ret i32 %vqrdmlahs_s32.i
+}
+
+define i16 @test_vqrdmlahh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vqrdmlahh_laneq_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s1, w1
+; CHECK-NEXT: fmov s2, w0
+; CHECK-NEXT: sqrdmlah v2.4h, v1.4h, v0.h[7]
+; CHECK-NEXT: umov w0, v2.h[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = insertelement <4 x i16> undef, i16 %a, i64 0
+ %1 = insertelement <4 x i16> undef, i16 %b, i64 0
+ %2 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 7, i32 undef, i32 undef, i32 undef>
+ %vqrdmlahh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
+ %3 = extractelement <4 x i16> %vqrdmlahh_s16.i, i64 0
+ ret i16 %3
+}
+
+define i32 @test_vqrdmlahs_laneq_s32(i32 %a, i32 %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vqrdmlahs_laneq_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s1, w1
+; CHECK-NEXT: fmov s2, w0
+; CHECK-NEXT: sqrdmlah s2, s1, v0.s[3]
+; CHECK-NEXT: fmov w0, s2
+; CHECK-NEXT: ret
+entry:
+ %vgetq_lane = extractelement <4 x i32> %c, i64 3
+ %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %vgetq_lane) #4
+ ret i32 %vqrdmlahs_s32.i
+}
+
+define <4 x i16> @test_vqrdmlsh_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmlsh_laneq_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmlsh v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT: ret
+entry:
+ %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+ %vqrdmlsh_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4
+ ret <4 x i16> %vqrdmlsh_v3.i
+}
+
+define <2 x i32> @test_vqrdmlsh_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmlsh_laneq_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmlsh v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT: ret
+entry:
+ %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> <i32 3, i32 3>
+ %vqrdmlsh_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4
+ ret <2 x i32> %vqrdmlsh_v3.i
+}
+
+define <8 x i16> @test_vqrdmlshq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK-LABEL: test_vqrdmlshq_laneq_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmlsh v0.8h, v1.8h, v2.h[7]
+; CHECK-NEXT: ret
+entry:
+ %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ %vqrdmlshq_v3.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %lane) #4
+ ret <8 x i16> %vqrdmlshq_v3.i
+}
+
+define <4 x i32> @test_vqrdmlshq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK-LABEL: test_vqrdmlshq_laneq_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sqrdmlsh v0.4s, v1.4s, v2.s[3]
+; CHECK-NEXT: ret
+entry:
+ %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %vqrdmlshq_v3.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %lane) #4
+ ret <4 x i32> %vqrdmlshq_v3.i
+}
+
+define i16 @test_vqrdmlshh_s16(i16 %a, i16 %b, i16 %c) {
+; CHECK-LABEL: test_vqrdmlshh_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s0, w1
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: fmov s2, w2
+; CHECK-NEXT: sqrdmlsh v1.4h, v0.4h, v2.4h
+; CHECK-NEXT: umov w0, v1.h[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = insertelement <4 x i16> undef, i16 %a, i64 0
+ %1 = insertelement <4 x i16> undef, i16 %b, i64 0
+ %2 = insertelement <4 x i16> undef, i16 %c, i64 0
+ %vqrdmlshh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
+ %3 = extractelement <4 x i16> %vqrdmlshh_s16.i, i64 0
+ ret i16 %3
+}
+
+define i32 @test_vqrdmlshs_s32(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: test_vqrdmlshs_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s0, w1
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: fmov s2, w2
+; CHECK-NEXT: sqrdmlsh s1, s0, s2
+; CHECK-NEXT: fmov w0, s1
+; CHECK-NEXT: ret
+entry:
+ %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %c) #4
+ ret i32 %vqrdmlshs_s32.i
+}
+
+define i16 @test_vqrdmlshh_lane_s16(i16 %a, i16 %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vqrdmlshh_lane_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s1, w1
+; CHECK-NEXT: fmov s2, w0
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: sqrdmlsh v2.4h, v1.4h, v0.h[3]
+; CHECK-NEXT: umov w0, v2.h[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = insertelement <4 x i16> undef, i16 %a, i64 0
+ %1 = insertelement <4 x i16> undef, i16 %b, i64 0
+ %2 = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+ %vqrdmlshh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
+ %3 = extractelement <4 x i16> %vqrdmlshh_s16.i, i64 0
+ ret i16 %3
+}
+
+define i32 @test_vqrdmlshs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vqrdmlshs_lane_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s1, w1
+; CHECK-NEXT: fmov s2, w0
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: sqrdmlsh s2, s1, v0.s[1]
+; CHECK-NEXT: fmov w0, s2
+; CHECK-NEXT: ret
+entry:
+ %vget_lane = extractelement <2 x i32> %c, i64 1
+ %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vget_lane) #4
+ ret i32 %vqrdmlshs_s32.i
+}
+
+define i16 @test_vqrdmlshh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vqrdmlshh_laneq_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s1, w1
+; CHECK-NEXT: fmov s2, w0
+; CHECK-NEXT: sqrdmlsh v2.4h, v1.4h, v0.h[7]
+; CHECK-NEXT: umov w0, v2.h[0]
+; CHECK-NEXT: ret
+entry:
+ %0 = insertelement <4 x i16> undef, i16 %a, i64 0
+ %1 = insertelement <4 x i16> undef, i16 %b, i64 0
+ %2 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 7, i32 undef, i32 undef, i32 undef>
+ %vqrdmlshh_s16.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %0, <4 x i16> %1, <4 x i16> %2) #4
+ %3 = extractelement <4 x i16> %vqrdmlshh_s16.i, i64 0
+ ret i16 %3
+}
+
+define i32 @test_vqrdmlshs_laneq_s32(i32 %a, i32 %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vqrdmlshs_laneq_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmov s1, w1
+; CHECK-NEXT: fmov s2, w0
+; CHECK-NEXT: sqrdmlsh s2, s1, v0.s[3]
+; CHECK-NEXT: fmov w0, s2
+; CHECK-NEXT: ret
+entry:
+ %vgetq_lane = extractelement <4 x i32> %c, i64 3
+ %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vgetq_lane) #4
+ ret i32 %vqrdmlshs_s32.i
+}
declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>)
declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
+declare <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
+declare <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
+declare <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
+declare <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+; The sadd intrinsics in this file previously transformed into sqrdmlah where they
+; shouldn't. They should produce vqrdmulh and vadd.
+
define arm_aapcs_vfpcc <4 x i16> @test_vqrdmulah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
; CHECK-LABEL: test_vqrdmulah_v4i16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vqrdmlah.s16 d0, d1, d2
+; CHECK-NEXT: vqrdmulh.s16 d16, d1, d2
+; CHECK-NEXT: vqadd.s16 d0, d0, d16
; CHECK-NEXT: bx lr
%prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
%retval = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod)
define arm_aapcs_vfpcc <8 x i16> @test_vqrdmulah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
; CHECK-LABEL: test_vqrdmulah_v8i16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vqrdmlah.s16 q0, q1, q2
+; CHECK-NEXT: vqrdmulh.s16 q8, q1, q2
+; CHECK-NEXT: vqadd.s16 q0, q0, q8
; CHECK-NEXT: bx lr
%prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
%retval = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod)
define arm_aapcs_vfpcc <2 x i32> @test_vqrdmulah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
; CHECK-LABEL: test_vqrdmulah_v2i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vqrdmlah.s32 d0, d1, d2
+; CHECK-NEXT: vqrdmulh.s32 d16, d1, d2
+; CHECK-NEXT: vqadd.s32 d0, d0, d16
; CHECK-NEXT: bx lr
%prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
%retval = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod)
define arm_aapcs_vfpcc <4 x i32> @test_vqrdmulah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
; CHECK-LABEL: test_vqrdmulah_v4i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vqrdmlah.s32 q0, q1, q2
+; CHECK-NEXT: vqrdmulh.s32 q8, q1, q2
+; CHECK-NEXT: vqadd.s32 q0, q0, q8
; CHECK-NEXT: bx lr
%prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
%retval = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod)
define arm_aapcs_vfpcc <4 x i16> @test_vqrdmulsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
; CHECK-LABEL: test_vqrdmulsh_v4i16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vqrdmlsh.s16 d0, d1, d2
+; CHECK-NEXT: vqrdmulh.s16 d16, d1, d2
+; CHECK-NEXT: vqsub.s16 d0, d0, d16
; CHECK-NEXT: bx lr
%prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs)
%retval = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod)
define arm_aapcs_vfpcc <8 x i16> @test_vqrdmulsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
; CHECK-LABEL: test_vqrdmulsh_v8i16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vqrdmlsh.s16 q0, q1, q2
+; CHECK-NEXT: vqrdmulh.s16 q8, q1, q2
+; CHECK-NEXT: vqsub.s16 q0, q0, q8
; CHECK-NEXT: bx lr
%prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
%retval = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod)
define arm_aapcs_vfpcc <2 x i32> @test_vqrdmulsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
; CHECK-LABEL: test_vqrdmulsh_v2i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vqrdmlsh.s32 d0, d1, d2
+; CHECK-NEXT: vqrdmulh.s32 d16, d1, d2
+; CHECK-NEXT: vqsub.s32 d0, d0, d16
; CHECK-NEXT: bx lr
%prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
%retval = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod)
define arm_aapcs_vfpcc <4 x i32> @test_vqrdmulsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
; CHECK-LABEL: test_vqrdmulsh_v4i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vqrdmlsh.s32 q0, q1, q2
+; CHECK-NEXT: vqrdmulh.s32 q8, q1, q2
+; CHECK-NEXT: vqsub.s32 q0, q0, q8
; CHECK-NEXT: bx lr
%prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
%retval = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod)
define arm_aapcs_vfpcc <4 x i16> @test_vqrdmulah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulah_lane_s16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vqrdmlah.s16 d0, d1, d2[3]
+; CHECK-NEXT: vqrdmulh.s16 d16, d1, d2[3]
+; CHECK-NEXT: vqadd.s16 d0, d0, d16
; CHECK-NEXT: bx lr
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
; CHECK-LABEL: test_vqrdmulahq_lane_s16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
-; CHECK-NEXT: vqrdmlah.s16 q0, q1, d4[2]
+; CHECK-NEXT: vqrdmulh.s16 q8, q1, d4[2]
+; CHECK-NEXT: vqadd.s16 q0, q0, q8
; CHECK-NEXT: bx lr
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
define arm_aapcs_vfpcc <2 x i32> @test_vqrdmulah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulah_lane_s32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vqrdmlah.s32 d0, d1, d2[1]
+; CHECK-NEXT: vqrdmulh.s32 d16, d1, d2[1]
+; CHECK-NEXT: vqadd.s32 d0, d0, d16
; CHECK-NEXT: bx lr
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
; CHECK-LABEL: test_vqrdmulahq_lane_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
-; CHECK-NEXT: vqrdmlah.s32 q0, q1, d4[0]
+; CHECK-NEXT: vqrdmulh.s32 q8, q1, d4[0]
+; CHECK-NEXT: vqadd.s32 q0, q0, q8
; CHECK-NEXT: bx lr
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
define arm_aapcs_vfpcc <4 x i16> @test_vqrdmulsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
; CHECK-LABEL: test_vqrdmulsh_lane_s16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vqrdmlsh.s16 d0, d1, d2[3]
+; CHECK-NEXT: vqrdmulh.s16 d16, d1, d2[3]
+; CHECK-NEXT: vqsub.s16 d0, d0, d16
; CHECK-NEXT: bx lr
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
; CHECK-LABEL: test_vqrdmulshq_lane_s16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
-; CHECK-NEXT: vqrdmlsh.s16 q0, q1, d4[2]
+; CHECK-NEXT: vqrdmulh.s16 q8, q1, d4[2]
+; CHECK-NEXT: vqsub.s16 q0, q0, q8
; CHECK-NEXT: bx lr
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
define arm_aapcs_vfpcc <2 x i32> @test_vqrdmulsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
; CHECK-LABEL: test_vqrdmulsh_lane_s32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vqrdmlsh.s32 d0, d1, d2[1]
+; CHECK-NEXT: vqrdmulh.s32 d16, d1, d2[1]
+; CHECK-NEXT: vqsub.s32 d0, d0, d16
; CHECK-NEXT: bx lr
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
; CHECK-LABEL: test_vqrdmulshq_lane_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
-; CHECK-NEXT: vqrdmlsh.s32 q0, q1, d4[0]
+; CHECK-NEXT: vqrdmulh.s32 q8, q1, d4[0]
+; CHECK-NEXT: vqsub.s32 q0, q0, q8
; CHECK-NEXT: bx lr
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
%retval = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod)
ret <4 x i32> %retval
}
+
+
+
+define arm_aapcs_vfpcc <4 x i16> @test_vqrdmlah_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vqrdmlah_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqrdmlah.s16 d0, d1, d2
+; CHECK-NEXT: bx lr
+entry:
+ %vqrdmlah_v3.i = tail call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #3
+ ret <4 x i16> %vqrdmlah_v3.i
+}
+
+define arm_aapcs_vfpcc <2 x i32> @test_vqrdmlah_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vqrdmlah_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqrdmlah.s32 d0, d1, d2
+; CHECK-NEXT: bx lr
+entry:
+ %vqrdmlah_v3.i = tail call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #3
+ ret <2 x i32> %vqrdmlah_v3.i
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlahq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vqrdmlahq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqrdmlah.s16 q0, q1, q2
+; CHECK-NEXT: bx lr
+entry:
+ %vqrdmlahq_v3.i = tail call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #3
+ ret <8 x i16> %vqrdmlahq_v3.i
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlahq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vqrdmlahq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqrdmlah.s32 q0, q1, q2
+; CHECK-NEXT: bx lr
+entry:
+ %vqrdmlahq_v3.i = tail call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #3
+ ret <4 x i32> %vqrdmlahq_v3.i
+}
+
+define arm_aapcs_vfpcc <4 x i16> @test_vqrdmlah_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vqrdmlah_lane_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqrdmlah.s16 d0, d1, d2[3]
+; CHECK-NEXT: bx lr
+entry:
+ %lane = shufflevector <4 x i16> %c, <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %vqrdmlah_v3.i = tail call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #3
+ ret <4 x i16> %vqrdmlah_v3.i
+}
+
+define arm_aapcs_vfpcc <2 x i32> @test_vqrdmlah_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vqrdmlah_lane_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqrdmlah.s32 d0, d1, d2[1]
+; CHECK-NEXT: bx lr
+entry:
+ %lane = shufflevector <2 x i32> %c, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %vqrdmlah_v3.i = tail call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #3
+ ret <2 x i32> %vqrdmlah_v3.i
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlahq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vqrdmlahq_lane_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
+; CHECK-NEXT: vqrdmlah.s16 q0, q1, d4[3]
+; CHECK-NEXT: bx lr
+entry:
+ %lane = shufflevector <4 x i16> %c, <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %vqrdmlahq_v3.i = tail call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %lane) #3
+ ret <8 x i16> %vqrdmlahq_v3.i
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlahq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vqrdmlahq_lane_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
+; CHECK-NEXT: vqrdmlah.s32 q0, q1, d4[1]
+; CHECK-NEXT: bx lr
+entry:
+ %lane = shufflevector <2 x i32> %c, <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %vqrdmlahq_v3.i = tail call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %lane) #3
+ ret <4 x i32> %vqrdmlahq_v3.i
+}
+
+define arm_aapcs_vfpcc <4 x i16> @test_vqrdmlsh_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vqrdmlsh_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqrdmlsh.s16 d0, d1, d2
+; CHECK-NEXT: bx lr
+entry:
+ %vqrdmlsh_v3.i = tail call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #3
+ ret <4 x i16> %vqrdmlsh_v3.i
+}
+
+define arm_aapcs_vfpcc <2 x i32> @test_vqrdmlsh_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vqrdmlsh_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqrdmlsh.s32 d0, d1, d2
+; CHECK-NEXT: bx lr
+entry:
+ %vqrdmlsh_v3.i = tail call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #3
+ ret <2 x i32> %vqrdmlsh_v3.i
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlshq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK-LABEL: test_vqrdmlshq_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqrdmlsh.s16 q0, q1, q2
+; CHECK-NEXT: bx lr
+entry:
+ %vqrdmlshq_v3.i = tail call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #3
+ ret <8 x i16> %vqrdmlshq_v3.i
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlshq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK-LABEL: test_vqrdmlshq_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqrdmlsh.s32 q0, q1, q2
+; CHECK-NEXT: bx lr
+entry:
+ %vqrdmlshq_v3.i = tail call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #3
+ ret <4 x i32> %vqrdmlshq_v3.i
+}
+
+define arm_aapcs_vfpcc <4 x i16> @test_vqrdmlsh_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vqrdmlsh_lane_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqrdmlsh.s16 d0, d1, d2[3]
+; CHECK-NEXT: bx lr
+entry:
+ %lane = shufflevector <4 x i16> %c, <4 x i16> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %vqrdmlsh_v3.i = tail call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #3
+ ret <4 x i16> %vqrdmlsh_v3.i
+}
+
+define arm_aapcs_vfpcc <2 x i32> @test_vqrdmlsh_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vqrdmlsh_lane_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vqrdmlsh.s32 d0, d1, d2[1]
+; CHECK-NEXT: bx lr
+entry:
+ %lane = shufflevector <2 x i32> %c, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
+ %vqrdmlsh_v3.i = tail call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #3
+ ret <2 x i32> %vqrdmlsh_v3.i
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqrdmlshq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) {
+; CHECK-LABEL: test_vqrdmlshq_lane_s16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
+; CHECK-NEXT: vqrdmlsh.s16 q0, q1, d4[3]
+; CHECK-NEXT: bx lr
+entry:
+ %lane = shufflevector <4 x i16> %c, <4 x i16> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %vqrdmlshq_v3.i = tail call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %lane) #3
+ ret <8 x i16> %vqrdmlshq_v3.i
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqrdmlshq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: test_vqrdmlshq_lane_s32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: @ kill: def $d4 killed $d4 def $q2
+; CHECK-NEXT: vqrdmlsh.s32 q0, q1, d4[1]
+; CHECK-NEXT: bx lr
+entry:
+ %lane = shufflevector <2 x i32> %c, <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %vqrdmlshq_v3.i = tail call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %lane) #3
+ ret <4 x i32> %vqrdmlshq_v3.i
+}