let Inst{20} = idx{0};
}
- // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
- // intermediate EXTRACT_SUBREG would be untyped.
- def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
- (i32 (vector_extract (v4i32
- (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
- (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
- VectorIndexH:$idx)))),
- (i64 0))))),
- (EXTRACT_SUBREG
- (!cast<Instruction>(NAME # v4i16_indexed)
- (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
- V128_lo:$Rm, VectorIndexH:$idx),
- ssub)>;
-
def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
V128, V128,
V128_lo, VectorIndexH,
let Inst{20} = idx{0};
}
+ def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+ (i32 (vector_extract
+ (v4i32 (int_aarch64_neon_sqdmull
+ (v4i16 V64:$Rn),
+ (v4i16 V64:$Rm))),
+ (i64 0))))),
+ (!cast<Instruction>(NAME # v1i32_indexed)
+ FPR32Op:$Rd,
+ (EXTRACT_SUBREG V64:$Rn, hsub),
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),
+ (i64 0))>;
+
+ def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+ (i32 (vector_extract
+ (v4i32 (int_aarch64_neon_sqdmull
+ (v4i16 V64:$Rn),
+ (v4i16 (AArch64duplane16
+ (v8i16 V128_lo:$Rm),
+ VectorIndexH:$idx)))),
+ (i64 0))))),
+ (!cast<Instruction>(NAME # v1i32_indexed)
+ FPR32Op:$Rd,
+ (EXTRACT_SUBREG V64:$Rn, hsub),
+ V128_lo:$Rm,
+ VectorIndexH:$idx)>;
def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
FPR64Op, FPR32Op, V128, VectorIndexS,
; CHECK-NEXT: fmov s1, w1
; CHECK-NEXT: fmov s2, w0
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: sqdmlal.4s v2, v1, v0[1]
+; CHECK-NEXT: sqdmlal.h s2, h1, v0[1]
; CHECK-NEXT: fmov w0, s2
; CHECK-NEXT: ret
%lhs = insertelement <4 x i16> undef, i16 %B, i32 0
; CHECK-NEXT: fmov s1, w1
; CHECK-NEXT: fmov s2, w0
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: sqdmlsl.4s v2, v1, v0[1]
+; CHECK-NEXT: sqdmlsl.h s2, h1, v0[1]
; CHECK-NEXT: fmov w0, s2
; CHECK-NEXT: ret
%lhs = insertelement <4 x i16> undef, i16 %B, i32 0
}
declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
+define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
+; CHECK-LABEL: sqadd_lane1_sqdmull4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmull.4s v0, v0, v1
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: mov.s w8, v0[1]
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: sqadd s0, s1, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
+ %prod = extractelement <4 x i32> %prod.vec, i32 1
+ %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
+ ret i32 %res
+}
+
+define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
+; CHECK-LABEL: sqsub_lane1_sqdmull4s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sqdmull.4s v0, v0, v1
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: mov.s w8, v0[1]
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: sqsub s0, s1, s0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+ %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
+ %prod = extractelement <4 x i32> %prod.vec, i32 1
+ %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
+ ret i32 %res
+}
+
define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
; CHECK-LABEL: sqdmlal_lane_1d:
; CHECK: // %bb.0:
ret <1 x double> %prod
}
+define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind {
+; CHECK-LABEL: sqdmlal_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s0, w1
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: fmov s2, w2
+; CHECK-NEXT: sqdmlal.h s2, h1, v0[0]
+; CHECK-NEXT: fmov w0, s2
+; CHECK-NEXT: ret
+ %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0
+ %tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0
+ %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp4 = extractelement <4 x i32> %tmp3, i64 0
+ %tmp5 = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %C, i32 %tmp4)
+ ret i32 %tmp5
+}
+
define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
; CHECK-LABEL: sqdmlal_d:
; CHECK: // %bb.0:
ret i64 %tmp5
}
+define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind {
+; CHECK-LABEL: sqdmlsl_s:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s0, w1
+; CHECK-NEXT: fmov s1, w0
+; CHECK-NEXT: fmov s2, w2
+; CHECK-NEXT: sqdmlsl.h s2, h1, v0[0]
+; CHECK-NEXT: fmov w0, s2
+; CHECK-NEXT: ret
+ %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0
+ %tmp2 = insertelement <4 x i16> undef, i16 %B, i64 0
+ %tmp3 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp4 = extractelement <4 x i32> %tmp3, i64 0
+ %tmp5 = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %C, i32 %tmp4)
+ ret i32 %tmp5
+}
+
define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
; CHECK-LABEL: sqdmlsl_d:
; CHECK: // %bb.0: