; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, <8 x i8>* %A
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: fmov x10, d1
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: mov x8, v1.d[1]
-; CHECK-NEXT: mov x9, v0.d[1]
-; CHECK-NEXT: mul x10, x11, x10
-; CHECK-NEXT: mul x8, x9, x8
-; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: mov v0.d[1], x8
+; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
define <8 x i16> @amlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
; CHECK-LABEL: amlal_v8i8_v8i16:
; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: ushll v2.8h, v2.8b, #0
-; CHECK-NEXT: mla v0.8h, v1.8h, v2.8h
+; CHECK-NEXT: umlal v0.8h, v1.8b, v2.8b
; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, <8 x i16>* %A
define <4 x i32> @amlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
; CHECK-LABEL: amlal_v4i16_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x1]
-; CHECK-NEXT: ldr d1, [x2]
-; CHECK-NEXT: ldr q2, [x0]
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff
-; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: ldr d2, [x2]
+; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h
+; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, <4 x i32>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
define <2 x i64> @amlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
; CHECK-LABEL: amlal_v2i32_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x1]
-; CHECK-NEXT: ldr d1, [x2]
-; CHECK-NEXT: ldr q2, [x0]
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: fmov x10, d1
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: mov x8, v1.d[1]
-; CHECK-NEXT: mov x9, v0.d[1]
-; CHECK-NEXT: mul x10, x11, x10
-; CHECK-NEXT: mul x8, x9, x8
-; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: mov v0.d[1], x8
-; CHECK-NEXT: add v0.2d, v2.2d, v0.2d
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: ldr d2, [x2]
+; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
define <8 x i16> @amlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
; CHECK-LABEL: amlsl_v8i8_v8i16:
; CHECK: // %bb.0:
+; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x2]
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: ushll v2.8h, v2.8b, #0
-; CHECK-NEXT: mls v0.8h, v1.8h, v2.8h
+; CHECK-NEXT: umlsl v0.8h, v1.8b, v2.8b
; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, <8 x i16>* %A
define <4 x i32> @amlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
; CHECK-LABEL: amlsl_v4i16_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x1]
-; CHECK-NEXT: ldr d1, [x2]
-; CHECK-NEXT: ldr q2, [x0]
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: mls v2.4s, v0.4s, v1.4s
-; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff
-; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: ldr d2, [x2]
+; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h
+; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, <4 x i32>* %A
%tmp2 = load <4 x i16>, <4 x i16>* %B
define <2 x i64> @amlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
; CHECK-LABEL: amlsl_v2i32_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x1]
-; CHECK-NEXT: ldr d1, [x2]
-; CHECK-NEXT: ldr q2, [x0]
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: fmov x10, d1
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: mov x8, v1.d[1]
-; CHECK-NEXT: mov x9, v0.d[1]
-; CHECK-NEXT: mul x10, x11, x10
-; CHECK-NEXT: mul x8, x9, x8
-; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: mov v0.d[1], x8
-; CHECK-NEXT: sub v0.2d, v2.2d, v0.2d
+; CHECK-NEXT: ldr q0, [x0]
+; CHECK-NEXT: ldr d1, [x1]
+; CHECK-NEXT: ldr d2, [x2]
+; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
; CHECK-LABEL: amull_extvec_v8i8_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: movi v1.8h, #12
-; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: movi v1.8b, #12
+; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: bic v0.8h, #255, lsl #8
; CHECK-NEXT: ret
%tmp3 = zext <8 x i8> %arg to <8 x i16>
; CHECK-LABEL: amull_extvec_v4i16_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #1234
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: dup v1.4s, w8
-; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup v1.4h, w8
+; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
; CHECK-LABEL: amull_extvec_v2i32_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: mov w8, #1234
-; CHECK-NEXT: fmov x10, d0
-; CHECK-NEXT: mov x9, v0.d[1]
-; CHECK-NEXT: mul x10, x10, x8
-; CHECK-NEXT: mul x8, x9, x8
-; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: mov v0.d[1], x8
+; CHECK-NEXT: dup v1.2s, w8
+; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
; CHECK-LABEL: amull2_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: mul v1.8h, v2.8h, v3.8h
+; CHECK-NEXT: umull v2.8h, v0.8b, v1.8b
+; CHECK-NEXT: umull2 v1.8h, v0.16b, v1.16b
+; CHECK-NEXT: bic v2.8h, #255, lsl #8
; CHECK-NEXT: bic v1.8h, #255, lsl #8
-; CHECK-NEXT: bic v0.8h, #255, lsl #8
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
%arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
; CHECK-LABEL: amull2_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll2 v3.4s, v1.8h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: movi v4.2d, #0x00ffff0000ffff
-; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: mul v1.4s, v2.4s, v3.4s
-; CHECK-NEXT: and v1.16b, v1.16b, v4.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: umull v2.4s, v0.4h, v1.4h
+; CHECK-NEXT: umull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff
+; CHECK-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEXT: and v0.16b, v2.16b, v3.16b
; CHECK-NEXT: ret
%arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
%arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
; CHECK-LABEL: amull2_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll2 v2.2d, v0.4s, #0
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-NEXT: ushll2 v3.2d, v1.4s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: fmov x10, d1
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: fmov x13, d3
-; CHECK-NEXT: fmov x14, d2
-; CHECK-NEXT: mov x8, v1.d[1]
-; CHECK-NEXT: mov x9, v0.d[1]
-; CHECK-NEXT: mul x10, x11, x10
-; CHECK-NEXT: mov x11, v3.d[1]
-; CHECK-NEXT: mov x12, v2.d[1]
-; CHECK-NEXT: mul x13, x14, x13
-; CHECK-NEXT: mul x8, x9, x8
-; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: mul x9, x12, x11
-; CHECK-NEXT: fmov d1, x13
-; CHECK-NEXT: movi v2.2d, #0x000000ffffffff
-; CHECK-NEXT: mov v0.d[1], x8
-; CHECK-NEXT: mov v1.d[1], x9
-; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: umull v2.2d, v0.2s, v1.2s
+; CHECK-NEXT: umull2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT: movi v3.2d, #0x000000ffffffff
+; CHECK-NEXT: and v1.16b, v0.16b, v3.16b
+; CHECK-NEXT: and v0.16b, v2.16b, v3.16b
; CHECK-NEXT: ret
%arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
%arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
define <4 x i32> @mlai16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
; CHECK-LABEL: mlai16_and:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-NEXT: ushll v2.4s, v2.4h, #0
-; CHECK-NEXT: mla v2.4s, v1.4s, v0.4s
+; CHECK-NEXT: umlal v2.4s, v1.4h, v0.4h
; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff
; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-NEXT: ret
define <4 x i32> @addmuli16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
; CHECK-LABEL: addmuli16_and:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: ushll v2.4s, v2.4h, #0
-; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h
+; CHECK-NEXT: umlal v1.4s, v0.4h, v2.4h
+; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff
+; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ret
entry:
%v0 = sext <4 x i16> %vec0 to <4 x i32>
define <2 x i64> @mlai32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
; CHECK-LABEL: mlai32_and:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: fmov x10, d0
-; CHECK-NEXT: fmov x11, d1
-; CHECK-NEXT: mov x8, v0.d[1]
-; CHECK-NEXT: mov x9, v1.d[1]
-; CHECK-NEXT: mul x10, x11, x10
-; CHECK-NEXT: mul x8, x9, x8
-; CHECK-NEXT: fmov d1, x10
-; CHECK-NEXT: ushll v0.2d, v2.2s, #0
-; CHECK-NEXT: mov v1.d[1], x8
-; CHECK-NEXT: add v0.2d, v1.2d, v0.2d
-; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-NEXT: umlal v2.2d, v1.2s, v0.2s
+; CHECK-NEXT: movi v0.2d, #0x000000ffffffff
+; CHECK-NEXT: and v0.16b, v2.16b, v0.16b
; CHECK-NEXT: ret
entry:
%v0 = sext <2 x i32> %vec0 to <2 x i64>
define <2 x i64> @addmuli32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
; CHECK-LABEL: addmuli32_and:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEXT: ushll v2.2d, v2.2s, #0
-; CHECK-NEXT: add v0.2d, v1.2d, v0.2d
-; CHECK-NEXT: fmov x9, d2
-; CHECK-NEXT: fmov x11, d0
-; CHECK-NEXT: mov x8, v2.d[1]
-; CHECK-NEXT: mov x10, v0.d[1]
-; CHECK-NEXT: mul x9, x11, x9
-; CHECK-NEXT: mul x8, x10, x8
-; CHECK-NEXT: fmov d0, x9
-; CHECK-NEXT: mov v0.d[1], x8
-; CHECK-NEXT: movi v1.2d, #0x000000ffffffff
-; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s
+; CHECK-NEXT: umlal v1.2d, v0.2s, v2.2s
+; CHECK-NEXT: movi v0.2d, #0x000000ffffffff
+; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ret
entry:
%v0 = sext <2 x i32> %vec0 to <2 x i64>