--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
+
+define <2 x i64> @test_mul_add_2x64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
+; CHECK-LABEL: test_mul_add_2x64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
+ %mul = mul <2 x i64> %b, %c
+ %add = add <2 x i64> %a, %mul
+ ret <2 x i64> %add
+}
+
+define <1 x i64> @test_mul_add_1x64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) {
+; CHECK-LABEL: test_mul_add_1x64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: ret
+ %mul = mul <1 x i64> %b, %c
+ %add = add <1 x i64> %mul, %a
+ ret <1 x i64> %add
+}
+
+define <2 x i64> @test_mul_sub_2x64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
+; CHECK-LABEL: test_mul_sub_2x64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mls z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
+ %mul = mul <2 x i64> %b, %c
+ %sub = sub <2 x i64> %a, %mul
+ ret <2 x i64> %sub
+}
+
+define <2 x i64> @test_mul_sub_2x64_2(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
+; CHECK-LABEL: test_mul_sub_2x64_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q3 killed $q3 def $z3
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mul z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: sub v0.2d, v2.2d, v0.2d
+; CHECK-NEXT: ret
+ %div = sdiv <2 x i64> %a, %b
+ %mul = mul <2 x i64> %c, %d
+ %sub = sub <2 x i64> %mul, %div
+ ret <2 x i64> %sub
+}
+
+define <2 x i64> @test_mul_sub_2x64_3(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
+; CHECK-LABEL: test_mul_sub_2x64_3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: // kill: def $q3 killed $q3 def $z3
+; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2
+; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: mls z0.d, p0/m, z2.d, z3.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: ret
+ %div = sdiv <2 x i64> %a, %b
+ %mul = mul <2 x i64> %c, %d
+ %sub = sub <2 x i64> %div, %mul
+ ret <2 x i64> %sub
+}
+
+define <1 x i64> @test_mul_sub_1x64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) {
+; CHECK-LABEL: test_mul_sub_1x64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2
+; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d
+; CHECK-NEXT: sub d0, d1, d0
+; CHECK-NEXT: ret
+ %mul = mul <1 x i64> %b, %c
+ %sub = sub <1 x i64> %mul, %a
+ ret <1 x i64> %sub
+}
;
; VBITS_GE_256-LABEL: srem_v16i32:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: srem_v1i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT: ptrue p0.d, vl1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
-; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d
-; CHECK-NEXT: sub d0, d0, d1
+; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = srem <1 x i64> %op1, %op2
ret <1 x i64> %res
define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: srem_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
-; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
-; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d
-; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = srem <2 x i64> %op1, %op2
ret <2 x i64> %res
define void @srem_v8i64(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: srem_v8i64:
; VBITS_GE_128: // %bb.0:
-; VBITS_GE_128-NEXT: ldp q4, q5, [x1]
-; VBITS_GE_128-NEXT: ptrue p0.d, vl2
-; VBITS_GE_128-NEXT: ldp q7, q6, [x1, #32]
; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
-; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
-; VBITS_GE_128-NEXT: movprfx z16, z3
-; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z5.d
-; VBITS_GE_128-NEXT: movprfx z17, z2
-; VBITS_GE_128-NEXT: sdiv z17.d, p0/m, z17.d, z4.d
-; VBITS_GE_128-NEXT: mul z5.d, p0/m, z5.d, z16.d
+; VBITS_GE_128-NEXT: ptrue p0.d, vl2
+; VBITS_GE_128-NEXT: ldp q2, q3, [x1, #32]
; VBITS_GE_128-NEXT: movprfx z16, z1
+; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z3.d
+; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z3.d
+; VBITS_GE_128-NEXT: movprfx z3, z0
+; VBITS_GE_128-NEXT: sdiv z3.d, p0/m, z3.d, z2.d
+; VBITS_GE_128-NEXT: mls z0.d, p0/m, z3.d, z2.d
+; VBITS_GE_128-NEXT: ldp q4, q5, [x0]
+; VBITS_GE_128-NEXT: ldp q7, q6, [x1]
+; VBITS_GE_128-NEXT: movprfx z16, z5
; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z6.d
-; VBITS_GE_128-NEXT: mul z4.d, p0/m, z4.d, z17.d
-; VBITS_GE_128-NEXT: movprfx z17, z0
-; VBITS_GE_128-NEXT: sdiv z17.d, p0/m, z17.d, z7.d
-; VBITS_GE_128-NEXT: mul z6.d, p0/m, z6.d, z16.d
-; VBITS_GE_128-NEXT: mul z7.d, p0/m, z7.d, z17.d
-; VBITS_GE_128-NEXT: sub v0.2d, v0.2d, v7.2d
-; VBITS_GE_128-NEXT: sub v1.2d, v1.2d, v6.2d
-; VBITS_GE_128-NEXT: sub v2.2d, v2.2d, v4.2d
+; VBITS_GE_128-NEXT: movprfx z2, z4
+; VBITS_GE_128-NEXT: sdiv z2.d, p0/m, z2.d, z7.d
; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
-; VBITS_GE_128-NEXT: sub v0.2d, v3.2d, v5.2d
-; VBITS_GE_128-NEXT: stp q2, q0, [x0]
+; VBITS_GE_128-NEXT: movprfx z0, z4
+; VBITS_GE_128-NEXT: mls z0.d, p0/m, z2.d, z7.d
+; VBITS_GE_128-NEXT: movprfx z1, z5
+; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z6.d
+; VBITS_GE_128-NEXT: stp q0, q1, [x0]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: srem_v8i64:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
;
; VBITS_GE_256-LABEL: urem_v16i32:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: urem_v1i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT: ptrue p0.d, vl1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
-; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d
-; CHECK-NEXT: sub d0, d0, d1
+; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = urem <1 x i64> %op1, %op2
ret <1 x i64> %res
define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
; CHECK-LABEL: urem_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
-; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: movprfx z2, z0
; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
-; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d
-; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = urem <2 x i64> %op1, %op2
ret <2 x i64> %res
define void @urem_v8i64(ptr %a, ptr %b) #0 {
; VBITS_GE_128-LABEL: urem_v8i64:
; VBITS_GE_128: // %bb.0:
-; VBITS_GE_128-NEXT: ldp q4, q5, [x1]
-; VBITS_GE_128-NEXT: ptrue p0.d, vl2
-; VBITS_GE_128-NEXT: ldp q7, q6, [x1, #32]
; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
-; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
-; VBITS_GE_128-NEXT: movprfx z16, z3
-; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z5.d
-; VBITS_GE_128-NEXT: movprfx z17, z2
-; VBITS_GE_128-NEXT: udiv z17.d, p0/m, z17.d, z4.d
-; VBITS_GE_128-NEXT: mul z5.d, p0/m, z5.d, z16.d
+; VBITS_GE_128-NEXT: ptrue p0.d, vl2
+; VBITS_GE_128-NEXT: ldp q2, q3, [x1, #32]
; VBITS_GE_128-NEXT: movprfx z16, z1
+; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z3.d
+; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z3.d
+; VBITS_GE_128-NEXT: movprfx z3, z0
+; VBITS_GE_128-NEXT: udiv z3.d, p0/m, z3.d, z2.d
+; VBITS_GE_128-NEXT: mls z0.d, p0/m, z3.d, z2.d
+; VBITS_GE_128-NEXT: ldp q4, q5, [x0]
+; VBITS_GE_128-NEXT: ldp q7, q6, [x1]
+; VBITS_GE_128-NEXT: movprfx z16, z5
; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z6.d
-; VBITS_GE_128-NEXT: mul z4.d, p0/m, z4.d, z17.d
-; VBITS_GE_128-NEXT: movprfx z17, z0
-; VBITS_GE_128-NEXT: udiv z17.d, p0/m, z17.d, z7.d
-; VBITS_GE_128-NEXT: mul z6.d, p0/m, z6.d, z16.d
-; VBITS_GE_128-NEXT: mul z7.d, p0/m, z7.d, z17.d
-; VBITS_GE_128-NEXT: sub v0.2d, v0.2d, v7.2d
-; VBITS_GE_128-NEXT: sub v1.2d, v1.2d, v6.2d
-; VBITS_GE_128-NEXT: sub v2.2d, v2.2d, v4.2d
+; VBITS_GE_128-NEXT: movprfx z2, z4
+; VBITS_GE_128-NEXT: udiv z2.d, p0/m, z2.d, z7.d
; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
-; VBITS_GE_128-NEXT: sub v0.2d, v3.2d, v5.2d
-; VBITS_GE_128-NEXT: stp q2, q0, [x0]
+; VBITS_GE_128-NEXT: movprfx z0, z4
+; VBITS_GE_128-NEXT: mls z0.d, p0/m, z2.d, z7.d
+; VBITS_GE_128-NEXT: movprfx z1, z5
+; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z6.d
+; VBITS_GE_128-NEXT: stp q0, q1, [x0]
; VBITS_GE_128-NEXT: ret
;
; VBITS_GE_256-LABEL: urem_v8i64:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]