define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocapture readonly %b, float* nocapture readonly %c, i32 %N) {
; CHECK-LABEL: fast_float_mul:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: beq.w .LBB0_11
; CHECK-NEXT: @ %bb.1: @ %vector.memcheck
; CHECK-NEXT: cmp r6, #3
; CHECK-NEXT: bhs .LBB0_6
; CHECK-NEXT: @ %bb.3:
-; CHECK-NEXT: mov r8, r7
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB0_8
; CHECK-NEXT: .LBB0_4: @ %vector.ph
; CHECK-NEXT: letp lr, .LBB0_5
; CHECK-NEXT: b .LBB0_11
; CHECK-NEXT: .LBB0_6: @ %for.body.preheader.new
-; CHECK-NEXT: subs r3, r3, r7
-; CHECK-NEXT: mov r8, r7
+; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: add.w lr, r12, r3, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vstr s0, [r6, #12]
; CHECK-NEXT: le lr, .LBB0_7
; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup.loopexit.unr-lcssa
-; CHECK-NEXT: wls lr, r8, .LBB0_11
+; CHECK-NEXT: wls lr, r7, .LBB0_11
; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader
; CHECK-NEXT: mvn r3, #3
-; CHECK-NEXT: mov lr, r8
+; CHECK-NEXT: mov lr, r7
; CHECK-NEXT: add.w r3, r3, r12, lsl #2
; CHECK-NEXT: add r1, r3
; CHECK-NEXT: add r2, r3
; CHECK-NEXT: adds r0, #4
; CHECK-NEXT: le lr, .LBB0_10
; CHECK-NEXT: .LBB0_11: @ %for.cond.cleanup
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %vector.memcheck
define arm_aapcs_vfpcc float @half_half_mac(half* nocapture readonly %a, half* nocapture readonly %b, i32 %N) {
; CHECK-LABEL: half_half_mac:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: cbz r2, .LBB9_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: bhs .LBB9_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: vldr s0, .LCPI9_0
-; CHECK-NEXT: mov r5, r4
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB9_6
; CHECK-NEXT: .LBB9_3:
; CHECK-NEXT: vldr s0, .LCPI9_0
; CHECK-NEXT: b .LBB9_9
; CHECK-NEXT: .LBB9_4: @ %for.body.preheader.new
-; CHECK-NEXT: subs r2, r2, r4
+; CHECK-NEXT: bic r2, r2, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vldr s0, .LCPI9_0
-; CHECK-NEXT: mov r5, r4
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
; CHECK-NEXT: sub.w r3, r0, #8
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: le lr, .LBB9_5
; CHECK-NEXT: .LBB9_6: @ %for.cond.cleanup.loopexit.unr-lcssa
-; CHECK-NEXT: wls lr, r5, .LBB9_9
+; CHECK-NEXT: wls lr, r4, .LBB9_9
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
; CHECK-NEXT: mvn r2, #1
-; CHECK-NEXT: mov lr, r5
+; CHECK-NEXT: mov lr, r4
; CHECK-NEXT: add.w r2, r2, r12, lsl #1
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: add r1, r2
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: le lr, .LBB9_8
; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: .LCPI9_0:
define arm_aapcs_vfpcc float @half_half_acc(half* nocapture readonly %a, half* nocapture readonly %b, i32 %N) {
; CHECK-LABEL: half_half_acc:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: cbz r2, .LBB10_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: bhs .LBB10_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: vldr s0, .LCPI10_0
-; CHECK-NEXT: mov r5, r4
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB10_6
; CHECK-NEXT: .LBB10_3:
; CHECK-NEXT: vldr s0, .LCPI10_0
; CHECK-NEXT: b .LBB10_9
; CHECK-NEXT: .LBB10_4: @ %for.body.preheader.new
-; CHECK-NEXT: subs r2, r2, r4
+; CHECK-NEXT: bic r2, r2, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vldr s0, .LCPI10_0
-; CHECK-NEXT: mov r5, r4
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
; CHECK-NEXT: sub.w r3, r0, #8
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: le lr, .LBB10_5
; CHECK-NEXT: .LBB10_6: @ %for.cond.cleanup.loopexit.unr-lcssa
-; CHECK-NEXT: wls lr, r5, .LBB10_9
+; CHECK-NEXT: wls lr, r4, .LBB10_9
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
; CHECK-NEXT: mvn r2, #1
-; CHECK-NEXT: mov lr, r5
+; CHECK-NEXT: mov lr, r4
; CHECK-NEXT: add.w r2, r2, r12, lsl #1
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: add r1, r2
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: le lr, .LBB10_8
; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: .LCPI10_0:
define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) {
; CHECK-LABEL: half_short_mac:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: cbz r2, .LBB11_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: bhs .LBB11_4
; CHECK-NEXT: @ %bb.2:
; CHECK-NEXT: vldr s0, .LCPI11_0
-; CHECK-NEXT: mov r8, r7
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB11_6
; CHECK-NEXT: .LBB11_3:
; CHECK-NEXT: vldr s0, .LCPI11_0
; CHECK-NEXT: b .LBB11_9
; CHECK-NEXT: .LBB11_4: @ %for.body.preheader.new
-; CHECK-NEXT: subs r2, r2, r7
+; CHECK-NEXT: bic r2, r2, #3
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vldr s0, .LCPI11_0
-; CHECK-NEXT: mov r8, r7
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: le lr, .LBB11_5
; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa
-; CHECK-NEXT: wls lr, r8, .LBB11_9
+; CHECK-NEXT: wls lr, r7, .LBB11_9
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
; CHECK-NEXT: mvn r3, #1
; CHECK-NEXT: add.w r2, r3, r12, lsl #1
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: add r1, r2
-; CHECK-NEXT: mov lr, r8
+; CHECK-NEXT: mov lr, r7
; CHECK-NEXT: .LBB11_8: @ %for.body.epil
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrsh r2, [r1, #2]!
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: le lr, .LBB11_8
; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: .LCPI11_0:
; X86-LABEL: t0_32:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: andl $15, %ecx
-; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: andl $-16, %eax
; X86-NEXT: retl
;
; X64-LABEL: t0_32:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: andl $15, %ecx
-; X64-NEXT: subl %ecx, %eax
+; X64-NEXT: andl $-16, %eax
; X64-NEXT: retq
%bias = and i32 %ptr, 15
%r = sub i32 %ptr, %bias
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: andl $15, %ecx
-; X86-NEXT: subl %ecx, %eax
-; X86-NEXT: sbbl $0, %edx
+; X86-NEXT: andl $-16, %eax
; X86-NEXT: retl
;
; X64-LABEL: t1_64:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movl %eax, %ecx
-; X64-NEXT: andl $15, %ecx
-; X64-NEXT: subq %rcx, %rax
+; X64-NEXT: andq $-16, %rax
; X64-NEXT: retq
%bias = and i64 %ptr, 15
%r = sub i64 %ptr, %bias
; X86-LABEL: t2_powerof2:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: andl $16, %ecx
-; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: andl $-17, %eax
; X86-NEXT: retl
;
; X64-LABEL: t2_powerof2:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: andl $16, %ecx
-; X64-NEXT: subl %ecx, %eax
+; X64-NEXT: andl $-17, %eax
; X64-NEXT: retq
%bias = and i32 %ptr, 16
%r = sub i32 %ptr, %bias
; X86-LABEL: t3_random_constant:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: andl $42, %ecx
-; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: andl $-43, %eax
; X86-NEXT: retl
;
; X64-LABEL: t3_random_constant:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: andl $42, %ecx
-; X64-NEXT: subl %ecx, %eax
+; X64-NEXT: andl $-43, %eax
; X64-NEXT: retq
%bias = and i32 %ptr, 42
%r = sub i32 %ptr, %bias
; X86-NEXT: movl %eax, %edx
; X86-NEXT: andl $15, %edx
; X86-NEXT: movl %edx, (%ecx)
-; X86-NEXT: subl %edx, %eax
+; X86-NEXT: andl $-16, %eax
; X86-NEXT: retl
;
; X64-LABEL: t4_extrause:
; X64-NEXT: movl %edi, %ecx
; X64-NEXT: andl $15, %ecx
; X64-NEXT: movl %ecx, (%rsi)
-; X64-NEXT: subl %ecx, %eax
+; X64-NEXT: andl $-16, %eax
; X64-NEXT: retq
%bias = and i32 %ptr, 15
store i32 %bias, i32* %bias_storage