}
}
+ // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect
+ // world the [w|d]lstp instruction would be last instruction in the preheader
+ // and so it would only affect instructions within the loop body. But due to
+ // scheduling, and/or the logic in this pass (above), the insertion point can
+ // be moved earlier. So if the Loop Start isn't the last instruction in the
+ // preheader, and if the initial element count is smaller than the vector
+ // width, the Loop Start instruction will immediately generate one or more
+ // false lane mask which can, incorrectly, affect the proceeding MVE
+ // instructions in the preheader.
+ auto cannotInsertWDLSTPBetween = [](MachineInstr *Begin,
+ MachineInstr *End) {
+ auto I = MachineBasicBlock::iterator(Begin);
+ auto E = MachineBasicBlock::iterator(End);
+ for (; I != E; ++I)
+ if (shouldInspect(*I))
+ return true;
+ return false;
+ };
+
+ if (cannotInsertWDLSTPBetween(StartInsertPt, &InsertBB->back()))
+ return false;
+
// Especially in the case of while loops, InsertBB may not be the
// preheader, so we need to check that the register isn't redefined
// before entering the loop.
; CHECK: bb.1.vector.ph:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: liveins: $r0, $r1, $r2
+ ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
; CHECK: renamable $q2 = MVE_VMOVimmi32 1, 0, $noreg, undef renamable $q2
+ ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
; CHECK: renamable $q3 = MVE_VMOVimmi32 4, 0, $noreg, undef renamable $q3
- ; CHECK: $lr = MVE_DLSTP_32 renamable $r2
+ ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+ ; CHECK: renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: dead $lr = t2DLS renamable $r3
+ ; CHECK: $r4 = tMOVr killed $r3, 14 /* CC::al */, $noreg
; CHECK: renamable $r3 = tLEApcrel %const.0, 14 /* CC::al */, $noreg
; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r3, 0, 0, $noreg :: (load 16 from constant-pool)
- ; CHECK: renamable $r3, dead $cpsr = tLSRri killed renamable $r2, 1, 14 /* CC::al */, $noreg
+ ; CHECK: renamable $r3, dead $cpsr = tLSRri renamable $r2, 1, 14 /* CC::al */, $noreg
; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
; CHECK: bb.2.vector.body:
; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000)
- ; CHECK: liveins: $lr, $q0, $q1, $q2, $q3, $r0, $r1
- ; CHECK: MVE_VPTv4u32 2, renamable $q1, renamable $q0, 8, implicit-def $vpr
+ ; CHECK: liveins: $q0, $q1, $q2, $q3, $r0, $r1, $r2, $r4
+ ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
+ ; CHECK: $lr = tMOVr $r4, 14 /* CC::al */, $noreg
+ ; CHECK: renamable $r4, dead $cpsr = nsw tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg
+ ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
+ ; CHECK: MVE_VPST 1, implicit $vpr
+ ; CHECK: renamable $vpr = MVE_VCMPu32 renamable $q1, renamable $q0, 8, 1, killed renamable $vpr
; CHECK: renamable $vpr = MVE_VCMPu32 renamable $q0, renamable $q2, 2, 1, killed renamable $vpr
; CHECK: renamable $r1, renamable $q4 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv35, align 4)
; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q4, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv12, align 4)
; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q0, renamable $q3, 0, $noreg, undef renamable $q0
- ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2
+ ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2
; CHECK: bb.3.for.cond.cleanup:
; CHECK: $sp = frame-destroy VLDMDIA_UPD $sp, 14 /* CC::al */, $noreg, def $d8, def $d9
; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7
+ ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
+ ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+ ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
; CHECK: renamable $r3 = tLDRpci %const.0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool)
- ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2
+ ; CHECK: $lr = t2DLS killed renamable $lr
; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
; CHECK: $s4 = VMOVS killed $s0, 14 /* CC::al */, $noreg, implicit killed $q1, implicit-def $q1
; CHECK: bb.2.vector.body:
; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000)
- ; CHECK: liveins: $lr, $q1, $r0, $r1
- ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.lsr.iv12, align 4)
- ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1315, align 4)
- ; CHECK: renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 0, killed $noreg
- ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2
+ ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2
+ ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
+ ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
+ ; CHECK: MVE_VPST 2, implicit $vpr
+ ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv12, align 4)
+ ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1315, align 4)
+ ; CHECK: renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 1, killed renamable $vpr
+ ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2
; CHECK: bb.3.middle.block:
; CHECK: liveins: $q1
; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS renamable $s6, renamable $s7, 14 /* CC::al */, $noreg
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7
+ ; CHECK: renamable $r3, dead $cpsr = tMOVi8 3, 14 /* CC::al */, $noreg
+ ; CHECK: renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+ ; CHECK: renamable $lr = t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
; CHECK: renamable $r3 = tLDRpci %const.0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool)
; CHECK: renamable $r2, dead $cpsr = tLSRri killed renamable $r2, 2, 14 /* CC::al */, $noreg
- ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2
+ ; CHECK: $lr = t2DLS killed renamable $lr
; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
; CHECK: $s4 = VMOVS killed $s0, 14 /* CC::al */, $noreg, implicit killed $q1, implicit-def $q1
; CHECK: bb.2.vector.body:
; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000)
- ; CHECK: liveins: $lr, $q1, $r0, $r1
- ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4)
- ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4)
- ; CHECK: renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 0, killed $noreg
- ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2
+ ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2
+ ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
+ ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
+ ; CHECK: MVE_VPST 2, implicit $vpr
+ ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
+ ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
+ ; CHECK: renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 1, killed renamable $vpr
+ ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2
; CHECK: bb.3.middle.block:
; CHECK: liveins: $q1
; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS renamable $s6, renamable $s7, 14 /* CC::al */, $noreg
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8
+ ; CHECK: $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
+ ; CHECK: tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr
+ ; CHECK: t2IT 10, 8, implicit-def $itstate
+ ; CHECK: renamable $r3 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r3, implicit killed $itstate
+ ; CHECK: renamable $r12 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
+ ; CHECK: renamable $r3, dead $cpsr = tSUBrr renamable $r1, killed renamable $r3, 14 /* CC::al */, $noreg
; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+ ; CHECK: renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 3, 14 /* CC::al */, $noreg
+ ; CHECK: dead renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg
; CHECK: $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
; CHECK: $r12 = tMOVr $r0, 14 /* CC::al */, $noreg
; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3
+ ; CHECK: $r4 = tMOVr $lr, 14 /* CC::al */, $noreg
; CHECK: bb.1.do.body.i:
; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
- ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r12
+ ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r4, $r12
; CHECK: renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.0.i2, align 4)
; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0
; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1
; CHECK: bb.2.arm_mean_f32_mve.exit:
; CHECK: successors: %bb.3(0x80000000)
- ; CHECK: liveins: $q0, $r0, $r1, $r2
+ ; CHECK: liveins: $q0, $r0, $r1, $r2, $r4
; CHECK: $s4 = VMOVSR $r1, 14 /* CC::al */, $noreg
- ; CHECK: $lr = MVE_DLSTP_32 $r1
+ ; CHECK: $lr = t2DLS killed $r4
; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, killed renamable $s3, 14 /* CC::al */, $noreg, implicit killed $q0
; CHECK: renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg
; CHECK: renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s4, 14 /* CC::al */, $noreg
; CHECK: renamable $r3 = VMOVRS killed renamable $s0, 14 /* CC::al */, $noreg
; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
- ; CHECK: dead $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
+ ; CHECK: $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
; CHECK: bb.3.do.body:
; CHECK: successors: %bb.3(0x7c000000), %bb.4(0x04000000)
- ; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2
- ; CHECK: renamable $r0, renamable $q2 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.01, align 4)
- ; CHECK: renamable $q2 = nnan ninf nsz arcp contract afn reassoc MVE_VSUBf32 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2
- ; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VFMAf32 killed renamable $q0, killed renamable $q2, killed renamable $q2, 0, killed $noreg
- ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.3
+ ; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3
+ ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+ ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
+ ; CHECK: MVE_VPST 2, implicit $vpr
+ ; CHECK: renamable $r0, renamable $q2 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.01, align 4)
+ ; CHECK: renamable $q2 = nnan ninf nsz arcp contract afn reassoc MVE_VSUBf32 killed renamable $q2, renamable $q1, 1, renamable $vpr, undef renamable $q2
+ ; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VFMAf32 killed renamable $q0, killed renamable $q2, killed renamable $q2, 1, killed renamable $vpr
+ ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.3
; CHECK: bb.4.do.end:
; CHECK: liveins: $q0, $r1, $r2
; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 1, 14 /* CC::al */, $noreg
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: beq.w .LBB3_3
; CHECK-NEXT: @ %bb.1: @ %vector.ph
+; CHECK-NEXT: adds r3, r2, #3
; CHECK-NEXT: adr r7, .LCPI3_5
+; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: vmov.i32 q0, #0x8000
+; CHECK-NEXT: sub.w r12, r3, #4
+; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: adr r6, .LCPI3_4
; CHECK-NEXT: adr r5, .LCPI3_3
+; CHECK-NEXT: add.w lr, r3, r12, lsr #2
; CHECK-NEXT: adr r4, .LCPI3_2
-; CHECK-NEXT: dlstp.32 lr, r2
+; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: vstrw.32 q0, [sp, #160] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r7]
; CHECK-NEXT: adr.w r8, .LCPI3_1
; CHECK-NEXT: .LBB3_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [sp, #192] @ 16-byte Reload
-; CHECK-NEXT: vldrb.u32 q4, [r0, q0]
+; CHECK-NEXT: vctp.32 r2
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vldrbt.u32 q4, [r0, q0]
; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload
-; CHECK-NEXT: vldrb.u32 q7, [r0, q0]
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vldrbt.u32 q7, [r0, q0]
; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q5, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmul.i32 q6, q7, q0
; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT: vldrb.u32 q1, [r0, q5]
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vldrbt.u32 q1, [r0, q5]
; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload
; CHECK-NEXT: vmul.i32 q3, q4, q0
; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [sp, #192] @ 16-byte Reload
; CHECK-NEXT: vshr.u32 q1, q1, #16
-; CHECK-NEXT: vstrb.32 q1, [r1, q0]
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vstrbt.32 q1, [r1, q0]
; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload
-; CHECK-NEXT: vstrb.32 q2, [r1, q0]
-; CHECK-NEXT: vstrb.32 q6, [r1, q5]
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vstrbt.32 q2, [r1, q0]
+; CHECK-NEXT: vstrbt.32 q6, [r1, q5]
; CHECK-NEXT: adds r1, #12
-; CHECK-NEXT: letp lr, .LBB3_2
+; CHECK-NEXT: le lr, .LBB3_2
; CHECK-NEXT: .LBB3_3: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #216
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}