From: David Green Date: Tue, 8 Jun 2021 19:51:33 +0000 (+0100) Subject: [ARM] Generate VDUP(Const) from constant buildvectors X-Git-Tag: llvmorg-14-init~4523 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d7853bae941006cece63013f09d524e72bbbec45;p=platform%2Fupstream%2Fllvm.git [ARM] Generate VDUP(Const) from constant buildvectors If we cannot otherwise use a VMOVimm/VMOVFPimm/VMVNimm, fall back to producing a VDUP(const) as opposed to a constant pool load. This will at least be smaller codesize and can allow the VDUP to be folded into other instructions. Differential Revision: https://reviews.llvm.org/D103808 --- diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index e6e495d..c2376e9 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -7635,6 +7635,18 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); } } + + // If we are under MVE, generate a VDUP(constant), bitcast to the original + // type. + if (ST->hasMVEIntegerOps() && + (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) { + EVT DupVT = SplatBitSize == 32 ? MVT::v4i32 + : SplatBitSize == 16 ? MVT::v8i16 + : MVT::v16i8; + SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32); + SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const); + return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup); + } } } diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout-unknown-lanes.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout-unknown-lanes.ll index 01da3ce..c5d63a2 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout-unknown-lanes.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/predicated-liveout-unknown-lanes.ll @@ -7,8 +7,9 @@ define arm_aapcs_vfpcc <4 x float> @arm_max_no_idx_f32_mve(float* %pSrc, i32 %bl ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: subs r2, r1, #4 -; CHECK-NEXT: adr r3, .LCPI0_0 -; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: movw r3, #0 +; CHECK-NEXT: movt r3, #65408 +; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -17,13 +18,6 @@ define arm_aapcs_vfpcc <4 x float> @arm_max_no_idx_f32_mve(float* %pSrc, i32 %bl ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 0xff800000 @ float -Inf -; CHECK-NEXT: .long 0xff800000 @ float -Inf -; CHECK-NEXT: .long 0xff800000 @ float -Inf -; CHECK-NEXT: .long 0xff800000 @ float -Inf entry: br label %do.body diff --git a/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll b/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll index c6e4dd5..a02f6b8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll @@ -7,24 +7,17 @@ define void @to_4(float* nocapture readonly %x, half* noalias nocapture %y) { ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #256 -; CHECK-NEXT: adr r2, .LCPI0_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: movw r2, #26214 +; CHECK-NEXT: movt r2, #16390 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vstrh.32 q1, [r1], #8 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r1], #8 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 entry: br label %vector.body @@ -52,28 +45,21 @@ define void @to_8(float* nocapture readonly %x, half* noalias nocapture %y) { ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #128 -; CHECK-NEXT: adr r2, .LCPI1_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: movw r2, #26214 +; CHECK-NEXT: movt r2, #16390 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vstrh.32 q1, [r1, #8] -; CHECK-NEXT: vldrw.u32 q1, [r0], #32 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vstrh.32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r1, #8] +; CHECK-NEXT: vldrw.u32 q0, [r0], #32 +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI1_0: -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 entry: br label %vector.body @@ -101,36 +87,29 @@ define void @to_16(float* nocapture readonly %x, half* noalias nocapture %y) { ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #64 -; CHECK-NEXT: adr r2, .LCPI2_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: movw r2, #26214 +; CHECK-NEXT: movt r2, #16390 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vstrh.32 q1, [r1, #24] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vstrh.32 q1, [r1, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vstrh.32 q1, [r1, #8] -; CHECK-NEXT: vldrw.u32 q1, [r0], #64 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vstrh.32 q1, [r1], #32 +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r1, #24] +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r1, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r1, #8] +; CHECK-NEXT: vldrw.u32 q0, [r0], #64 +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r1], #32 ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI2_0: -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 entry: br label %vector.body @@ -158,24 +137,17 @@ define void @from_4(half* nocapture readonly %x, float* noalias nocapture %y) { ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #256 -; CHECK-NEXT: adr r2, .LCPI3_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: movw r2, #26214 +; CHECK-NEXT: movt r2, #16390 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q1, [r0], #8 -; CHECK-NEXT: vcvtb.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vldrh.u32 q0, [r0], #8 +; CHECK-NEXT: vcvtb.f32.f16 q0, q0 +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI3_0: -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 entry: br label %vector.body @@ -203,28 +175,21 @@ define void @from_8(half* nocapture readonly %x, float* noalias nocapture %y) { ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #128 -; CHECK-NEXT: adr r2, .LCPI4_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: movw r2, #26214 +; CHECK-NEXT: movt r2, #16390 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: vldrh.u32 q0, [r0], #16 +; CHECK-NEXT: vldrh.u32 q1, [r0, #-8] +; CHECK-NEXT: vcvtb.f32.f16 q0, q0 +; CHECK-NEXT: vmul.f32 q0, q0, r2 ; CHECK-NEXT: vcvtb.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q1, q1, q0 +; CHECK-NEXT: vmul.f32 q1, q1, r2 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vldrh.u32 q1, [r0], #16 -; CHECK-NEXT: vcvtb.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r1], #32 +; CHECK-NEXT: vstrw.32 q0, [r1], #32 ; CHECK-NEXT: le lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI4_0: -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 entry: br label %vector.body @@ -252,36 +217,29 @@ define void @from_16(half* nocapture readonly %x, float* noalias nocapture %y) { ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #64 -; CHECK-NEXT: adr r2, .LCPI5_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: movw r2, #26214 +; CHECK-NEXT: movt r2, #16390 ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q1, [r0, #24] -; CHECK-NEXT: vcvtb.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r1, #48] -; CHECK-NEXT: vldrh.u32 q1, [r0, #16] -; CHECK-NEXT: vcvtb.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r1, #32] -; CHECK-NEXT: vldrh.u32 q1, [r0, #8] +; CHECK-NEXT: vldrh.u32 q0, [r0], #32 +; CHECK-NEXT: vldrh.u32 q1, [r0, #-24] +; CHECK-NEXT: vldrh.u32 q2, [r0, #-16] +; CHECK-NEXT: vldrh.u32 q3, [r0, #-8] +; CHECK-NEXT: vcvtb.f32.f16 q0, q0 ; CHECK-NEXT: vcvtb.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q1, q1, q0 +; CHECK-NEXT: vcvtb.f32.f16 q2, q2 +; CHECK-NEXT: vcvtb.f32.f16 q3, q3 +; CHECK-NEXT: vmul.f32 q2, q2, r2 +; CHECK-NEXT: vmul.f32 q3, q3, r2 +; CHECK-NEXT: vmul.f32 q1, q1, r2 +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vstrw.32 q3, [r1, #48] +; CHECK-NEXT: vstrw.32 q2, [r1, #32] ; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vldrh.u32 q1, [r0], #32 -; CHECK-NEXT: vcvtb.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vstrw.32 q1, [r1], #64 +; CHECK-NEXT: vstrw.32 q0, [r1], #64 ; CHECK-NEXT: le lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI5_0: -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 entry: br label %vector.body @@ -309,25 +267,18 @@ define void @both_4(half* nocapture readonly %x, half* noalias nocapture %y) { ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #256 -; CHECK-NEXT: adr r2, .LCPI6_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: movw r2, #26214 +; CHECK-NEXT: movt r2, #16390 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u32 q1, [r0], #8 -; CHECK-NEXT: vcvtb.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vstrh.32 q1, [r1], #8 +; CHECK-NEXT: vldrh.u32 q0, [r0], #8 +; CHECK-NEXT: vcvtb.f32.f16 q0, q0 +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vcvtb.f16.f32 q0, q0 +; CHECK-NEXT: vstrh.32 q0, [r1], #8 ; CHECK-NEXT: le lr, .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI6_0: -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 entry: br label %vector.body @@ -356,28 +307,21 @@ define void @both_8(half* nocapture readonly %x, half* noalias nocapture %y) { ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #128 -; CHECK-NEXT: adr r2, .LCPI7_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: movw r2, #26214 +; CHECK-NEXT: movt r2, #16390 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0], #16 -; CHECK-NEXT: vcvtb.f32.f16 q2, q1 -; CHECK-NEXT: vcvtt.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q2, q2, q0 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q2, q2 -; CHECK-NEXT: vcvtt.f16.f32 q2, q1 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vldrh.u16 q0, [r0], #16 +; CHECK-NEXT: vcvtb.f32.f16 q1, q0 +; CHECK-NEXT: vcvtt.f32.f16 q0, q0 +; CHECK-NEXT: vmul.f32 q1, q1, r2 +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vcvtt.f16.f32 q1, q0 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: le lr, .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI7_0: -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 entry: br label %vector.body @@ -406,36 +350,29 @@ define void @both_16(half* nocapture readonly %x, half* noalias nocapture %y) { ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #64 -; CHECK-NEXT: adr r2, .LCPI8_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: movw r2, #26214 +; CHECK-NEXT: movt r2, #16390 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0, #16] -; CHECK-NEXT: vcvtb.f32.f16 q2, q1 -; CHECK-NEXT: vcvtt.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q2, q2, q0 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q2, q2 -; CHECK-NEXT: vcvtt.f16.f32 q2, q1 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 -; CHECK-NEXT: vstrh.16 q2, [r1, #16] -; CHECK-NEXT: vcvtb.f32.f16 q2, q1 -; CHECK-NEXT: vcvtt.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q2, q2, q0 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q2, q2 -; CHECK-NEXT: vcvtt.f16.f32 q2, q1 -; CHECK-NEXT: vstrh.16 q2, [r1], #32 +; CHECK-NEXT: vldrh.u16 q0, [r0, #16] +; CHECK-NEXT: vcvtb.f32.f16 q1, q0 +; CHECK-NEXT: vcvtt.f32.f16 q0, q0 +; CHECK-NEXT: vmul.f32 q1, q1, r2 +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vcvtt.f16.f32 q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0], #32 +; CHECK-NEXT: vstrh.16 q1, [r1, #16] +; CHECK-NEXT: vcvtb.f32.f16 q1, q0 +; CHECK-NEXT: vcvtt.f32.f16 q0, q0 +; CHECK-NEXT: vmul.f32 q1, q1, r2 +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vcvtt.f16.f32 q1, q0 +; CHECK-NEXT: vstrh.16 q1, [r1], #32 ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI8_0: -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 entry: br label %vector.body @@ -464,28 +401,21 @@ define void @both_8_I(half* nocapture readonly %x, half* noalias nocapture %y) { ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #128 -; CHECK-NEXT: adr r2, .LCPI9_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: movw r2, #26214 +; CHECK-NEXT: movt r2, #16390 ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0], #16 -; CHECK-NEXT: vcvtb.f32.f16 q2, q1 -; CHECK-NEXT: vcvtt.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q2, q2, q0 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q2, q2 -; CHECK-NEXT: vcvtt.f16.f32 q2, q1 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vldrh.u16 q0, [r0], #16 +; CHECK-NEXT: vcvtb.f32.f16 q1, q0 +; CHECK-NEXT: vcvtt.f32.f16 q0, q0 +; CHECK-NEXT: vmul.f32 q1, q1, r2 +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vcvtt.f16.f32 q1, q0 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: le lr, .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI9_0: -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 entry: br label %vector.body @@ -519,36 +449,29 @@ define void @both_16_I(half* nocapture readonly %x, half* noalias nocapture %y) ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #128 -; CHECK-NEXT: adr r2, .LCPI10_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: movw r2, #26214 +; CHECK-NEXT: movt r2, #16390 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0] -; CHECK-NEXT: vcvtb.f32.f16 q2, q1 -; CHECK-NEXT: vcvtt.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q2, q2, q0 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q2, q2 -; CHECK-NEXT: vcvtt.f16.f32 q2, q1 -; CHECK-NEXT: vldrh.u16 q1, [r0, #16]! -; CHECK-NEXT: vstrh.16 q2, [r1] -; CHECK-NEXT: vcvtb.f32.f16 q2, q1 -; CHECK-NEXT: vcvtt.f32.f16 q1, q1 -; CHECK-NEXT: vmul.f32 q2, q2, q0 -; CHECK-NEXT: vmul.f32 q1, q1, q0 -; CHECK-NEXT: vcvtb.f16.f32 q2, q2 -; CHECK-NEXT: vcvtt.f16.f32 q2, q1 -; CHECK-NEXT: vstrb.8 q2, [r1, #16]! +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vcvtb.f32.f16 q1, q0 +; CHECK-NEXT: vcvtt.f32.f16 q0, q0 +; CHECK-NEXT: vmul.f32 q1, q1, r2 +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vcvtt.f16.f32 q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0, #16]! +; CHECK-NEXT: vstrh.16 q1, [r1] +; CHECK-NEXT: vcvtb.f32.f16 q1, q0 +; CHECK-NEXT: vcvtt.f32.f16 q0, q0 +; CHECK-NEXT: vmul.f32 q1, q1, r2 +; CHECK-NEXT: vmul.f32 q0, q0, r2 +; CHECK-NEXT: vcvtb.f16.f32 q1, q1 +; CHECK-NEXT: vcvtt.f16.f32 q1, q0 +; CHECK-NEXT: vstrb.8 q1, [r1, #16]! ; CHECK-NEXT: le lr, .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI10_0: -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 -; CHECK-NEXT: .long 0x40066666 @ float 2.0999999 entry: br label %vector.body diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll index cba123c..e885a7a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll @@ -221,161 +221,89 @@ end: ; preds = %middle.block define void @justoffsets(i8* noalias nocapture readonly %r, i8* noalias nocapture %w, i32 %N) { ; CHECK-LABEL: justoffsets: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #216 -; CHECK-NEXT: sub sp, #216 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: beq.w .LBB3_3 +; CHECK-NEXT: beq .LBB3_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph +; CHECK-NEXT: adr r5, .LCPI3_2 +; CHECK-NEXT: vldrw.u32 q1, [r5] +; CHECK-NEXT: adr r4, .LCPI3_1 +; CHECK-NEXT: movw r5, #50417 +; CHECK-NEXT: adr r3, .LCPI3_0 +; CHECK-NEXT: movw r7, #32769 +; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: movw r4, #47888 +; CHECK-NEXT: vldrw.u32 q3, [r3] +; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.i32 q1, #0x7fff ; CHECK-NEXT: vmov.i32 q0, #0x8000 -; CHECK-NEXT: adr r7, .LCPI3_5 -; CHECK-NEXT: vstrw.32 q0, [sp, #160] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: adr r6, .LCPI3_4 -; CHECK-NEXT: adr r5, .LCPI3_3 -; CHECK-NEXT: vstrw.32 q0, [sp, #176] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr.w r8, .LCPI3_2 -; CHECK-NEXT: vstrw.32 q0, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: adr.w lr, .LCPI3_1 -; CHECK-NEXT: vstrw.32 q0, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r8] -; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [lr] -; CHECK-NEXT: adr.w r12, .LCPI3_0 -; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r12] -; CHECK-NEXT: adr r7, .LCPI3_7 -; CHECK-NEXT: adr r5, .LCPI3_10 -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.i32 q0, #0x7fff -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: adr r6, .LCPI3_9 -; CHECK-NEXT: adr r4, .LCPI3_6 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: adr r7, .LCPI3_8 -; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: vstrw.32 q1, [sp, #192] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: movw r12, #7471 +; CHECK-NEXT: movw r9, #19595 +; CHECK-NEXT: movw r8, #38470 +; CHECK-NEXT: movt r4, #65535 +; CHECK-NEXT: movt r5, #65535 +; CHECK-NEXT: movw r6, #19485 +; CHECK-NEXT: movt r7, #65535 +; CHECK-NEXT: movw r3, #13282 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #192] @ 16-byte Reload -; CHECK-NEXT: vldrb.u32 q4, [r0, q0] -; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload -; CHECK-NEXT: vldrb.u32 q7, [r0, q0] -; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmul.i32 q6, q7, q0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vldrb.u32 q1, [r0, q5] -; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmul.i32 q3, q4, q0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrb.u32 q7, [r0, q1] +; CHECK-NEXT: vldrb.u32 q5, [r0, q2] +; CHECK-NEXT: vmul.i32 q4, q5, r8 +; CHECK-NEXT: vmla.u32 q4, q7, r9 +; CHECK-NEXT: vldrb.u32 q6, [r0, q3] +; CHECK-NEXT: vmla.u32 q4, q6, r12 ; CHECK-NEXT: adds r0, #12 -; CHECK-NEXT: vmul.i32 q6, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #160] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q6 -; CHECK-NEXT: vadd.i32 q3, q3, q0 -; CHECK-NEXT: vshr.u32 q6, q3, #16 -; CHECK-NEXT: vmul.i32 q3, q7, q2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmul.i32 q2, q4, q2 -; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmul.i32 q3, q1, q3 -; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q2, q2, q0 -; CHECK-NEXT: vmul.i32 q3, q7, q3 -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vshr.u32 q2, q2, #16 -; CHECK-NEXT: vmul.i32 q4, q4, q7 -; CHECK-NEXT: vadd.i32 q3, q4, q3 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmul.i32 q1, q1, q4 -; CHECK-NEXT: vadd.i32 q1, q3, q1 +; CHECK-NEXT: vadd.i32 q4, q4, q0 +; CHECK-NEXT: vshr.u32 q4, q4, #16 +; CHECK-NEXT: vstrb.32 q4, [r1, q1] +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmul.i32 q4, q7, q1 +; CHECK-NEXT: vmul.i32 q1, q5, r7 +; CHECK-NEXT: vmla.u32 q1, q7, r3 +; CHECK-NEXT: vmla.u32 q4, q5, r5 +; CHECK-NEXT: vmla.u32 q1, q6, r6 +; CHECK-NEXT: vmla.u32 q4, q6, r4 ; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #192] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q4, q4, q0 ; CHECK-NEXT: vshr.u32 q1, q1, #16 -; CHECK-NEXT: vstrb.32 q1, [r1, q0] -; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload -; CHECK-NEXT: vstrb.32 q2, [r1, q0] -; CHECK-NEXT: vstrb.32 q6, [r1, q5] +; CHECK-NEXT: vshr.u32 q4, q4, #16 +; CHECK-NEXT: vstrb.32 q4, [r1, q2] +; CHECK-NEXT: vstrb.32 q1, [r1, q3] ; CHECK-NEXT: adds r1, #12 ; CHECK-NEXT: letp lr, .LBB3_2 ; CHECK-NEXT: .LBB3_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #216 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.4: ; CHECK-NEXT: .LCPI3_0: -; CHECK-NEXT: .long 4294952177 @ 0xffffc4f1 -; CHECK-NEXT: .long 4294952177 @ 0xffffc4f1 -; CHECK-NEXT: .long 4294952177 @ 0xffffc4f1 -; CHECK-NEXT: .long 4294952177 @ 0xffffc4f1 -; CHECK-NEXT: .LCPI3_1: -; CHECK-NEXT: .long 19485 @ 0x4c1d -; CHECK-NEXT: .long 19485 @ 0x4c1d -; CHECK-NEXT: .long 19485 @ 0x4c1d -; CHECK-NEXT: .long 19485 @ 0x4c1d -; CHECK-NEXT: .LCPI3_2: ; CHECK-NEXT: .long 2 @ 0x2 ; CHECK-NEXT: .long 5 @ 0x5 ; CHECK-NEXT: .long 8 @ 0x8 ; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .LCPI3_3: -; CHECK-NEXT: .long 13282 @ 0x33e2 -; CHECK-NEXT: .long 13282 @ 0x33e2 -; CHECK-NEXT: .long 13282 @ 0x33e2 -; CHECK-NEXT: .long 13282 @ 0x33e2 -; CHECK-NEXT: .LCPI3_4: -; CHECK-NEXT: .long 4294934529 @ 0xffff8001 -; CHECK-NEXT: .long 4294934529 @ 0xffff8001 -; CHECK-NEXT: .long 4294934529 @ 0xffff8001 -; CHECK-NEXT: .long 4294934529 @ 0xffff8001 -; CHECK-NEXT: .LCPI3_5: +; CHECK-NEXT: .LCPI3_1: ; CHECK-NEXT: .long 1 @ 0x1 ; CHECK-NEXT: .long 4 @ 0x4 ; CHECK-NEXT: .long 7 @ 0x7 ; CHECK-NEXT: .long 10 @ 0xa -; CHECK-NEXT: .LCPI3_6: +; CHECK-NEXT: .LCPI3_2: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 3 @ 0x3 ; CHECK-NEXT: .long 6 @ 0x6 ; CHECK-NEXT: .long 9 @ 0x9 -; CHECK-NEXT: .LCPI3_7: -; CHECK-NEXT: .long 4294949648 @ 0xffffbb10 -; CHECK-NEXT: .long 4294949648 @ 0xffffbb10 -; CHECK-NEXT: .long 4294949648 @ 0xffffbb10 -; CHECK-NEXT: .long 4294949648 @ 0xffffbb10 -; CHECK-NEXT: .LCPI3_8: -; CHECK-NEXT: .long 7471 @ 0x1d2f -; CHECK-NEXT: .long 7471 @ 0x1d2f -; CHECK-NEXT: .long 7471 @ 0x1d2f -; CHECK-NEXT: .long 7471 @ 0x1d2f -; CHECK-NEXT: .LCPI3_9: -; CHECK-NEXT: .long 19595 @ 0x4c8b -; CHECK-NEXT: .long 19595 @ 0x4c8b -; CHECK-NEXT: .long 19595 @ 0x4c8b -; CHECK-NEXT: .long 19595 @ 0x4c8b -; CHECK-NEXT: .LCPI3_10: -; CHECK-NEXT: .long 38470 @ 0x9646 -; CHECK-NEXT: .long 38470 @ 0x9646 -; CHECK-NEXT: .long 38470 @ 0x9646 -; CHECK-NEXT: .long 38470 @ 0x9646 entry: %cmp47.not = icmp eq i32 %N, 0 br i1 %cmp47.not, label %for.cond.cleanup, label %vector.ph diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll index 56f4acd..406d2d1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll @@ -4,14 +4,15 @@ define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture %pResult, i32* nocapture %pIndex) { ; CHECK-LABEL: arm_min_helium_f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r6, r7, lr} -; CHECK-NEXT: push {r4, r6, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: vidup.u32 q2, r6, #1 -; CHECK-NEXT: adr r4, .LCPI0_0 -; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: vidup.u32 q2, r4, #1 +; CHECK-NEXT: movw r4, #54437 +; CHECK-NEXT: movt r4, #21352 +; CHECK-NEXT: vdup.32 q1, r4 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vmov.i32 q3, #0x4 ; CHECK-NEXT: dlstp.32 lr, r1 @@ -24,7 +25,7 @@ define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture % ; CHECK-NEXT: vadd.i32 q2, q2, q3 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %do.end -; CHECK-NEXT: vldr s8, .LCPI0_1 +; CHECK-NEXT: vldr s8, .LCPI0_0 ; CHECK-NEXT: vdup.32 q3, r1 ; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vminnmv.f32 r0, q1 @@ -35,16 +36,11 @@ define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture % ; CHECK-NEXT: str r1, [r3] ; CHECK-NEXT: vstr s8, [r2] ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r6, r7, pc} -; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI0_0: ; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11 -; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11 -; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11 -; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11 -; CHECK-NEXT: .LCPI0_1: -; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11 entry: %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 1) %1 = extractvalue { <4 x i32>, i32 } %0, 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-shifts.ll b/llvm/test/CodeGen/Thumb2/mve-shifts.ll index 19d5193..dbd03ff 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shifts.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shifts.ll @@ -470,29 +470,11 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shl_qiv_int8_t(<16 x i8> %src1) { ; CHECK-LABEL: shl_qiv_int8_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adr r0, .LCPI36_0 -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: movw r0, #513 +; CHECK-NEXT: movt r0, #1027 +; CHECK-NEXT: vdup.32 q1, r0 ; CHECK-NEXT: vshl.u8 q0, q0, q1 ; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI36_0: -; CHECK-NEXT: .byte 1 @ 0x1 -; CHECK-NEXT: .byte 2 @ 0x2 -; CHECK-NEXT: .byte 3 @ 0x3 -; CHECK-NEXT: .byte 4 @ 0x4 -; CHECK-NEXT: .byte 1 @ 0x1 -; CHECK-NEXT: .byte 2 @ 0x2 -; CHECK-NEXT: .byte 3 @ 0x3 -; CHECK-NEXT: .byte 4 @ 0x4 -; CHECK-NEXT: .byte 1 @ 0x1 -; CHECK-NEXT: .byte 2 @ 0x2 -; CHECK-NEXT: .byte 3 @ 0x3 -; CHECK-NEXT: .byte 4 @ 0x4 -; CHECK-NEXT: .byte 1 @ 0x1 -; CHECK-NEXT: .byte 2 @ 0x2 -; CHECK-NEXT: .byte 3 @ 0x3 -; CHECK-NEXT: .byte 4 @ 0x4 entry: %0 = shl <16 x i8> %src1, ret <16 x i8> %0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll b/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll index 4226dab..ce4756b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovimm.ll @@ -21,33 +21,23 @@ entry: ret <16 x i8> } +; This has 0x01020304 or 0x04030201 vdup.32'd to q reg depending on endianness. +; The big endian is different as there is an implicit vrev64.8 out of the +; function, which gets constant folded away. define arm_aapcs_vfpcc <16 x i8> @mov_int8_1234() { ; CHECKLE-LABEL: mov_int8_1234: ; CHECKLE: @ %bb.0: @ %entry -; CHECKLE-NEXT: adr r0, .LCPI2_0 -; CHECKLE-NEXT: vldrw.u32 q0, [r0] +; CHECKLE-NEXT: movw r0, #513 +; CHECKLE-NEXT: movt r0, #1027 +; CHECKLE-NEXT: vdup.32 q0, r0 ; CHECKLE-NEXT: bx lr -; CHECKLE-NEXT: .p2align 4 -; CHECKLE-NEXT: @ %bb.1: -; CHECKLE-NEXT: .LCPI2_0: -; CHECKLE-NEXT: .long 67305985 @ double 2.4380727978175888E-289 -; CHECKLE-NEXT: .long 67305985 -; CHECKLE-NEXT: .long 67305985 @ double 2.4380727978175888E-289 -; CHECKLE-NEXT: .long 67305985 ; ; CHECKBE-LABEL: mov_int8_1234: ; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: adr r0, .LCPI2_0 -; CHECKBE-NEXT: vldrb.u8 q1, [r0] -; CHECKBE-NEXT: vrev64.8 q0, q1 +; CHECKBE-NEXT: movw r0, #772 +; CHECKBE-NEXT: movt r0, #258 +; CHECKBE-NEXT: vdup.32 q0, r0 ; CHECKBE-NEXT: bx lr -; CHECKBE-NEXT: .p2align 4 -; CHECKBE-NEXT: @ %bb.1: -; CHECKBE-NEXT: .LCPI2_0: -; CHECKBE-NEXT: .long 16909060 @ double 8.2078802900595913E-304 -; CHECKBE-NEXT: .long 16909060 -; CHECKBE-NEXT: .long 16909060 @ double 8.2078802900595913E-304 -; CHECKBE-NEXT: .long 16909060 entry: ret <16 x i8> } @@ -89,32 +79,11 @@ entry: } define arm_aapcs_vfpcc <8 x i16> @mov_int16_258() { -; CHECKLE-LABEL: mov_int16_258: -; CHECKLE: @ %bb.0: @ %entry -; CHECKLE-NEXT: adr r0, .LCPI7_0 -; CHECKLE-NEXT: vldrw.u32 q0, [r0] -; CHECKLE-NEXT: bx lr -; CHECKLE-NEXT: .p2align 4 -; CHECKLE-NEXT: @ %bb.1: -; CHECKLE-NEXT: .LCPI7_0: -; CHECKLE-NEXT: .long 16908546 @ double 8.204306265173532E-304 -; CHECKLE-NEXT: .long 16908546 -; CHECKLE-NEXT: .long 16908546 @ double 8.204306265173532E-304 -; CHECKLE-NEXT: .long 16908546 -; -; CHECKBE-LABEL: mov_int16_258: -; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: adr r0, .LCPI7_0 -; CHECKBE-NEXT: vldrb.u8 q1, [r0] -; CHECKBE-NEXT: vrev64.8 q0, q1 -; CHECKBE-NEXT: bx lr -; CHECKBE-NEXT: .p2align 4 -; CHECKBE-NEXT: @ %bb.1: -; CHECKBE-NEXT: .LCPI7_0: -; CHECKBE-NEXT: .long 16908546 @ double 8.204306265173532E-304 -; CHECKBE-NEXT: .long 16908546 -; CHECKBE-NEXT: .long 16908546 @ double 8.204306265173532E-304 -; CHECKBE-NEXT: .long 16908546 +; CHECK-LABEL: mov_int16_258: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: mov.w r0, #258 +; CHECK-NEXT: vdup.16 q0, r0 +; CHECK-NEXT: bx lr entry: ret <8 x i16> } @@ -156,32 +125,12 @@ entry: } define arm_aapcs_vfpcc <4 x i32> @mov_int32_16777217() { -; CHECKLE-LABEL: mov_int32_16777217: -; CHECKLE: @ %bb.0: @ %entry -; CHECKLE-NEXT: adr r0, .LCPI12_0 -; CHECKLE-NEXT: vldrw.u32 q0, [r0] -; CHECKLE-NEXT: bx lr -; CHECKLE-NEXT: .p2align 4 -; CHECKLE-NEXT: @ %bb.1: -; CHECKLE-NEXT: .LCPI12_0: -; CHECKLE-NEXT: .long 16777217 @ double 7.2911290000737531E-304 -; CHECKLE-NEXT: .long 16777217 -; CHECKLE-NEXT: .long 16777217 @ double 7.2911290000737531E-304 -; CHECKLE-NEXT: .long 16777217 -; -; CHECKBE-LABEL: mov_int32_16777217: -; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: adr r0, .LCPI12_0 -; CHECKBE-NEXT: vldrb.u8 q1, [r0] -; CHECKBE-NEXT: vrev64.8 q0, q1 -; CHECKBE-NEXT: bx lr -; CHECKBE-NEXT: .p2align 4 -; CHECKBE-NEXT: @ %bb.1: -; CHECKBE-NEXT: .LCPI12_0: -; CHECKBE-NEXT: .long 16777217 @ double 7.2911290000737531E-304 -; CHECKBE-NEXT: .long 16777217 -; CHECKBE-NEXT: .long 16777217 @ double 7.2911290000737531E-304 -; CHECKBE-NEXT: .long 16777217 +; CHECK-LABEL: mov_int32_16777217: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: movt r0, #256 +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: bx lr entry: ret <4 x i32> } @@ -223,32 +172,12 @@ entry: } define arm_aapcs_vfpcc <4 x i32> @mov_int32_4278190335() { -; CHECKLE-LABEL: mov_int32_4278190335: -; CHECKLE: @ %bb.0: @ %entry -; CHECKLE-NEXT: adr r0, .LCPI17_0 -; CHECKLE-NEXT: vldrw.u32 q0, [r0] -; CHECKLE-NEXT: bx lr -; CHECKLE-NEXT: .p2align 4 -; CHECKLE-NEXT: @ %bb.1: -; CHECKLE-NEXT: .LCPI17_0: -; CHECKLE-NEXT: .long 4278190335 @ double -5.4874634341155774E+303 -; CHECKLE-NEXT: .long 4278190335 -; CHECKLE-NEXT: .long 4278190335 @ double -5.4874634341155774E+303 -; CHECKLE-NEXT: .long 4278190335 -; -; CHECKBE-LABEL: mov_int32_4278190335: -; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: adr r0, .LCPI17_0 -; CHECKBE-NEXT: vldrb.u8 q1, [r0] -; CHECKBE-NEXT: vrev64.8 q0, q1 -; CHECKBE-NEXT: bx lr -; CHECKBE-NEXT: .p2align 4 -; CHECKBE-NEXT: @ %bb.1: -; CHECKBE-NEXT: .LCPI17_0: -; CHECKBE-NEXT: .long 4278190335 @ double -5.4874634341155774E+303 -; CHECKBE-NEXT: .long 4278190335 -; CHECKBE-NEXT: .long 4278190335 @ double -5.4874634341155774E+303 -; CHECKBE-NEXT: .long 4278190335 +; CHECK-LABEL: mov_int32_4278190335: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r0, #255 +; CHECK-NEXT: movt r0, #65280 +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: bx lr entry: ret <4 x i32> } @@ -263,32 +192,11 @@ entry: } define arm_aapcs_vfpcc <4 x i32> @mov_int32_16908546() { -; CHECKLE-LABEL: mov_int32_16908546: -; CHECKLE: @ %bb.0: @ %entry -; CHECKLE-NEXT: adr r0, .LCPI19_0 -; CHECKLE-NEXT: vldrw.u32 q0, [r0] -; CHECKLE-NEXT: bx lr -; CHECKLE-NEXT: .p2align 4 -; CHECKLE-NEXT: @ %bb.1: -; CHECKLE-NEXT: .LCPI19_0: -; CHECKLE-NEXT: .long 16908546 @ double 8.204306265173532E-304 -; CHECKLE-NEXT: .long 16908546 -; CHECKLE-NEXT: .long 16908546 @ double 8.204306265173532E-304 -; CHECKLE-NEXT: .long 16908546 -; -; CHECKBE-LABEL: mov_int32_16908546: -; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: adr r0, .LCPI19_0 -; CHECKBE-NEXT: vldrb.u8 q1, [r0] -; CHECKBE-NEXT: vrev64.8 q0, q1 -; CHECKBE-NEXT: bx lr -; CHECKBE-NEXT: .p2align 4 -; CHECKBE-NEXT: @ %bb.1: -; CHECKBE-NEXT: .LCPI19_0: -; CHECKBE-NEXT: .long 16908546 @ double 8.204306265173532E-304 -; CHECKBE-NEXT: .long 16908546 -; CHECKBE-NEXT: .long 16908546 @ double 8.204306265173532E-304 -; CHECKBE-NEXT: .long 16908546 +; CHECK-LABEL: mov_int32_16908546: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: mov.w r0, #258 +; CHECK-NEXT: vdup.16 q0, r0 +; CHECK-NEXT: bx lr entry: ret <4 x i32> } @@ -425,63 +333,22 @@ entry: } define arm_aapcs_vfpcc <4 x float> @mov_float_1() { -; CHECKLE-LABEL: mov_float_1: -; CHECKLE: @ %bb.0: @ %entry -; CHECKLE-NEXT: adr r0, .LCPI28_0 -; CHECKLE-NEXT: vldrw.u32 q0, [r0] -; CHECKLE-NEXT: bx lr -; CHECKLE-NEXT: .p2align 4 -; CHECKLE-NEXT: @ %bb.1: -; CHECKLE-NEXT: .LCPI28_0: -; CHECKLE-NEXT: .long 1065353216 @ double 0.007812501848093234 -; CHECKLE-NEXT: .long 1065353216 -; CHECKLE-NEXT: .long 1065353216 @ double 0.007812501848093234 -; CHECKLE-NEXT: .long 1065353216 -; -; CHECKBE-LABEL: mov_float_1: -; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: adr r0, .LCPI28_0 -; CHECKBE-NEXT: vldrb.u8 q1, [r0] -; CHECKBE-NEXT: vrev64.8 q0, q1 -; CHECKBE-NEXT: bx lr -; CHECKBE-NEXT: .p2align 4 -; CHECKBE-NEXT: @ %bb.1: -; CHECKBE-NEXT: .LCPI28_0: -; CHECKBE-NEXT: .long 1065353216 @ double 0.007812501848093234 -; CHECKBE-NEXT: .long 1065353216 -; CHECKBE-NEXT: .long 1065353216 @ double 0.007812501848093234 -; CHECKBE-NEXT: .long 1065353216 +; CHECK-LABEL: mov_float_1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: mov.w r0, #1065353216 +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: bx lr entry: ret <4 x float> } define arm_aapcs_vfpcc <4 x float> @mov_float_m3() { -; CHECKLE-LABEL: mov_float_m3: -; CHECKLE: @ %bb.0: @ %entry -; CHECKLE-NEXT: adr r0, .LCPI29_0 -; CHECKLE-NEXT: vldrw.u32 q0, [r0] -; CHECKLE-NEXT: bx lr -; CHECKLE-NEXT: .p2align 4 -; CHECKLE-NEXT: @ %bb.1: -; CHECKLE-NEXT: .LCPI29_0: -; CHECKLE-NEXT: .long 3225419776 @ double -32.000022917985916 -; CHECKLE-NEXT: .long 3225419776 -; CHECKLE-NEXT: .long 3225419776 @ double -32.000022917985916 -; CHECKLE-NEXT: .long 3225419776 -; -; CHECKBE-LABEL: mov_float_m3: -; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: adr r0, .LCPI29_0 -; CHECKBE-NEXT: vldrb.u8 q1, [r0] -; CHECKBE-NEXT: vrev64.8 q0, q1 -; CHECKBE-NEXT: bx lr -; CHECKBE-NEXT: .p2align 4 -; CHECKBE-NEXT: @ %bb.1: -; CHECKBE-NEXT: .LCPI29_0: -; CHECKBE-NEXT: .long 3225419776 @ double -32.000022917985916 -; CHECKBE-NEXT: .long 3225419776 -; CHECKBE-NEXT: .long 3225419776 @ double -32.000022917985916 -; CHECKBE-NEXT: .long 3225419776 +; CHECK-LABEL: mov_float_m3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movt r0, #49216 +; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: bx lr entry: ret <4 x float> } diff --git a/llvm/test/CodeGen/Thumb2/mve-vmvnimm.ll b/llvm/test/CodeGen/Thumb2/mve-vmvnimm.ll index a54d005..8988643 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmvnimm.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmvnimm.ll @@ -68,16 +68,10 @@ entry: define arm_aapcs_vfpcc <4 x i32> @mov_int32_4278386688() { ; CHECK-LABEL: mov_int32_4278386688: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: adr r0, .LCPI7_0 -; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movt r0, #65283 +; CHECK-NEXT: vdup.32 q0, r0 ; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI7_0: -; CHECK-NEXT: .long 4278386688 @ double -6.5147775434702224E+303 -; CHECK-NEXT: .long 4278386688 -; CHECK-NEXT: .long 4278386688 @ double -6.5147775434702224E+303 -; CHECK-NEXT: .long 4278386688 entry: ret <4 x i32> }