From: David Green Date: Sun, 8 Dec 2019 09:58:03 +0000 (+0000) Subject: [ARM] Disable VLD4 under MVE X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3a6eb5f16054e8c0f41a37542a5fc806016502a0;p=platform%2Fupstream%2Fllvm.git [ARM] Disable VLD4 under MVE Alas, using half the available vector registers in a single instruction is just too much for the register allocator to handle. The mve-vldst4.ll test here fails when these instructions are enabled at present. This patch disables the generation of VLD4 and VST4 by adding a mve-max-interleave-factor option, which we currently default to 2. Differential Revision: https://reviews.llvm.org/D71109 --- diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 3dcddd7..6f17d18 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -142,6 +142,11 @@ static cl::opt ConstpoolPromotionMaxTotal( cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128)); +static cl::opt +MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, + cl::desc("Maximum interleave factor for MVE VLDn to generate."), + cl::init(2)); + // The APCS parameter registers. static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 @@ -16786,7 +16791,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { if (Subtarget->hasNEON()) return 4; if (Subtarget->hasMVEIntegerOps()) - return 4; + return MVEMaxSupportedInterleaveFactor; return TargetLoweringBase::getMaxSupportedInterleaveFactor(); } diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll index 537d2da..4bffc10 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp,+fp64 -mve-max-interleave-factor=4 -verify-machineinstrs %s -o - | FileCheck %s ; i32 diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll new file mode 100644 index 0000000..db0dc36 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -0,0 +1,262 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s + +define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 { +; CHECK-LABEL: vldst4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #40 +; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: cmp.w r3, r2, lsr #2 +; CHECK-NEXT: beq.w .LBB0_3 +; CHECK-NEXT: @ %bb.1: @ %vector.ph +; CHECK-NEXT: mvn r3, #7 +; CHECK-NEXT: and.w r2, r3, r2, lsr #2 +; CHECK-NEXT: vldr.16 s0, [sp, #112] +; CHECK-NEXT: subs r2, #8 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w r12, r0, #64 +; CHECK-NEXT: add.w lr, r3, r2, lsr #3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: subs r1, #64 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB0_2: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q0, [r12, #64]! +; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrh.u16 q7, [r12, #16] +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov r2, s28 +; CHECK-NEXT: vldrh.u16 q6, [r12, #32] +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov r2, s30 +; CHECK-NEXT: vldrh.u16 q5, [r12, #48] +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmul.f16 q1, q1, q2 +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s12, s28 +; CHECK-NEXT: vmov.16 q2[0], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.16 q2[1], r3 +; CHECK-NEXT: vmovx.f16 s12, s30 +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s24 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s26 +; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s20 +; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmovx.f16 s12, s22 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmul.f16 q2, q2, q4 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov.16 q3[1], r2 +; CHECK-NEXT: vmov r0, s29 +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov r0, s31 +; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov r0, s25 +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov r0, s27 +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmul.f16 q3, q3, q1 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmovx.f16 s4, s29 +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s31 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s25 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s27 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s21 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s23 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload +; CHECK-NEXT: vmul.f16 q5, q0, q1 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmovx.f16 s0, s24 +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s12 +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s20 +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s4, s26 +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vstrb.8 q4, [r1, #64]! +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s10 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s14 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s22 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov q4, q6 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vstrh.16 q0, [r1, #32] +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vmovx.f16 s4, s19 +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s11 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s15 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s23 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s17 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vstrh.16 q0, [r1, #48] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s9 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s13 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s21 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vstrh.16 q0, [r1, #16] +; CHECK-NEXT: le lr, .LBB0_2 +; CHECK-NEXT: .LBB0_3: @ %while.end +; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r7, pc} +entry: + %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16 + %l0 = bitcast i16 %tmp.0.extract.trunc to half + %mul = mul i32 %numCols, %numRows + %shr = lshr i32 %mul, 2 + %cmp38 = icmp eq i32 %shr, 0 + br i1 %cmp38, label %while.end, label %vector.ph + +vector.ph: ; preds = %vector.memcheck + %n.vec = and i32 %shr, 1073741816 + %l2 = shl nuw i32 %n.vec, 2 + %ind.end = getelementptr half, half* %pIn, i32 %l2 + %l3 = shl nuw i32 %n.vec, 2 + %ind.end48 = getelementptr half, half* %pOut, i32 %l3 + %ind.end50 = sub nsw i32 %shr, %n.vec + %broadcast.splatinsert55 = insertelement <8 x half> undef, half %l0, i32 0 + %broadcast.splat56 = shufflevector <8 x half> %broadcast.splatinsert55, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %l4 = shl i32 %index, 2 + %next.gep = getelementptr half, half* %pIn, i32 %l4 + %l5 = shl i32 %index, 2 + %l6 = bitcast half* %next.gep to <32 x half>* + %wide.vec = load <32 x half>, <32 x half>* %l6, align 2 + %strided.vec = shufflevector <32 x half> %wide.vec, <32 x half> undef, <8 x i32> + %strided.vec52 = shufflevector <32 x half> %wide.vec, <32 x half> undef, <8 x i32> + %strided.vec53 = shufflevector <32 x half> %wide.vec, <32 x half> undef, <8 x i32> + %strided.vec54 = shufflevector <32 x half> %wide.vec, <32 x half> undef, <8 x i32> + %l7 = fmul <8 x half> %strided.vec, %broadcast.splat56 + %l8 = fmul <8 x half> %strided.vec52, %broadcast.splat56 + %l9 = fmul <8 x half> %strided.vec53, %broadcast.splat56 + %l10 = fmul <8 x half> %strided.vec54, %broadcast.splat56 + %l11 = getelementptr inbounds half, half* %pOut, i32 %l5 + %l12 = bitcast half* %l11 to <32 x half>* + %l13 = shufflevector <8 x half> %l7, <8 x half> %l8, <16 x i32> + %l14 = shufflevector <8 x half> %l9, <8 x half> %l10, <16 x i32> + %interleaved.vec = shufflevector <16 x half> %l13, <16 x half> %l14, <32 x i32> + store <32 x half> %interleaved.vec, <32 x half>* %l12, align 2 + %index.next = add i32 %index, 8 + %l15 = icmp eq i32 %index.next, %n.vec + br i1 %l15, label %while.end, label %vector.body + +while.end: ; preds = %while.body, %middle.block, %entry + ret void +} diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll index 5f836da..4ac204a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp,+fp64 -mve-max-interleave-factor=4 -verify-machineinstrs %s -o - | FileCheck %s ; i32 diff --git a/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll index c60bb71..2491b72 100644 --- a/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll +++ b/llvm/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll @@ -73,12 +73,11 @@ define void @load_factor4(<16 x i32>* %ptr) { ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_factor4( -; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* -; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.mve.vld4q.v4i32.p0i32(i32* [[TMP1]]) -; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3 -; CHECK-MVE-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2 -; CHECK-MVE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1 -; CHECK-MVE-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0 +; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4 +; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> +; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> +; CHECK-MVE-NEXT: [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> +; CHECK-MVE-NEXT: [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_factor4( @@ -167,15 +166,8 @@ define void @store_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x ; CHECK-MVE-LABEL: @store_factor4( ; CHECK-MVE-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; CHECK-MVE-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; CHECK-MVE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: [[TMP5:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 0) -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 1) -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 2) -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 3) +; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> +; CHECK-MVE-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4 ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @store_factor4( @@ -448,12 +440,11 @@ define void @load_undef_mask_factor4(<16 x i32>* %ptr) { ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_undef_mask_factor4( -; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* -; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.mve.vld4q.v4i32.p0i32(i32* [[TMP1]]) -; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3 -; CHECK-MVE-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2 -; CHECK-MVE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1 -; CHECK-MVE-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0 +; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[PTR:%.*]], align 4 +; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> +; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> +; CHECK-MVE-NEXT: [[V2:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> +; CHECK-MVE-NEXT: [[V3:%.*]] = shufflevector <16 x i32> [[INTERLEAVED_VEC]], <16 x i32> undef, <4 x i32> ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_undef_mask_factor4( @@ -545,15 +536,8 @@ define void @store_undef_mask_factor4(<16 x i32>* %ptr, <4 x i32> %v0, <4 x i32> ; CHECK-MVE-LABEL: @store_undef_mask_factor4( ; CHECK-MVE-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> ; CHECK-MVE-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; CHECK-MVE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: [[TMP5:%.*]] = bitcast <16 x i32>* [[PTR:%.*]] to i32* -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 0) -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 1) -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 2) -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP5]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 3) +; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <16 x i32> +; CHECK-MVE-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[PTR:%.*]], align 4 ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @store_undef_mask_factor4( @@ -1205,22 +1189,11 @@ define void @load_factor4_wide(<32 x i32>* %ptr) { ; CHECK-NEON-NEXT: ret void ; ; CHECK-MVE-LABEL: @load_factor4_wide( -; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* [[PTR:%.*]] to i32* -; CHECK-MVE-NEXT: [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.mve.vld4q.v4i32.p0i32(i32* [[TMP1]]) -; CHECK-MVE-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3 -; CHECK-MVE-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2 -; CHECK-MVE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1 -; CHECK-MVE-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0 -; CHECK-MVE-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 -; CHECK-MVE-NEXT: [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.mve.vld4q.v4i32.p0i32(i32* [[TMP6]]) -; CHECK-MVE-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 3 -; CHECK-MVE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2 -; CHECK-MVE-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1 -; CHECK-MVE-NEXT: [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0 -; CHECK-MVE-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP7]], <8 x i32> -; CHECK-MVE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> -; CHECK-MVE-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> -; CHECK-MVE-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> +; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = load <32 x i32>, <32 x i32>* [[PTR:%.*]], align 4 +; CHECK-MVE-NEXT: [[V0:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> +; CHECK-MVE-NEXT: [[V1:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> +; CHECK-MVE-NEXT: [[V2:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> +; CHECK-MVE-NEXT: [[V3:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> undef, <8 x i32> ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @load_factor4_wide( @@ -1338,24 +1311,8 @@ define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, ; CHECK-MVE-LABEL: @store_factor4_wide( ; CHECK-MVE-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> ; CHECK-MVE-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> -; CHECK-MVE-NEXT: [[TMP1:%.*]] = bitcast <32 x i32>* [[PTR:%.*]] to i32* -; CHECK-MVE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 0) -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 1) -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 2) -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3) -; CHECK-MVE-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16 -; CHECK-MVE-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <4 x i32> -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 0) -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 1) -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 2) -; CHECK-MVE-NEXT: call void @llvm.arm.mve.vst4q.p0i32.v4i32(i32* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 3) +; CHECK-MVE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <32 x i32> +; CHECK-MVE-NEXT: store <32 x i32> [[INTERLEAVED_VEC]], <32 x i32>* [[PTR:%.*]], align 4 ; CHECK-MVE-NEXT: ret void ; ; CHECK-NONE-LABEL: @store_factor4_wide( diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll index d94735f..4cd63b4 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll @@ -700,14 +700,14 @@ entry: ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp2, align 1 ; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i8 0, i8* %tmp3, align 1 ; VF_16-LABEL: Checking a loop in "i8_factor_4" -; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 +; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i8, i8* %tmp2, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i8, i8* %tmp3, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp2, align 1 -; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1 +; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.4, %i8.4* %data, i64 %i, i32 0 @@ -754,23 +754,23 @@ entry: ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp2, align 2 ; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store i16 0, i16* %tmp3, align 2 ; VF_8-LABEL: Checking a loop in "i16_factor_4" -; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 +; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, i16* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i16, i16* %tmp2, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i16, i16* %tmp3, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i16 0, i16* %tmp3, align 2 +; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i16 0, i16* %tmp3, align 2 ; VF_16-LABEL: Checking a loop in "i16_factor_4" -; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 +; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i16, i16* %tmp2, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i16, i16* %tmp3, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2 +; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.4, %i16.4* %data, i64 %i, i32 0 @@ -808,32 +808,32 @@ entry: ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp2, align 4 ; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i32 0, i32* %tmp3, align 4 ; VF_4-LABEL: Checking a loop in "i32_factor_4" -; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 +; VF_4: Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i32, i32* %tmp2, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i32, i32* %tmp3, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 0, i32* %tmp3, align 4 +; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store i32 0, i32* %tmp3, align 4 ; VF_8-LABEL: Checking a loop in "i32_factor_4" -; VF_8: Found an estimated cost of 16 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 +; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i32, i32* %tmp2, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i32, i32* %tmp3, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store i32 0, i32* %tmp3, align 4 +; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i32 0, i32* %tmp3, align 4 ; VF_16-LABEL: Checking a loop in "i32_factor_4" -; VF_16: Found an estimated cost of 32 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 +; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i32, i32* %tmp2, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i32, i32* %tmp3, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4 +; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.4, %i32.4* %data, i64 %i, i32 0 @@ -943,23 +943,23 @@ entry: ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp2, align 2 ; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store half 0xH0000, half* %tmp3, align 2 ; VF_8-LABEL: Checking a loop in "f16_factor_4" -; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp4 = load half, half* %tmp0, align 2 +; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load half, half* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load half, half* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load half, half* %tmp2, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load half, half* %tmp3, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store half 0xH0000, half* %tmp3, align 2 +; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store half 0xH0000, half* %tmp3, align 2 ; VF_16-LABEL: Checking a loop in "f16_factor_4" -; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp4 = load half, half* %tmp0, align 2 +; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load half, half* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load half, half* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load half, half* %tmp2, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load half, half* %tmp3, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2 +; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 0 @@ -997,32 +997,32 @@ entry: ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp2, align 4 ; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store float 0.000000e+00, float* %tmp3, align 4 ; VF_4-LABEL: Checking a loop in "f32_factor_4" -; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load float, float* %tmp0, align 4 +; VF_4: Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load float, float* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, float* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load float, float* %tmp2, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load float, float* %tmp3, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float 0.000000e+00, float* %tmp3, align 4 +; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store float 0.000000e+00, float* %tmp3, align 4 ; VF_8-LABEL: Checking a loop in "f32_factor_4" -; VF_8: Found an estimated cost of 16 for VF 8 For instruction: %tmp4 = load float, float* %tmp0, align 4 +; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load float, float* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, float* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load float, float* %tmp2, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load float, float* %tmp3, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 16 for VF 8 For instruction: store float 0.000000e+00, float* %tmp3, align 4 +; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store float 0.000000e+00, float* %tmp3, align 4 ; VF_16-LABEL: Checking a loop in "f32_factor_4" -; VF_16: Found an estimated cost of 32 for VF 16 For instruction: %tmp4 = load float, float* %tmp0, align 4 +; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load float, float* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load float, float* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load float, float* %tmp2, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load float, float* %tmp3, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 32 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4 +; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll new file mode 100644 index 0000000..cb6e100 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll @@ -0,0 +1,87 @@ +; RUN: opt -loop-vectorize < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-2,CHECK-NO4 +; RUN: opt -loop-vectorize -mve-max-interleave-factor=1 < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO2,CHECK-NO4 +; RUN: opt -loop-vectorize -mve-max-interleave-factor=2 < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-2,CHECK-NO4 +; RUN: opt -loop-vectorize -mve-max-interleave-factor=4 < %s -S -o - | FileCheck %s --check-prefixes=CHECK,CHECK-2,CHECK-4 + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv8.1m.main-arm-none-eabi" + +; CHECK-LABEL: vld2 +; CHECK-2: vector.body +; CHECK-NO2-NOT: vector.body +define void @vld2(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 { +entry: + %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16 + %0 = bitcast i16 %tmp.0.extract.trunc to half + %mul = mul i32 %numCols, %numRows + %shr = lshr i32 %mul, 2 + %cmp26 = icmp eq i32 %shr, 0 + br i1 %cmp26, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %pIn.addr.029 = phi half* [ %add.ptr, %while.body ], [ %pIn, %entry ] + %pOut.addr.028 = phi half* [ %add.ptr7, %while.body ], [ %pOut, %entry ] + %blkCnt.027 = phi i32 [ %dec, %while.body ], [ %shr, %entry ] + %1 = load half, half* %pIn.addr.029, align 2 + %arrayidx2 = getelementptr inbounds half, half* %pIn.addr.029, i32 1 + %2 = load half, half* %arrayidx2, align 2 + %mul3 = fmul half %1, %0 + %mul4 = fmul half %2, %0 + store half %mul3, half* %pOut.addr.028, align 2 + %arrayidx6 = getelementptr inbounds half, half* %pOut.addr.028, i32 1 + store half %mul4, half* %arrayidx6, align 2 + %add.ptr = getelementptr inbounds half, half* %pIn.addr.029, i32 2 + %add.ptr7 = getelementptr inbounds half, half* %pOut.addr.028, i32 2 + %dec = add nsw i32 %blkCnt.027, -1 + %cmp = icmp eq i32 %dec, 0 + br i1 %cmp, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret void +} + +; CHECK-LABEL: vld4 +; CHECK-4: vector.body +; CHECK-NO4-NOT: vector.body +define void @vld4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 { +entry: + %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16 + %0 = bitcast i16 %tmp.0.extract.trunc to half + %mul = mul i32 %numCols, %numRows + %shr = lshr i32 %mul, 2 + %cmp38 = icmp eq i32 %shr, 0 + br i1 %cmp38, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %pIn.addr.041 = phi half* [ %add.ptr, %while.body ], [ %pIn, %entry ] + %pOut.addr.040 = phi half* [ %add.ptr13, %while.body ], [ %pOut, %entry ] + %blkCnt.039 = phi i32 [ %dec, %while.body ], [ %shr, %entry ] + %1 = load half, half* %pIn.addr.041, align 2 + %arrayidx2 = getelementptr inbounds half, half* %pIn.addr.041, i32 1 + %2 = load half, half* %arrayidx2, align 2 + %arrayidx3 = getelementptr inbounds half, half* %pIn.addr.041, i32 2 + %3 = load half, half* %arrayidx3, align 2 + %arrayidx4 = getelementptr inbounds half, half* %pIn.addr.041, i32 3 + %4 = load half, half* %arrayidx4, align 2 + %mul5 = fmul half %1, %0 + %mul6 = fmul half %2, %0 + %mul7 = fmul half %3, %0 + %mul8 = fmul half %4, %0 + store half %mul5, half* %pOut.addr.040, align 2 + %arrayidx10 = getelementptr inbounds half, half* %pOut.addr.040, i32 1 + store half %mul6, half* %arrayidx10, align 2 + %arrayidx11 = getelementptr inbounds half, half* %pOut.addr.040, i32 2 + store half %mul7, half* %arrayidx11, align 2 + %arrayidx12 = getelementptr inbounds half, half* %pOut.addr.040, i32 3 + store half %mul8, half* %arrayidx12, align 2 + %add.ptr = getelementptr inbounds half, half* %pIn.addr.041, i32 4 + %add.ptr13 = getelementptr inbounds half, half* %pOut.addr.040, i32 4 + %dec = add nsw i32 %blkCnt.039, -1 + %cmp = icmp eq i32 %dec, 0 + br i1 %cmp, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret void +} + +attributes #0 = { "target-features"="+armv8.1-m.main,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-crypto,-d32,-fp-armv8,-fp-armv8sp,-neon,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }