From a71ad6a3c80d2a8526976c03d11bcb97f736ba52 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 11 Jun 2022 15:29:18 +0100 Subject: [PATCH] [DAG] visitINSERT_VECTOR_ELT - fold insert_vector_elt(scalar_to_vector(x),v,i) -> build_vector() Allow scalar_to_vector nodes to be used for the start of a build_vector creation --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +++ .../CodeGen/PowerPC/aix_scalar_vector_permuted.ll | 33 ++++++-------- .../CodeGen/PowerPC/canonical-merge-shuffles.ll | 48 +++++++++----------- llvm/test/CodeGen/PowerPC/load-and-splat.ll | 49 ++++++++++---------- llvm/test/CodeGen/PowerPC/pre-inc-disable.ll | 20 ++++----- llvm/test/CodeGen/PowerPC/reduce_scalarization.ll | 46 +++++++++---------- llvm/test/CodeGen/Thumb2/mve-vld3.ll | 3 +- llvm/test/CodeGen/Thumb2/mve-vst3.ll | 47 +++++++++---------- llvm/test/CodeGen/Thumb2/mve-vst4.ll | 52 ++++++++++------------ 9 files changed, 142 insertions(+), 162 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 35c3903..6ba9a07 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19406,6 +19406,12 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { return UpdateBuildVector(Ops); } + if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR && InVec.hasOneUse()) { + Ops.push_back(InVec.getOperand(0)); + Ops.append(NumElts - 1, DAG.getUNDEF(InVec.getOperand(0).getValueType())); + return UpdateBuildVector(Ops); + } + if (InVec.isUndef()) { Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType())); return UpdateBuildVector(Ops); diff --git a/llvm/test/CodeGen/PowerPC/aix_scalar_vector_permuted.ll b/llvm/test/CodeGen/PowerPC/aix_scalar_vector_permuted.ll index b0716a5..9b0b1e2 100644 --- a/llvm/test/CodeGen/PowerPC/aix_scalar_vector_permuted.ll +++ b/llvm/test/CodeGen/PowerPC/aix_scalar_vector_permuted.ll @@ -30,15 +30,13 @@ define void @test_f2(%f2* %P, %f2* %Q, %f2* %S) { ; ; AIX-P8-32-LABEL: test_f2: ; AIX-P8-32: # %bb.0: -; AIX-P8-32-NEXT: lwz r6, L..C0(r2) # %const.0 -; AIX-P8-32-NEXT: li r7, 4 +; AIX-P8-32-NEXT: li r6, 4 ; AIX-P8-32-NEXT: lxsiwzx v3, 0, r3 -; AIX-P8-32-NEXT: lxsiwzx v0, 0, r4 -; AIX-P8-32-NEXT: lxsiwzx v2, r3, r7 -; AIX-P8-32-NEXT: lxsiwzx v5, r4, r7 -; AIX-P8-32-NEXT: lxvw4x v4, 0, r6 -; AIX-P8-32-NEXT: vperm v2, v3, v2, v4 -; AIX-P8-32-NEXT: vperm v3, v0, v5, v4 +; AIX-P8-32-NEXT: lxsiwzx v5, 0, r4 +; AIX-P8-32-NEXT: lxsiwzx v2, r3, r6 +; AIX-P8-32-NEXT: lxsiwzx v4, r4, r6 +; AIX-P8-32-NEXT: vmrgow v2, v3, v2 +; AIX-P8-32-NEXT: vmrgow v3, v5, v4 ; AIX-P8-32-NEXT: xvaddsp vs0, v2, v3 ; AIX-P8-32-NEXT: xxsldwi vs1, vs0, vs0, 1 ; AIX-P8-32-NEXT: xscvspdpn f0, vs0 @@ -57,17 +55,14 @@ define void @test_f2(%f2* %P, %f2* %Q, %f2* %S) { ; ; AIX-P9-32-LABEL: test_f2: ; AIX-P9-32: # %bb.0: -; AIX-P9-32-NEXT: lfiwzx f0, 0, r3 -; AIX-P9-32-NEXT: lwz r3, 4(r3) -; AIX-P9-32-NEXT: xxsldwi vs0, f0, f0, 1 -; AIX-P9-32-NEXT: mtfprwz f1, r3 -; AIX-P9-32-NEXT: lwz r3, 4(r4) -; AIX-P9-32-NEXT: xxinsertw vs0, vs1, 4 -; AIX-P9-32-NEXT: lfiwzx f1, 0, r4 -; AIX-P9-32-NEXT: mtfprwz f2, r3 -; AIX-P9-32-NEXT: xxsldwi vs1, f1, f1, 1 -; AIX-P9-32-NEXT: xxinsertw vs1, vs2, 4 -; AIX-P9-32-NEXT: xvaddsp vs0, vs0, vs1 +; AIX-P9-32-NEXT: li r6, 4 +; AIX-P9-32-NEXT: lxsiwzx v3, 0, r3 +; AIX-P9-32-NEXT: lxsiwzx v4, 0, r4 +; AIX-P9-32-NEXT: lxsiwzx v2, r3, r6 +; AIX-P9-32-NEXT: vmrgow v2, v3, v2 +; AIX-P9-32-NEXT: lxsiwzx v3, r4, r6 +; AIX-P9-32-NEXT: vmrgow v3, v4, v3 +; AIX-P9-32-NEXT: xvaddsp vs0, v2, v3 ; AIX-P9-32-NEXT: xscvspdpn f1, vs0 ; AIX-P9-32-NEXT: xxsldwi vs0, vs0, vs0, 1 ; AIX-P9-32-NEXT: xscvspdpn f0, vs0 diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll index 43400d4..633befe 100644 --- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll +++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -529,18 +529,16 @@ define dso_local <8 x i16> @testmrglb3(<8 x i8>* nocapture readonly %a) local_un ; ; P8-AIX-32-LABEL: testmrglb3: ; P8-AIX-32: # %bb.0: # %entry -; P8-AIX-32-NEXT: lwz r5, 4(r3) -; P8-AIX-32-NEXT: lwz r4, L..C0(r2) # %const.0 -; P8-AIX-32-NEXT: stw r5, -32(r1) -; P8-AIX-32-NEXT: lwz r3, 0(r3) -; P8-AIX-32-NEXT: lxvw4x v2, 0, r4 -; P8-AIX-32-NEXT: addi r4, r1, -16 -; P8-AIX-32-NEXT: stw r3, -16(r1) -; P8-AIX-32-NEXT: addi r3, r1, -32 -; P8-AIX-32-NEXT: lxvw4x v3, 0, r3 -; P8-AIX-32-NEXT: lxvw4x v4, 0, r4 -; P8-AIX-32-NEXT: vperm v2, v4, v3, v2 +; P8-AIX-32-NEXT: lwz r4, 4(r3) ; P8-AIX-32-NEXT: xxlxor v3, v3, v3 +; P8-AIX-32-NEXT: stw r4, -16(r1) +; P8-AIX-32-NEXT: addi r4, r1, -32 +; P8-AIX-32-NEXT: lwz r3, 0(r3) +; P8-AIX-32-NEXT: stw r3, -32(r1) +; P8-AIX-32-NEXT: addi r3, r1, -16 +; P8-AIX-32-NEXT: lxvw4x vs0, 0, r3 +; P8-AIX-32-NEXT: lxvw4x vs1, 0, r4 +; P8-AIX-32-NEXT: xxmrghw v2, vs1, vs0 ; P8-AIX-32-NEXT: vmrghb v2, v3, v2 ; P8-AIX-32-NEXT: blr entry: @@ -706,7 +704,7 @@ define dso_local <16 x i8> @no_crash_bitcast(i32 %a) { ; ; P8-AIX-32-LABEL: no_crash_bitcast: ; P8-AIX-32: # %bb.0: # %entry -; P8-AIX-32-NEXT: lwz r4, L..C1(r2) # %const.0 +; P8-AIX-32-NEXT: lwz r4, L..C0(r2) # %const.0 ; P8-AIX-32-NEXT: stw r3, -16(r1) ; P8-AIX-32-NEXT: addi r3, r1, -16 ; P8-AIX-32-NEXT: lxvw4x v3, 0, r3 @@ -780,8 +778,8 @@ define dso_local <4 x i32> @replace_undefs_in_splat(<4 x i32> %a) local_unnamed_ ; ; P8-AIX-32-LABEL: replace_undefs_in_splat: ; P8-AIX-32: # %bb.0: # %entry -; P8-AIX-32-NEXT: lwz r3, L..C2(r2) # %const.0 -; P8-AIX-32-NEXT: lwz r4, L..C3(r2) # %const.1 +; P8-AIX-32-NEXT: lwz r3, L..C1(r2) # %const.0 +; P8-AIX-32-NEXT: lwz r4, L..C2(r2) # %const.1 ; P8-AIX-32-NEXT: lxvw4x v3, 0, r3 ; P8-AIX-32-NEXT: lxvw4x v4, 0, r4 ; P8-AIX-32-NEXT: vperm v2, v2, v4, v3 @@ -1025,18 +1023,16 @@ define dso_local <2 x i64> @testSplat8(<8 x i8>* nocapture readonly %ptr) local_ ; ; P8-AIX-32-LABEL: testSplat8: ; P8-AIX-32: # %bb.0: # %entry -; P8-AIX-32-NEXT: lwz r5, 4(r3) -; P8-AIX-32-NEXT: lwz r4, L..C4(r2) # %const.0 -; P8-AIX-32-NEXT: stw r5, -32(r1) +; P8-AIX-32-NEXT: lwz r4, 4(r3) +; P8-AIX-32-NEXT: stw r4, -16(r1) +; P8-AIX-32-NEXT: addi r4, r1, -32 ; P8-AIX-32-NEXT: lwz r3, 0(r3) -; P8-AIX-32-NEXT: lxvw4x v2, 0, r4 -; P8-AIX-32-NEXT: addi r4, r1, -16 -; P8-AIX-32-NEXT: stw r3, -16(r1) -; P8-AIX-32-NEXT: addi r3, r1, -32 -; P8-AIX-32-NEXT: lxvw4x v3, 0, r3 -; P8-AIX-32-NEXT: lxvw4x v4, 0, r4 -; P8-AIX-32-NEXT: vperm v2, v4, v3, v2 -; P8-AIX-32-NEXT: xxmrghd v2, v2, v2 +; P8-AIX-32-NEXT: stw r3, -32(r1) +; P8-AIX-32-NEXT: addi r3, r1, -16 +; P8-AIX-32-NEXT: lxvw4x vs0, 0, r3 +; P8-AIX-32-NEXT: lxvw4x vs1, 0, r4 +; P8-AIX-32-NEXT: xxmrghw vs0, vs1, vs0 +; P8-AIX-32-NEXT: xxmrghd v2, vs0, vs0 ; P8-AIX-32-NEXT: blr entry: %0 = load <8 x i8>, <8 x i8>* %ptr, align 8 @@ -1082,7 +1078,7 @@ define <2 x i64> @testSplati64_0(<1 x i64>* nocapture readonly %ptr) #0 { ; ; P8-AIX-32-LABEL: testSplati64_0: ; P8-AIX-32: # %bb.0: # %entry -; P8-AIX-32-NEXT: lwz r4, L..C5(r2) # %const.0 +; P8-AIX-32-NEXT: lwz r4, L..C3(r2) # %const.0 ; P8-AIX-32-NEXT: lwz r5, 4(r3) ; P8-AIX-32-NEXT: lwz r3, 0(r3) ; P8-AIX-32-NEXT: stw r5, -16(r1) diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll index 699f5a8..5eb1810 100644 --- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll +++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll @@ -811,45 +811,42 @@ define <16 x i8> @unadjusted_lxvdsx(i64* %s, i64* %t) { ; ; P9-AIX32-LABEL: unadjusted_lxvdsx: ; P9-AIX32: # %bb.0: # %entry -; P9-AIX32-NEXT: lwz r4, 0(r3) +; P9-AIX32-NEXT: lwz r4, 4(r3) ; P9-AIX32-NEXT: stw r4, -16(r1) -; P9-AIX32-NEXT: lwz r3, 4(r3) -; P9-AIX32-NEXT: lxv vs1, -16(r1) -; P9-AIX32-NEXT: mtfprwz f0, r3 -; P9-AIX32-NEXT: xxinsertw vs1, vs0, 4 -; P9-AIX32-NEXT: xxmrghd v2, vs1, vs1 +; P9-AIX32-NEXT: lwz r3, 0(r3) +; P9-AIX32-NEXT: lxv vs0, -16(r1) +; P9-AIX32-NEXT: stw r3, -32(r1) +; P9-AIX32-NEXT: lxv vs1, -32(r1) +; P9-AIX32-NEXT: xxmrghw vs0, vs1, vs0 +; P9-AIX32-NEXT: xxmrghd v2, vs0, vs0 ; P9-AIX32-NEXT: blr ; ; P8-AIX32-LABEL: unadjusted_lxvdsx: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: lwz r5, 4(r3) -; P8-AIX32-NEXT: lwz r4, L..C3(r2) # %const.0 -; P8-AIX32-NEXT: stw r5, -32(r1) +; P8-AIX32-NEXT: lwz r4, 4(r3) +; P8-AIX32-NEXT: stw r4, -16(r1) +; P8-AIX32-NEXT: addi r4, r1, -32 ; P8-AIX32-NEXT: lwz r3, 0(r3) -; P8-AIX32-NEXT: lxvw4x v2, 0, r4 -; P8-AIX32-NEXT: addi r4, r1, -16 -; P8-AIX32-NEXT: stw r3, -16(r1) -; P8-AIX32-NEXT: addi r3, r1, -32 -; P8-AIX32-NEXT: lxvw4x v3, 0, r3 -; P8-AIX32-NEXT: lxvw4x v4, 0, r4 -; P8-AIX32-NEXT: vperm v2, v4, v3, v2 -; P8-AIX32-NEXT: xxmrghd v2, v2, v2 +; P8-AIX32-NEXT: stw r3, -32(r1) +; P8-AIX32-NEXT: addi r3, r1, -16 +; P8-AIX32-NEXT: lxvw4x vs0, 0, r3 +; P8-AIX32-NEXT: lxvw4x vs1, 0, r4 +; P8-AIX32-NEXT: xxmrghw vs0, vs1, vs0 +; P8-AIX32-NEXT: xxmrghd v2, vs0, vs0 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: unadjusted_lxvdsx: ; P7-AIX32: # %bb.0: # %entry ; P7-AIX32-NEXT: lwz r5, 4(r3) -; P7-AIX32-NEXT: lwz r4, L..C3(r2) # %const.0 -; P7-AIX32-NEXT: stw r5, -32(r1) -; P7-AIX32-NEXT: lwz r3, 0(r3) -; P7-AIX32-NEXT: lxvw4x v2, 0, r4 ; P7-AIX32-NEXT: addi r4, r1, -16 -; P7-AIX32-NEXT: stw r3, -16(r1) +; P7-AIX32-NEXT: stw r5, -16(r1) +; P7-AIX32-NEXT: lwz r3, 0(r3) +; P7-AIX32-NEXT: stw r3, -32(r1) ; P7-AIX32-NEXT: addi r3, r1, -32 -; P7-AIX32-NEXT: lxvw4x v3, 0, r3 -; P7-AIX32-NEXT: lxvw4x v4, 0, r4 -; P7-AIX32-NEXT: vperm v2, v4, v3, v2 -; P7-AIX32-NEXT: xxmrghd v2, v2, v2 +; P7-AIX32-NEXT: lxvw4x vs0, 0, r4 +; P7-AIX32-NEXT: lxvw4x vs1, 0, r3 +; P7-AIX32-NEXT: xxmrghw vs0, vs1, vs0 +; P7-AIX32-NEXT: xxmrghd v2, vs0, vs0 ; P7-AIX32-NEXT: blr entry: %0 = bitcast i64* %s to <8 x i8>* diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll index d0db193..ad7891c 100644 --- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll +++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll @@ -86,22 +86,22 @@ define void @test64(i8* nocapture readonly %pix2, i32 signext %i_pix2) { ; P9BE-AIX32-LABEL: test64: ; P9BE-AIX32: # %bb.0: # %entry ; P9BE-AIX32-NEXT: lwzux 4, 3, 4 -; P9BE-AIX32-NEXT: lwz 5, L..C0(2) # %const.0 ; P9BE-AIX32-NEXT: xxlxor 4, 4, 4 -; P9BE-AIX32-NEXT: lxv 3, 0(5) -; P9BE-AIX32-NEXT: stw 4, -32(1) +; P9BE-AIX32-NEXT: stw 4, -48(1) ; P9BE-AIX32-NEXT: lwz 4, 4(3) -; P9BE-AIX32-NEXT: lxv 2, -32(1) -; P9BE-AIX32-NEXT: stw 4, -16(1) -; P9BE-AIX32-NEXT: mtfprwz 0, 4 +; P9BE-AIX32-NEXT: lxv 0, -48(1) +; P9BE-AIX32-NEXT: stw 4, -32(1) +; P9BE-AIX32-NEXT: lwz 4, L..C0(2) # %const.0 ; P9BE-AIX32-NEXT: lwz 3, 8(3) -; P9BE-AIX32-NEXT: xxinsertw 2, 0, 4 -; P9BE-AIX32-NEXT: mtfprwz 0, 3 +; P9BE-AIX32-NEXT: lxv 1, -32(1) +; P9BE-AIX32-NEXT: lxv 3, 0(4) +; P9BE-AIX32-NEXT: stw 3, -16(1) ; P9BE-AIX32-NEXT: lwz 3, L..C1(2) # %const.1 +; P9BE-AIX32-NEXT: xxmrghw 2, 0, 1 +; P9BE-AIX32-NEXT: lxv 0, -16(1) ; P9BE-AIX32-NEXT: vperm 2, 4, 2, 3 -; P9BE-AIX32-NEXT: lxv 3, -16(1) ; P9BE-AIX32-NEXT: lxv 4, 0(3) -; P9BE-AIX32-NEXT: xxinsertw 3, 0, 4 +; P9BE-AIX32-NEXT: xxmrghw 3, 1, 0 ; P9BE-AIX32-NEXT: vperm 3, 3, 3, 4 ; P9BE-AIX32-NEXT: vspltisw 4, 8 ; P9BE-AIX32-NEXT: vnegw 3, 3 diff --git a/llvm/test/CodeGen/PowerPC/reduce_scalarization.ll b/llvm/test/CodeGen/PowerPC/reduce_scalarization.ll index 5034778..2c5bc80 100644 --- a/llvm/test/CodeGen/PowerPC/reduce_scalarization.ll +++ b/llvm/test/CodeGen/PowerPC/reduce_scalarization.ll @@ -68,15 +68,13 @@ define dso_local <2 x double> @test2(<2 x float>* nocapture readonly %a, <2 x fl ; ; AIX-32-LABEL: test2: ; AIX-32: # %bb.0: # %entry -; AIX-32-NEXT: lwz r5, L..C0(r2) # %const.0 -; AIX-32-NEXT: li r6, 4 +; AIX-32-NEXT: li r5, 4 ; AIX-32-NEXT: lxsiwzx v3, 0, r3 -; AIX-32-NEXT: lxsiwzx v0, 0, r4 -; AIX-32-NEXT: lxsiwzx v2, r3, r6 -; AIX-32-NEXT: lxsiwzx v5, r4, r6 -; AIX-32-NEXT: lxvw4x v4, 0, r5 -; AIX-32-NEXT: vperm v2, v3, v2, v4 -; AIX-32-NEXT: vperm v3, v0, v5, v4 +; AIX-32-NEXT: lxsiwzx v5, 0, r4 +; AIX-32-NEXT: lxsiwzx v2, r3, r5 +; AIX-32-NEXT: lxsiwzx v4, r4, r5 +; AIX-32-NEXT: vmrgow v2, v3, v2 +; AIX-32-NEXT: vmrgow v3, v5, v4 ; AIX-32-NEXT: xvsubsp vs0, v2, v3 ; AIX-32-NEXT: xxsldwi vs1, vs0, vs0, 1 ; AIX-32-NEXT: xscvspdpn f0, vs0 @@ -114,15 +112,13 @@ define dso_local <2 x double> @test3(<2 x float>* nocapture readonly %a, <2 x fl ; ; AIX-32-LABEL: test3: ; AIX-32: # %bb.0: # %entry -; AIX-32-NEXT: lwz r5, L..C1(r2) # %const.0 -; AIX-32-NEXT: li r6, 4 +; AIX-32-NEXT: li r5, 4 ; AIX-32-NEXT: lxsiwzx v3, 0, r3 -; AIX-32-NEXT: lxsiwzx v0, 0, r4 -; AIX-32-NEXT: lxsiwzx v2, r3, r6 -; AIX-32-NEXT: lxsiwzx v5, r4, r6 -; AIX-32-NEXT: lxvw4x v4, 0, r5 -; AIX-32-NEXT: vperm v2, v3, v2, v4 -; AIX-32-NEXT: vperm v3, v0, v5, v4 +; AIX-32-NEXT: lxsiwzx v5, 0, r4 +; AIX-32-NEXT: lxsiwzx v2, r3, r5 +; AIX-32-NEXT: lxsiwzx v4, r4, r5 +; AIX-32-NEXT: vmrgow v2, v3, v2 +; AIX-32-NEXT: vmrgow v3, v5, v4 ; AIX-32-NEXT: xvaddsp vs0, v2, v3 ; AIX-32-NEXT: xxsldwi vs1, vs0, vs0, 1 ; AIX-32-NEXT: xscvspdpn f0, vs0 @@ -160,15 +156,13 @@ define dso_local <2 x double> @test4(<2 x float>* nocapture readonly %a, <2 x fl ; ; AIX-32-LABEL: test4: ; AIX-32: # %bb.0: # %entry -; AIX-32-NEXT: lwz r5, L..C2(r2) # %const.0 -; AIX-32-NEXT: li r6, 4 +; AIX-32-NEXT: li r5, 4 ; AIX-32-NEXT: lxsiwzx v3, 0, r3 -; AIX-32-NEXT: lxsiwzx v0, 0, r4 -; AIX-32-NEXT: lxsiwzx v2, r3, r6 -; AIX-32-NEXT: lxsiwzx v5, r4, r6 -; AIX-32-NEXT: lxvw4x v4, 0, r5 -; AIX-32-NEXT: vperm v2, v3, v2, v4 -; AIX-32-NEXT: vperm v3, v0, v5, v4 +; AIX-32-NEXT: lxsiwzx v5, 0, r4 +; AIX-32-NEXT: lxsiwzx v2, r3, r5 +; AIX-32-NEXT: lxsiwzx v4, r4, r5 +; AIX-32-NEXT: vmrgow v2, v3, v2 +; AIX-32-NEXT: vmrgow v3, v5, v4 ; AIX-32-NEXT: xvmulsp vs0, v2, v3 ; AIX-32-NEXT: xxsldwi vs1, vs0, vs0, 1 ; AIX-32-NEXT: xscvspdpn f0, vs0 @@ -215,7 +209,7 @@ define dso_local <2 x double> @test5(<2 x double> %a) { ; ; AIX-32-LABEL: test5: ; AIX-32: # %bb.0: # %entry -; AIX-32-NEXT: lwz r3, L..C3(r2) # @G +; AIX-32-NEXT: lwz r3, L..C0(r2) # @G ; AIX-32-NEXT: lfs f0, 4(r3) ; AIX-32-NEXT: lfs f1, 0(r3) ; AIX-32-NEXT: xxmrghd vs0, vs1, vs0 @@ -284,7 +278,7 @@ define dso_local i32 @test6() #0 { ; ; AIX-32-LABEL: test6: ; AIX-32: # %bb.0: # %bb -; AIX-32-NEXT: lwz r3, L..C4(r2) # @Glob1 +; AIX-32-NEXT: lwz r3, L..C1(r2) # @Glob1 ; AIX-32-NEXT: lis r4, 8 ; AIX-32-NEXT: ori r4, r4, 38248 ; AIX-32-NEXT: lfsux f0, r3, r4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll index 6d14b70..888053d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -1025,9 +1025,8 @@ define void @vld3_v2f16(<6 x half> *%src, <2 x half> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrd r2, r3, [r0] ; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 ; CHECK-NEXT: vmovx.f16 s8, s0 ; CHECK-NEXT: vmovx.f16 s4, s2 ; CHECK-NEXT: vins.f16 s8, s2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll index 40efd04..4b28c2b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -1235,39 +1235,36 @@ entry: define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) { ; CHECK-LABEL: vst3_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrd r2, r12, [r0] -; CHECK-NEXT: ldrd r3, lr, [r0, #8] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrd r2, r0, [r0, #16] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.32 q0[1], r12 -; CHECK-NEXT: vmov.32 q1[1], lr -; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vins.f16 s8, s5 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: ldrd lr, r12, [r0] +; CHECK-NEXT: ldrd r3, r2, [r0, #8] +; CHECK-NEXT: ldrd r4, r0, [r0, #16] +; CHECK-NEXT: vmov q0[2], q0[0], lr, r3 +; CHECK-NEXT: vmov.32 q1[0], r4 +; CHECK-NEXT: vmov q0[3], q0[1], r12, r2 ; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmovx.f16 s13, s3 +; CHECK-NEXT: vmovx.f16 s9, s3 ; CHECK-NEXT: vmovx.f16 s6, s0 ; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s10, s4 +; CHECK-NEXT: vmovx.f16 s8, s4 ; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vins.f16 s2, s10 -; CHECK-NEXT: vmovx.f16 s10, s5 +; CHECK-NEXT: vins.f16 s2, s8 +; CHECK-NEXT: vmovx.f16 s8, s5 ; CHECK-NEXT: vins.f16 s5, s6 -; CHECK-NEXT: vins.f16 s13, s10 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s3, s8 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vmov r0, r2, d6 +; CHECK-NEXT: vins.f16 s9, s8 +; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vmov r0, r2, d4 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: strd r0, r2, [r1, #16] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 %l1 = load <4 x half>, <4 x half>* %s1, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll index 8c374e7..da969e0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -1087,45 +1087,41 @@ entry: define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vst4_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: add.w lr, r0, #16 -; CHECK-NEXT: ldr r2, [r0, #28] -; CHECK-NEXT: ldm.w lr, {r3, r12, lr} -; CHECK-NEXT: vmov.32 q1[0], lr -; CHECK-NEXT: vmov.32 q1[1], r2 -; CHECK-NEXT: vmov.32 q0[0], r3 -; CHECK-NEXT: vmov.32 q0[1], r12 -; CHECK-NEXT: ldrd r2, r12, [r0] -; CHECK-NEXT: ldrd r3, r0, [r0, #8] +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: add.w r6, r0, #16 +; CHECK-NEXT: ldrd lr, r12, [r0] +; CHECK-NEXT: ldrd r3, r2, [r0, #8] +; CHECK-NEXT: ldm r6, {r4, r5, r6} +; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 +; CHECK-NEXT: ldr r0, [r0, #28] +; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r6 +; CHECK-NEXT: vmovx.f16 s10, s5 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r0 +; CHECK-NEXT: vins.f16 s5, s7 ; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov.32 q2[0], r3 -; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.32 q2[1], r0 -; CHECK-NEXT: vmov.32 q1[1], r12 -; CHECK-NEXT: vins.f16 s12, s2 -; CHECK-NEXT: vmovx.f16 s6, s4 -; CHECK-NEXT: vmovx.f16 s2, s8 -; CHECK-NEXT: vins.f16 s6, s2 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vmovx.f16 s11, s1 +; CHECK-NEXT: vins.f16 s12, s2 ; CHECK-NEXT: vmovx.f16 s2, s3 -; CHECK-NEXT: vmovx.f16 s10, s5 ; CHECK-NEXT: vins.f16 s11, s2 -; CHECK-NEXT: vmovx.f16 s2, s9 +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s6 ; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vins.f16 s5, s9 -; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vins.f16 s2, s6 +; CHECK-NEXT: vmovx.f16 s6, s7 ; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vins.f16 s10, s2 +; CHECK-NEXT: vins.f16 s10, s6 ; CHECK-NEXT: vmov.f32 s9, s1 ; CHECK-NEXT: vmov.f32 s5, s0 ; CHECK-NEXT: vstrh.16 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s6, s2 ; CHECK-NEXT: vmov.f32 s7, s12 ; CHECK-NEXT: vstrh.16 q1, [r1] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 %l1 = load <4 x half>, <4 x half>* %s1, align 4 -- 2.7.4