From: Jingu Kang Date: Thu, 20 Jul 2023 16:12:25 +0000 (+0100) Subject: Revert "[MachineLICM] Handle Subloops" X-Git-Tag: upstream/17.0.6~941 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=351b4c17ddc0158c44fc2d09438a79ecf92a6795;p=platform%2Fupstream%2Fllvm.git Revert "[MachineLICM] Handle Subloops" This reverts commit 50dd383d08670960540fecb4b48c0f0429fbfba3. --- diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index d35cc6b..4e80e9b 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -778,25 +778,8 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN) { // Process the block SpeculationState = SpeculateUnknown; for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) { - if (!Hoist(&MI, Preheader)) { - // We have failed to hoist MI to outmost loop's preheader. If MI is in - // subloop, try to hoist it to subloop's preheader. - MachineLoop *InnerMostLoop = MLI->getLoopFor(MI.getParent()); - MachineBasicBlock *InnerMostLoopPreheader = - InnerMostLoop->getLoopPreheader(); - if (CurLoop != InnerMostLoop && InnerMostLoopPreheader) { - std::swap(CurLoop, InnerMostLoop); - std::swap(CurPreheader, InnerMostLoopPreheader); - Hoist(&MI, CurPreheader); - std::swap(CurLoop, InnerMostLoop); - std::swap(CurPreheader, InnerMostLoopPreheader); - } - // When MI is hoisted to inner-most loop's preheader, we need to update - // reg pressure because we have already visited inner-most loop's - // preheader. + if (!Hoist(&MI, Preheader)) UpdateRegPressure(&MI); - } - // If we have hoisted an instruction that may store, it can only be a // constant store. } diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll index af5acef..80c29ed 100644 --- a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll +++ b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll @@ -38,20 +38,20 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) { ; CHECK-NEXT: mov x16, x14 ; CHECK-NEXT: mov x17, x12 ; CHECK-NEXT: mov x18, x11 -; CHECK-NEXT: dup v0.8h, w15 ; CHECK-NEXT: .LBB0_6: // %vector.body ; CHECK-NEXT: // Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldp q1, q2, [x16, #-16] +; CHECK-NEXT: ldp q0, q1, [x16, #-16] +; CHECK-NEXT: dup v3.8h, w15 ; CHECK-NEXT: subs x18, x18, #16 ; CHECK-NEXT: add x16, x16, #32 -; CHECK-NEXT: ldp q4, q3, [x17, #-32] -; CHECK-NEXT: smlal v4.4s, v0.4h, v1.4h +; CHECK-NEXT: ldp q4, q2, [x17, #-32] +; CHECK-NEXT: smlal v4.4s, v3.4h, v0.4h ; CHECK-NEXT: ldp q6, q5, [x17] -; CHECK-NEXT: smlal2 v3.4s, v0.8h, v1.8h -; CHECK-NEXT: smlal v6.4s, v0.4h, v2.4h -; CHECK-NEXT: stp q4, q3, [x17, #-32] -; CHECK-NEXT: smlal2 v5.4s, v0.8h, v2.8h +; CHECK-NEXT: smlal2 v2.4s, v3.8h, v0.8h +; CHECK-NEXT: smlal v6.4s, v3.4h, v1.4h +; CHECK-NEXT: stp q4, q2, [x17, #-32] +; CHECK-NEXT: smlal2 v5.4s, v3.8h, v1.8h ; CHECK-NEXT: stp q6, q5, [x17], #64 ; CHECK-NEXT: b.ne .LBB0_6 ; CHECK-NEXT: // %bb.7: // %middle.block diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 8750b4c..e8ceeec 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -557,11 +557,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_mul_hi_u32 s9, s0, s5 ; GFX908-NEXT: s_mul_i32 s0, s0, s5 ; GFX908-NEXT: s_add_i32 s1, s9, s1 -; GFX908-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 +; GFX908-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 ; GFX908-NEXT: .LBB3_1: ; %Flow20 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15] ; GFX908-NEXT: s_cbranch_vccz .LBB3_12 ; GFX908-NEXT: .LBB3_2: ; %bb9 ; GFX908-NEXT: ; =>This Loop Header: Depth=1 @@ -571,17 +571,15 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1 ; GFX908-NEXT: s_mov_b32 s9, s8 -; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] ; GFX908-NEXT: v_mov_b32_e32 v4, s8 -; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6 ; GFX908-NEXT: v_mov_b32_e32 v8, s8 ; GFX908-NEXT: v_mov_b32_e32 v6, s8 ; GFX908-NEXT: v_mov_b32_e32 v5, s9 ; GFX908-NEXT: v_mov_b32_e32 v9, s9 ; GFX908-NEXT: v_mov_b32_e32 v7, s9 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 ; GFX908-NEXT: s_mov_b64 s[20:21], s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 @@ -601,9 +599,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX908-NEXT: s_add_u32 s20, s20, s14 +; GFX908-NEXT: s_add_u32 s20, s20, s0 ; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3] -; GFX908-NEXT: s_addc_u32 s21, s21, s15 +; GFX908-NEXT: s_addc_u32 s21, s21, s1 ; GFX908-NEXT: s_mov_b64 s[22:23], 0 ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25] ; GFX908-NEXT: s_cbranch_vccz .LBB3_9 @@ -622,7 +620,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: ds_read_b64 v[12:13], v19 ; GFX908-NEXT: ds_read_b64 v[14:15], v0 -; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX908-NEXT: ; %bb.6: ; %bb51 @@ -650,7 +648,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_mov_b64 s[22:23], -1 ; GFX908-NEXT: s_branch .LBB3_4 ; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: s_mov_b64 s[22:23], s[16:17] +; GFX908-NEXT: s_mov_b64 s[22:23], s[14:15] ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX908-NEXT: s_cbranch_vccz .LBB3_4 ; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 @@ -665,7 +663,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_xor_b64 s[16:17], s[22:23], -1 ; GFX908-NEXT: .LBB3_10: ; %Flow19 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_mov_b64 s[0:1], -1 +; GFX908-NEXT: s_mov_b64 s[14:15], -1 ; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 @@ -674,7 +672,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_addc_u32 s7, s7, 0 ; GFX908-NEXT: s_add_u32 s10, s10, s12 ; GFX908-NEXT: s_addc_u32 s11, s11, s13 -; GFX908-NEXT: s_mov_b64 s[0:1], 0 +; GFX908-NEXT: s_mov_b64 s[14:15], 0 ; GFX908-NEXT: s_branch .LBB3_1 ; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX908-NEXT: s_endpgm @@ -724,11 +722,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s5 ; GFX90A-NEXT: s_mul_i32 s0, s0, s5 ; GFX90A-NEXT: s_add_i32 s1, s9, s1 -; GFX90A-NEXT: s_lshl_b64 s[14:15], s[0:1], 5 +; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 ; GFX90A-NEXT: .LBB3_1: ; %Flow20 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[0:1] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_12 ; GFX90A-NEXT: .LBB3_2: ; %bb9 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 @@ -738,14 +736,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1 ; GFX90A-NEXT: s_mov_b32 s9, s8 -; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8 ; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0 +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1 ; GFX90A-NEXT: s_mov_b64 s[20:21], s[10:11] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -764,8 +760,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: s_add_u32 s20, s20, s14 -; GFX90A-NEXT: s_addc_u32 s21, s21, s15 +; GFX90A-NEXT: s_add_u32 s20, s20, s0 +; GFX90A-NEXT: s_addc_u32 s21, s21, s1 ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5] ; GFX90A-NEXT: s_mov_b64 s[22:23], 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25] @@ -785,7 +781,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_read_b64 v[14:15], v19 ; GFX90A-NEXT: ds_read_b64 v[16:17], v0 -; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7 @@ -806,7 +802,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_mov_b64 s[22:23], -1 ; GFX90A-NEXT: s_branch .LBB3_4 ; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 -; GFX90A-NEXT: s_mov_b64 s[22:23], s[16:17] +; GFX90A-NEXT: s_mov_b64 s[22:23], s[14:15] ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_4 ; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 @@ -821,7 +817,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_xor_b64 s[16:17], s[22:23], -1 ; GFX90A-NEXT: .LBB3_10: ; %Flow19 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_mov_b64 s[0:1], -1 +; GFX90A-NEXT: s_mov_b64 s[14:15], -1 ; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 @@ -830,7 +826,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_addc_u32 s7, s7, 0 ; GFX90A-NEXT: s_add_u32 s10, s10, s12 ; GFX90A-NEXT: s_addc_u32 s11, s11, s13 -; GFX90A-NEXT: s_mov_b64 s[0:1], 0 +; GFX90A-NEXT: s_mov_b64 s[14:15], 0 ; GFX90A-NEXT: s_branch .LBB3_1 ; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock ; GFX90A-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll index f40e48a..52ae259 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -1,11 +1,13 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool llc +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}negated_cond: ; GCN: .LBB0_1: +; GCN: v_cmp_eq_u32_e64 [[CC:[^,]+]], ; GCN: .LBB0_3: ; GCN-NOT: v_cndmask_b32 ; GCN-NOT: v_cmp +; GCN: s_andn2_b64 vcc, exec, [[CC]] ; GCN: s_lshl_b32 s12, s12, 5 ; GCN: s_cbranch_vccz .LBB0_6 define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { @@ -36,9 +38,17 @@ bb4: ; GCN-LABEL: {{^}}negated_cond_dominated_blocks: ; GCN: s_cmp_lg_u32 +; GCN: s_cselect_b64 [[CC1:[^,]+]], -1, 0 +; GCN: s_branch [[BB1:.LBB[0-9]+_[0-9]+]] +; GCN: [[BB0:.LBB[0-9]+_[0-9]+]] ; GCN-NOT: v_cndmask_b32 ; GCN-NOT: v_cmp +; GCN: [[BB1]]: +; GCN: s_mov_b64 vcc, [[CC1]] +; GCN: s_cbranch_vccz [[BB2:.LBB[0-9]+_[0-9]+]] ; GCN: s_mov_b64 vcc, exec +; GCN: s_cbranch_execnz [[BB0]] +; GCN: [[BB2]]: define amdgpu_kernel void @negated_cond_dominated_blocks(ptr addrspace(1) %arg1) { bb: br label %bb2 @@ -71,5 +81,3 @@ bb7: %tmp8 = icmp eq i32 %tmp7, 32 br i1 %tmp8, label %bb3, label %bb4 } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index 0147b5a..d494123 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -178,10 +178,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 +; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1] +; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[66:67], 0, v2 ; GLOBALNESS1-NEXT: s_branch .LBB1_15 ; GLOBALNESS1-NEXT: .LBB1_13: ; %Flow16 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 @@ -209,7 +207,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS1-NEXT: ; %bb.19: ; %bb6.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[66:67] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[64:65] ; GLOBALNESS1-NEXT: .LBB1_20: ; %spam.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57] @@ -238,7 +236,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] -; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] +; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[66:67] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 @@ -467,10 +465,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 +; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1] +; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[66:67], 0, v2 ; GLOBALNESS0-NEXT: s_branch .LBB1_15 ; GLOBALNESS0-NEXT: .LBB1_13: ; %Flow16 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 @@ -498,7 +494,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb6.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[66:67] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[64:65] ; GLOBALNESS0-NEXT: .LBB1_20: ; %spam.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57] @@ -527,7 +523,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] -; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] +; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[66:67] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll index c206859..ad63e9e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -447,69 +447,67 @@ end: define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) local_unnamed_addr #0 { ; CHECK-LABEL: arm_mat_mult_q31: ; CHECK: @ %bb.0: @ %for.cond8.preheader.us.us.preheader.preheader -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: ldrd r9, r12, [sp, #128] +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: ldrd r9, r12, [sp, #120] ; CHECK-NEXT: sub.w r7, r12, #1 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: add.w r7, r6, r7, lsr #1 +; CHECK-NEXT: vdup.32 q1, r9 ; CHECK-NEXT: bic r7, r7, #3 +; CHECK-NEXT: vshl.i32 q3, q1, #3 ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: add.w r10, r6, r7, lsr #2 +; CHECK-NEXT: adr r7, .LCPI9_0 ; CHECK-NEXT: adr r6, .LCPI9_1 +; CHECK-NEXT: vldrw.u32 q2, [r7] ; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr r7, .LCPI9_0 -; CHECK-NEXT: vldrw.u32 q1, [r7] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vdup.32 q0, r9 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vshl.i32 q3, q0, #3 -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: .LBB9_1: @ %for.cond8.preheader.us.us.preheader ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB9_2 Depth 2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 -; CHECK-NEXT: mul lr, r8, r12 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: mul r6, r8, r9 -; CHECK-NEXT: vdup.32 q4, lr -; CHECK-NEXT: vshl.i32 q4, q4, #2 -; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vadd.i32 q4, q4, q0 +; CHECK-NEXT: mul r11, r8, r9 +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: mul r7, r8, r12 ; CHECK-NEXT: .LBB9_2: @ %vector.ph ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov q7, q2 +; CHECK-NEXT: vdup.32 q5, r7 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vshl.i32 q5, q5, #2 +; CHECK-NEXT: vmov q6, q1 +; CHECK-NEXT: vadd.i32 q5, q5, r0 ; CHECK-NEXT: dls lr, r10 -; CHECK-NEXT: vmov.i32 q5, #0x0 -; CHECK-NEXT: vmlas.i32 q7, q0, r7 -; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vadd.i32 q5, q5, q0 +; CHECK-NEXT: vmlas.i32 q6, q2, r5 ; CHECK-NEXT: .LBB9_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q0, q7, q3 -; CHECK-NEXT: vldrw.u32 q1, [r1, q7, uxtw #2] -; CHECK-NEXT: vldrw.u32 q7, [q6, #32]! -; CHECK-NEXT: vmul.i32 q1, q1, q7 -; CHECK-NEXT: vmov q7, q0 -; CHECK-NEXT: vadd.i32 q5, q1, q5 +; CHECK-NEXT: vadd.i32 q7, q6, q3 +; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2] +; CHECK-NEXT: vldrw.u32 q6, [q5, #32]! +; CHECK-NEXT: vmul.i32 q0, q0, q6 +; CHECK-NEXT: vmov q6, q7 +; CHECK-NEXT: vadd.i32 q4, q0, q4 ; CHECK-NEXT: le lr, .LBB9_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2 -; CHECK-NEXT: adds r5, r7, r6 -; CHECK-NEXT: adds r7, #1 -; CHECK-NEXT: vaddv.u32 r4, q5 -; CHECK-NEXT: cmp r7, r9 -; CHECK-NEXT: str.w r4, [r2, r5, lsl #2] +; CHECK-NEXT: add.w r4, r5, r11 +; CHECK-NEXT: adds r5, #1 +; CHECK-NEXT: vaddv.u32 r6, q4 +; CHECK-NEXT: cmp r5, r9 +; CHECK-NEXT: str.w r6, [r2, r4, lsl #2] ; CHECK-NEXT: bne .LBB9_2 ; CHECK-NEXT: @ %bb.5: @ %for.cond4.for.cond.cleanup6_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB9_1 Depth=1 @@ -517,9 +515,10 @@ define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32* ; CHECK-NEXT: cmp r8, r3 ; CHECK-NEXT: bne .LBB9_1 ; CHECK-NEXT: @ %bb.6: @ %for.end25 -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.7: ; CHECK-NEXT: .LCPI9_0: @@ -860,17 +859,17 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: ldrd r2, r7, [sp, #136] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: ldrd r2, r7, [sp, #104] ; CHECK-NEXT: add.w r8, r7, #10 ; CHECK-NEXT: adr r7, .LCPI11_0 -; CHECK-NEXT: ldr r1, [sp, #128] +; CHECK-NEXT: ldr r1, [sp, #96] ; CHECK-NEXT: vdup.32 q0, r2 ; CHECK-NEXT: vldrw.u32 q1, [r7] -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: mov.w r10, #0 ; CHECK-NEXT: mov.w r9, #6 ; CHECK-NEXT: movs r6, #11 ; CHECK-NEXT: vshl.i32 q0, q0, #2 @@ -881,7 +880,7 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly ; CHECK-NEXT: @ Child Loop BB11_3 Depth 3 ; CHECK-NEXT: @ Child Loop BB11_4 Depth 4 ; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 -; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: .LBB11_2: @ %for.cond22.preheader.i ; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 @@ -897,41 +896,38 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly ; CHECK-NEXT: @ Child Loop BB11_4 Depth 4 ; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 ; CHECK-NEXT: dls lr, r9 -; CHECK-NEXT: vdup.32 q2, r10 ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: mov.w r11, #4 -; CHECK-NEXT: vdup.32 q3, r5 -; CHECK-NEXT: vstrw.32 q2, [sp, #8] @ 16-byte Spill ; CHECK-NEXT: .LBB11_4: @ %for.body78.us.i ; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2 ; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3 ; CHECK-NEXT: @ => This Loop Header: Depth=4 ; CHECK-NEXT: @ Child Loop BB11_5 Depth 5 -; CHECK-NEXT: mul r7, r11, r6 -; CHECK-NEXT: vmov q4, q3 -; CHECK-NEXT: vadd.i32 q5, q1, r7 -; CHECK-NEXT: vmla.i32 q4, q5, r2 -; CHECK-NEXT: vldrw.u32 q5, [sp, #8] @ 16-byte Reload -; CHECK-NEXT: adds r7, #113 -; CHECK-NEXT: vadd.i32 q6, q1, r7 -; CHECK-NEXT: mov r7, r8 -; CHECK-NEXT: vmla.i32 q5, q6, r2 +; CHECK-NEXT: mul r4, r11, r6 +; CHECK-NEXT: vdup.32 q3, r5 +; CHECK-NEXT: vdup.32 q2, r7 +; CHECK-NEXT: vadd.i32 q4, q1, r4 +; CHECK-NEXT: vmla.i32 q3, q4, r2 +; CHECK-NEXT: adds r4, #113 +; CHECK-NEXT: vadd.i32 q4, q1, r4 +; CHECK-NEXT: mov r4, r8 +; CHECK-NEXT: vmla.i32 q2, q4, r2 ; CHECK-NEXT: .LBB11_5: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=2 ; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3 ; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=5 -; CHECK-NEXT: vldrb.s32 q2, [r0, q5] -; CHECK-NEXT: vadd.i32 q7, q5, q0 -; CHECK-NEXT: vldrb.s32 q5, [r1, q4] -; CHECK-NEXT: vadd.i32 q6, q4, q0 -; CHECK-NEXT: vadd.i32 q2, q2, r2 -; CHECK-NEXT: subs r7, #4 -; CHECK-NEXT: vmlava.u32 r12, q2, q5 -; CHECK-NEXT: vmov q5, q7 -; CHECK-NEXT: vmov q4, q6 +; CHECK-NEXT: vldrb.s32 q6, [r0, q2] +; CHECK-NEXT: vadd.i32 q5, q2, q0 +; CHECK-NEXT: vadd.i32 q4, q3, q0 +; CHECK-NEXT: subs r4, #4 +; CHECK-NEXT: vadd.i32 q2, q6, r2 +; CHECK-NEXT: vldrb.s32 q6, [r1, q3] +; CHECK-NEXT: vmov q3, q4 +; CHECK-NEXT: vmlava.u32 r12, q2, q6 +; CHECK-NEXT: vmov q2, q5 ; CHECK-NEXT: bne .LBB11_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4 @@ -940,18 +936,18 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly ; CHECK-NEXT: @ %bb.7: @ %for.cond.cleanup77.i ; CHECK-NEXT: @ in Loop: Header=BB11_3 Depth=3 ; CHECK-NEXT: adds r5, #1 -; CHECK-NEXT: adds r4, #1 +; CHECK-NEXT: add.w r10, r10, #1 ; CHECK-NEXT: cmp r5, r2 ; CHECK-NEXT: bne .LBB11_3 ; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup26.i ; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=2 -; CHECK-NEXT: add.w r10, r10, #1 -; CHECK-NEXT: cmp r10, r3 +; CHECK-NEXT: adds r7, #1 +; CHECK-NEXT: cmp r7, r3 ; CHECK-NEXT: bne .LBB11_2 ; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup20.i ; CHECK-NEXT: @ in Loop: Header=BB11_1 Depth=1 ; CHECK-NEXT: ldr r5, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r7, [sp, #180] +; CHECK-NEXT: ldr r7, [sp, #148] ; CHECK-NEXT: adds r5, #1 ; CHECK-NEXT: cmp r5, r7 ; CHECK-NEXT: it eq diff --git a/llvm/test/CodeGen/WebAssembly/reg-stackify.ll b/llvm/test/CodeGen/WebAssembly/reg-stackify.ll index f9845d4..28f167e 100644 --- a/llvm/test/CodeGen/WebAssembly/reg-stackify.ll +++ b/llvm/test/CodeGen/WebAssembly/reg-stackify.ll @@ -471,7 +471,8 @@ define i32 @commute_to_fix_ordering(i32 %arg) { ; CHECK-LABEL: multiple_defs: ; CHECK: f64.add $push[[NUM0:[0-9]+]]=, ${{[0-9]+}}, $pop{{[0-9]+}}{{$}} ; CHECK-NEXT: local.tee $push[[NUM1:[0-9]+]]=, $[[NUM2:[0-9]+]]=, $pop[[NUM0]]{{$}} -; CHECK-NEXT: f64.select ${{[0-9]+}}=, $pop{{[0-9]+}}, $pop[[NUM1]], ${{[0-9]+}}{{$}} +; CHECK-NEXT: f64.select $push{{[0-9]+}}=, $pop{{[0-9]+}}, $pop[[NUM1]], ${{[0-9]+}}{{$}} +; CHECK: $[[NUM2]]=, ; NOREGS-LABEL: multiple_defs: ; NOREGS: f64.add ; NOREGS: local.tee