From 051112a3c55a45686942c643ee7ed1449fffbf99 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 6 Mar 2023 10:40:25 -0400 Subject: [PATCH] AMDGPU: Add baseline test for SWDEV-380865 This demonstrates really bad rematerialization support for 64-bit constants which need to be split into 32-bit pieces. --- llvm/test/CodeGen/AMDGPU/swdev380865.ll | 135 ++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/swdev380865.ll diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll new file mode 100644 index 0000000..f61481f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -0,0 +1,135 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stress-regalloc=4 -o - %s | FileCheck %s + +; Make sure we can rematerialize split 64-bit constants (which +; MachineLICM hoisted out of the loop) and avoid spilling inside the +; loop. +; +; MachineLICM originally believed the constant materializes to be +; rematerializable, but the lowered REG_SEQUENCE uses they coalesece +; into were not. The InlineSpiller also did not recognize redundant +; spills inside the loop, so we would repeatedly reload the same +; values. + +define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce) { +; CHECK-LABEL: _Z6kernelILi4000ELi1EEvPd: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b64 s[0:1], 0 +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x0 +; CHECK-NEXT: ; kill: killed $sgpr0_sgpr1 +; CHECK-NEXT: s_mov_b32 s7, 0x401c0000 +; CHECK-NEXT: s_mov_b32 s5, 0x40280000 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_writelane_b32 v0, s2, 0 +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0x40140000 +; CHECK-NEXT: s_mov_b32 s1, 0x40180000 +; CHECK-NEXT: v_writelane_b32 v0, s0, 1 +; CHECK-NEXT: v_writelane_b32 v0, s1, 2 +; CHECK-NEXT: s_mov_b32 s1, 0x40220000 +; CHECK-NEXT: v_writelane_b32 v0, s0, 3 +; CHECK-NEXT: v_writelane_b32 v0, s1, 4 +; CHECK-NEXT: s_mov_b32 s1, 0x40240000 +; CHECK-NEXT: v_writelane_b32 v0, s0, 5 +; CHECK-NEXT: v_writelane_b32 v0, s1, 6 +; CHECK-NEXT: s_mov_b32 s1, 0x40260000 +; CHECK-NEXT: v_writelane_b32 v0, s0, 7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_writelane_b32 v0, s1, 8 +; CHECK-NEXT: v_mov_b32_e32 v2, s3 +; CHECK-NEXT: .LBB0_1: ; %for.cond4.preheader +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], 0 +; CHECK-NEXT: s_mov_b32 s2, 0 +; CHECK-NEXT: s_mov_b32 s3, 0x40140000 +; CHECK-NEXT: v_writelane_b32 v0, s0, 9 +; CHECK-NEXT: v_writelane_b32 v0, s6, 10 +; CHECK-NEXT: v_writelane_b32 v0, s7, 11 +; CHECK-NEXT: v_readlane_b32 s6, v0, 1 +; CHECK-NEXT: v_readlane_b32 s7, v0, 2 +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] +; CHECK-NEXT: s_mov_b32 s1, s7 +; CHECK-NEXT: s_mov_b32 s0, s2 +; CHECK-NEXT: v_writelane_b32 v0, s6, 1 +; CHECK-NEXT: v_writelane_b32 v0, s7, 2 +; CHECK-NEXT: v_readlane_b32 s6, v0, 10 +; CHECK-NEXT: v_readlane_b32 s7, v0, 11 +; CHECK-NEXT: s_mov_b32 s6, s2 +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[0:1] +; CHECK-NEXT: v_readlane_b32 s0, v0, 3 +; CHECK-NEXT: v_readlane_b32 s1, v0, 4 +; CHECK-NEXT: s_mov_b32 s3, s1 +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0x40140000 +; CHECK-NEXT: s_mov_b32 s2, s0 +; CHECK-NEXT: s_mov_b32 s1, s3 +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[6:7] +; CHECK-NEXT: v_writelane_b32 v0, s0, 3 +; CHECK-NEXT: v_writelane_b32 v0, s1, 4 +; CHECK-NEXT: v_readlane_b32 s0, v0, 5 +; CHECK-NEXT: v_readlane_b32 s1, v0, 6 +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] +; CHECK-NEXT: s_mov_b32 s3, s1 +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0x40140000 +; CHECK-NEXT: s_mov_b32 s2, s0 +; CHECK-NEXT: s_mov_b32 s1, s3 +; CHECK-NEXT: v_writelane_b32 v0, s0, 5 +; CHECK-NEXT: v_writelane_b32 v0, s1, 6 +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] +; CHECK-NEXT: v_readlane_b32 s0, v0, 7 +; CHECK-NEXT: v_readlane_b32 s1, v0, 8 +; CHECK-NEXT: s_mov_b32 s3, s1 +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0x40140000 +; CHECK-NEXT: s_mov_b32 s2, s0 +; CHECK-NEXT: s_mov_b32 s1, s3 +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] +; CHECK-NEXT: v_writelane_b32 v0, s0, 7 +; CHECK-NEXT: s_mov_b32 s4, s0 +; CHECK-NEXT: v_writelane_b32 v0, s1, 8 +; CHECK-NEXT: v_readlane_b32 s0, v0, 0 +; CHECK-NEXT: v_readlane_b32 s2, v0, 9 +; CHECK-NEXT: s_add_i32 s2, s2, s0 +; CHECK-NEXT: v_writelane_b32 v0, s2, 9 +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[4:5] +; CHECK-NEXT: v_readlane_b32 s0, v0, 9 +; CHECK-NEXT: s_cmpk_lt_i32 s0, 0xa00 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 +; CHECK-NEXT: ; %bb.2: ; %for.cond.cleanup.loopexit +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; CHECK-NEXT: s_endpgm +entry: + %0 = load i32, ptr addrspace(4) null, align 4 + %cmp6 = icmp slt i32 0, 2560 + br i1 %cmp6, label %for.cond4.preheader, label %for.cond.cleanup + +for.cond4.preheader: ; preds = %for.cond4.preheader, %entry + %idx.07 = phi i32 [ %add13, %for.cond4.preheader ], [ 0, %entry ] + %arrayidx.promoted = load double, ptr addrspace(1) null, align 8 + %add9 = fadd contract double %arrayidx.promoted, 0.000000e+00 + %add9.1 = fadd contract double %add9, 5.000000e+00 + %add9.2 = fadd contract double %add9.1, 6.000000e+00 + %add9.3 = fadd contract double %add9.2, 7.000000e+00 + %add9.4 = fadd contract double %add9.3, 9.000000e+00 + %add9.5 = fadd contract double %add9.4, 1.000000e+01 + %add9.6 = fadd contract double %add9.5, 1.100000e+01 + %add9.7 = fadd contract double %add9.6, 1.200000e+01 + store double %add9.7, ptr addrspace(1) null, align 8 + %add13 = add i32 %idx.07, %0 + %cmp = icmp slt i32 %add13, 2560 + br i1 %cmp, label %for.cond4.preheader, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond4.preheader, %entry + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workgroup.id.x() #0 +declare align 4 ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -- 2.7.4