From 275b0c5a5a3765373dc0b076e03e5c6cad780ddb Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 17 Mar 2022 15:46:01 -0700 Subject: [PATCH] [AMDGPU] Add 2 gfx940 mfma tests. NFC. --- llvm/test/CodeGen/AMDGPU/mfma-loop.ll | 18 +++++++++++++ .../CodeGen/AMDGPU/mfma-no-register-aliasing.ll | 31 +++++++++++----------- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index d2098d9..ba77723 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX940 %s ; GCN-LABEL: {{^}}test_mfma_loop_zeroinit: @@ -12,6 +13,7 @@ ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr ; GFX908_A: v_mfma_f32_32x32x1f32 +; GFX940: v_mfma_f32_32x32x1_2b_f32 ; GCN-NOT: v_accvgpr ; GCN: s_cbranch_scc1 [[LOOP]] @@ -46,6 +48,7 @@ exit: ; Check that we do not use 32 temp sgprs as well. ; GFX908_A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 +; GFX940: s_mov_b32 [[TMP:s[0-9]+]], 0x42f60000 ; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] ; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]] ; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] @@ -53,6 +56,7 @@ exit: ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr ; GFX908_A: v_mfma_f32_32x32x1f32 +; GFX940: v_mfma_f32_32x32x1_2b_f32 ; GCN-NOT: v_accvgpr ; GCN: s_cbranch_scc1 [[LOOP]] @@ -88,6 +92,7 @@ exit: ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr ; GFX908_A: v_mfma_f32_32x32x1f32 +; GFX940: v_mfma_f32_32x32x1_2b_f32 ; GCN-NOT: v_accvgpr ; GCN: s_cbranch_scc1 [[LOOP]] @@ -248,10 +253,13 @@ exit: ; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} ; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX940-COUNT-32: s_mov_b32 s{{[0-9]+}}, 0x4{{[0-9a-f]+}} +; GFX940-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr ; GFX908_A: v_mfma_f32_32x32x1f32 +; GFX940: v_mfma_f32_32x32x1_2b_f32 ; GCN-NOT: v_accvgpr ; GCN: s_cbranch_scc1 [[LOOP]] @@ -284,6 +292,7 @@ exit: ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr ; GFX908_A: v_mfma_f32_32x32x1f32 +; GFX940: v_mfma_f32_32x32x1_2b_f32 ; GCN-NOT: v_accvgpr ; GCN: s_cbranch_scc1 [[LOOP]] @@ -349,11 +358,13 @@ exit: ; GFX908_A: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} ; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] ; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]] +; GFX940: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], s{{[0-9]+}} ; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr ; GFX908_A: v_mfma_f32_32x32x1f32 +; GFX940: v_mfma_f32_32x32x1_2b_f32 ; GCN-NOT: v_accvgpr ; GCN: s_cbranch_scc1 [[LOOP]] @@ -416,6 +427,7 @@ exit: ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v0 ; GFX908_A-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} +; GFX940-DAG: s_load_dword [[TMP:s[0-9]+]], ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} ; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} @@ -454,6 +466,7 @@ exit: ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr ; GFX908_A: v_mfma_f32_32x32x1f32 +; GFX940: v_mfma_f32_32x32x1_2b_f32 ; GCN-NOT: v_accvgpr ; GCN: s_cbranch_scc1 [[LOOP]] @@ -491,11 +504,13 @@ exit: ; GFX90A-NOT: v_accvgpr ; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} ; GFX90A-NOT: v_accvgpr +; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} ; GCN-NOT: v_accvgpr ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr ; GFX908_A: v_mfma_f32_32x32x1f32 +; GFX940: v_mfma_f32_32x32x1_2b_f32 ; GCN-NOT: v_accvgpr ; GCN: s_cbranch_scc1 [[LOOP]] @@ -530,6 +545,7 @@ exit: ; GFX90A-NOT: v_accvgpr ; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} ; GFX90A-NOT: v_accvgpr +; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} ; Check that we are using only one tmp VGPR. @@ -541,6 +557,7 @@ exit: ; GCN: [[LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr ; GFX908_A: v_mfma_f32_32x32x1f32 +; GFX940: v_mfma_f32_32x32x1_2b_f32 ; GCN-NOT: v_accvgpr ; GCN: s_cbranch_scc1 [[LOOP]] @@ -614,6 +631,7 @@ exit: ; GCN: [[INNER_LOOP:.LBB[0-9_]+]]: ; GCN-NOT: v_accvgpr ; GFX908_A: v_mfma_f32_32x32x1f32 +; GFX940: v_mfma_f32_32x32x1_2b_f32 ; GCN-NOT: v_accvgpr ; GCN: s_cbranch_scc1 [[INNER_LOOP]] ; GCN-NOT: v_accvgpr diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index feea8e6..3c26845 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY %s ; RUN: llc -march=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,FAST %s @@ -11,11 +12,11 @@ declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float> declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32: -; GREEDY: v_mfma_f32_32x32x1f32 a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] -; GREEDY: v_mfma_f32_32x32x1f32 a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] -; FAST: v_mfma_f32_32x32x1f32 a[64:95], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95] -; FAST: v_mfma_f32_32x32x1f32 a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95] -; GCN: v_mfma_f32_32x32x1f32 a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] +; GREEDY: v_mfma_f32_32x32x1{{.*}} a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] +; GREEDY: v_mfma_f32_32x32x1{{.*}} a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] +; FAST: v_mfma_f32_32x32x1{{.*}} a[64:95], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95] +; FAST: v_mfma_f32_32x32x1{{.*}} a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95] +; GCN: v_mfma_f32_32x32x1{{.*}} a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31] define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) #0 { bb: %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg @@ -28,11 +29,11 @@ bb: } ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32: -; GREEDY: v_mfma_f32_16x16x1f32 a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] -; GREEDY: v_mfma_f32_16x16x1f32 a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] -; FAST: v_mfma_f32_16x16x1f32 a[32:47], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47] -; FAST: v_mfma_f32_16x16x1f32 a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47] -; GCN: v_mfma_f32_16x16x1f32 a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] +; GREEDY: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] +; GREEDY: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] +; FAST: v_mfma_f32_16x16x1{{.*}} a[32:47], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47] +; FAST: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47] +; GCN: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15] define amdgpu_kernel void @test_mfma_f32_16x16x1f32(<16 x float> addrspace(1)* %arg) #0 { bb: %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg @@ -47,11 +48,11 @@ bb: ; This instruction allows the overlap since it only read 4 registers. ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32: -; GREEDY: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] -; GREEDY: v_mfma_f32_4x4x1f32 a[2:5], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] -; FAST: v_mfma_f32_4x4x1f32 a[8:11], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] -; FAST: v_mfma_f32_4x4x1f32 a[4:7], v{{[0-9]+}}, v{{[0-9]+}}, a[8:11] -; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] +; GREEDY: v_mfma_f32_4x4x1{{.*}} a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] +; GREEDY: v_mfma_f32_4x4x1{{.*}} a[2:5], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] +; FAST: v_mfma_f32_4x4x1{{.*}} a[8:11], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] +; FAST: v_mfma_f32_4x4x1{{.*}} a[4:7], v{{[0-9]+}}, v{{[0-9]+}}, a[8:11] +; GCN: v_mfma_f32_4x4x1{{.*}} a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] define amdgpu_kernel void @test_mfma_f32_4x4x1f32(<4 x float> addrspace(1)* %arg) #0 { bb: %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg -- 2.7.4