[AMDGPU] add v2i32 and v2f32 insert_vector_elt tests. NFC.

author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Thu, 5 Aug 2021 21:28:32 +0000 (14:28 -0700)

committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Thu, 5 Aug 2021 21:28:32 +0000 (14:28 -0700)
author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Thu, 5 Aug 2021 21:28:32 +0000 (14:28 -0700)
committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Thu, 5 Aug 2021 21:28:32 +0000 (14:28 -0700)
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll

index 30a1b31..9c8733b 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -6,6 +6,129 @@
  ; FIXME: For some reason the 8 and 16 vectors are being stored as
  ; individual elements instead of 128-bit stores.
  
+define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
+; SI-LABEL: insertelement_v2f32_0:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, 0x40a00000
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: insertelement_v2f32_0:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, 0x40a00000
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0
+  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
+; SI-LABEL: insertelement_v2f32_1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s5, 0x40a00000
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: insertelement_v2f32_1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s5, 0x40a00000
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1
+  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+; SI-LABEL: insertelement_v2i32_0:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s4, 0x3e7
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: insertelement_v2i32_0:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_movk_i32 s4, 0x3e7
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <2 x i32> %a, i32 999, i32 0
+  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+; SI-LABEL: insertelement_v2i32_1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_movk_i32 s5, 0x3e7
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: insertelement_v2i32_1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x8
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_movk_i32 s5, 0x3e7
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+  %vecins = insertelement <2 x i32> %a, i32 999, i32 1
+  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16
+  ret void
+}
  
  ; FIXME: Why is the constant moved into the intermediate register and
  ; not just directly into the vector component?
@@ -1336,19 +1459,19 @@ define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 add
  ; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
  ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  ; SI-NEXT:    s_cmp_lg_u32 s0, 0
-; SI-NEXT:    s_cbranch_scc0 BB26_2
+; SI-NEXT:    s_cbranch_scc0 BB30_2
  ; SI-NEXT:  ; %bb.1: ; %else
  ; SI-NEXT:    s_load_dword s1, s[6:7], 0x1
  ; SI-NEXT:    s_mov_b64 s[2:3], 0
  ; SI-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
  ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccz BB26_3
-; SI-NEXT:    s_branch BB26_4
-; SI-NEXT:  BB26_2:
-; SI-NEXT:  BB26_3: ; %if
+; SI-NEXT:    s_cbranch_vccz BB30_3
+; SI-NEXT:    s_branch BB30_4
+; SI-NEXT:  BB30_2:
+; SI-NEXT:  BB30_3: ; %if
  ; SI-NEXT:    s_load_dword s1, s[6:7], 0x0
-; SI-NEXT:  BB26_4: ; %endif
+; SI-NEXT:  BB30_4: ; %endif
  ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  ; SI-NEXT:    v_mov_b32_e32 v0, s0
  ; SI-NEXT:    s_mov_b32 s7, 0x100f000
@@ -1363,16 +1486,16 @@ define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 add
  ; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
  ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  ; VI-NEXT:    s_cmp_lg_u32 s0, 0
-; VI-NEXT:    s_cbranch_scc0 BB26_2
+; VI-NEXT:    s_cbranch_scc0 BB30_2
  ; VI-NEXT:  ; %bb.1: ; %else
  ; VI-NEXT:    s_load_dword s1, s[6:7], 0x4
-; VI-NEXT:    s_cbranch_execz BB26_3
-; VI-NEXT:    s_branch BB26_4
-; VI-NEXT:  BB26_2:
-; VI-NEXT:  BB26_3: ; %if
+; VI-NEXT:    s_cbranch_execz BB30_3
+; VI-NEXT:    s_branch BB30_4
+; VI-NEXT:  BB30_2:
+; VI-NEXT:  BB30_3: ; %if
  ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  ; VI-NEXT:    s_load_dword s1, s[6:7], 0x0
-; VI-NEXT:  BB26_4: ; %endif
+; VI-NEXT:  BB30_4: ; %endif
  ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  ; VI-NEXT:    v_mov_b32_e32 v0, s0
  ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Thu, 5 Aug 2021 21:28:32 +0000 (14:28 -0700)
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Thu, 5 Aug 2021 21:28:32 +0000 (14:28 -0700)