From: Matt Arsenault Date: Wed, 3 Apr 2019 00:00:58 +0000 (+0000) Subject: AMDGPU: Don't use the default cpu in a few tests X-Git-Tag: llvmorg-10-init~8627 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=2065680b471d0f635d31199531042478748cd10f;p=platform%2Fupstream%2Fllvm.git AMDGPU: Don't use the default cpu in a few tests Avoids unnecessary test changes in a future commit. llvm-svn: 357539 --- diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index 4225ab3..4e81381d 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn-- -verify-machineinstrs | FileCheck %s --check-prefixes=FUNC,SI +; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefixes=FUNC,SI ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=FUNC,FLAT,TONGA ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=FUNC,FLAT,VI @@ -18,15 +18,15 @@ declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1 define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 { ; SI-LABEL: s_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s2, s2 -; SI-NEXT: s_lshr_b32 s4, s2, 16 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_brev_b32 s0, s0 +; SI-NEXT: s_lshr_b32 s0, s0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_i16: @@ -50,19 +50,18 @@ define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfrev_b32_e32 v0, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_i16: @@ -89,14 +88,14 @@ define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrsp define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 { ; SI-LABEL: s_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s4, s2 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_brev_b32 s0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_i32: @@ -122,9 +121,9 @@ define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrsp ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_mov_b32 s6, -1 @@ -160,16 +159,16 @@ define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrsp define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 { ; SI-LABEL: s_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s5, s5 -; SI-NEXT: s_brev_b32 s4, s4 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_brev_b32 s1, s1 +; SI-NEXT: s_brev_b32 s0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_v2i32: @@ -197,9 +196,9 @@ define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_mov_b32 s6, -1 @@ -237,74 +236,74 @@ define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 { ; SI-LABEL: s_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s17, 0xff0000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0 -; SI-NEXT: s_mov_b32 s13, 0xff00 -; SI-NEXT: s_mov_b32 s22, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s23, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s24, 0x33333333 -; SI-NEXT: s_mov_b32 s25, 0xcccccccc -; SI-NEXT: s_mov_b32 s26, 0x55555555 -; SI-NEXT: s_mov_b32 s27, 0xaaaaaaaa -; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: s_mov_b32 s10, s3 -; SI-NEXT: s_mov_b32 s12, s3 -; SI-NEXT: s_mov_b32 s14, s3 -; SI-NEXT: s_mov_b32 s16, s3 +; SI-NEXT: s_mov_b32 s10, 0xff0000 +; SI-NEXT: s_mov_b32 s11, 0xff00 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_alignbit_b32 v1, s1, v0, 24 -; SI-NEXT: v_alignbit_b32 v0, s1, v0, 8 -; SI-NEXT: s_lshr_b32 s2, s1, 24 -; SI-NEXT: s_lshr_b32 s8, s1, 8 -; SI-NEXT: s_lshl_b64 s[18:19], s[0:1], 8 -; SI-NEXT: s_lshl_b64 s[20:21], s[0:1], 24 -; SI-NEXT: s_lshl_b32 s15, s0, 24 -; SI-NEXT: s_lshl_b32 s0, s0, 8 -; SI-NEXT: v_and_b32_e32 v1, s17, v1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_alignbit_b32 v1, s5, v0, 24 +; SI-NEXT: v_alignbit_b32 v0, s5, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s5, 8 +; SI-NEXT: v_and_b32_e32 v1, s10, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff000000, v0 -; SI-NEXT: s_and_b32 s8, s8, s13 -; SI-NEXT: s_and_b32 s11, s19, 0xff -; SI-NEXT: s_and_b32 s13, s21, s13 -; SI-NEXT: s_and_b32 s17, s0, s17 +; SI-NEXT: s_lshr_b32 s2, s5, 24 +; SI-NEXT: s_and_b32 s6, s6, s11 +; SI-NEXT: s_or_b64 s[6:7], s[6:7], s[2:3] ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] -; SI-NEXT: s_or_b64 s[2:3], s[12:13], s[10:11] -; SI-NEXT: s_or_b64 s[8:9], s[14:15], s[16:17] -; SI-NEXT: v_or_b32_e32 v0, s0, v0 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] -; SI-NEXT: v_or_b32_e32 v2, s0, v0 -; SI-NEXT: v_or_b32_e32 v3, s1, v1 -; SI-NEXT: v_and_b32_e32 v1, s22, v3 -; SI-NEXT: v_and_b32_e32 v0, s22, v2 -; SI-NEXT: v_and_b32_e32 v3, s23, v3 -; SI-NEXT: v_and_b32_e32 v2, s23, v2 +; SI-NEXT: s_lshl_b64 s[8:9], s[4:5], 24 +; SI-NEXT: v_or_b32_e32 v0, s6, v0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_lshl_b64 s[6:7], s[4:5], 8 +; SI-NEXT: s_lshl_b32 s2, s4, 8 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_mov_b32 s6, s3 +; SI-NEXT: s_and_b32 s9, s9, s11 +; SI-NEXT: s_mov_b32 s8, s3 +; SI-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; SI-NEXT: s_lshl_b32 s9, s4, 24 +; SI-NEXT: s_and_b32 s5, s2, s10 +; SI-NEXT: s_mov_b32 s4, s3 +; SI-NEXT: s_or_b64 s[2:3], s[8:9], s[4:5] +; SI-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; SI-NEXT: v_or_b32_e32 v2, s2, v0 +; SI-NEXT: v_or_b32_e32 v3, s3, v1 +; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f +; SI-NEXT: v_and_b32_e32 v1, s2, v3 +; SI-NEXT: v_and_b32_e32 v0, s2, v2 +; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 +; SI-NEXT: v_and_b32_e32 v3, s2, v3 +; SI-NEXT: v_and_b32_e32 v2, s2, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 +; SI-NEXT: s_mov_b32 s2, 0x33333333 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_and_b32_e32 v1, s24, v3 -; SI-NEXT: v_and_b32_e32 v0, s24, v2 -; SI-NEXT: v_and_b32_e32 v3, s25, v3 -; SI-NEXT: v_and_b32_e32 v2, s25, v2 +; SI-NEXT: v_and_b32_e32 v1, s2, v3 +; SI-NEXT: v_and_b32_e32 v0, s2, v2 +; SI-NEXT: s_mov_b32 s2, 0xcccccccc +; SI-NEXT: v_and_b32_e32 v3, s2, v3 +; SI-NEXT: v_and_b32_e32 v2, s2, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 +; SI-NEXT: s_mov_b32 s2, 0x55555555 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_and_b32_e32 v1, s26, v3 -; SI-NEXT: v_and_b32_e32 v0, s26, v2 -; SI-NEXT: v_and_b32_e32 v3, s27, v3 -; SI-NEXT: v_and_b32_e32 v2, s27, v2 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: v_and_b32_e32 v1, s2, v3 +; SI-NEXT: v_and_b32_e32 v0, s2, v2 +; SI-NEXT: s_mov_b32 s2, 0xaaaaaaaa +; SI-NEXT: v_and_b32_e32 v3, s2, v3 +; SI-NEXT: v_and_b32_e32 v2, s2, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_i64: @@ -388,61 +387,61 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrsp ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_mov_b32 s0, 0xff0000 ; SI-NEXT: s_mov_b32 s1, 0xff00 ; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f ; SI-NEXT: s_mov_b32 s3, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s8, 0x33333333 -; SI-NEXT: s_mov_b32 s9, 0xcccccccc -; SI-NEXT: s_mov_b32 s10, 0x55555555 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s11, 0xaaaaaaaa +; SI-NEXT: s_mov_b32 s6, 0x33333333 +; SI-NEXT: s_mov_b32 s8, 0xcccccccc +; SI-NEXT: s_mov_b32 s9, 0x55555555 +; SI-NEXT: s_mov_b32 s10, 0xaaaaaaaa ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], 8 ; SI-NEXT: v_alignbit_b32 v4, v1, v0, 24 ; SI-NEXT: v_alignbit_b32 v5, v1, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], 8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; SI-NEXT: v_lshl_b64 v[1:2], v[0:1], 24 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-NEXT: v_and_b32_e32 v0, s0, v0 ; SI-NEXT: v_and_b32_e32 v4, s0, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xff000000, v5 ; SI-NEXT: v_and_b32_e32 v7, s1, v7 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_and_b32_e32 v2, s1, v2 -; SI-NEXT: v_and_b32_e32 v0, s0, v0 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_or_b32_e32 v5, v7, v6 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v2, v0, v2 -; SI-NEXT: v_and_b32_e32 v1, s2, v2 -; SI-NEXT: v_and_b32_e32 v0, s2, v4 -; SI-NEXT: v_and_b32_e32 v3, s3, v2 -; SI-NEXT: v_and_b32_e32 v2, s3, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v1, v4, v5 +; SI-NEXT: v_or_b32_e32 v3, v0, v2 +; SI-NEXT: v_and_b32_e32 v0, s2, v1 +; SI-NEXT: v_and_b32_e32 v2, s3, v1 +; SI-NEXT: v_and_b32_e32 v1, s2, v3 +; SI-NEXT: v_and_b32_e32 v3, s3, v3 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s8, v3 -; SI-NEXT: v_and_b32_e32 v0, s8, v2 -; SI-NEXT: v_and_b32_e32 v3, s9, v3 -; SI-NEXT: v_and_b32_e32 v2, s9, v2 +; SI-NEXT: v_and_b32_e32 v1, s6, v3 +; SI-NEXT: v_and_b32_e32 v0, s6, v2 +; SI-NEXT: v_and_b32_e32 v3, s8, v3 +; SI-NEXT: v_and_b32_e32 v2, s8, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s10, v3 -; SI-NEXT: v_and_b32_e32 v0, s10, v2 -; SI-NEXT: v_and_b32_e32 v3, s11, v3 -; SI-NEXT: v_and_b32_e32 v2, s11, v2 +; SI-NEXT: v_and_b32_e32 v1, s9, v3 +; SI-NEXT: v_and_b32_e32 v0, s9, v2 +; SI-NEXT: v_and_b32_e32 v3, s10, v3 +; SI-NEXT: v_and_b32_e32 v2, s10, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 @@ -527,126 +526,125 @@ define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s25, 0xff0000 ; SI-NEXT: s_mov_b32 s9, 0 -; SI-NEXT: s_mov_b32 s20, 0xff000000 -; SI-NEXT: s_mov_b32 s29, 0xff00 -; SI-NEXT: s_movk_i32 s27, 0xff -; SI-NEXT: s_mov_b32 s32, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s33, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s34, 0x33333333 -; SI-NEXT: s_mov_b32 s35, 0xcccccccc -; SI-NEXT: s_mov_b32 s36, 0x55555555 -; SI-NEXT: s_mov_b32 s37, 0xaaaaaaaa -; SI-NEXT: s_mov_b32 s11, s9 -; SI-NEXT: s_mov_b32 s12, s9 -; SI-NEXT: s_mov_b32 s14, s9 -; SI-NEXT: s_mov_b32 s16, s9 -; SI-NEXT: s_mov_b32 s18, s9 -; SI-NEXT: s_mov_b32 s21, s9 -; SI-NEXT: s_mov_b32 s22, s9 -; SI-NEXT: s_mov_b32 s24, s9 -; SI-NEXT: s_mov_b32 s26, s9 -; SI-NEXT: s_mov_b32 s28, s9 +; SI-NEXT: s_mov_b32 s12, 0xff0000 +; SI-NEXT: s_mov_b32 s13, 0xff000000 +; SI-NEXT: s_mov_b32 s14, 0xff00 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_alignbit_b32 v1, s3, v0, 24 ; SI-NEXT: v_alignbit_b32 v0, s3, v0, 8 +; SI-NEXT: s_lshr_b32 s6, s3, 8 +; SI-NEXT: v_and_b32_e32 v1, s12, v1 +; SI-NEXT: v_and_b32_e32 v0, s13, v0 ; SI-NEXT: s_lshr_b32 s8, s3, 24 -; SI-NEXT: s_lshr_b32 s10, s3, 8 -; SI-NEXT: s_lshl_b32 s13, s2, 24 -; SI-NEXT: s_lshl_b32 s15, s2, 8 -; SI-NEXT: s_lshl_b64 s[30:31], s[2:3], 8 -; SI-NEXT: s_and_b32 s17, s31, s27 -; SI-NEXT: s_lshl_b64 s[2:3], s[2:3], 24 -; SI-NEXT: v_mov_b32_e32 v2, s0 -; SI-NEXT: v_alignbit_b32 v3, s1, v2, 24 -; SI-NEXT: v_alignbit_b32 v2, s1, v2, 8 -; SI-NEXT: s_and_b32 s10, s10, s29 -; SI-NEXT: s_lshr_b32 s30, s1, 8 -; SI-NEXT: s_lshl_b32 s23, s0, 24 -; SI-NEXT: s_and_b32 s15, s15, s25 -; SI-NEXT: s_lshl_b32 s38, s0, 8 -; SI-NEXT: s_and_b32 s19, s3, s29 -; SI-NEXT: s_lshl_b64 s[2:3], s[0:1], 8 -; SI-NEXT: v_and_b32_e32 v0, s20, v0 -; SI-NEXT: v_and_b32_e32 v2, s20, v2 -; SI-NEXT: s_and_b32 s20, s30, s29 -; SI-NEXT: s_lshl_b64 s[30:31], s[0:1], 24 -; SI-NEXT: v_and_b32_e32 v1, s25, v1 -; SI-NEXT: v_and_b32_e32 v3, s25, v3 -; SI-NEXT: s_and_b32 s25, s38, s25 -; SI-NEXT: s_and_b32 s27, s3, s27 -; SI-NEXT: s_and_b32 s29, s31, s29 +; SI-NEXT: s_and_b32 s6, s6, s14 +; SI-NEXT: s_mov_b32 s7, s9 +; SI-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: s_or_b64 s[2:3], s[10:11], s[8:9] -; SI-NEXT: s_or_b64 s[10:11], s[12:13], s[14:15] -; SI-NEXT: s_or_b64 s[12:13], s[18:19], s[16:17] -; SI-NEXT: v_or_b32_e32 v1, v2, v3 +; SI-NEXT: s_lshl_b32 s8, s2, 8 +; SI-NEXT: v_or_b32_e32 v0, s6, v0 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b32 s11, s8, s12 +; SI-NEXT: s_lshl_b32 s7, s2, 24 +; SI-NEXT: s_mov_b32 s6, s9 +; SI-NEXT: s_mov_b32 s10, s9 +; SI-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; SI-NEXT: s_lshl_b64 s[10:11], s[2:3], 8 +; SI-NEXT: s_lshl_b64 s[2:3], s[2:3], 24 +; SI-NEXT: s_movk_i32 s15, 0xff +; SI-NEXT: s_and_b32 s11, s11, s15 +; SI-NEXT: s_mov_b32 s10, s9 +; SI-NEXT: s_and_b32 s3, s3, s14 +; SI-NEXT: s_mov_b32 s2, s9 +; SI-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] +; SI-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] +; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: v_alignbit_b32 v5, s1, v4, 24 +; SI-NEXT: v_alignbit_b32 v4, s1, v4, 8 +; SI-NEXT: v_or_b32_e32 v2, s2, v0 +; SI-NEXT: s_lshr_b32 s2, s1, 8 +; SI-NEXT: v_or_b32_e32 v3, s3, v1 +; SI-NEXT: v_and_b32_e32 v5, s12, v5 +; SI-NEXT: v_and_b32_e32 v4, s13, v4 ; SI-NEXT: s_lshr_b32 s8, s1, 24 -; SI-NEXT: s_or_b64 s[0:1], s[22:23], s[24:25] -; SI-NEXT: s_or_b64 s[14:15], s[28:29], s[26:27] -; SI-NEXT: v_or_b32_e32 v0, s2, v0 -; SI-NEXT: v_mov_b32_e32 v2, s3 -; SI-NEXT: s_or_b64 s[2:3], s[10:11], s[12:13] -; SI-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9] -; SI-NEXT: s_or_b64 s[0:1], s[0:1], s[14:15] -; SI-NEXT: v_or_b32_e32 v3, s2, v0 -; SI-NEXT: v_or_b32_e32 v4, s3, v2 -; SI-NEXT: v_or_b32_e32 v5, s8, v1 -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_or_b32_e32 v6, s1, v0 -; SI-NEXT: v_and_b32_e32 v0, s32, v3 -; SI-NEXT: v_and_b32_e32 v1, s32, v4 -; SI-NEXT: v_and_b32_e32 v2, s33, v3 -; SI-NEXT: v_and_b32_e32 v3, s33, v4 -; SI-NEXT: v_or_b32_e32 v5, s0, v5 -; SI-NEXT: v_and_b32_e32 v4, s32, v6 -; SI-NEXT: v_and_b32_e32 v6, s33, v6 +; SI-NEXT: s_and_b32 s2, s2, s14 +; SI-NEXT: s_mov_b32 s3, s9 +; SI-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_lshl_b32 s8, s0, 8 +; SI-NEXT: v_or_b32_e32 v4, s2, v4 +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: s_lshl_b32 s3, s0, 24 +; SI-NEXT: s_mov_b32 s2, s9 +; SI-NEXT: s_and_b32 s11, s8, s12 +; SI-NEXT: s_mov_b32 s16, 0xf0f0f0f +; SI-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] +; SI-NEXT: s_lshl_b64 s[10:11], s[0:1], 8 +; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], 24 +; SI-NEXT: s_mov_b32 s17, 0xf0f0f0f0 +; SI-NEXT: v_and_b32_e32 v0, s16, v2 +; SI-NEXT: v_and_b32_e32 v1, s16, v3 +; SI-NEXT: v_and_b32_e32 v2, s17, v2 +; SI-NEXT: v_and_b32_e32 v3, s17, v3 +; SI-NEXT: s_and_b32 s11, s11, s15 +; SI-NEXT: s_mov_b32 s10, s9 +; SI-NEXT: s_and_b32 s1, s1, s14 +; SI-NEXT: s_mov_b32 s0, s9 +; SI-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 -; SI-NEXT: v_lshr_b64 v[7:8], v[2:3], 4 -; SI-NEXT: v_and_b32_e32 v3, s32, v5 -; SI-NEXT: v_and_b32_e32 v5, s33, v5 -; SI-NEXT: v_or_b32_e32 v7, v7, v0 -; SI-NEXT: v_or_b32_e32 v8, v8, v1 -; SI-NEXT: v_lshl_b64 v[0:1], v[3:4], 4 -; SI-NEXT: v_lshr_b64 v[2:3], v[5:6], 4 -; SI-NEXT: v_and_b32_e32 v4, s34, v7 -; SI-NEXT: v_and_b32_e32 v5, s34, v8 -; SI-NEXT: v_and_b32_e32 v6, s35, v7 -; SI-NEXT: v_and_b32_e32 v7, s35, v8 -; SI-NEXT: v_or_b32_e32 v8, v2, v0 -; SI-NEXT: v_or_b32_e32 v9, v3, v1 -; SI-NEXT: v_lshl_b64 v[0:1], v[4:5], 2 -; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 2 -; SI-NEXT: v_and_b32_e32 v4, s34, v8 -; SI-NEXT: v_and_b32_e32 v5, s34, v9 -; SI-NEXT: v_and_b32_e32 v6, s35, v8 -; SI-NEXT: v_and_b32_e32 v7, s35, v9 -; SI-NEXT: v_or_b32_e32 v8, v2, v0 -; SI-NEXT: v_or_b32_e32 v9, v3, v1 -; SI-NEXT: v_lshl_b64 v[0:1], v[4:5], 2 -; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 2 -; SI-NEXT: v_and_b32_e32 v4, s36, v8 -; SI-NEXT: v_and_b32_e32 v5, s36, v9 -; SI-NEXT: v_and_b32_e32 v6, s37, v8 -; SI-NEXT: v_and_b32_e32 v7, s37, v9 -; SI-NEXT: v_or_b32_e32 v8, v2, v0 -; SI-NEXT: v_or_b32_e32 v9, v3, v1 -; SI-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; SI-NEXT: v_lshr_b64 v[2:3], v[6:7], 1 -; SI-NEXT: v_and_b32_e32 v4, s36, v8 -; SI-NEXT: v_and_b32_e32 v5, s36, v9 -; SI-NEXT: v_and_b32_e32 v6, s37, v8 -; SI-NEXT: v_and_b32_e32 v7, s37, v9 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 +; SI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; SI-NEXT: v_or_b32_e32 v6, s0, v4 +; SI-NEXT: v_or_b32_e32 v7, s1, v5 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: s_mov_b32 s18, 0x33333333 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: s_mov_b32 s19, 0xcccccccc +; SI-NEXT: v_and_b32_e32 v0, s18, v2 +; SI-NEXT: v_and_b32_e32 v1, s18, v3 +; SI-NEXT: v_and_b32_e32 v4, s16, v6 +; SI-NEXT: v_and_b32_e32 v5, s16, v7 +; SI-NEXT: v_and_b32_e32 v2, s19, v2 +; SI-NEXT: v_and_b32_e32 v3, s19, v3 +; SI-NEXT: v_and_b32_e32 v6, s17, v6 +; SI-NEXT: v_and_b32_e32 v7, s17, v7 +; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 +; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 4 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_or_b32_e32 v6, v6, v4 +; SI-NEXT: v_or_b32_e32 v7, v7, v5 +; SI-NEXT: s_mov_b32 s20, 0x55555555 +; SI-NEXT: v_or_b32_e32 v3, v3, v1 +; SI-NEXT: s_mov_b32 s21, 0xaaaaaaaa +; SI-NEXT: v_and_b32_e32 v0, s20, v2 +; SI-NEXT: v_and_b32_e32 v1, s20, v3 +; SI-NEXT: v_and_b32_e32 v4, s18, v6 +; SI-NEXT: v_and_b32_e32 v5, s18, v7 +; SI-NEXT: v_and_b32_e32 v2, s21, v2 +; SI-NEXT: v_and_b32_e32 v3, s21, v3 +; SI-NEXT: v_and_b32_e32 v6, s19, v6 +; SI-NEXT: v_and_b32_e32 v7, s19, v7 +; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 +; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 2 +; SI-NEXT: v_or_b32_e32 v2, v2, v0 +; SI-NEXT: v_or_b32_e32 v0, v6, v4 +; SI-NEXT: v_or_b32_e32 v7, v7, v5 +; SI-NEXT: v_and_b32_e32 v5, s20, v7 +; SI-NEXT: v_and_b32_e32 v4, s20, v0 +; SI-NEXT: v_and_b32_e32 v6, s21, v0 +; SI-NEXT: v_and_b32_e32 v7, s21, v7 +; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 1 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 -; SI-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; SI-NEXT: v_lshr_b64 v[4:5], v[6:7], 1 -; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_or_b32_e32 v0, v6, v4 +; SI-NEXT: v_or_b32_e32 v1, v7, v5 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -787,31 +785,13 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_mov_b32 s0, 0xff0000 ; SI-NEXT: s_mov_b32 s1, 0xff000000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v3, v2, 24 -; SI-NEXT: v_alignbit_b32 v9, v3, v2, 8 -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v3 -; SI-NEXT: v_lshl_b64 v[4:5], v[2:3], 8 -; SI-NEXT: v_lshl_b64 v[6:7], v[2:3], 24 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v2 -; SI-NEXT: v_alignbit_b32 v6, v1, v0, 24 -; SI-NEXT: v_alignbit_b32 v11, v1, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v3 -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v1 -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v0 -; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], 8 -; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 24 ; SI-NEXT: s_mov_b32 s2, 0xff00 ; SI-NEXT: s_movk_i32 s3, 0xff ; SI-NEXT: s_mov_b32 s8, 0xf0f0f0f @@ -820,38 +800,56 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; SI-NEXT: s_mov_b32 s11, 0xcccccccc ; SI-NEXT: s_mov_b32 s12, 0x55555555 ; SI-NEXT: s_mov_b32 s13, 0xaaaaaaaa -; SI-NEXT: v_and_b32_e32 v0, s0, v8 -; SI-NEXT: v_and_b32_e32 v2, s1, v9 -; SI-NEXT: v_and_b32_e32 v8, s2, v10 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshl_b64 v[4:5], v[2:3], 8 +; SI-NEXT: v_alignbit_b32 v6, v3, v2, 24 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; SI-NEXT: v_lshl_b64 v[3:4], v[2:3], 24 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 +; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], 8 +; SI-NEXT: v_alignbit_b32 v12, v1, v0, 24 +; SI-NEXT: v_alignbit_b32 v13, v1, v0, 8 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v0 +; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 24 +; SI-NEXT: v_and_b32_e32 v0, s0, v6 +; SI-NEXT: v_and_b32_e32 v2, s1, v7 +; SI-NEXT: v_and_b32_e32 v6, s2, v9 +; SI-NEXT: v_and_b32_e32 v7, s0, v11 +; SI-NEXT: v_and_b32_e32 v9, s0, v12 +; SI-NEXT: v_and_b32_e32 v11, s1, v13 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v2, v6, v8 +; SI-NEXT: v_and_b32_e32 v12, s2, v15 +; SI-NEXT: v_and_b32_e32 v13, s0, v17 ; SI-NEXT: v_and_b32_e32 v5, s3, v5 -; SI-NEXT: v_and_b32_e32 v7, s2, v7 -; SI-NEXT: v_and_b32_e32 v4, s0, v4 -; SI-NEXT: v_and_b32_e32 v6, s0, v6 -; SI-NEXT: v_and_b32_e32 v9, s1, v11 -; SI-NEXT: v_and_b32_e32 v10, s2, v12 +; SI-NEXT: v_and_b32_e32 v4, s2, v4 ; SI-NEXT: v_and_b32_e32 v3, s3, v3 ; SI-NEXT: v_and_b32_e32 v1, s2, v1 -; SI-NEXT: v_and_b32_e32 v11, s0, v17 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_or_b32_e32 v2, v8, v13 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_or_b32_e32 v4, v14, v4 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_or_b32_e32 v7, v10, v15 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_or_b32_e32 v3, v16, v11 +; SI-NEXT: v_or_b32_e32 v6, v10, v7 +; SI-NEXT: v_or_b32_e32 v7, v11, v9 ; SI-NEXT: v_or_b32_e32 v2, v0, v2 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_or_b32_e32 v7, v3, v1 -; SI-NEXT: v_and_b32_e32 v1, s8, v4 +; SI-NEXT: v_or_b32_e32 v8, v12, v14 +; SI-NEXT: v_or_b32_e32 v0, v4, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_or_b32_e32 v9, v16, v13 +; SI-NEXT: v_or_b32_e32 v5, v7, v8 +; SI-NEXT: v_or_b32_e32 v3, v6, v0 +; SI-NEXT: v_or_b32_e32 v7, v9, v1 ; SI-NEXT: v_and_b32_e32 v0, s8, v2 -; SI-NEXT: v_and_b32_e32 v3, s9, v4 +; SI-NEXT: v_and_b32_e32 v1, s8, v3 ; SI-NEXT: v_and_b32_e32 v2, s9, v2 +; SI-NEXT: v_and_b32_e32 v3, s9, v3 +; SI-NEXT: v_and_b32_e32 v4, s8, v5 +; SI-NEXT: v_and_b32_e32 v6, s9, v5 ; SI-NEXT: v_and_b32_e32 v5, s8, v7 -; SI-NEXT: v_and_b32_e32 v4, s8, v6 ; SI-NEXT: v_and_b32_e32 v7, s9, v7 -; SI-NEXT: v_and_b32_e32 v6, s9, v6 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 ; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4 @@ -862,10 +860,10 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; SI-NEXT: v_or_b32_e32 v6, v6, v4 ; SI-NEXT: v_and_b32_e32 v1, s10, v3 ; SI-NEXT: v_and_b32_e32 v0, s10, v2 -; SI-NEXT: v_and_b32_e32 v3, s11, v3 -; SI-NEXT: v_and_b32_e32 v2, s11, v2 ; SI-NEXT: v_and_b32_e32 v5, s10, v7 ; SI-NEXT: v_and_b32_e32 v4, s10, v6 +; SI-NEXT: v_and_b32_e32 v3, s11, v3 +; SI-NEXT: v_and_b32_e32 v2, s11, v2 ; SI-NEXT: v_and_b32_e32 v7, s11, v7 ; SI-NEXT: v_and_b32_e32 v6, s11, v6 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 @@ -878,10 +876,10 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 ; SI-NEXT: v_or_b32_e32 v6, v6, v4 ; SI-NEXT: v_and_b32_e32 v1, s12, v3 ; SI-NEXT: v_and_b32_e32 v0, s12, v2 -; SI-NEXT: v_and_b32_e32 v3, s13, v3 -; SI-NEXT: v_and_b32_e32 v2, s13, v2 ; SI-NEXT: v_and_b32_e32 v5, s12, v7 ; SI-NEXT: v_and_b32_e32 v4, s12, v6 +; SI-NEXT: v_and_b32_e32 v3, s13, v3 +; SI-NEXT: v_and_b32_e32 v2, s13, v2 ; SI-NEXT: v_and_b32_e32 v7, s13, v7 ; SI-NEXT: v_and_b32_e32 v6, s13, v6 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll index 5df1d55..8976835 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,HSA %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,HSA %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MESA %s ; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty: ; GCN: enable_sgpr_kernarg_segment_ptr = 1 @@ -162,15 +162,16 @@ define void @opencl_func_call_implicitarg_ptr_func() #0 { ; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr: ; GCN: s_waitcnt -; MESA: v_mov_b32_e32 v0, s6 -; MESA: v_mov_b32_e32 v1, s7 -; MESA: v_mov_b32_e32 v2, s8 -; MESA: v_mov_b32_e32 v3, s9 -; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; MESA-DAG: v_mov_b32_e32 v0, s6 +; MESA-DAG: v_mov_b32_e32 v1, s7 +; MESA-DAG: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; MESA: v_mov_b32_e32 v0, s8 +; MESA: v_mov_b32_e32 v1, s9 +; MESA: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 + ; HSA: v_mov_b32_e32 v0, s6 ; HSA: v_mov_b32_e32 v1, s7 ; HSA: flat_load_dword v0, v[0:1] -; MESA: buffer_load_dword v0, v[2:3], s[8:11], 0 addr64 ; HSA: v_mov_b32_e32 v0, s8 ; HSA: v_mov_b32_e32 v1, s9 ; HSA: flat_load_dword v0, v[0:1] @@ -188,15 +189,18 @@ define void @func_kernarg_implicitarg_ptr() #0 { ; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr: ; GCN: s_waitcnt -; MESA: v_mov_b32_e32 v0, s6 -; MESA: v_mov_b32_e32 v1, s7 -; MESA: v_mov_b32_e32 v2, s8 -; MESA: v_mov_b32_e32 v3, s9 -; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; MESA-DAG: v_mov_b32_e32 v0, s6 +; MESA-DAG: v_mov_b32_e32 v1, s7 +; MESA: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; MESA-DAG: v_mov_b32_e32 v0, s8 +; MESA-DAG: v_mov_b32_e32 v1, s9 +; MESA: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 + + ; HSA: v_mov_b32_e32 v0, s6 ; HSA: v_mov_b32_e32 v1, s7 ; HSA: flat_load_dword v0, v[0:1] -; MESA: buffer_load_dword v0, v[2:3], s[8:11], 0 addr64 + ; HSA: v_mov_b32_e32 v0, s8 ; HSA: v_mov_b32_e32 v1, s9 ; HSA: flat_load_dword v0, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll index 7119795..a8f5603 100644 --- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll @@ -1,18 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-- -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s define amdgpu_kernel void @zext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) { ; GCN-LABEL: zext_shl64_to_32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s0, s0, 2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %and = and i32 %x, 1073741823 %ext = zext i32 %and to i64 @@ -24,16 +24,16 @@ define amdgpu_kernel void @zext_shl64_to_32(i64 addrspace(1)* nocapture %out, i3 define amdgpu_kernel void @sext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) { ; GCN-LABEL: sext_shl64_to_32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s4, 0x1fffffff -; GCN-NEXT: s_lshl_b32 s4, s4, 2 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s0, s0, 0x1fffffff +; GCN-NEXT: s_lshl_b32 s0, s0, 2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %and = and i32 %x, 536870911 %ext = sext i32 %and to i64 @@ -45,17 +45,17 @@ define amdgpu_kernel void @sext_shl64_to_32(i64 addrspace(1)* nocapture %out, i3 define amdgpu_kernel void @zext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) { ; GCN-LABEL: zext_shl64_overflow: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s2, 0x7fffffff -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_bitset0_b32 s0, 31 +; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %and = and i32 %x, 2147483647 %ext = zext i32 %and to i64 @@ -67,17 +67,17 @@ define amdgpu_kernel void @zext_shl64_overflow(i64 addrspace(1)* nocapture %out, define amdgpu_kernel void @sext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) { ; GCN-LABEL: sext_shl64_overflow: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s2, 0x7fffffff -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_bitset0_b32 s0, 31 +; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %and = and i32 %x, 2147483647 %ext = sext i32 %and to i64 @@ -91,8 +91,8 @@ define amdgpu_kernel void @mulu24_shl64(i32 addrspace(1)* nocapture %arg) { ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: v_and_b32_e32 v0, 6, v0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 7, v0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 @@ -113,20 +113,20 @@ define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 ad ; GCN-LABEL: muli24_shl64: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] ; GCN-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, 0x800000, v1 -; GCN-NEXT: v_mul_i32_i24_e32 v1, 0xfffff9, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] ; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v1 +; GCN-NEXT: v_mul_i32_i24_e32 v0, 0xfffff9, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64 ; GCN-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll index b79f477..716caf2 100644 --- a/llvm/test/CodeGen/AMDGPU/max.ll +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=pitcairn < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_test_imax_sge_i32: @@ -238,7 +238,10 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspac ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_max_slt_i16: ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c -; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]] +; SI-DAG: s_sext_i32_i16 [[EXT_A:s[0-9]+]], [[A]] +; SI-DAG: s_sext_i32_i16 [[EXT_B:s[0-9]+]], [[B]] + +; SI: s_max_i32 [[MAX:s[0-9]+]], [[EXT_A]], [[EXT_B]] ; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] ; SI: buffer_store_dword [[VMAX]] diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index b2d62f5..cc772a6 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -109,9 +109,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, [8 ; GCN: s_load_dword s ; SI: s_ashr_i32 -; SI: s_sext_i32_i16 ; SI: s_ashr_i32 ; SI: s_sext_i32_i16 +; SI: s_sext_i32_i16 ; SI: s_min_i32 ; SI: s_min_i32 @@ -381,7 +381,8 @@ define amdgpu_kernel void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i ; FUNC-LABEL: @v_test_umin_ult_i32_multi_use ; SI-NOT: v_min ; GCN: v_cmp_lt_u32 -; SI-NEXT: v_cndmask_b32 +; SI-NOT: v_min +; SI: v_cndmask_b32 ; SI-NOT: v_min ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll index 3db6fd2..e0971b8 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s ; ; ; Most SALU instructions ignore control flow, so we need to make sure @@ -108,7 +108,7 @@ endif: ; SI: ; %if ; SI: buffer_load_dword [[AVAL:v[0-9]+]] -; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]] +; SI-DAG: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]] ; SI-DAG: s_andn2_b64 [[PHI]], [[PHI]], exec ; SI-DAG: s_and_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP_ELSE]], exec ; SI: s_or_b64 [[PHI]], [[PHI]], [[TMP]] diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index cd7b2fe..6a0b3e9 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -1,26 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) { ; GCN-LABEL: v_shl_i128_vv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sub_i32_e32 v7, vcc, 64, v4 ; GCN-NEXT: v_lshl_b64 v[5:6], v[2:3], v4 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 64, v4 -; GCN-NEXT: v_subrev_i32_e32 v11, vcc, 64, v4 -; GCN-NEXT: v_lshl_b64 v[7:8], v[0:1], v4 -; GCN-NEXT: v_lshr_b64 v[9:10], v[0:1], v9 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v10 -; GCN-NEXT: v_or_b32_e32 v5, v5, v9 -; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GCN-NEXT: v_cndmask_b32_e32 v6, v1, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v8, vcc +; GCN-NEXT: v_lshr_b64 v[7:8], v[0:1], v7 ; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v4 +; GCN-NEXT: v_or_b32_e32 v7, v5, v7 +; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 64, v4 +; GCN-NEXT: v_or_b32_e32 v8, v6, v8 +; GCN-NEXT: v_lshl_b64 v[5:6], v[0:1], v5 +; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 +; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v2, s[6:7] -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v7, vcc +; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl i128 %lhs, %rhs ret i128 %shl @@ -30,22 +30,22 @@ define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) { ; GCN-LABEL: v_lshr_i128_vv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sub_i32_e32 v7, vcc, 64, v4 ; GCN-NEXT: v_lshr_b64 v[5:6], v[0:1], v4 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 64, v4 -; GCN-NEXT: v_subrev_i32_e32 v11, vcc, 64, v4 -; GCN-NEXT: v_lshr_b64 v[7:8], v[2:3], v4 -; GCN-NEXT: v_lshl_b64 v[9:10], v[2:3], v9 -; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v11 -; GCN-NEXT: v_or_b32_e32 v6, v6, v10 -; GCN-NEXT: v_or_b32_e32 v5, v5, v9 -; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GCN-NEXT: v_cndmask_b32_e32 v6, v3, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v8, vcc +; GCN-NEXT: v_lshl_b64 v[7:8], v[2:3], v7 ; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v4 +; GCN-NEXT: v_or_b32_e32 v7, v5, v7 +; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 64, v4 +; GCN-NEXT: v_or_b32_e32 v8, v6, v8 +; GCN-NEXT: v_lshr_b64 v[5:6], v[2:3], v5 +; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 +; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[6:7] -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = lshr i128 %lhs, %rhs @@ -56,21 +56,21 @@ define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) { ; GCN-LABEL: v_ashr_i128_vv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GCN-NEXT: v_ashr_i64 v[5:6], v[2:3], v4 +; GCN-NEXT: v_sub_i32_e32 v9, vcc, 64, v4 ; GCN-NEXT: v_lshr_b64 v[7:8], v[0:1], v4 -; GCN-NEXT: v_sub_i32_e32 v10, vcc, 64, v4 -; GCN-NEXT: v_subrev_i32_e32 v11, vcc, 64, v4 -; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GCN-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GCN-NEXT: v_lshl_b64 v[9:10], v[2:3], v10 -; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], v11 +; GCN-NEXT: v_lshl_b64 v[9:10], v[2:3], v9 +; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v3 ; GCN-NEXT: v_or_b32_e32 v8, v8, v10 +; GCN-NEXT: v_subrev_i32_e32 v10, vcc, 64, v4 +; GCN-NEXT: v_ashr_i64 v[5:6], v[2:3], v4 +; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], v10 +; GCN-NEXT: v_cmp_gt_u32_e64 s[6:7], 64, v4 ; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GCN-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[6:7] ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v6, v11, v6, s[6:7] ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v2, v5 @@ -87,8 +87,8 @@ define i128 @v_shl_i128_vk(i128 %lhs) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], 17 ; GCN-NEXT: v_lshrrev_b32_e32 v4, 15, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v4 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 17 +; GCN-NEXT: v_or_b32_e32 v2, v2, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl i128 %lhs, 17 ret i128 %shl @@ -113,8 +113,8 @@ define i128 @v_ashr_i128_vk(i128 %lhs) { ; GCN-NEXT: v_lshl_b64 v[4:5], v[2:3], 31 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v1 ; GCN-NEXT: v_or_b32_e32 v4, v0, v4 -; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], 33 ; GCN-NEXT: v_mov_b32_e32 v0, v4 +; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], 33 ; GCN-NEXT: v_mov_b32_e32 v1, v5 ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = ashr i128 %lhs, 33 @@ -127,17 +127,17 @@ define i128 @v_shl_i128_kv(i128 %rhs) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_i32_e32 v1, vcc, 64, v0 ; GCN-NEXT: v_subrev_i32_e32 v3, vcc, 64, v0 -; GCN-NEXT: v_lshl_b64 v[4:5], 17, v0 ; GCN-NEXT: v_lshr_b64 v[1:2], 17, v1 -; GCN-NEXT: v_lshl_b64 v[6:7], 17, v3 +; GCN-NEXT: v_lshl_b64 v[4:5], 17, v3 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v0 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v2, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[6:7] -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v1, vcc +; GCN-NEXT: v_lshl_b64 v[0:1], 17, v0 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl i128 17, %rhs ret i128 %shl @@ -149,15 +149,15 @@ define i128 @v_lshr_i128_kv(i128 %rhs) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s7, 0 ; GCN-NEXT: s_movk_i32 s6, 0x41 -; GCN-NEXT: v_mov_b32_e32 v3, 0x41 -; GCN-NEXT: v_lshr_b64 v[1:2], s[6:7], v0 +; GCN-NEXT: v_lshr_b64 v[2:3], s[6:7], v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41 +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[6:7] ; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7] ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = lshr i128 65, %rhs @@ -168,14 +168,14 @@ define i128 @v_ashr_i128_kv(i128 %rhs) { ; GCN-LABEL: v_ashr_i128_kv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshr_b64 v[1:2], 33, v0 +; GCN-NEXT: v_lshr_b64 v[2:3], 33, v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, 33, v3, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v0, s[6:7] ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = ashr i128 33, %rhs @@ -184,100 +184,72 @@ define i128 @v_ashr_i128_kv(i128 %rhs) { define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_shl_i128_ss: -; GCN: .amd_kernel_code_t -; GCN-NEXT: amd_code_version_major = 1 -; GCN-NEXT: amd_code_version_minor = 2 -; GCN-NEXT: amd_machine_kind = 1 -; GCN-NEXT: amd_machine_version_major = 7 -; GCN-NEXT: amd_machine_version_minor = 0 -; GCN-NEXT: amd_machine_version_stepping = 0 -; GCN-NEXT: kernel_code_entry_byte_offset = 256 -; GCN-NEXT: kernel_code_prefetch_byte_size = 0 -; GCN-NEXT: granulated_workitem_vgpr_count = 1 -; GCN-NEXT: granulated_wavefront_sgpr_count = 1 -; GCN-NEXT: priority = 0 -; GCN-NEXT: float_mode = 192 -; GCN-NEXT: priv = 0 -; GCN-NEXT: enable_dx10_clamp = 1 -; GCN-NEXT: debug_mode = 0 -; GCN-NEXT: enable_ieee_mode = 1 -; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GCN-NEXT: user_sgpr_count = 6 -; GCN-NEXT: enable_trap_handler = 0 -; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 -; GCN-NEXT: enable_sgpr_workgroup_id_y = 0 -; GCN-NEXT: enable_sgpr_workgroup_id_z = 0 -; GCN-NEXT: enable_sgpr_workgroup_info = 0 -; GCN-NEXT: enable_vgpr_workitem_id = 0 -; GCN-NEXT: enable_exception_msb = 0 -; GCN-NEXT: granulated_lds_size = 0 -; GCN-NEXT: enable_exception = 0 -; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 -; GCN-NEXT: enable_sgpr_dispatch_ptr = 0 -; GCN-NEXT: enable_sgpr_queue_ptr = 0 -; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GCN-NEXT: enable_sgpr_dispatch_id = 0 -; GCN-NEXT: enable_sgpr_flat_scratch_init = 0 -; GCN-NEXT: enable_sgpr_private_segment_size = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0 -; GCN-NEXT: enable_ordered_append_gds = 0 -; GCN-NEXT: private_element_size = 1 -; GCN-NEXT: is_ptr64 = 1 -; GCN-NEXT: is_dynamic_callstack = 0 -; GCN-NEXT: is_debug_enabled = 0 -; GCN-NEXT: is_xnack_enabled = 0 -; GCN-NEXT: workitem_private_segment_byte_size = 0 -; GCN-NEXT: workgroup_group_segment_byte_size = 0 -; GCN-NEXT: gds_segment_byte_size = 0 -; GCN-NEXT: kernarg_segment_byte_size = 32 -; GCN-NEXT: workgroup_fbarrier_count = 0 -; GCN-NEXT: wavefront_sgpr_count = 15 -; GCN-NEXT: workitem_vgpr_count = 8 -; GCN-NEXT: reserved_vgpr_first = 0 -; GCN-NEXT: reserved_vgpr_count = 0 -; GCN-NEXT: reserved_sgpr_first = 0 -; GCN-NEXT: reserved_sgpr_count = 0 -; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 -; GCN-NEXT: debug_private_segment_buffer_sgpr = 0 -; GCN-NEXT: kernarg_segment_alignment = 4 -; GCN-NEXT: group_segment_alignment = 4 -; GCN-NEXT: private_segment_alignment = 4 -; GCN-NEXT: wavefront_size = 6 -; GCN-NEXT: call_convention = -1 -; GCN-NEXT: runtime_loader_kernel_symbol = 0 -; GCN-NEXT: .end_amd_kernel_code_t -; GCN-NEXT: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b64 s[6:7], s[2:3], s4 -; GCN-NEXT: s_sub_i32 s5, 64, s4 -; GCN-NEXT: s_sub_i32 s12, s4, 64 -; GCN-NEXT: s_lshl_b64 s[8:9], s[0:1], s4 -; GCN-NEXT: s_lshr_b64 s[10:11], s[0:1], s5 -; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 -; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] -; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s7 -; GCN-NEXT: v_cmp_lt_u32_e64 vcc, s4, 64 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 0 -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sub_i32 s2, 64, s8 +; GCN-NEXT: s_sub_i32 s9, s8, 64 +; GCN-NEXT: s_lshl_b64 s[0:1], s[6:7], s8 +; GCN-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 +; GCN-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GCN-NEXT: s_lshl_b64 s[10:11], s[4:5], s9 +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_cmp_lt_u32_e64 vcc, s8, 64 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, 0 +; GCN-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] +; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], s8 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; GCN-NEXT: .section .rodata,#alloc +; GCN-NEXT: .p2align 6 +; GCN-NEXT: .amdhsa_kernel s_shl_i128_ss +; GCN-NEXT: .amdhsa_group_segment_fixed_size 0 +; GCN-NEXT: .amdhsa_private_segment_fixed_size 0 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GCN-NEXT: .amdhsa_next_free_vgpr 8 +; GCN-NEXT: .amdhsa_next_free_sgpr 12 +; GCN-NEXT: .amdhsa_reserve_flat_scratch 0 +; GCN-NEXT: .amdhsa_float_round_mode_32 0 +; GCN-NEXT: .amdhsa_float_round_mode_16_64 0 +; GCN-NEXT: .amdhsa_float_denorm_mode_32 0 +; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; GCN-NEXT: .amdhsa_dx10_clamp 1 +; GCN-NEXT: .amdhsa_ieee_mode 1 +; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; GCN-NEXT: .amdhsa_exception_int_div_zero 0 +; GCN-NEXT: .end_amdhsa_kernel +; GCN-NEXT: .text %shift = shl i128 %lhs, %rhs store i128 %shift, i128 addrspace(1)* null ret void @@ -285,100 +257,72 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_lshr_i128_ss: -; GCN: .amd_kernel_code_t -; GCN-NEXT: amd_code_version_major = 1 -; GCN-NEXT: amd_code_version_minor = 2 -; GCN-NEXT: amd_machine_kind = 1 -; GCN-NEXT: amd_machine_version_major = 7 -; GCN-NEXT: amd_machine_version_minor = 0 -; GCN-NEXT: amd_machine_version_stepping = 0 -; GCN-NEXT: kernel_code_entry_byte_offset = 256 -; GCN-NEXT: kernel_code_prefetch_byte_size = 0 -; GCN-NEXT: granulated_workitem_vgpr_count = 1 -; GCN-NEXT: granulated_wavefront_sgpr_count = 1 -; GCN-NEXT: priority = 0 -; GCN-NEXT: float_mode = 192 -; GCN-NEXT: priv = 0 -; GCN-NEXT: enable_dx10_clamp = 1 -; GCN-NEXT: debug_mode = 0 -; GCN-NEXT: enable_ieee_mode = 1 -; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GCN-NEXT: user_sgpr_count = 6 -; GCN-NEXT: enable_trap_handler = 0 -; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 -; GCN-NEXT: enable_sgpr_workgroup_id_y = 0 -; GCN-NEXT: enable_sgpr_workgroup_id_z = 0 -; GCN-NEXT: enable_sgpr_workgroup_info = 0 -; GCN-NEXT: enable_vgpr_workitem_id = 0 -; GCN-NEXT: enable_exception_msb = 0 -; GCN-NEXT: granulated_lds_size = 0 -; GCN-NEXT: enable_exception = 0 -; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 -; GCN-NEXT: enable_sgpr_dispatch_ptr = 0 -; GCN-NEXT: enable_sgpr_queue_ptr = 0 -; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GCN-NEXT: enable_sgpr_dispatch_id = 0 -; GCN-NEXT: enable_sgpr_flat_scratch_init = 0 -; GCN-NEXT: enable_sgpr_private_segment_size = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0 -; GCN-NEXT: enable_ordered_append_gds = 0 -; GCN-NEXT: private_element_size = 1 -; GCN-NEXT: is_ptr64 = 1 -; GCN-NEXT: is_dynamic_callstack = 0 -; GCN-NEXT: is_debug_enabled = 0 -; GCN-NEXT: is_xnack_enabled = 0 -; GCN-NEXT: workitem_private_segment_byte_size = 0 -; GCN-NEXT: workgroup_group_segment_byte_size = 0 -; GCN-NEXT: gds_segment_byte_size = 0 -; GCN-NEXT: kernarg_segment_byte_size = 32 -; GCN-NEXT: workgroup_fbarrier_count = 0 -; GCN-NEXT: wavefront_sgpr_count = 15 -; GCN-NEXT: workitem_vgpr_count = 8 -; GCN-NEXT: reserved_vgpr_first = 0 -; GCN-NEXT: reserved_vgpr_count = 0 -; GCN-NEXT: reserved_sgpr_first = 0 -; GCN-NEXT: reserved_sgpr_count = 0 -; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 -; GCN-NEXT: debug_private_segment_buffer_sgpr = 0 -; GCN-NEXT: kernarg_segment_alignment = 4 -; GCN-NEXT: group_segment_alignment = 4 -; GCN-NEXT: private_segment_alignment = 4 -; GCN-NEXT: wavefront_size = 6 -; GCN-NEXT: call_convention = -1 -; GCN-NEXT: runtime_loader_kernel_symbol = 0 -; GCN-NEXT: .end_amd_kernel_code_t -; GCN-NEXT: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 -; GCN-NEXT: s_sub_i32 s5, 64, s4 -; GCN-NEXT: s_sub_i32 s12, s4, 64 -; GCN-NEXT: s_lshr_b64 s[8:9], s[2:3], s4 -; GCN-NEXT: s_lshl_b64 s[10:11], s[2:3], s5 -; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s12 -; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] -; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NEXT: v_mov_b32_e32 v5, s8 +; GCN-NEXT: s_sub_i32 s2, 64, s8 +; GCN-NEXT: s_sub_i32 s9, s8, 64 +; GCN-NEXT: s_lshr_b64 s[0:1], s[4:5], s8 +; GCN-NEXT: s_lshl_b64 s[2:3], s[6:7], s2 +; GCN-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GCN-NEXT: s_lshr_b64 s[10:11], s[6:7], s9 +; GCN-NEXT: v_mov_b32_e32 v0, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: v_mov_b32_e32 v6, s7 -; GCN-NEXT: v_cmp_lt_u32_e64 vcc, s4, 64 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 0 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc +; GCN-NEXT: v_cmp_lt_u32_e64 vcc, s8, 64 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: s_lshr_b64 s[0:1], s[6:7], s8 +; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; GCN-NEXT: .section .rodata,#alloc +; GCN-NEXT: .p2align 6 +; GCN-NEXT: .amdhsa_kernel s_lshr_i128_ss +; GCN-NEXT: .amdhsa_group_segment_fixed_size 0 +; GCN-NEXT: .amdhsa_private_segment_fixed_size 0 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GCN-NEXT: .amdhsa_next_free_vgpr 8 +; GCN-NEXT: .amdhsa_next_free_sgpr 12 +; GCN-NEXT: .amdhsa_reserve_flat_scratch 0 +; GCN-NEXT: .amdhsa_float_round_mode_32 0 +; GCN-NEXT: .amdhsa_float_round_mode_16_64 0 +; GCN-NEXT: .amdhsa_float_denorm_mode_32 0 +; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; GCN-NEXT: .amdhsa_dx10_clamp 1 +; GCN-NEXT: .amdhsa_ieee_mode 1 +; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; GCN-NEXT: .amdhsa_exception_int_div_zero 0 +; GCN-NEXT: .end_amdhsa_kernel +; GCN-NEXT: .text %shift = lshr i128 %lhs, %rhs store i128 %shift, i128 addrspace(1)* null ret void @@ -386,102 +330,74 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_ashr_i128_ss: -; GCN: .amd_kernel_code_t -; GCN-NEXT: amd_code_version_major = 1 -; GCN-NEXT: amd_code_version_minor = 2 -; GCN-NEXT: amd_machine_kind = 1 -; GCN-NEXT: amd_machine_version_major = 7 -; GCN-NEXT: amd_machine_version_minor = 0 -; GCN-NEXT: amd_machine_version_stepping = 0 -; GCN-NEXT: kernel_code_entry_byte_offset = 256 -; GCN-NEXT: kernel_code_prefetch_byte_size = 0 -; GCN-NEXT: granulated_workitem_vgpr_count = 1 -; GCN-NEXT: granulated_wavefront_sgpr_count = 1 -; GCN-NEXT: priority = 0 -; GCN-NEXT: float_mode = 192 -; GCN-NEXT: priv = 0 -; GCN-NEXT: enable_dx10_clamp = 1 -; GCN-NEXT: debug_mode = 0 -; GCN-NEXT: enable_ieee_mode = 1 -; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GCN-NEXT: user_sgpr_count = 6 -; GCN-NEXT: enable_trap_handler = 0 -; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 -; GCN-NEXT: enable_sgpr_workgroup_id_y = 0 -; GCN-NEXT: enable_sgpr_workgroup_id_z = 0 -; GCN-NEXT: enable_sgpr_workgroup_info = 0 -; GCN-NEXT: enable_vgpr_workitem_id = 0 -; GCN-NEXT: enable_exception_msb = 0 -; GCN-NEXT: granulated_lds_size = 0 -; GCN-NEXT: enable_exception = 0 -; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 -; GCN-NEXT: enable_sgpr_dispatch_ptr = 0 -; GCN-NEXT: enable_sgpr_queue_ptr = 0 -; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GCN-NEXT: enable_sgpr_dispatch_id = 0 -; GCN-NEXT: enable_sgpr_flat_scratch_init = 0 -; GCN-NEXT: enable_sgpr_private_segment_size = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0 -; GCN-NEXT: enable_ordered_append_gds = 0 -; GCN-NEXT: private_element_size = 1 -; GCN-NEXT: is_ptr64 = 1 -; GCN-NEXT: is_dynamic_callstack = 0 -; GCN-NEXT: is_debug_enabled = 0 -; GCN-NEXT: is_xnack_enabled = 0 -; GCN-NEXT: workitem_private_segment_byte_size = 0 -; GCN-NEXT: workgroup_group_segment_byte_size = 0 -; GCN-NEXT: gds_segment_byte_size = 0 -; GCN-NEXT: kernarg_segment_byte_size = 32 -; GCN-NEXT: workgroup_fbarrier_count = 0 -; GCN-NEXT: wavefront_sgpr_count = 16 -; GCN-NEXT: workitem_vgpr_count = 8 -; GCN-NEXT: reserved_vgpr_first = 0 -; GCN-NEXT: reserved_vgpr_count = 0 -; GCN-NEXT: reserved_sgpr_first = 0 -; GCN-NEXT: reserved_sgpr_count = 0 -; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 -; GCN-NEXT: debug_private_segment_buffer_sgpr = 0 -; GCN-NEXT: kernarg_segment_alignment = 4 -; GCN-NEXT: group_segment_alignment = 4 -; GCN-NEXT: private_segment_alignment = 4 -; GCN-NEXT: wavefront_size = 6 -; GCN-NEXT: call_convention = -1 -; GCN-NEXT: runtime_loader_kernel_symbol = 0 -; GCN-NEXT: .end_amd_kernel_code_t -; GCN-NEXT: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 -; GCN-NEXT: s_sub_i32 s5, 64, s4 -; GCN-NEXT: s_sub_i32 s12, s4, 64 -; GCN-NEXT: s_ashr_i64 s[8:9], s[2:3], s4 -; GCN-NEXT: s_ashr_i32 s13, s3, 31 -; GCN-NEXT: s_lshl_b64 s[10:11], s[2:3], s5 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], s12 -; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] -; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NEXT: v_mov_b32_e32 v5, s8 +; GCN-NEXT: s_sub_i32 s2, 64, s8 +; GCN-NEXT: s_sub_i32 s9, s8, 64 +; GCN-NEXT: s_lshr_b64 s[0:1], s[4:5], s8 +; GCN-NEXT: s_lshl_b64 s[2:3], s[6:7], s2 +; GCN-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GCN-NEXT: s_ashr_i64 s[10:11], s[6:7], s9 +; GCN-NEXT: v_mov_b32_e32 v0, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v6, s2 -; GCN-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NEXT: v_cmp_lt_u32_e64 vcc, s4, 64 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GCN-NEXT: v_mov_b32_e32 v7, s6 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 0 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc +; GCN-NEXT: v_cmp_lt_u32_e64 vcc, s8, 64 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: s_ashr_i64 s[0:1], s[6:7], s8 +; GCN-NEXT: s_ashr_i32 s2, s7, 31 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm +; GCN-NEXT: .section .rodata,#alloc +; GCN-NEXT: .p2align 6 +; GCN-NEXT: .amdhsa_kernel s_ashr_i128_ss +; GCN-NEXT: .amdhsa_group_segment_fixed_size 0 +; GCN-NEXT: .amdhsa_private_segment_fixed_size 0 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GCN-NEXT: .amdhsa_next_free_vgpr 8 +; GCN-NEXT: .amdhsa_next_free_sgpr 12 +; GCN-NEXT: .amdhsa_reserve_flat_scratch 0 +; GCN-NEXT: .amdhsa_float_round_mode_32 0 +; GCN-NEXT: .amdhsa_float_round_mode_16_64 0 +; GCN-NEXT: .amdhsa_float_denorm_mode_32 0 +; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; GCN-NEXT: .amdhsa_dx10_clamp 1 +; GCN-NEXT: .amdhsa_ieee_mode 1 +; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; GCN-NEXT: .amdhsa_exception_int_div_zero 0 +; GCN-NEXT: .end_amdhsa_kernel +; GCN-NEXT: .text %shift = ashr i128 %lhs, %rhs store i128 %shift, i128 addrspace(1)* null ret void @@ -491,46 +407,46 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: v_shl_v2i128_vv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v8 ; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8 +; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v8 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v18 -; GCN-NEXT: v_or_b32_e32 v20, v17, v19 -; GCN-NEXT: v_or_b32_e32 v21, v16, v18 -; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v12 -; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v16 -; GCN-NEXT: v_lshl_b64 v[18:19], v[6:7], v12 -; GCN-NEXT: v_or_b32_e32 v17, v19, v17 -; GCN-NEXT: v_or_b32_e32 v16, v18, v16 -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] +; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11] +; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[8:9] ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 +; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 -; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] -; GCN-NEXT: v_or_b32_e32 v15, v13, v15 -; GCN-NEXT: v_or_b32_e32 v14, v12, v14 -; GCN-NEXT: v_cmp_gt_u64_e64 s[10:11], 64, v[8:9] -; GCN-NEXT: v_subrev_i32_e32 v18, vcc, 64, v8 -; GCN-NEXT: v_lshl_b64 v[8:9], v[0:1], v8 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v18 -; GCN-NEXT: v_cmp_gt_u64_e64 s[12:13], 64, v[12:13] -; GCN-NEXT: v_subrev_i32_e32 v18, vcc, 64, v12 -; GCN-NEXT: v_lshl_b64 v[12:13], v[4:5], v12 -; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v18 -; GCN-NEXT: s_and_b64 vcc, s[6:7], s[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v18, v1, v20, vcc -; GCN-NEXT: v_cndmask_b32_e32 v19, v0, v21, vcc -; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[12:13] -; GCN-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v16, s[6:7] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v9, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v8, vcc -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v13, s[6:7] +; GCN-NEXT: v_or_b32_e32 v19, v17, v19 +; GCN-NEXT: v_or_b32_e32 v18, v16, v18 +; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9 +; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[6:7] ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v3, v18, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v19, v2, vcc -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GCN-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc -; GCN-NEXT: v_cndmask_b32_e32 v6, v4, v6, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, v12, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; GCN-NEXT: v_sub_i32_e32 v11, vcc, 64, v12 +; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v12 +; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v11 +; GCN-NEXT: v_cmp_gt_u64_e64 s[8:9], 64, v[12:13] +; GCN-NEXT: v_or_b32_e32 v16, v9, v16 +; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], 0, v[14:15] +; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12 +; GCN-NEXT: v_or_b32_e32 v11, v10, v17 +; GCN-NEXT: v_lshl_b64 v[9:10], v[4:5], v9 +; GCN-NEXT: s_and_b64 vcc, s[10:11], s[8:9] +; GCN-NEXT: v_cndmask_b32_e32 v17, v10, v11, vcc +; GCN-NEXT: v_or_b32_e32 v11, v13, v15 +; GCN-NEXT: v_or_b32_e32 v10, v12, v14 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 +; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v12 +; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11] +; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc +; GCN-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[8:9] +; GCN-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[8:9] +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl <2 x i128> %lhs, %rhs ret <2 x i128> %shl @@ -540,46 +456,46 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: v_lshr_v2i128_vv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v8 ; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8 +; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v8 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v18 -; GCN-NEXT: v_or_b32_e32 v20, v17, v19 -; GCN-NEXT: v_or_b32_e32 v21, v16, v18 -; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v12 -; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v16 -; GCN-NEXT: v_lshr_b64 v[18:19], v[4:5], v12 -; GCN-NEXT: v_or_b32_e32 v17, v19, v17 -; GCN-NEXT: v_or_b32_e32 v16, v18, v16 -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] +; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11] +; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[8:9] ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 +; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 -; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] -; GCN-NEXT: v_or_b32_e32 v15, v13, v15 -; GCN-NEXT: v_or_b32_e32 v14, v12, v14 -; GCN-NEXT: v_cmp_gt_u64_e64 s[10:11], 64, v[8:9] -; GCN-NEXT: v_subrev_i32_e32 v18, vcc, 64, v8 -; GCN-NEXT: v_lshr_b64 v[8:9], v[2:3], v8 -; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v18 -; GCN-NEXT: v_cmp_gt_u64_e64 s[12:13], 64, v[12:13] -; GCN-NEXT: v_subrev_i32_e32 v18, vcc, 64, v12 -; GCN-NEXT: v_lshr_b64 v[12:13], v[6:7], v12 -; GCN-NEXT: v_lshr_b64 v[6:7], v[6:7], v18 -; GCN-NEXT: s_and_b64 vcc, s[6:7], s[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v18, v3, v20, vcc -; GCN-NEXT: v_cndmask_b32_e32 v19, v2, v21, vcc -; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[12:13] -; GCN-NEXT: v_cndmask_b32_e64 v17, v7, v17, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v16, s[6:7] -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v13, s[6:7] +; GCN-NEXT: v_or_b32_e32 v19, v17, v19 +; GCN-NEXT: v_or_b32_e32 v18, v16, v18 +; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9 +; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[6:7] ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v1, v18, v1, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v19, v0, vcc -; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GCN-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, v12, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; GCN-NEXT: v_sub_i32_e32 v11, vcc, 64, v12 +; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v12 +; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v11 +; GCN-NEXT: v_cmp_gt_u64_e64 s[8:9], 64, v[12:13] +; GCN-NEXT: v_or_b32_e32 v16, v9, v16 +; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], 0, v[14:15] +; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12 +; GCN-NEXT: v_or_b32_e32 v11, v10, v17 +; GCN-NEXT: v_lshr_b64 v[9:10], v[6:7], v9 +; GCN-NEXT: s_and_b64 vcc, s[10:11], s[8:9] +; GCN-NEXT: v_cndmask_b32_e32 v17, v10, v11, vcc +; GCN-NEXT: v_or_b32_e32 v11, v13, v15 +; GCN-NEXT: v_or_b32_e32 v10, v12, v14 +; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v8 +; GCN-NEXT: v_lshr_b64 v[6:7], v[6:7], v12 +; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11] +; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc +; GCN-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[8:9] +; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[8:9] +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = lshr <2 x i128> %lhs, %rhs ret <2 x i128> %shl @@ -589,48 +505,48 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: v_ashr_v2i128_vv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v8 ; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8 +; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v8 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v18 -; GCN-NEXT: v_or_b32_e32 v20, v17, v19 -; GCN-NEXT: v_or_b32_e32 v21, v16, v18 -; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v12 -; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v16 -; GCN-NEXT: v_lshr_b64 v[18:19], v[4:5], v12 -; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] +; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11] +; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[8:9] ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_or_b32_e32 v10, v8, v10 -; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] -; GCN-NEXT: v_or_b32_e32 v15, v13, v15 -; GCN-NEXT: v_or_b32_e32 v14, v12, v14 -; GCN-NEXT: v_cmp_gt_u64_e64 s[10:11], 64, v[8:9] ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 +; GCN-NEXT: v_or_b32_e32 v10, v8, v10 +; GCN-NEXT: v_or_b32_e32 v19, v17, v19 +; GCN-NEXT: v_or_b32_e32 v18, v16, v18 ; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9 -; GCN-NEXT: s_and_b64 s[6:7], s[6:7], s[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v16, v16, v21, s[6:7] -; GCN-NEXT: v_cmp_gt_u64_e64 s[10:11], 64, v[12:13] +; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[6:7] +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc +; GCN-NEXT: v_sub_i32_e32 v11, vcc, 64, v12 +; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v12 +; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v11 +; GCN-NEXT: v_cmp_gt_u64_e64 s[8:9], 64, v[12:13] +; GCN-NEXT: v_or_b32_e32 v16, v9, v16 +; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], 0, v[14:15] +; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12 +; GCN-NEXT: v_or_b32_e32 v11, v10, v17 +; GCN-NEXT: v_ashr_i64 v[9:10], v[6:7], v9 +; GCN-NEXT: s_and_b64 vcc, s[10:11], s[8:9] +; GCN-NEXT: v_cndmask_b32_e32 v17, v10, v11, vcc +; GCN-NEXT: v_or_b32_e32 v11, v13, v15 +; GCN-NEXT: v_or_b32_e32 v10, v12, v14 +; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11] +; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc +; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[8:9] ; GCN-NEXT: v_ashr_i64 v[8:9], v[2:3], v8 -; GCN-NEXT: v_ashrrev_i32_e32 v20, 31, v3 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, 64, v12 -; GCN-NEXT: v_ashr_i64 v[12:13], v[6:7], v12 -; GCN-NEXT: v_ashrrev_i32_e32 v21, 31, v7 -; GCN-NEXT: v_ashr_i64 v[2:3], v[6:7], v2 -; GCN-NEXT: s_and_b64 vcc, s[8:9], s[10:11] -; GCN-NEXT: v_cndmask_b32_e32 v6, v3, v19, vcc -; GCN-NEXT: v_cndmask_b32_e32 v18, v2, v18, vcc -; GCN-NEXT: v_cndmask_b32_e64 v3, v20, v9, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v2, v20, v8, s[6:7] -; GCN-NEXT: v_cndmask_b32_e32 v7, v21, v13, vcc -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[6:7] -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] -; GCN-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v4, v18, v4, s[6:7] -; GCN-NEXT: v_cndmask_b32_e32 v6, v21, v12, vcc +; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v3 +; GCN-NEXT: v_cndmask_b32_e64 v3, v2, v9, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] +; GCN-NEXT: v_ashr_i64 v[8:9], v[6:7], v12 +; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v7 +; GCN-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc +; GCN-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[8:9] +; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = ashr <2 x i128> %lhs, %rhs ret <2 x i128> %shl @@ -638,133 +554,105 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: -; GCN: .amd_kernel_code_t -; GCN-NEXT: amd_code_version_major = 1 -; GCN-NEXT: amd_code_version_minor = 2 -; GCN-NEXT: amd_machine_kind = 1 -; GCN-NEXT: amd_machine_version_major = 7 -; GCN-NEXT: amd_machine_version_minor = 0 -; GCN-NEXT: amd_machine_version_stepping = 0 -; GCN-NEXT: kernel_code_entry_byte_offset = 256 -; GCN-NEXT: kernel_code_prefetch_byte_size = 0 -; GCN-NEXT: granulated_workitem_vgpr_count = 3 -; GCN-NEXT: granulated_wavefront_sgpr_count = 4 -; GCN-NEXT: priority = 0 -; GCN-NEXT: float_mode = 192 -; GCN-NEXT: priv = 0 -; GCN-NEXT: enable_dx10_clamp = 1 -; GCN-NEXT: debug_mode = 0 -; GCN-NEXT: enable_ieee_mode = 1 -; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GCN-NEXT: user_sgpr_count = 6 -; GCN-NEXT: enable_trap_handler = 0 -; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 -; GCN-NEXT: enable_sgpr_workgroup_id_y = 0 -; GCN-NEXT: enable_sgpr_workgroup_id_z = 0 -; GCN-NEXT: enable_sgpr_workgroup_info = 0 -; GCN-NEXT: enable_vgpr_workitem_id = 0 -; GCN-NEXT: enable_exception_msb = 0 -; GCN-NEXT: granulated_lds_size = 0 -; GCN-NEXT: enable_exception = 0 -; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 -; GCN-NEXT: enable_sgpr_dispatch_ptr = 0 -; GCN-NEXT: enable_sgpr_queue_ptr = 0 -; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GCN-NEXT: enable_sgpr_dispatch_id = 0 -; GCN-NEXT: enable_sgpr_flat_scratch_init = 0 -; GCN-NEXT: enable_sgpr_private_segment_size = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0 -; GCN-NEXT: enable_ordered_append_gds = 0 -; GCN-NEXT: private_element_size = 1 -; GCN-NEXT: is_ptr64 = 1 -; GCN-NEXT: is_dynamic_callstack = 0 -; GCN-NEXT: is_debug_enabled = 0 -; GCN-NEXT: is_xnack_enabled = 0 -; GCN-NEXT: workitem_private_segment_byte_size = 0 -; GCN-NEXT: workgroup_group_segment_byte_size = 0 -; GCN-NEXT: gds_segment_byte_size = 0 -; GCN-NEXT: kernarg_segment_byte_size = 64 -; GCN-NEXT: workgroup_fbarrier_count = 0 -; GCN-NEXT: wavefront_sgpr_count = 36 -; GCN-NEXT: workitem_vgpr_count = 16 -; GCN-NEXT: reserved_vgpr_first = 0 -; GCN-NEXT: reserved_vgpr_count = 0 -; GCN-NEXT: reserved_sgpr_first = 0 -; GCN-NEXT: reserved_sgpr_count = 0 -; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 -; GCN-NEXT: debug_private_segment_buffer_sgpr = 0 -; GCN-NEXT: kernarg_segment_alignment = 5 -; GCN-NEXT: group_segment_alignment = 4 -; GCN-NEXT: private_segment_alignment = 4 -; GCN-NEXT: wavefront_size = 6 -; GCN-NEXT: call_convention = -1 -; GCN-NEXT: runtime_loader_kernel_symbol = 0 -; GCN-NEXT: .end_amd_kernel_code_t -; GCN-NEXT: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x8 -; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x8 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[14:15], 0 -; GCN-NEXT: s_lshl_b64 s[20:21], s[4:5], s12 -; GCN-NEXT: s_lshl_b64 s[22:23], s[6:7], s12 -; GCN-NEXT: s_sub_i32 s30, 64, s12 -; GCN-NEXT: s_sub_i32 s31, s12, 64 -; GCN-NEXT: s_sub_i32 s32, 64, s16 -; GCN-NEXT: s_sub_i32 s33, s16, 64 -; GCN-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] -; GCN-NEXT: v_cmp_lt_u64_e64 s[14:15], s[16:17], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[24:25], s[18:19], 0 -; GCN-NEXT: s_lshl_b64 s[26:27], s[8:9], s16 -; GCN-NEXT: s_lshl_b64 s[28:29], s[10:11], s16 +; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[0:1], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[2:3], 0 +; GCN-NEXT: s_lshl_b64 s[20:21], s[8:9], s0 +; GCN-NEXT: s_and_b64 vcc, s[18:19], s[16:17] +; GCN-NEXT: s_sub_i32 s18, 64, s0 +; GCN-NEXT: s_lshl_b64 s[16:17], s[10:11], s0 +; GCN-NEXT: s_lshr_b64 s[18:19], s[8:9], s18 ; GCN-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GCN-NEXT: s_sub_i32 s18, s0, 64 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_lshl_b64 s[8:9], s[8:9], s18 +; GCN-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: v_mov_b32_e32 v4, s16 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[6:7], 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: s_sub_i32 s2, 64, s4 +; GCN-NEXT: s_lshl_b64 s[8:9], s[12:13], s4 +; GCN-NEXT: s_lshl_b64 s[0:1], s[14:15], s4 +; GCN-NEXT: s_lshr_b64 s[2:3], s[12:13], s2 +; GCN-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v4, s9 +; GCN-NEXT: s_sub_i32 s0, s4, 64 +; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: s_lshl_b64 s[8:9], s[12:13], s0 +; GCN-NEXT: s_or_b64 s[0:1], s[4:5], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v6, s9 +; GCN-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GCN-NEXT: v_mov_b32_e32 v7, s15 +; GCN-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v6, s8 +; GCN-NEXT: v_mov_b32_e32 v10, s2 +; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; GCN-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v10, 16 +; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_mov_b32_e32 v7, s11 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s27 -; GCN-NEXT: s_and_b64 s[0:1], s[24:25], s[14:15] -; GCN-NEXT: s_lshr_b64 s[2:3], s[4:5], s30 -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s31 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, s26 -; GCN-NEXT: s_lshr_b64 s[6:7], s[8:9], s32 -; GCN-NEXT: s_lshl_b64 s[8:9], s[8:9], s33 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc -; GCN-NEXT: s_or_b64 s[2:3], s[22:23], s[2:3] -; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: v_mov_b32_e32 v12, s4 -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] -; GCN-NEXT: s_or_b64 s[4:5], s[28:29], s[6:7] -; GCN-NEXT: v_mov_b32_e32 v13, s9 -; GCN-NEXT: v_mov_b32_e32 v14, s8 -; GCN-NEXT: v_mov_b32_e32 v15, s3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v15, s2 -; GCN-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v15, s5 -; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v15, s4 -; GCN-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v15, s10 -; GCN-NEXT: v_cmp_eq_u64_e64 vcc, s[12:13], 0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v6, vcc -; GCN-NEXT: v_cmp_eq_u64_e64 vcc, s[16:17], 0 -; GCN-NEXT: v_cndmask_b32_e32 v7, v13, v7, vcc -; GCN-NEXT: v_cndmask_b32_e32 v6, v14, v15, vcc ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-NEXT: s_endpgm +; GCN-NEXT: .section .rodata,#alloc +; GCN-NEXT: .p2align 6 +; GCN-NEXT: .amdhsa_kernel s_shl_v2i128ss +; GCN-NEXT: .amdhsa_group_segment_fixed_size 0 +; GCN-NEXT: .amdhsa_private_segment_fixed_size 0 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GCN-NEXT: .amdhsa_next_free_vgpr 16 +; GCN-NEXT: .amdhsa_next_free_sgpr 22 +; GCN-NEXT: .amdhsa_reserve_flat_scratch 0 +; GCN-NEXT: .amdhsa_float_round_mode_32 0 +; GCN-NEXT: .amdhsa_float_round_mode_16_64 0 +; GCN-NEXT: .amdhsa_float_denorm_mode_32 0 +; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; GCN-NEXT: .amdhsa_dx10_clamp 1 +; GCN-NEXT: .amdhsa_ieee_mode 1 +; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; GCN-NEXT: .amdhsa_exception_int_div_zero 0 +; GCN-NEXT: .end_amdhsa_kernel +; GCN-NEXT: .text %shift = shl <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null ret void @@ -772,133 +660,105 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: -; GCN: .amd_kernel_code_t -; GCN-NEXT: amd_code_version_major = 1 -; GCN-NEXT: amd_code_version_minor = 2 -; GCN-NEXT: amd_machine_kind = 1 -; GCN-NEXT: amd_machine_version_major = 7 -; GCN-NEXT: amd_machine_version_minor = 0 -; GCN-NEXT: amd_machine_version_stepping = 0 -; GCN-NEXT: kernel_code_entry_byte_offset = 256 -; GCN-NEXT: kernel_code_prefetch_byte_size = 0 -; GCN-NEXT: granulated_workitem_vgpr_count = 4 -; GCN-NEXT: granulated_wavefront_sgpr_count = 4 -; GCN-NEXT: priority = 0 -; GCN-NEXT: float_mode = 192 -; GCN-NEXT: priv = 0 -; GCN-NEXT: enable_dx10_clamp = 1 -; GCN-NEXT: debug_mode = 0 -; GCN-NEXT: enable_ieee_mode = 1 -; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GCN-NEXT: user_sgpr_count = 6 -; GCN-NEXT: enable_trap_handler = 0 -; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 -; GCN-NEXT: enable_sgpr_workgroup_id_y = 0 -; GCN-NEXT: enable_sgpr_workgroup_id_z = 0 -; GCN-NEXT: enable_sgpr_workgroup_info = 0 -; GCN-NEXT: enable_vgpr_workitem_id = 0 -; GCN-NEXT: enable_exception_msb = 0 -; GCN-NEXT: granulated_lds_size = 0 -; GCN-NEXT: enable_exception = 0 -; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 -; GCN-NEXT: enable_sgpr_dispatch_ptr = 0 -; GCN-NEXT: enable_sgpr_queue_ptr = 0 -; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GCN-NEXT: enable_sgpr_dispatch_id = 0 -; GCN-NEXT: enable_sgpr_flat_scratch_init = 0 -; GCN-NEXT: enable_sgpr_private_segment_size = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0 -; GCN-NEXT: enable_ordered_append_gds = 0 -; GCN-NEXT: private_element_size = 1 -; GCN-NEXT: is_ptr64 = 1 -; GCN-NEXT: is_dynamic_callstack = 0 -; GCN-NEXT: is_debug_enabled = 0 -; GCN-NEXT: is_xnack_enabled = 0 -; GCN-NEXT: workitem_private_segment_byte_size = 0 -; GCN-NEXT: workgroup_group_segment_byte_size = 0 -; GCN-NEXT: gds_segment_byte_size = 0 -; GCN-NEXT: kernarg_segment_byte_size = 64 -; GCN-NEXT: workgroup_fbarrier_count = 0 -; GCN-NEXT: wavefront_sgpr_count = 36 -; GCN-NEXT: workitem_vgpr_count = 17 -; GCN-NEXT: reserved_vgpr_first = 0 -; GCN-NEXT: reserved_vgpr_count = 0 -; GCN-NEXT: reserved_sgpr_first = 0 -; GCN-NEXT: reserved_sgpr_count = 0 -; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 -; GCN-NEXT: debug_private_segment_buffer_sgpr = 0 -; GCN-NEXT: kernarg_segment_alignment = 5 -; GCN-NEXT: group_segment_alignment = 4 -; GCN-NEXT: private_segment_alignment = 4 -; GCN-NEXT: wavefront_size = 6 -; GCN-NEXT: call_convention = -1 -; GCN-NEXT: runtime_loader_kernel_symbol = 0 -; GCN-NEXT: .end_amd_kernel_code_t -; GCN-NEXT: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 -; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[8:9], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 -; GCN-NEXT: s_lshr_b64 s[20:21], s[2:3], s8 -; GCN-NEXT: s_lshr_b64 s[22:23], s[0:1], s8 -; GCN-NEXT: s_sub_i32 s30, 64, s8 -; GCN-NEXT: s_sub_i32 s31, s8, 64 -; GCN-NEXT: s_sub_i32 s32, 64, s12 -; GCN-NEXT: s_sub_i32 s33, s12, 64 -; GCN-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GCN-NEXT: v_cmp_lt_u64_e64 s[10:11], s[12:13], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[24:25], s[14:15], 0 -; GCN-NEXT: s_lshr_b64 s[26:27], s[6:7], s12 -; GCN-NEXT: s_lshr_b64 s[28:29], s[4:5], s12 -; GCN-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] -; GCN-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NEXT: v_mov_b32_e32 v1, s20 -; GCN-NEXT: s_and_b64 vcc, s[18:19], s[16:17] +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x8 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[0:1], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[2:3], 0 +; GCN-NEXT: s_lshr_b64 s[20:21], s[10:11], s0 +; GCN-NEXT: s_and_b64 vcc, s[18:19], s[16:17] +; GCN-NEXT: s_sub_i32 s18, 64, s0 +; GCN-NEXT: s_lshr_b64 s[16:17], s[8:9], s0 +; GCN-NEXT: s_lshl_b64 s[18:19], s[10:11], s18 +; GCN-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GCN-NEXT: s_sub_i32 s18, s0, 64 +; GCN-NEXT: v_mov_b32_e32 v0, s21 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 +; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: v_mov_b32_e32 v4, s16 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[6:7], 0 +; GCN-NEXT: s_lshr_b64 s[8:9], s[14:15], s4 +; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] +; GCN-NEXT: s_sub_i32 s2, 64, s4 +; GCN-NEXT: s_lshr_b64 s[0:1], s[12:13], s4 +; GCN-NEXT: s_lshl_b64 s[2:3], s[14:15], s2 +; GCN-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v4, s9 +; GCN-NEXT: s_sub_i32 s0, s4, 64 +; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: s_lshr_b64 s[8:9], s[14:15], s0 +; GCN-NEXT: s_or_b64 s[0:1], s[4:5], s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v6, 0, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s9 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: v_mov_b32_e32 v10, s2 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GCN-NEXT: v_mov_b32_e32 v10, s12 +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v4, s1 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s27 -; GCN-NEXT: s_and_b64 s[0:1], s[24:25], s[10:11] -; GCN-NEXT: s_lshl_b64 s[10:11], s[2:3], s30 -; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s31 -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NEXT: s_lshl_b64 s[14:15], s[6:7], s32 -; GCN-NEXT: s_lshr_b64 s[6:7], s[6:7], s33 -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; GCN-NEXT: s_or_b64 s[10:11], s[22:23], s[10:11] -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v13, s2 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[0:1] -; GCN-NEXT: s_or_b64 s[2:3], s[28:29], s[14:15] -; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: v_mov_b32_e32 v14, s6 -; GCN-NEXT: v_mov_b32_e32 v15, s11 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v15, s10 -; GCN-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v15, s3 -; GCN-NEXT: v_cndmask_b32_e64 v15, v0, v15, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_cndmask_b32_e64 v14, v14, v0, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v16, s4 -; GCN-NEXT: v_cmp_eq_u64_e64 vcc, s[8:9], 0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v13, v5, vcc -; GCN-NEXT: v_cmp_eq_u64_e64 vcc, s[12:13], 0 -; GCN-NEXT: v_cndmask_b32_e32 v5, v15, v12, vcc -; GCN-NEXT: v_cndmask_b32_e32 v4, v14, v16, vcc ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-NEXT: s_endpgm +; GCN-NEXT: .section .rodata,#alloc +; GCN-NEXT: .p2align 6 +; GCN-NEXT: .amdhsa_kernel s_lshr_v2i128_ss +; GCN-NEXT: .amdhsa_group_segment_fixed_size 0 +; GCN-NEXT: .amdhsa_private_segment_fixed_size 0 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GCN-NEXT: .amdhsa_next_free_vgpr 16 +; GCN-NEXT: .amdhsa_next_free_sgpr 22 +; GCN-NEXT: .amdhsa_reserve_flat_scratch 0 +; GCN-NEXT: .amdhsa_float_round_mode_32 0 +; GCN-NEXT: .amdhsa_float_round_mode_16_64 0 +; GCN-NEXT: .amdhsa_float_denorm_mode_32 0 +; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; GCN-NEXT: .amdhsa_dx10_clamp 1 +; GCN-NEXT: .amdhsa_ieee_mode 1 +; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; GCN-NEXT: .amdhsa_exception_int_div_zero 0 +; GCN-NEXT: .end_amdhsa_kernel +; GCN-NEXT: .text %shift = lshr <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null ret void @@ -906,137 +766,109 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: -; GCN: .amd_kernel_code_t -; GCN-NEXT: amd_code_version_major = 1 -; GCN-NEXT: amd_code_version_minor = 2 -; GCN-NEXT: amd_machine_kind = 1 -; GCN-NEXT: amd_machine_version_major = 7 -; GCN-NEXT: amd_machine_version_minor = 0 -; GCN-NEXT: amd_machine_version_stepping = 0 -; GCN-NEXT: kernel_code_entry_byte_offset = 256 -; GCN-NEXT: kernel_code_prefetch_byte_size = 0 -; GCN-NEXT: granulated_workitem_vgpr_count = 4 -; GCN-NEXT: granulated_wavefront_sgpr_count = 4 -; GCN-NEXT: priority = 0 -; GCN-NEXT: float_mode = 192 -; GCN-NEXT: priv = 0 -; GCN-NEXT: enable_dx10_clamp = 1 -; GCN-NEXT: debug_mode = 0 -; GCN-NEXT: enable_ieee_mode = 1 -; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GCN-NEXT: user_sgpr_count = 6 -; GCN-NEXT: enable_trap_handler = 0 -; GCN-NEXT: enable_sgpr_workgroup_id_x = 1 -; GCN-NEXT: enable_sgpr_workgroup_id_y = 0 -; GCN-NEXT: enable_sgpr_workgroup_id_z = 0 -; GCN-NEXT: enable_sgpr_workgroup_info = 0 -; GCN-NEXT: enable_vgpr_workitem_id = 0 -; GCN-NEXT: enable_exception_msb = 0 -; GCN-NEXT: granulated_lds_size = 0 -; GCN-NEXT: enable_exception = 0 -; GCN-NEXT: enable_sgpr_private_segment_buffer = 1 -; GCN-NEXT: enable_sgpr_dispatch_ptr = 0 -; GCN-NEXT: enable_sgpr_queue_ptr = 0 -; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1 -; GCN-NEXT: enable_sgpr_dispatch_id = 0 -; GCN-NEXT: enable_sgpr_flat_scratch_init = 0 -; GCN-NEXT: enable_sgpr_private_segment_size = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0 -; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0 -; GCN-NEXT: enable_ordered_append_gds = 0 -; GCN-NEXT: private_element_size = 1 -; GCN-NEXT: is_ptr64 = 1 -; GCN-NEXT: is_dynamic_callstack = 0 -; GCN-NEXT: is_debug_enabled = 0 -; GCN-NEXT: is_xnack_enabled = 0 -; GCN-NEXT: workitem_private_segment_byte_size = 0 -; GCN-NEXT: workgroup_group_segment_byte_size = 0 -; GCN-NEXT: gds_segment_byte_size = 0 -; GCN-NEXT: kernarg_segment_byte_size = 64 -; GCN-NEXT: workgroup_fbarrier_count = 0 -; GCN-NEXT: wavefront_sgpr_count = 37 -; GCN-NEXT: workitem_vgpr_count = 17 -; GCN-NEXT: reserved_vgpr_first = 0 -; GCN-NEXT: reserved_vgpr_count = 0 -; GCN-NEXT: reserved_sgpr_first = 0 -; GCN-NEXT: reserved_sgpr_count = 0 -; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 -; GCN-NEXT: debug_private_segment_buffer_sgpr = 0 -; GCN-NEXT: kernarg_segment_alignment = 5 -; GCN-NEXT: group_segment_alignment = 4 -; GCN-NEXT: private_segment_alignment = 4 -; GCN-NEXT: wavefront_size = 6 -; GCN-NEXT: call_convention = -1 -; GCN-NEXT: runtime_loader_kernel_symbol = 0 -; GCN-NEXT: .end_amd_kernel_code_t -; GCN-NEXT: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 -; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[8:9], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 -; GCN-NEXT: s_ashr_i64 s[20:21], s[2:3], s8 -; GCN-NEXT: s_ashr_i32 s30, s3, 31 -; GCN-NEXT: s_lshr_b64 s[22:23], s[0:1], s8 -; GCN-NEXT: s_sub_i32 s31, 64, s8 -; GCN-NEXT: s_sub_i32 s32, s8, 64 -; GCN-NEXT: s_sub_i32 s33, 64, s12 -; GCN-NEXT: s_sub_i32 s34, s12, 64 -; GCN-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GCN-NEXT: v_cmp_lt_u64_e64 s[10:11], s[12:13], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[24:25], s[14:15], 0 -; GCN-NEXT: s_ashr_i64 s[26:27], s[6:7], s12 -; GCN-NEXT: s_lshr_b64 s[28:29], s[4:5], s12 -; GCN-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] -; GCN-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NEXT: v_mov_b32_e32 v1, s20 -; GCN-NEXT: v_mov_b32_e32 v2, s30 -; GCN-NEXT: s_ashr_i32 s14, s7, 31 -; GCN-NEXT: s_and_b64 vcc, s[18:19], s[16:17] -; GCN-NEXT: v_mov_b32_e32 v4, s14 +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x8 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i32 s22, s11, 31 +; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[0:1], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[2:3], 0 +; GCN-NEXT: s_ashr_i64 s[20:21], s[10:11], s0 +; GCN-NEXT: s_and_b64 vcc, s[18:19], s[16:17] +; GCN-NEXT: s_sub_i32 s18, 64, s0 +; GCN-NEXT: s_lshr_b64 s[16:17], s[8:9], s0 +; GCN-NEXT: s_lshl_b64 s[18:19], s[10:11], s18 +; GCN-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GCN-NEXT: s_sub_i32 s18, s0, 64 +; GCN-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NEXT: s_ashr_i64 s[10:11], s[10:11], s18 +; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: v_mov_b32_e32 v4, s16 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[4:5], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[6:7], 0 +; GCN-NEXT: s_ashr_i64 s[8:9], s[14:15], s4 +; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] +; GCN-NEXT: s_sub_i32 s2, 64, s4 +; GCN-NEXT: s_ashr_i32 s10, s15, 31 +; GCN-NEXT: s_lshr_b64 s[0:1], s[12:13], s4 +; GCN-NEXT: s_lshl_b64 s[2:3], s[14:15], s2 +; GCN-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NEXT: v_mov_b32_e32 v5, s9 +; GCN-NEXT: s_sub_i32 s0, s4, 64 +; GCN-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s8 +; GCN-NEXT: s_ashr_i64 s[8:9], s[14:15], s0 +; GCN-NEXT: s_or_b64 s[0:1], s[4:5], s[6:7] +; GCN-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s9 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: v_mov_b32_e32 v10, s2 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GCN-NEXT: v_mov_b32_e32 v10, s12 +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v12, s0 -; GCN-NEXT: v_mov_b32_e32 v13, s5 -; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s27 -; GCN-NEXT: s_and_b64 s[0:1], s[24:25], s[10:11] -; GCN-NEXT: s_lshl_b64 s[10:11], s[2:3], s31 -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], s32 -; GCN-NEXT: v_cndmask_b32_e64 v7, v4, v0, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NEXT: s_lshl_b64 s[14:15], s[6:7], s33 -; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], s34 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc -; GCN-NEXT: s_or_b64 s[10:11], s[22:23], s[10:11] -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v14, s2 -; GCN-NEXT: v_cndmask_b32_e64 v6, v4, v0, s[0:1] -; GCN-NEXT: s_or_b64 s[2:3], s[28:29], s[14:15] -; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_mov_b32_e32 v15, s11 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v15, s10 -; GCN-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; GCN-NEXT: v_mov_b32_e32 v15, s3 -; GCN-NEXT: v_cndmask_b32_e64 v15, v0, v15, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v16, s4 -; GCN-NEXT: v_cmp_eq_u64_e64 vcc, s[8:9], 0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v14, v12, vcc -; GCN-NEXT: v_cmp_eq_u64_e64 vcc, s[12:13], 0 -; GCN-NEXT: v_cndmask_b32_e32 v5, v15, v13, vcc -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v16, vcc ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-NEXT: s_endpgm +; GCN-NEXT: .section .rodata,#alloc +; GCN-NEXT: .p2align 6 +; GCN-NEXT: .amdhsa_kernel s_ashr_v2i128_ss +; GCN-NEXT: .amdhsa_group_segment_fixed_size 0 +; GCN-NEXT: .amdhsa_private_segment_fixed_size 0 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 +; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 +; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 +; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 +; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 +; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 +; GCN-NEXT: .amdhsa_next_free_vgpr 16 +; GCN-NEXT: .amdhsa_next_free_sgpr 23 +; GCN-NEXT: .amdhsa_reserve_flat_scratch 0 +; GCN-NEXT: .amdhsa_float_round_mode_32 0 +; GCN-NEXT: .amdhsa_float_round_mode_16_64 0 +; GCN-NEXT: .amdhsa_float_denorm_mode_32 0 +; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3 +; GCN-NEXT: .amdhsa_dx10_clamp 1 +; GCN-NEXT: .amdhsa_ieee_mode 1 +; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 +; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0 +; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0 +; GCN-NEXT: .amdhsa_exception_int_div_zero 0 +; GCN-NEXT: .end_amdhsa_kernel +; GCN-NEXT: .text %shift = ashr <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null ret void diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index d52a7ff..70c1ad9 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -1,19 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI +; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i1_to_i32: @@ -37,17 +37,17 @@ define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: test_s_sext_i32_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s2, s4, s5 -; SI-NEXT: s_add_i32 s4, s2, s6 -; SI-NEXT: s_ashr_i32 s5, s4, 31 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mul_i32 s0, s0, s1 +; SI-NEXT: s_add_i32 s0, s0, s2 +; SI-NEXT: s_ashr_i32 s1, s0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_s_sext_i32_to_i64: @@ -75,16 +75,16 @@ entry: define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i1_to_i64: @@ -109,15 +109,15 @@ define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind { ; SI-LABEL: s_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s5, s4, 31 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_ashr_i32 s1, s0, 31 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i32_to_i64: @@ -140,20 +140,20 @@ define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nou define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { ; SI-LABEL: v_sext_i32_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_sext_i32_to_i64: @@ -182,15 +182,15 @@ define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspa define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { ; SI-LABEL: s_sext_i16_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x100000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i16_to_i64: @@ -213,15 +213,15 @@ define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nou define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i1_to_i16: @@ -249,18 +249,18 @@ define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { ; SI-LABEL: s_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s6, v0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i1_to_i16_with_and: @@ -294,9 +294,9 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -336,26 +336,26 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind { ; SI-LABEL: s_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ashr_i32 s5, s4, 24 -; SI-NEXT: s_bfe_i32 s6, s4, 0x80010 -; SI-NEXT: s_sext_i32_i8 s7, s4 -; SI-NEXT: s_bfe_i32 s4, s4, 0x80008 -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_ashr_i32 s1, s0, 24 +; SI-NEXT: s_bfe_i32 s2, s0, 0x80010 +; SI-NEXT: s_bfe_i32 s3, s0, 0x80008 +; SI-NEXT: s_sext_i32_i8 s0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_v4i8_to_v4i32: @@ -396,26 +396,26 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { ; SI-LABEL: v_sext_v4i8_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0 ; SI-NEXT: v_bfe_i32 v2, v0, 16, 8 ; SI-NEXT: v_bfe_i32 v3, v0, 8, 8 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 8 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 -; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_sext_v4i8_to_v4i32: @@ -460,27 +460,27 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addr define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind { ; SI-LABEL: s_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_ashr_i64 s[0:1], s[2:3], 48 -; SI-NEXT: s_ashr_i32 s1, s2, 16 -; SI-NEXT: s_sext_i32_i16 s2, s2 -; SI-NEXT: s_sext_i32_i16 s3, s3 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_ashr_i64 s[4:5], s[6:7], 48 +; SI-NEXT: s_ashr_i32 s5, s6, 16 +; SI-NEXT: s_sext_i32_i16 s6, s6 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s5 +; SI-NEXT: s_sext_i32_i16 s7, s7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_v4i16_to_v4i32: @@ -520,26 +520,26 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { ; SI-LABEL: v_sext_v4i16_to_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48 -; SI-NEXT: v_bfe_i32 v1, v1, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 -; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: v_bfe_i32 v1, v1, 0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_sext_v4i16_to_v4i32: diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll index 546a6062..d61c1743 100644 --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn-- -mattr=-fp64-fp16-denormals -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI +; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -mattr=-fp64-fp16-denormals -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI define amdgpu_kernel void @madak_f16( @@ -11,20 +11,20 @@ define amdgpu_kernel void @madak_f16( ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s6 ; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s8, s4 ; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_madak_f32 v0, v0, v1, 0x41200000 +; SI-NEXT: v_madak_f32 v0, v1, v0, 0x41200000 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -72,12 +72,12 @@ define amdgpu_kernel void @madak_f16_use_2( ; SI-NEXT: s_mov_b32 s15, 0xf000 ; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 -; SI-NEXT: s_mov_b32 s18, s14 -; SI-NEXT: s_mov_b32 s19, s15 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s16, s10 ; SI-NEXT: s_mov_b32 s17, s11 +; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: s_mov_b32 s18, s14 +; SI-NEXT: s_mov_b32 s19, s15 ; SI-NEXT: s_mov_b32 s10, s14 ; SI-NEXT: s_mov_b32 s11, s15 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll index 2f365cb..a49ff33 100644 --- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index d04df7d..01bcd6f 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -stop-after expand-isel-pseudos -o %t.mir %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after expand-isel-pseudos -o %t.mir %s ; RUN: llc -run-pass=none -verify-machineinstrs %t.mir -o - | FileCheck %s ; Test that SIMachineFunctionInfo can be round trip serialized through