From 01a8c4958e2aac66a9376565377e0eb6b5a81de0 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 14 Nov 2022 15:22:45 -0800 Subject: [PATCH] AMDGPU/GlobalISel: Add some end-to-end g_sext_inreg tests --- llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll | 1817 +++++++++++++++++++++ 1 file changed, 1817 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll new file mode 100644 index 0000000..193490d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -0,0 +1,1817 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s + +define i8 @v_sext_inreg_i8_4(i8 %value) { +; GCN-LABEL: v_sext_inreg_i8_4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i8_4: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 4 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i8 %value, 4 + %ashr = ashr i8 %shl, 4 + ret i8 %ashr +} + +define i8 @v_sext_inreg_i8_7(i8 %value) { +; GCN-LABEL: v_sext_inreg_i8_7: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i8_7: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i8 %value, 7 + %ashr = ashr i8 %shl, 7 + ret i8 %ashr +} + +define amdgpu_ps i8 @s_sext_inreg_i8(i8 inreg %value) { +; GFX6-LABEL: s_sext_inreg_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x50000 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_sext_inreg_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_bfe_u32 s1, 3, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_sext_i32_i8 s0, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_sext_inreg_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_bfe_u32 s1, 3, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_i8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_u32 s1, 3, 0x100000 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 3 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl i8 %value, 3 + %ashr = ashr i8 %shl, 3 + ret i8 %ashr +} + +define amdgpu_ps i8 @s_sext_inreg_i8_6(i8 inreg %value) { +; GFX6-LABEL: s_sext_inreg_i8_6: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x20000 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_sext_inreg_i8_6: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_bfe_u32 s1, 6, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_sext_i32_i8 s0, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 6 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_sext_inreg_i8_6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_bfe_u32 s1, 6, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_sext_i32_i8 s0, s0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 6 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_i8_6: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_u32 s1, 6, 0x100000 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 6 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl i8 %value, 6 + %ashr = ashr i8 %shl, 6 + ret i8 %ashr +} + +define i24 @v_sext_inreg_i24_12(i24 %value) { +; GCN-LABEL: v_sext_inreg_i24_12: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i24_12: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i24 %value, 12 + %ashr = ashr i24 %value, 12 + ret i24 %ashr +} + +define i24 @v_sext_inreg_i24_7(i24 %value) { +; GCN-LABEL: v_sext_inreg_i24_7: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 14, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i24_7: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, 14, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i24 %value, 7 + %ashr = shl i24 %shl, 7 + ret i24 %ashr +} + +define amdgpu_ps i24 @s_sext_inreg_i24_8(i24 inreg %value) { +; GCN-LABEL: s_sext_inreg_i24_8: +; GCN: ; %bb.0: +; GCN-NEXT: s_sext_i32_i16 s0, s0 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_i24_8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl i24 %value, 8 + %ashr = ashr i24 %shl, 8 + ret i24 %ashr +} + +define amdgpu_ps i24 @s_sext_inreg_i24_7(i24 inreg %value) { +; GCN-LABEL: s_sext_inreg_i24_7: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i32 s0, s0, 0x110000 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_i24_7: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x110000 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl i24 %value, 7 + %ashr = ashr i24 %shl, 7 + ret i24 %ashr +} + +define i32 @v_sext_inreg_i32_3(i32 %value) { +; GCN-LABEL: v_sext_inreg_i32_3: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 29 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i32_3: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 29 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i32 %value, 3 + %ashr = ashr i32 %shl, 3 + ret i32 %ashr +} + +define i32 @v_sext_inreg_i32_31(i32 %value) { +; GCN-LABEL: v_sext_inreg_i32_31: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i32_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i32 %value, 31 + %ashr = ashr i32 %value, 31 + ret i32 %ashr +} + +define amdgpu_ps i32 @s_sext_inreg_i32_2(i32 inreg %value) { +; GCN-LABEL: s_sext_inreg_i32_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 4 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_i32_2: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl i32 %value, 2 + %ashr = shl i32 %shl, 2 + ret i32 %ashr +} + +define amdgpu_ps i32 @s_sext_inreg_i32_31(i32 inreg %value) { +; GCN-LABEL: s_sext_inreg_i32_31: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i32 s0, s0, 0x10000 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_i32_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x10000 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl i32 %value, 31 + %ashr = ashr i32 %shl, 31 + ret i32 %ashr +} + +define <2 x i32> @v_sext_inreg_v2i32_14(<2 x i32> %value) { +; GCN-LABEL: v_sext_inreg_v2i32_14: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 18 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 18 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_v2i32_14: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 18 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 18 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl <2 x i32> %value, + %ashr = ashr <2 x i32> %shl, + ret <2 x i32> %ashr +} + +define <2 x i32> @v_sext_inreg_v2i32_31(<2 x i32> %value) { +; GCN-LABEL: v_sext_inreg_v2i32_31: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_v2i32_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl <2 x i32> %value, + %shr = ashr <2 x i32> %shl, + ret <2 x i32> %shr +} + +define amdgpu_ps <2 x i32> @s_sext_inreg_v2i32_22(<2 x i32> inreg %value) { +; GCN-LABEL: s_sext_inreg_v2i32_22: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i32 s0, s0, 0xa0000 +; GCN-NEXT: s_bfe_i32 s1, s1, 0xa0000 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_v2i32_22: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0xa0000 +; GFX10PLUS-NEXT: s_bfe_i32 s1, s1, 0xa0000 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl <2 x i32> %value, + %ashr = ashr <2 x i32> %shl, + ret <2 x i32> %ashr +} + +define <3 x i32> @v_sext_inreg_v3i32_16(<3 x i32> %value, <3 x i32> %amount) { +; GCN-LABEL: v_sext_inreg_v3i32_16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GCN-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_v3i32_16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX10PLUS-NEXT: v_bfe_i32 v2, v2, 0, 16 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl <3 x i32> %value, + %ashr = ashr <3 x i32> %shl, + ret <3 x i32> %ashr +} + +define amdgpu_ps <3 x i32> @s_sext_inreg_v3i32_22(<3 x i32> inreg %value) { +; GCN-LABEL: s_sext_inreg_v3i32_22: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_v3i32_22: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, 0 +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: s_mov_b32 s2, 0 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl <3 x i32> %value, + %ashr = shl <3 x i32> %shl, + ret <3 x i32> %ashr +} + +define <4 x i32> @v_sext_inreg_v4i32_6(<4 x i32> %value) { +; GCN-LABEL: v_sext_inreg_v4i32_6: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 26 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 26 +; GCN-NEXT: v_bfe_i32 v2, v2, 0, 26 +; GCN-NEXT: v_bfe_i32 v3, v3, 0, 26 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_v4i32_6: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 26 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 26 +; GFX10PLUS-NEXT: v_bfe_i32 v2, v2, 0, 26 +; GFX10PLUS-NEXT: v_bfe_i32 v3, v3, 0, 26 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl <4 x i32> %value, + %ashr = ashr <4 x i32> %shl, + ret <4 x i32> %ashr +} + +define amdgpu_ps <4 x i32> @s_sext_inreg_v4i32_13(<4 x i32> inreg %value) { +; GCN-LABEL: s_sext_inreg_v4i32_13: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i32 s0, s0, 0x130000 +; GCN-NEXT: s_bfe_i32 s1, s1, 0x130000 +; GCN-NEXT: s_bfe_i32 s2, s2, 0x130000 +; GCN-NEXT: s_bfe_i32 s3, s3, 0x130000 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_v4i32_13: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x130000 +; GFX10PLUS-NEXT: s_bfe_i32 s1, s1, 0x130000 +; GFX10PLUS-NEXT: s_bfe_i32 s2, s2, 0x130000 +; GFX10PLUS-NEXT: s_bfe_i32 s3, s3, 0x130000 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl <4 x i32> %value, + %ashr = ashr <4 x i32> %shl, + ret <4 x i32> %ashr +} + +define <5 x i32> @v_sext_inreg_v5i32_30(<5 x i32> %value) { +; GCN-LABEL: v_sext_inreg_v5i32_30: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 2 +; GCN-NEXT: v_bfe_i32 v1, v1, 0, 2 +; GCN-NEXT: v_bfe_i32 v2, v2, 0, 2 +; GCN-NEXT: v_bfe_i32 v3, v3, 0, 2 +; GCN-NEXT: v_bfe_i32 v4, v4, 0, 2 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_v5i32_30: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 2 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v1, 0, 2 +; GFX10PLUS-NEXT: v_bfe_i32 v2, v2, 0, 2 +; GFX10PLUS-NEXT: v_bfe_i32 v3, v3, 0, 2 +; GFX10PLUS-NEXT: v_bfe_i32 v4, v4, 0, 2 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl <5 x i32> %value, + %ashr = ashr <5 x i32> %shl, + ret <5 x i32> %ashr +} + +define amdgpu_ps <5 x i32> @s_sext_inreg_v5i32_19(<5 x i32> inreg %value) { +; GCN-LABEL: s_sext_inreg_v5i32_19: +; GCN: ; %bb.0: +; GCN-NEXT: s_ashr_i32 s0, s0, 19 +; GCN-NEXT: s_ashr_i32 s1, s1, 19 +; GCN-NEXT: s_ashr_i32 s2, s2, 19 +; GCN-NEXT: s_ashr_i32 s3, s3, 19 +; GCN-NEXT: s_ashr_i32 s4, s4, 19 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_v5i32_19: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 19 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s1, 19 +; GFX10PLUS-NEXT: s_ashr_i32 s2, s2, 19 +; GFX10PLUS-NEXT: s_ashr_i32 s3, s3, 19 +; GFX10PLUS-NEXT: s_ashr_i32 s4, s4, 19 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl <5 x i32> %value, + %ashr = ashr <5 x i32> %value, + ret <5 x i32> %ashr +} + +define <16 x i32> @v_sext_inreg_v16i32_27(<16 x i32> %value) { +; GCN-LABEL: v_sext_inreg_v16i32_27: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_mov_b32_e32 v10, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v13, 0 +; GCN-NEXT: v_mov_b32_e32 v14, 0 +; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_sext_inreg_v16i32_27: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 +; GFX10-NEXT: v_mov_b32_e32 v13, 0 +; GFX10-NEXT: v_mov_b32_e32 v14, 0 +; GFX10-NEXT: v_mov_b32_e32 v15, 0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_sext_inreg_v16i32_27: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0 +; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v7, 0 +; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v9, 0 +; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v11, 0 +; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v13, 0 +; GFX11-NEXT: v_dual_mov_b32 v14, 0 :: v_dual_mov_b32 v15, 0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %shl = shl <16 x i32> %value, + %ashr = shl <16 x i32> %shl, + ret <16 x i32> %ashr +} + +define amdgpu_ps <16 x i32> @s_sext_inreg_v16i32_3(<16 x i32> inreg %value) { +; GCN-LABEL: s_sext_inreg_v16i32_3: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 6 +; GCN-NEXT: s_lshl_b32 s1, s1, 6 +; GCN-NEXT: s_lshl_b32 s2, s2, 6 +; GCN-NEXT: s_lshl_b32 s3, s3, 6 +; GCN-NEXT: s_lshl_b32 s4, s4, 6 +; GCN-NEXT: s_lshl_b32 s5, s5, 6 +; GCN-NEXT: s_lshl_b32 s6, s6, 6 +; GCN-NEXT: s_lshl_b32 s7, s7, 6 +; GCN-NEXT: s_lshl_b32 s8, s8, 6 +; GCN-NEXT: s_lshl_b32 s9, s9, 6 +; GCN-NEXT: s_lshl_b32 s10, s10, 6 +; GCN-NEXT: s_lshl_b32 s11, s11, 6 +; GCN-NEXT: s_lshl_b32 s12, s12, 6 +; GCN-NEXT: s_lshl_b32 s13, s13, 6 +; GCN-NEXT: s_lshl_b32 s14, s14, 6 +; GCN-NEXT: s_lshl_b32 s15, s15, 6 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_v16i32_3: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s4, s4, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s5, s5, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s6, s6, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s7, s7, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s8, s8, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s9, s9, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s10, s10, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s11, s11, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s12, s12, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s13, s13, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s14, s14, 6 +; GFX10PLUS-NEXT: s_lshl_b32 s15, s15, 6 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl <16 x i32> %value, + %ashr = shl <16 x i32> %shl, + ret <16 x i32> %ashr +} + +define i16 @v_sext_inreg_i16_4(i16 %value) { +; GFX6-LABEL: v_sext_inreg_i16_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 12 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_sext_inreg_i16_4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 4, v0 +; GFX8-NEXT: v_ashrrev_i16_e32 v0, 4, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sext_inreg_i16_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 12 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i16_4: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 12 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i16 %value, 4 + %ashr = ashr i16 %shl, 4 + ret i16 %ashr +} + +define i16 @v_sext_inreg_i16_15(i16 %value) { +; GFX6-LABEL: v_sext_inreg_i16_15: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_sext_inreg_i16_15: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 15, v0 +; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sext_inreg_i16_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i16_15: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i16 %value, 15 + %ashr = ashr i16 %shl, 15 + ret i16 %ashr +} + +define amdgpu_ps i16 @s_sext_inreg_i16_9(i16 inreg %value) { +; GFX6-LABEL: s_sext_inreg_i16_9: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x70000 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_sext_inreg_i16_9: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_bfe_u32 s1, 9, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 9 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_sext_inreg_i16_9: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_bfe_u32 s1, 9, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_sext_i32_i16 s0, s0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 9 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_i16_9: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_u32 s1, 9, 0x100000 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 9 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl i16 %value, 9 + %ashr = ashr i16 %shl, 9 + ret i16 %ashr +} + +define amdgpu_ps i16 @s_sext_inreg_i16_15(i16 inreg %value) { +; GFX6-LABEL: s_sext_inreg_i16_15: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x10000 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_sext_inreg_i16_15: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_bfe_u32 s1, 15, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_ashr_i32 s0, s0, 15 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_sext_inreg_i16_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_bfe_u32 s1, 15, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_sext_i32_i16 s0, s0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 15 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_i16_15: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_u32 s1, 15, 0x100000 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 15 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl i16 %value, 15 + %ashr = ashr i16 %shl, 15 + ret i16 %ashr +} + +define <2 x i16> @v_sext_inreg_v2i16_8(<2 x i16> %value) { +; GFX6-LABEL: v_sext_inreg_v2i16_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_sext_inreg_v2i16_8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_ashrrev_i16_e32 v1, 8, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, sext(v0), v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sext_inreg_v2i16_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_v2i16_8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl <2 x i16> %value, + %ashr = ashr <2 x i16> %shl, + ret <2 x i16> %ashr +} + +define <2 x i16> @v_sext_inreg_v2i16_15(<2 x i16> %value) { +; GFX6-LABEL: v_sext_inreg_v2i16_15: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_sext_inreg_v2i16_15: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, 15 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 15, v0 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v1 +; GFX8-NEXT: v_ashrrev_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sext_inreg_v2i16_15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_v2i16_15: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl <2 x i16> %value, + %ashr = ashr <2 x i16> %shl, + ret <2 x i16> %ashr +} + +define amdgpu_ps i32 @s_sext_inreg_v2i16_11(<2 x i16> inreg %value) { +; GFX6-LABEL: s_sext_inreg_v2i16_11: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_bfe_i32 s1, s1, 0x50000 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x50000 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_sext_inreg_v2i16_11: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 +; GFX8-NEXT: s_bfe_u32 s2, 11, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_sext_i32_i16 s0, s0 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_ashr_i32 s0, s0, 11 +; GFX8-NEXT: s_ashr_i32 s1, s1, 11 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_sext_inreg_v2i16_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0xb000b +; GFX9-NEXT: s_lshl_b32 s1, s1, 11 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX9-NEXT: s_sext_i32_i16 s1, s0 +; GFX9-NEXT: s_ashr_i32 s0, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s2, 0xb000b +; GFX9-NEXT: s_ashr_i32 s1, s1, s2 +; GFX9-NEXT: s_ashr_i32 s0, s0, 11 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_v2i16_11: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s1, s0, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 0xb000b +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 11 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10PLUS-NEXT: s_sext_i32_i16 s1, 0xb000b +; GFX10PLUS-NEXT: s_sext_i32_i16 s2, s0 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 16 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s2, s1 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 11 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl <2 x i16> %value, + %ashr = ashr <2 x i16> %shl, + %cast = bitcast <2 x i16> %ashr to i32 + ret i32 %cast +} + +; FIXME +; define <3 x i16> @v_sext_inreg_v3i16_4(<3 x i16> %value) { +; %shl = shl <3 x i16> %value, +; %ashr = ashr <3 x i16> %shl, +; ret <3 x i16> %ashr +; } + +; define amdgpu_ps <3 x i16> @s_sext_inreg_v3i16_4(<3 x i16> inreg %value) { +; %shl = shl <3 x i16> %value, +; %ashr = ashr <3 x i16> %shl, +; ret <3 x i16> %ashr +; } + +define <2 x float> @v_sext_inreg_v4i16_3(<4 x i16> %value) { +; GFX6-LABEL: v_sext_inreg_v4i16_3: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 13 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 13 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 13 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 13 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_sext_inreg_v4i16_3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, 3 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 3, v0 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, 3, v1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 3, v2 +; GFX8-NEXT: v_ashrrev_i16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 3, v4 +; GFX8-NEXT: v_ashrrev_i16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sext_inreg_v4i16_3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 3, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 3, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 3, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v1, 3, v1 op_sel_hi:[0,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_v4i16_3: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v0, 3, v0 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v1, 3, v1 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: v_pk_ashrrev_i16 v0, 3, v0 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: v_pk_ashrrev_i16 v1, 3, v1 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl <4 x i16> %value, + %ashr = ashr <4 x i16> %shl, + %cast = bitcast <4 x i16> %ashr to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x i32> @s_sext_inreg_v4i16_14(<4 x i16> inreg %value) { +; GFX6-LABEL: s_sext_inreg_v4i16_14: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshl_b32 s0, s0, 28 +; GFX6-NEXT: s_lshl_b32 s1, s2, 28 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_sext_inreg_v4i16_14: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_bfe_u32 s0, 0, 0x100000 +; GFX8-NEXT: s_mov_b32 s1, s0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_sext_inreg_v4i16_14: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0xe000e +; GFX9-NEXT: s_lshl_b32 s2, s2, 14 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0xe000e +; GFX9-NEXT: s_lshl_b32 s2, s2, 14 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0xe000e +; GFX9-NEXT: s_lshl_b32 s2, s2, 14 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0xe000e +; GFX9-NEXT: s_lshl_b32 s2, s2, 14 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_v4i16_14: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 0xe000e +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 14 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 0xe000e +; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, 14 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 0xe000e +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 14 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 0xe000e +; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, 14 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl <4 x i16> %value, + %ashr = shl <4 x i16> %shl, + %cast = bitcast <4 x i16> %ashr to <2 x i32> + ret <2 x i32> %cast +} + +; FIXME +; define <5 x i16> @v_sext_inreg_v5i16(<5 x i16> %value) { +; %shl = shl <5 x i16> %value, %amount +; ret <5 x i16> %result +; } + +; define amdgpu_ps <5 x i16> @s_sext_inreg_v5i16(<5 x i16> inreg %value) { +; %shl = shl <5 x i16> %value, %amount +; ret <5 x i16> %result +; } + +; define <3 x float> @v_sext_inreg_v6i16(<6 x i16> %value) { +; %shl = shl <6 x i16> %value, %amount +; %cast = bitcast <6 x i16> %result to <3 x float> +; ret <3 x float> %cast +; } + +; define amdgpu_ps <3 x i32> @s_sext_inreg_v6i16(<6 x i16> inreg %value) { +; %shl = shl <6 x i16> %value, %amount +; %cast = bitcast <6 x i16> %result to <3 x i32> +; ret <3 x i32> %cast +; } + +define <4 x float> @v_sext_inreg_v8i16_11(<8 x i16> %value) { +; GFX6-LABEL: v_sext_inreg_v8i16_11: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 22, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 22, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 22, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 22, v6 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_sext_inreg_v8i16_11: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_bfe_u32 s4, 0, 0x100000 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sext_inreg_v8i16_11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 11, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 11, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v2, 11, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v3, 11, v3 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 11, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 11, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v2, 11, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v3, 11, v3 op_sel_hi:[0,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_v8i16_11: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v0, 11, v0 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v1, 11, v1 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v2, 11, v2 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v3, 11, v3 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v0, 11, v0 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v1, 11, v1 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v2, 11, v2 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v3, 11, v3 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl <8 x i16> %value, + %ashr = shl <8 x i16> %shl, + %cast = bitcast <8 x i16> %ashr to <4 x float> + ret <4 x float> %cast +} + +define amdgpu_ps <4 x i32> @s_sext_inreg_v8i16_5(<8 x i16> inreg %value) { +; GFX6-LABEL: s_sext_inreg_v8i16_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshl_b32 s0, s0, 10 +; GFX6-NEXT: s_lshl_b32 s2, s2, 10 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshl_b32 s1, s1, 26 +; GFX6-NEXT: s_lshl_b32 s4, s4, 10 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_lshl_b32 s2, s3, 26 +; GFX6-NEXT: s_lshl_b32 s6, s6, 10 +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_lshl_b32 s3, s5, 26 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_lshl_b32 s4, s7, 26 +; GFX6-NEXT: s_or_b32 s3, s3, s4 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_sext_inreg_v8i16_5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_bfe_u32 s8, 10, 0x100000 +; GFX8-NEXT: s_lshl_b32 s4, s4, s8 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshl_b32 s5, s5, s8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_lshl_b32 s1, s1, s8 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX8-NEXT: s_lshl_b32 s6, s6, s8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 +; GFX8-NEXT: s_lshl_b32 s2, s2, s8 +; GFX8-NEXT: s_or_b32 s1, s1, s4 +; GFX8-NEXT: s_bfe_u32 s4, s6, 0x100000 +; GFX8-NEXT: s_lshl_b32 s7, s7, s8 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_lshl_b32 s3, s3, s8 +; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_bfe_u32 s4, s7, 0x100000 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_sext_inreg_v8i16_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x50005 +; GFX9-NEXT: s_lshl_b32 s4, s4, 5 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x50005 +; GFX9-NEXT: s_lshl_b32 s4, s4, 5 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NEXT: s_lshl_b32 s2, s2, 0x50005 +; GFX9-NEXT: s_lshl_b32 s4, s4, 5 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x50005 +; GFX9-NEXT: s_lshl_b32 s4, s4, 5 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x50005 +; GFX9-NEXT: s_lshl_b32 s4, s4, 5 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x50005 +; GFX9-NEXT: s_lshl_b32 s4, s4, 5 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NEXT: s_lshl_b32 s2, s2, 0x50005 +; GFX9-NEXT: s_lshl_b32 s4, s4, 5 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x50005 +; GFX9-NEXT: s_lshl_b32 s4, s4, 5 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_v8i16_5: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 0x50005 +; GFX10PLUS-NEXT: s_lshl_b32 s4, s4, 5 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 0x50005 +; GFX10PLUS-NEXT: s_lshl_b32 s5, s5, 5 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 0x50005 +; GFX10PLUS-NEXT: s_lshl_b32 s4, s4, 5 +; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, 0x50005 +; GFX10PLUS-NEXT: s_lshl_b32 s5, s5, 5 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 0x50005 +; GFX10PLUS-NEXT: s_lshl_b32 s4, s4, 5 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 0x50005 +; GFX10PLUS-NEXT: s_lshl_b32 s5, s5, 5 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 0x50005 +; GFX10PLUS-NEXT: s_lshl_b32 s4, s4, 5 +; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, 0x50005 +; GFX10PLUS-NEXT: s_lshl_b32 s5, s5, 5 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl <8 x i16> %value, + %ashr = shl <8 x i16> %shl, + %cast = bitcast <8 x i16> %ashr to <4 x i32> + ret <4 x i32> %cast +} + +define i64 @v_sext_inreg_i64_23(i64 %value) { +; GCN-LABEL: v_sext_inreg_i64_23: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v1, v0, 0, 9 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i64_23: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 9 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i64 %value, 23 + %ashr = ashr i64 %shl, 23 + ret i64 %ashr +} + +define i64 @v_sext_inreg_i64_40(i64 %value) { +; GCN-LABEL: v_sext_inreg_i64_40: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i64_40: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i64 %value, 40 + %ashr = ashr i64 %shl, 40 + ret i64 %ashr +} + +define i64 @v_sext_inreg_i64_63(i64 %value) { +; GCN-LABEL: v_sext_inreg_i64_63: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i64_63: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i64 %value, 63 + %ashr = ashr i64 %shl, 63 + ret i64 %ashr +} + +define i64 @v_sext_inreg_i64_33(i64 %value) { +; GCN-LABEL: v_sext_inreg_i64_33: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 31 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i64_33: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 31 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i64 %value, 33 + %ashr = ashr i64 %shl, 33 + ret i64 %ashr +} + +define i64 @v_sext_inreg_i64_32(i64 %value) { +; GCN-LABEL: v_sext_inreg_i64_32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i64_32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v1 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i64 %value, 32 + %ashr = ashr i64 %value, 32 + ret i64 %ashr +} + +define i64 @v_sext_inreg_i64_31(i64 %value) { +; GCN-LABEL: v_sext_inreg_i64_31: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v1, v0, 0, 1 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i64_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i64 %value, 31 + %ashr = ashr i64 %shl, 31 + ret i64 %ashr +} + +define amdgpu_ps i64 @s_sext_inreg_i64_3(i64 inreg %value) { +; GCN-LABEL: s_sext_inreg_i64_3: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x3d0000 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_i64_3: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x3d0000 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl i64 %value, 3 + %ashr = ashr i64 %shl, 3 + ret i64 %ashr +} + +define amdgpu_ps i64 @s_sext_inreg_i64_63(i64 inreg %value) { +; GCN-LABEL: s_sext_inreg_i64_63: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_i64_63: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, 0 +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl i64 %value, 63 + %ashr = shl i64 %shl, 63 + ret i64 %ashr +} + +define amdgpu_ps i64 @s_sext_inreg_i64_33(i64 inreg %value) { +; GCN-LABEL: s_sext_inreg_i64_33: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x1f0000 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_i64_33: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x1f0000 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl i64 %value, 33 + %ashr = ashr i64 %shl, 33 + ret i64 %ashr +} + +define amdgpu_ps i64 @s_sext_inreg_i64_32(i64 inreg %value) { +; GCN-LABEL: s_sext_inreg_i64_32: +; GCN: ; %bb.0: +; GCN-NEXT: s_ashr_i32 s1, s0, 31 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_i64_32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_ashr_i32 s1, s0, 31 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl i64 %value, 32 + %ashr = ashr i64 %shl, 32 + ret i64 %ashr +} + +define amdgpu_ps i64 @s_sext_inreg_i64_31(i64 inreg %value) { +; GCN-LABEL: s_sext_inreg_i64_31: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x210000 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_i64_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x210000 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl i64 %value, 31 + %ashr = ashr i64 %shl, 31 + ret i64 %ashr +} + +define <2 x i64> @v_sext_inreg_v2i64_16(<2 x i64> %value) { +; GCN-LABEL: v_sext_inreg_v2i64_16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GCN-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_v2i64_16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX10PLUS-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl <2 x i64> %value, + %ashr = ashr <2 x i64> %shl, + ret <2 x i64> %ashr +} + +define <2 x i64> @v_sext_inreg_v2i64_31(<2 x i64> %value) { +; GCN-LABEL: v_sext_inreg_v2i64_31: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_bfe_i32 v1, v0, 0, 1 +; GCN-NEXT: v_bfe_i32 v3, v2, 0, 1 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_v2i64_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v0, 0, 1 +; GFX10PLUS-NEXT: v_bfe_i32 v3, v2, 0, 1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl <2 x i64> %value, + %ashr = ashr <2 x i64> %shl, + ret <2 x i64> %ashr +} + +define amdgpu_ps <2 x i64> @s_sext_inreg_v2i64_30(<2 x i64> inreg %value) { +; GCN-LABEL: s_sext_inreg_v2i64_30: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x220000 +; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x220000 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_v2i64_30: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x220000 +; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x220000 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl <2 x i64> %value, + %ashr = ashr <2 x i64> %shl, + ret <2 x i64> %ashr +} + +define i65 @v_sext_inreg_i65_22(i65 %value) { +; GFX6-LABEL: v_sext_inreg_i65_22: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 22 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 10, v1 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 1 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 0 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 10 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 10, v2 +; GFX6-NEXT: v_ashr_i64 v[2:3], v[2:3], 22 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_sext_inreg_i65_22: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 10, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1] +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 10 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 10, v2 +; GFX8-NEXT: v_ashrrev_i64 v[2:3], 22, v[2:3] +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sext_inreg_i65_22: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 10, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 1 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1] +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_bfe_u32 v1, v1, 0, 10 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v2 +; GFX9-NEXT: v_ashrrev_i64 v[2:3], 22, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i65_22: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 22, v[2:3] +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 10, v1 +; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 0, v[0:1] +; GFX10PLUS-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX10PLUS-NEXT: v_bfe_u32 v1, v1, 0, 10 +; GFX10PLUS-NEXT: v_bfe_i32 v2, v2, 0, 1 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v4, 10, v2 +; GFX10PLUS-NEXT: v_ashrrev_i64 v[2:3], 22, v[2:3] +; GFX10PLUS-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i65 %value, 22 + %ashr = ashr i65 %shl, 22 + ret i65 %ashr +} + +define i65 @v_sext_inreg_i65_33(i65 %value) { +; GFX6-LABEL: v_sext_inreg_i65_33: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: v_bfe_i32 v1, v2, 0, 1 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[1:2], 31 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_sext_inreg_i65_33: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_bfe_i32 v1, v2, 0, 1 +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sext_inreg_i65_33: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_bfe_i32 v1, v2, 0, 1 +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10PLUS-LABEL: v_sext_inreg_i65_33: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v1 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v2, 0, 1 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 1, v2 +; GFX10PLUS-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] + %shl = shl i65 %value, 33 + %ashr = ashr i65 %value, 33 + ret i65 %ashr +} + +define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) { +; GCN-LABEL: s_sext_inreg_i65_18: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 18 +; GCN-NEXT: s_lshr_b32 s4, s1, 14 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000 +; GCN-NEXT: s_lshl_b32 s7, s2, 14 +; GCN-NEXT: s_mov_b32 s6, s5 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 18 +; GCN-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_i65_18: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 18 +; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 14 +; GFX10PLUS-NEXT: s_mov_b32 s5, 0 +; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x2e0000 +; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX10PLUS-NEXT: s_mov_b32 s6, s5 +; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX10PLUS-NEXT: s_lshl_b32 s7, s2, 14 +; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[2:3], 18 +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl i65 %value, 18 + %ashr = ashr i65 %shl, 18 + ret i65 %ashr +} + +define amdgpu_ps i65 @s_sext_inreg_i65_33(i65 inreg %value) { +; GFX6-LABEL: s_sext_inreg_i65_33: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, 0 +; GFX6-NEXT: s_mov_b32 s1, 0 +; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_sext_inreg_i65_33: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_bfe_u32 s0, 1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s1, 2, 0x100000 +; GFX8-NEXT: s_lshr_b32 s0, 0, s0 +; GFX8-NEXT: s_lshr_b32 s1, 0, s1 +; GFX8-NEXT: s_bfe_u32 s2, 3, 0x100000 +; GFX8-NEXT: s_and_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, s1, 1 +; GFX8-NEXT: s_lshr_b32 s2, 0, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, 17 +; GFX8-NEXT: s_lshl_b32 s1, s1, 18 +; GFX8-NEXT: s_bfe_u32 s3, 4, 0x100000 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: s_lshr_b32 s3, 0, s3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 19 +; GFX8-NEXT: s_bfe_u32 s4, 5, 0x100000 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s3, 1 +; GFX8-NEXT: s_lshr_b32 s4, 0, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 20 +; GFX8-NEXT: s_bfe_u32 s5, 6, 0x100000 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s4, 1 +; GFX8-NEXT: s_lshr_b32 s5, 0, s5 +; GFX8-NEXT: s_lshl_b32 s1, s1, 21 +; GFX8-NEXT: s_bfe_u32 s6, 7, 0x100000 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s5, 1 +; GFX8-NEXT: s_lshr_b32 s6, 0, s6 +; GFX8-NEXT: s_lshl_b32 s1, s1, 22 +; GFX8-NEXT: s_bfe_u32 s7, 8, 0x100000 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s6, 1 +; GFX8-NEXT: s_lshr_b32 s7, 0, s7 +; GFX8-NEXT: s_lshl_b32 s1, s1, 23 +; GFX8-NEXT: s_bfe_u32 s8, 9, 0x100000 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s7, 1 +; GFX8-NEXT: s_lshr_b32 s8, 0, s8 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_bfe_u32 s9, 10, 0x100000 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s8, 1 +; GFX8-NEXT: s_lshr_b32 s9, 0, s9 +; GFX8-NEXT: s_lshl_b32 s1, s1, 25 +; GFX8-NEXT: s_bfe_u32 s10, 11, 0x100000 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s9, 1 +; GFX8-NEXT: s_lshr_b32 s10, 0, s10 +; GFX8-NEXT: s_lshl_b32 s1, s1, 26 +; GFX8-NEXT: s_bfe_u32 s11, 12, 0x100000 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s10, 1 +; GFX8-NEXT: s_lshr_b32 s11, 0, s11 +; GFX8-NEXT: s_lshl_b32 s1, s1, 27 +; GFX8-NEXT: s_bfe_u32 s12, 13, 0x100000 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s11, 1 +; GFX8-NEXT: s_lshr_b32 s12, 0, s12 +; GFX8-NEXT: s_lshl_b32 s1, s1, 28 +; GFX8-NEXT: s_bfe_u32 s13, 14, 0x100000 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s12, 1 +; GFX8-NEXT: s_lshr_b32 s13, 0, s13 +; GFX8-NEXT: s_lshl_b32 s1, s1, 29 +; GFX8-NEXT: s_bfe_u32 s14, 15, 0x100000 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s13, 1 +; GFX8-NEXT: s_lshr_b32 s14, 0, s14 +; GFX8-NEXT: s_lshl_b32 s1, s1, 30 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s14, 1 +; GFX8-NEXT: s_lshl_b32 s1, s1, 31 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_mov_b32 s1, s0 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_sext_inreg_i65_33: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_bfe_u32 s0, 1, 0x100000 +; GFX9-NEXT: s_bfe_u32 s1, 2, 0x100000 +; GFX9-NEXT: s_lshr_b32 s0, 0, s0 +; GFX9-NEXT: s_lshr_b32 s1, 0, s1 +; GFX9-NEXT: s_bfe_u32 s2, 3, 0x100000 +; GFX9-NEXT: s_and_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s1, s1, 1 +; GFX9-NEXT: s_lshr_b32 s2, 0, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, 17 +; GFX9-NEXT: s_lshl_b32 s1, s1, 18 +; GFX9-NEXT: s_bfe_u32 s3, 4, 0x100000 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_lshr_b32 s3, 0, s3 +; GFX9-NEXT: s_lshl_b32 s1, s1, 19 +; GFX9-NEXT: s_bfe_u32 s4, 5, 0x100000 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s3, 1 +; GFX9-NEXT: s_lshr_b32 s4, 0, s4 +; GFX9-NEXT: s_lshl_b32 s1, s1, 20 +; GFX9-NEXT: s_bfe_u32 s5, 6, 0x100000 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s4, 1 +; GFX9-NEXT: s_lshr_b32 s5, 0, s5 +; GFX9-NEXT: s_lshl_b32 s1, s1, 21 +; GFX9-NEXT: s_bfe_u32 s6, 7, 0x100000 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s5, 1 +; GFX9-NEXT: s_lshr_b32 s6, 0, s6 +; GFX9-NEXT: s_lshl_b32 s1, s1, 22 +; GFX9-NEXT: s_bfe_u32 s7, 8, 0x100000 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s6, 1 +; GFX9-NEXT: s_lshr_b32 s7, 0, s7 +; GFX9-NEXT: s_lshl_b32 s1, s1, 23 +; GFX9-NEXT: s_bfe_u32 s8, 9, 0x100000 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s7, 1 +; GFX9-NEXT: s_lshr_b32 s8, 0, s8 +; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_bfe_u32 s9, 10, 0x100000 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s8, 1 +; GFX9-NEXT: s_lshr_b32 s9, 0, s9 +; GFX9-NEXT: s_lshl_b32 s1, s1, 25 +; GFX9-NEXT: s_bfe_u32 s10, 11, 0x100000 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s9, 1 +; GFX9-NEXT: s_lshr_b32 s10, 0, s10 +; GFX9-NEXT: s_lshl_b32 s1, s1, 26 +; GFX9-NEXT: s_bfe_u32 s11, 12, 0x100000 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s10, 1 +; GFX9-NEXT: s_lshr_b32 s11, 0, s11 +; GFX9-NEXT: s_lshl_b32 s1, s1, 27 +; GFX9-NEXT: s_bfe_u32 s12, 13, 0x100000 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s11, 1 +; GFX9-NEXT: s_lshr_b32 s12, 0, s12 +; GFX9-NEXT: s_lshl_b32 s1, s1, 28 +; GFX9-NEXT: s_bfe_u32 s13, 14, 0x100000 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s12, 1 +; GFX9-NEXT: s_lshr_b32 s13, 0, s13 +; GFX9-NEXT: s_lshl_b32 s1, s1, 29 +; GFX9-NEXT: s_bfe_u32 s14, 15, 0x100000 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s13, 1 +; GFX9-NEXT: s_lshr_b32 s14, 0, s14 +; GFX9-NEXT: s_lshl_b32 s1, s1, 30 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s14, 1 +; GFX9-NEXT: s_lshl_b32 s1, s1, 31 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10PLUS-LABEL: s_sext_inreg_i65_33: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_u32 s0, 1, 0x100000 +; GFX10PLUS-NEXT: s_bfe_u32 s1, 2, 0x100000 +; GFX10PLUS-NEXT: s_lshr_b32 s0, 0, s0 +; GFX10PLUS-NEXT: s_bfe_u32 s2, 3, 0x100000 +; GFX10PLUS-NEXT: s_lshr_b32 s1, 0, s1 +; GFX10PLUS-NEXT: s_lshr_b32 s2, 0, s2 +; GFX10PLUS-NEXT: s_bfe_u32 s3, 4, 0x100000 +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 1 +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 1 +; GFX10PLUS-NEXT: s_bfe_u32 s4, 5, 0x100000 +; GFX10PLUS-NEXT: s_lshr_b32 s3, 0, s3 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 17 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 18 +; GFX10PLUS-NEXT: s_and_b32 s2, s2, 1 +; GFX10PLUS-NEXT: s_lshr_b32 s4, 0, s4 +; GFX10PLUS-NEXT: s_bfe_u32 s5, 6, 0x100000 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 19 +; GFX10PLUS-NEXT: s_and_b32 s2, s3, 1 +; GFX10PLUS-NEXT: s_bfe_u32 s6, 7, 0x100000 +; GFX10PLUS-NEXT: s_lshr_b32 s5, 0, s5 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 20 +; GFX10PLUS-NEXT: s_and_b32 s2, s4, 1 +; GFX10PLUS-NEXT: s_lshr_b32 s6, 0, s6 +; GFX10PLUS-NEXT: s_bfe_u32 s7, 8, 0x100000 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 21 +; GFX10PLUS-NEXT: s_and_b32 s2, s5, 1 +; GFX10PLUS-NEXT: s_bfe_u32 s8, 9, 0x100000 +; GFX10PLUS-NEXT: s_lshr_b32 s7, 0, s7 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 22 +; GFX10PLUS-NEXT: s_and_b32 s2, s6, 1 +; GFX10PLUS-NEXT: s_lshr_b32 s8, 0, s8 +; GFX10PLUS-NEXT: s_bfe_u32 s9, 10, 0x100000 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 23 +; GFX10PLUS-NEXT: s_and_b32 s2, s7, 1 +; GFX10PLUS-NEXT: s_bfe_u32 s10, 11, 0x100000 +; GFX10PLUS-NEXT: s_lshr_b32 s9, 0, s9 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 24 +; GFX10PLUS-NEXT: s_and_b32 s2, s8, 1 +; GFX10PLUS-NEXT: s_lshr_b32 s10, 0, s10 +; GFX10PLUS-NEXT: s_bfe_u32 s11, 12, 0x100000 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 25 +; GFX10PLUS-NEXT: s_and_b32 s2, s9, 1 +; GFX10PLUS-NEXT: s_bfe_u32 s12, 13, 0x100000 +; GFX10PLUS-NEXT: s_lshr_b32 s11, 0, s11 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 26 +; GFX10PLUS-NEXT: s_and_b32 s2, s10, 1 +; GFX10PLUS-NEXT: s_lshr_b32 s12, 0, s12 +; GFX10PLUS-NEXT: s_bfe_u32 s13, 14, 0x100000 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 27 +; GFX10PLUS-NEXT: s_and_b32 s2, s11, 1 +; GFX10PLUS-NEXT: s_bfe_u32 s14, 15, 0x100000 +; GFX10PLUS-NEXT: s_lshr_b32 s13, 0, s13 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 28 +; GFX10PLUS-NEXT: s_and_b32 s2, s12, 1 +; GFX10PLUS-NEXT: s_lshr_b32 s14, 0, s14 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 29 +; GFX10PLUS-NEXT: s_and_b32 s2, s13, 1 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 30 +; GFX10PLUS-NEXT: s_and_b32 s2, s14, 1 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 31 +; GFX10PLUS-NEXT: s_mov_b32 s2, 0 +; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_mov_b32 s1, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog + %shl = shl i65 %value, 33 + %ashr = shl i65 %shl, 33 + ret i65 %ashr +} + +; FIXME: Argument lowering asserts +; define <2 x i65> @v_sext_inreg_v2i65_36(<2 x i65> %value) { +; %shl = shl <2 x i65> %value, +; %ashr = ashr <2 x i65> %shl, +; ret <2 x i65> %ashr +; } + +; define amdgpu_ps <2 x i65> @s_sext_inreg_v2i65_36(<2 x i65> inreg %valuex) { +; %shl = shl <2 x i65> %value, +; %ashr = ashrshl <2 x i65> %shl, +; ret <2 x i65> %ashr +; } -- 2.7.4