These tests had been commented out but seem to not be crashing.
Not sure if codegen is perfect in each of them, but even if it's not I think it's better to put a TODO to fix codegen than remove the test outright, unless codegen is plain wrong (then I'd still rather XFAIL rather than hide it)
Reviewed By: arsenm
Differential Revision: https://reviews.llvm.org/D136341
ret <2 x i16> %and
}
-; FIXME:
-; define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
-; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
-; %and = and <3 x i16> %src0, %not.src1
-; %cast = bitcast <3 x i16> %and to i48
-; ret i48 %cast
-; }
-
-; define amdgpu_ps i48 @s_andn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
-; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
-; %and = and <3 x i16> %not.src1, %src0
-; %cast = bitcast <3 x i16> %and to i48
-; ret i48 %cast
-; }
-
-; define amdgpu_ps { i48, i48 } @s_andn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
-; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
-; %and = and <3 x i16> %src0, %not.src1
-
-; %cast.0 = bitcast <3 x i16> %and to i48
-; %cast.1 = bitcast <3 x i16> %not.src1 to i48
-; %insert.0 = insertvalue { i48, i48 } undef, i48 %cast.0, 0
-; %insert.1 = insertvalue { i48, i48 } %insert.0, i48 %cast.1, 1
-; ret { i48, i48 } %insert.1
-; }
-
-; define <3 x i16> @v_andn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) {
-; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -11>
-; %and = and <3 x i16> %src0, %not.src1
-; ret <3 x i16> %and
-; }
+
+define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
+; GFX6-LABEL: s_andn2_v3i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX6-NEXT: s_mov_b32 s0, -1
+; GFX6-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX6-NEXT: s_lshl_b32 s6, s6, 16
+; GFX6-NEXT: s_mov_b32 s1, 0xffff
+; GFX6-NEXT: s_or_b32 s6, s5, s6
+; GFX6-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1]
+; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX6-NEXT: s_lshr_b32 s5, s0, 16
+; GFX6-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_or_b32 s2, s2, s3
+; GFX6-NEXT: s_and_b32 s3, s4, 0xffff
+; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT: s_lshl_b32 s4, s5, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s4
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT: s_lshr_b32 s2, s0, 16
+; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_andn2_v3i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b64 s[0:1], -1
+; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT: s_lshr_b32 s2, s0, 16
+; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: s_andn2_v3i16:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1
+; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
+; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16
+; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16
+; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
+ %and = and <3 x i16> %src0, %not.src1
+ %cast = bitcast <3 x i16> %and to i48
+ ret i48 %cast
+}
+
+define amdgpu_ps i48 @s_andn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
+; GFX6-LABEL: s_andn2_v3i16_commute:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX6-NEXT: s_mov_b32 s0, -1
+; GFX6-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX6-NEXT: s_lshl_b32 s6, s6, 16
+; GFX6-NEXT: s_mov_b32 s1, 0xffff
+; GFX6-NEXT: s_or_b32 s6, s5, s6
+; GFX6-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1]
+; GFX6-NEXT: s_lshr_b32 s5, s0, 16
+; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT: s_lshl_b32 s5, s5, 16
+; GFX6-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s5
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT: s_or_b32 s2, s2, s3
+; GFX6-NEXT: s_and_b32 s3, s4, 0xffff
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT: s_lshr_b32 s2, s0, 16
+; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_andn2_v3i16_commute:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b64 s[0:1], -1
+; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_lshr_b32 s2, s0, 16
+; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: s_andn2_v3i16_commute:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1
+; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
+; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16
+; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16
+; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
+ %and = and <3 x i16> %not.src1, %src0
+ %cast = bitcast <3 x i16> %and to i48
+ ret i48 %cast
+}
+
+define amdgpu_ps { i48, i48 } @s_andn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
+; GFX6-LABEL: s_andn2_v3i16_multi_use:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX6-NEXT: s_mov_b32 s0, -1
+; GFX6-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX6-NEXT: s_lshl_b32 s6, s6, 16
+; GFX6-NEXT: s_mov_b32 s1, 0xffff
+; GFX6-NEXT: s_or_b32 s6, s5, s6
+; GFX6-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1]
+; GFX6-NEXT: s_lshr_b32 s5, s0, 16
+; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX6-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_and_b32 s7, s4, 0xffff
+; GFX6-NEXT: s_and_b32 s4, s0, 0xffff
+; GFX6-NEXT: s_lshl_b32 s5, s5, 16
+; GFX6-NEXT: s_or_b32 s6, s2, s3
+; GFX6-NEXT: s_or_b32 s2, s4, s5
+; GFX6-NEXT: s_and_b32 s3, s1, 0xffff
+; GFX6-NEXT: s_and_b64 s[0:1], s[6:7], s[2:3]
+; GFX6-NEXT: s_lshr_b32 s2, s0, 16
+; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT: s_or_b32 s2, s4, s5
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_andn2_v3i16_multi_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b64 s[0:1], -1
+; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1]
+; GFX9-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5]
+; GFX9-NEXT: s_lshr_b32 s2, s0, 16
+; GFX9-NEXT: s_lshr_b32 s6, s4, 16
+; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_and_b32 s2, s4, 0xffff
+; GFX9-NEXT: s_lshl_b32 s3, s6, 16
+; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT: s_or_b32 s2, s2, s3
+; GFX9-NEXT: s_and_b32 s3, s5, 0xffff
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: s_andn2_v3i16_multi_use:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1
+; GFX10PLUS-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1]
+; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5]
+; GFX10PLUS-NEXT: s_lshr_b32 s3, s4, 16
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16
+; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16
+; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, 16
+; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2
+; GFX10PLUS-NEXT: s_and_b32 s2, s4, 0xffff
+; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10PLUS-NEXT: s_or_b32 s2, s2, s3
+; GFX10PLUS-NEXT: s_and_b32 s3, s5, 0xffff
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
+ %and = and <3 x i16> %src0, %not.src1
+ %cast.0 = bitcast <3 x i16> %and to i48
+ %cast.1 = bitcast <3 x i16> %not.src1 to i48
+ %insert.0 = insertvalue { i48, i48 } undef, i48 %cast.0, 0
+ %insert.1 = insertvalue { i48, i48 } %insert.0, i48 %cast.1, 1
+ ret { i48, i48 } %insert.1
+}
+
+define <3 x i16> @v_andn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) {
+; GFX6-LABEL: v_andn2_v3i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT: v_xor_b32_e32 v3, -1, v3
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_xor_b32_e32 v4, 0xfff5, v4
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX6-NEXT: v_and_b32_e32 v0, v0, v2
+; GFX6-NEXT: v_and_b32_e32 v2, v1, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_andn2_v3i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT: v_xor_b32_e32 v3, -11, v3
+; GFX9-NEXT: v_and_b32_e32 v0, v0, v2
+; GFX9-NEXT: v_and_b32_e32 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10PLUS-LABEL: v_andn2_v3i16:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10PLUS-NEXT: v_xor_b32_e32 v2, -1, v2
+; GFX10PLUS-NEXT: v_xor_b32_e32 v3, -11, v3
+; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v2
+; GFX10PLUS-NEXT: v_and_b32_e32 v1, v1, v3
+; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+ %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -11>
+ %and = and <3 x i16> %src0, %not.src1
+ ret <3 x i16> %and
+}
define amdgpu_ps i64 @s_andn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1) {
; GFX6-LABEL: s_andn2_v4i16:
ret <2 x i16> %bswap
}
-; FIXME
-; define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
-; %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %ext.src)
-; ret <3 x i16> %bswap
-; }
+define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
+; GFX7-LABEL: v_bswap_v3i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v0
+; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v1
+; GFX7-NEXT: v_bfe_u32 v1, v1, 8, 8
+; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v2
+; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_bswap_v3i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_mov_b32 s4, 0x2030001
+; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_bswap_v3i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x2030001
+; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
+; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_bswap_v3i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x2030001
+; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x2030001
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %src)
+ ret <3 x i16> %bswap
+}
define i64 @v_bswap_i48(i64 %src) {
; GFX7-LABEL: v_bswap_i48:
ret void
}
-; FIXME:
-; define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
-; %tid = call i32 @llvm.amdgcn.workitem.id.x()
-; %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid
-; %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
-; %cvt = uitofp <2 x i8> %load to <2 x float>
-; store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
-; ret void
-; }
+define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: load_v2i8_to_v2f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
+; SI-NEXT: v_bfe_u32 v2, v0, 8, 8
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: load_v2i8_to_v2f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ushort v1, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
+ %cvt = uitofp <2 x i8> %load to <2 x float>
+ store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
+ ret void
+}
-; FIXME:
-; define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
-; %tid = call i32 @llvm.amdgcn.workitem.id.x()
-; %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
-; %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
-; %cvt = uitofp <3 x i8> %load to <3 x float>
-; store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
-; ret void
-; }
+define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: load_v3i8_to_v3f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
+; SI-NEXT: v_bfe_u32 v2, v0, 8, 8
+; SI-NEXT: v_bfe_u32 v3, v0, 16, 8
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: load_v3i8_to_v3f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
+; VI-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
+ %cvt = uitofp <3 x i8> %load to <3 x float>
+ store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
+ ret void
+}
-; define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
-; %tid = call i32 @llvm.amdgcn.workitem.id.x()
-; %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
-; %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
-; %cvt = uitofp <4 x i8> %load to <4 x float>
-; store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
-; ret void
-; }
+define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: load_v4i8_to_v4f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v0
+; SI-NEXT: v_bfe_u32 v2, v0, 8, 8
+; SI-NEXT: v_bfe_u32 v4, v0, 16, 8
+; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: load_v4i8_to_v4f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
+ %cvt = uitofp <4 x i8> %load to <4 x float>
+ store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+ ret void
+}
; This should not be adding instructions to shift into the correct
; position in the word for the component.
ret void
}
-; FIXME: Need to handle non-uniform case for function below (load without gep).
-; Instructions still emitted to repack bytes for add use.
-; define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
-; %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
-; %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
-; %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
-; %cvt = uitofp <4 x i8> %load to <4 x float>
-; store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
-; %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
-; store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
-; ret void
-; }
+define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: load_v4i8_to_v4f32_2_uses:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v0
+; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v0
+; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
+; SI-NEXT: v_add_i32_e32 v6, vcc, 9, v0
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v1
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v2
+; SI-NEXT: v_add_i32_e32 v8, vcc, 9, v1
+; SI-NEXT: v_add_i32_e32 v9, vcc, 9, v2
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v7
+; SI-NEXT: v_and_b32_e32 v5, 0xff, v8
+; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT: v_and_b32_e32 v7, 0xff, v9
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5
+; SI-NEXT: v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7
+; SI-NEXT: v_or_b32_e32 v0, v6, v0
+; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v4
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: load_v4i8_to_v4f32_2_uses:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: v_mov_b32_e32 v6, 8
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v1, v[0:1]
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: v_mov_b32_e32 v2, 9
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v1
+; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1
+; VI-NEXT: v_add_u16_e32 v9, 9, v1
+; VI-NEXT: v_add_u16_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_add_u16_e32 v7, 9, v7
+; VI-NEXT: v_add_u16_e32 v8, 9, v8
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_and_b32_e32 v10, 0xff, v10
+; VI-NEXT: v_lshlrev_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_and_b32_e32 v1, 0xff, v8
+; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v10
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: v_or_b32_e32 v2, v0, v2
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+ %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
+ %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
+ %cvt = uitofp <4 x i8> %load to <4 x float>
+ store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
+ %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
+ store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4
+ ret void
+}
-; Make sure this doesn't crash.
-; FIXME:
-; define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
-; %tid = call i32 @llvm.amdgcn.workitem.id.x()
-; %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
-; %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
-; %cvt = uitofp <7 x i8> %load to <7 x float>
-; store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
-; ret void
-; }
+define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: load_v7i8_to_v7f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:1
+; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
+; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:3
+; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:4
+; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:5
+; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:6
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3
+; SI-NEXT: s_waitcnt vmcnt(4)
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
+; SI-NEXT: s_waitcnt vmcnt(3)
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v7
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:24
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: load_v7i8_to_v7f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0
+; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0
+; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v10, vcc, 5, v0
+; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v12, vcc, 6, v0
+; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v1, v[2:3]
+; VI-NEXT: flat_load_ubyte v2, v[4:5]
+; VI-NEXT: flat_load_ubyte v3, v[6:7]
+; VI-NEXT: flat_load_ubyte v4, v[8:9]
+; VI-NEXT: flat_load_ubyte v5, v[10:11]
+; VI-NEXT: flat_load_ubyte v6, v[12:13]
+; VI-NEXT: v_mov_b32_e32 v8, s1
+; VI-NEXT: v_mov_b32_e32 v7, s0
+; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_mov_b32_e32 v10, s1
+; VI-NEXT: v_mov_b32_e32 v9, s0
+; VI-NEXT: s_waitcnt vmcnt(6)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT: s_waitcnt vmcnt(5)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1
+; VI-NEXT: s_waitcnt vmcnt(4)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
+; VI-NEXT: s_waitcnt vmcnt(3)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3
+; VI-NEXT: s_waitcnt vmcnt(2)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v5
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v6
+; VI-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
+; VI-NEXT: flat_store_dwordx3 v[9:10], v[4:6]
+; VI-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
+ %cvt = uitofp <7 x i8> %load to <7 x float>
+ store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
+ ret void
+}
-; FIXME
-; define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
-; %tid = call i32 @llvm.amdgcn.workitem.id.x()
-; %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid
-; %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
-; %cvt = uitofp <8 x i8> %load to <8 x float>
-; store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
-; ret void
-; }
+define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
+; SI-LABEL: load_v8i8_to_v8f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_and_b32_e32 v2, 0xff, v0
+; SI-NEXT: v_bfe_u32 v4, v0, 8, 8
+; SI-NEXT: v_bfe_u32 v5, v0, 16, 8
+; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
+; SI-NEXT: v_and_b32_e32 v6, 0xff, v1
+; SI-NEXT: v_bfe_u32 v8, v1, 8, 8
+; SI-NEXT: v_bfe_u32 v9, v1, 16, 8
+; SI-NEXT: v_cvt_f32_ubyte3_e32 v7, v1
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v5
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v8
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v9
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: load_v8i8_to_v8f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v9, s1
+; VI-NEXT: v_mov_b32_e32 v8, s0
+; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_mov_b32_e32 v11, s1
+; VI-NEXT: v_mov_b32_e32 v10, s0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v6
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; VI-NEXT: v_cvt_f32_ubyte3_e32 v7, v7
+; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
+; VI-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid
+ %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
+ %cvt = uitofp <8 x i8> %load to <8 x float>
+ store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
+ ret void
+}
define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
; SI-LABEL: i8_zext_inreg_i32_to_f32:
ret <2 x half> %fma
}
-; FIXME:
-; define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
-; %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z)
-; ret <3 x half> %fma
-; }
+define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
+; GFX6-LABEL: v_fma_v3f16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_fma_f32 v0, v0, v3, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v8
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_fma_f32 v1, v1, v3, v4
+; GFX6-NEXT: v_fma_f32 v2, v2, v5, v6
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v3f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4
+; GFX8-NEXT: v_fma_f16 v2, v6, v7, v8
+; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5
+; GFX8-NEXT: v_mov_b32_e32 v3, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fma_v3f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_v3f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fma_v3f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX11-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z)
+ ret <3 x half> %fma
+}
define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
; GFX6-LABEL: v_fma_v4f16:
ret <2 x half> %mul
}
-; FIXME
-; define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) {
-; %mul = fmul <3 x half> %a, %b
-; ret <3 x half> %mul
-; }
+define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) {
+; GFX9-LABEL: v_fmul_v3f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v3f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v2, 16
+; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v3f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %mul = fmul <3 x half> %a, %b
+ ret <3 x half> %mul
+}
-; define <3 x half> @v_fmul_v3f16_fneg_lhs(<3 x half> %a, <3 x half> %b) {
-; %neg.a = fneg <3 x half> %a
-; %mul = fmul <3 x half> %neg.a, %b
-; ret <3 x half> %mul
-; }
+define <3 x half> @v_fmul_v3f16_fneg_lhs(<3 x half> %a, <3 x half> %b) {
+; GFX9-LABEL: v_fmul_v3f16_fneg_lhs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v3f16_fneg_lhs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x80008000
+; GFX8-NEXT: v_xor_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v2, 16
+; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v3f16_fneg_lhs:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0]
+; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = fneg <3 x half> %a
+ %mul = fmul <3 x half> %neg.a, %b
+ ret <3 x half> %mul
+}
-; define <3 x half> @v_fmul_v3f16_fneg_rhs(<3 x half> %a, <3 x half> %b) {
-; %neg.b = fneg <3 x half> %b
-; %mul = fmul <3 x half> %a, %neg.b
-; ret <3 x half> %mul
-; }
+define <3 x half> @v_fmul_v3f16_fneg_rhs(<3 x half> %a, <3 x half> %b) {
+; GFX9-LABEL: v_fmul_v3f16_fneg_rhs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v3f16_fneg_rhs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x80008000
+; GFX8-NEXT: v_xor_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v2, 16
+; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v3f16_fneg_rhs:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %neg.b = fneg <3 x half> %b
+ %mul = fmul <3 x half> %a, %neg.b
+ ret <3 x half> %mul
+}
-; define <3 x half> @v_fmul_v3f16_fneg_lhs_fneg_rhs(<3 x half> %a, <3 x half> %b) {
-; %neg.a = fneg <3 x half> %a
-; %neg.b = fneg <3 x half> %b
-; %mul = fmul <3 x half> %neg.a, %neg.b
-; ret <3 x half> %mul
-; }
+define <3 x half> @v_fmul_v3f16_fneg_lhs_fneg_rhs(<3 x half> %a, <3 x half> %b) {
+; GFX9-LABEL: v_fmul_v3f16_fneg_lhs_fneg_rhs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v3f16_fneg_lhs_fneg_rhs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v2, 16
+; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v3f16_fneg_lhs_fneg_rhs:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
+; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %neg.a = fneg <3 x half> %a
+ %neg.b = fneg <3 x half> %b
+ %mul = fmul <3 x half> %neg.a, %neg.b
+ ret <3 x half> %mul
+}
define <4 x half> @v_fmul_v4f16(<4 x half> %a, <4 x half> %b) {
; GFX9-LABEL: v_fmul_v4f16:
ret float %cast
}
-; ; FIXME
-; define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) {
-; %result = call <3 x i16> @llvm.fshl.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
-; %cast = bitcast <3 x i16> %result to i48
-; ret i48 %cast
-; }
-; ; FIXME
-; define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) {
-; %result = call <3 x i16> @llvm.fshl.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
-; %cast.result = bitcast <3 x i16> %result to <3 x half>
-; ret <3 x half> %cast.result
-; }
+define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) {
+; GFX6-LABEL: s_fshl_v3i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_and_b32 s9, s6, 15
+; GFX6-NEXT: s_andn2_b32 s6, 15, s6
+; GFX6-NEXT: s_bfe_u32 s9, s9, 0x100000
+; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001
+; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000
+; GFX6-NEXT: s_lshl_b32 s0, s0, s9
+; GFX6-NEXT: s_lshr_b32 s3, s3, s6
+; GFX6-NEXT: s_or_b32 s0, s0, s3
+; GFX6-NEXT: s_and_b32 s3, s7, 15
+; GFX6-NEXT: s_andn2_b32 s6, 15, s7
+; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000
+; GFX6-NEXT: s_lshl_b32 s1, s1, s3
+; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001
+; GFX6-NEXT: s_bfe_u32 s4, s6, 0x100000
+; GFX6-NEXT: s_lshr_b32 s3, s3, s4
+; GFX6-NEXT: s_or_b32 s1, s1, s3
+; GFX6-NEXT: s_and_b32 s3, s8, 15
+; GFX6-NEXT: s_andn2_b32 s4, 15, s8
+; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000
+; GFX6-NEXT: s_lshl_b32 s2, s2, s3
+; GFX6-NEXT: s_bfe_u32 s3, s5, 0xf0001
+; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT: s_lshr_b32 s3, s3, s4
+; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT: s_or_b32 s2, s2, s3
+; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshl_v3i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s9, s4, 15
+; GFX8-NEXT: s_bfe_u32 s9, s9, 0x100000
+; GFX8-NEXT: s_lshr_b32 s6, s0, 16
+; GFX8-NEXT: s_lshr_b32 s7, s2, 16
+; GFX8-NEXT: s_lshr_b32 s8, s4, 16
+; GFX8-NEXT: s_andn2_b32 s4, 15, s4
+; GFX8-NEXT: s_lshl_b32 s0, s0, s9
+; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT: s_bfe_u32 s9, 1, 0x100000
+; GFX8-NEXT: s_lshr_b32 s2, s2, s9
+; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT: s_lshr_b32 s2, s2, s4
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_and_b32 s2, s8, 15
+; GFX8-NEXT: s_andn2_b32 s4, 15, s8
+; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT: s_lshl_b32 s2, s6, s2
+; GFX8-NEXT: s_lshr_b32 s6, s7, s9
+; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT: s_lshr_b32 s4, s6, s4
+; GFX8-NEXT: s_or_b32 s2, s2, s4
+; GFX8-NEXT: s_and_b32 s4, s5, 15
+; GFX8-NEXT: s_andn2_b32 s5, 15, s5
+; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000
+; GFX8-NEXT: s_lshl_b32 s1, s1, s4
+; GFX8-NEXT: s_lshr_b32 s3, s3, s9
+; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000
+; GFX8-NEXT: s_lshr_b32 s3, s3, s4
+; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT: s_or_b32 s1, s1, s3
+; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000
+; GFX8-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshl_v3i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s6, s4, 0xf000f
+; GFX9-NEXT: s_lshr_b32 s7, s0, 16
+; GFX9-NEXT: s_lshr_b32 s8, s6, 16
+; GFX9-NEXT: s_lshl_b32 s0, s0, s6
+; GFX9-NEXT: s_lshl_b32 s6, s7, s8
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6
+; GFX9-NEXT: s_lshr_b32 s6, s2, 16
+; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT: s_lshr_b32 s2, s2, 0x10001
+; GFX9-NEXT: s_lshr_b32 s6, s6, 1
+; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s4
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6
+; GFX9-NEXT: s_lshr_b32 s6, s2, 16
+; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT: s_lshr_b32 s7, s4, 16
+; GFX9-NEXT: s_lshr_b32 s2, s2, s4
+; GFX9-NEXT: s_lshr_b32 s4, s6, s7
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_and_b32 s2, s5, 0xf000f
+; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s5
+; GFX9-NEXT: s_lshr_b32 s5, s1, 16
+; GFX9-NEXT: s_lshr_b32 s6, s2, 16
+; GFX9-NEXT: s_lshl_b32 s1, s1, s2
+; GFX9-NEXT: s_lshl_b32 s2, s5, s6
+; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
+; GFX9-NEXT: s_lshr_b32 s2, s3, 16
+; GFX9-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-NEXT: s_lshr_b32 s3, s3, 0x10001
+; GFX9-NEXT: s_lshr_b32 s2, s2, 1
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2
+; GFX9-NEXT: s_lshr_b32 s3, s2, 16
+; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT: s_lshr_b32 s5, s4, 16
+; GFX9-NEXT: s_lshr_b32 s2, s2, s4
+; GFX9-NEXT: s_lshr_b32 s3, s3, s5
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3
+; GFX9-NEXT: s_or_b32 s1, s1, s2
+; GFX9-NEXT: s_lshr_b32 s2, s0, 16
+; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshl_v3i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_and_b32 s9, s2, 0xffff
+; GFX10-NEXT: s_lshr_b32 s2, s2, 16
+; GFX10-NEXT: s_and_b32 s6, s4, 0xf000f
+; GFX10-NEXT: s_lshr_b32 s9, s9, 0x10001
+; GFX10-NEXT: s_lshr_b32 s2, s2, 1
+; GFX10-NEXT: s_andn2_b32 s4, 0xf000f, s4
+; GFX10-NEXT: s_lshr_b32 s7, s0, 16
+; GFX10-NEXT: s_lshr_b32 s8, s6, 16
+; GFX10-NEXT: s_pack_ll_b32_b16 s2, s9, s2
+; GFX10-NEXT: s_lshl_b32 s0, s0, s6
+; GFX10-NEXT: s_lshl_b32 s6, s7, s8
+; GFX10-NEXT: s_lshr_b32 s7, s2, 16
+; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX10-NEXT: s_lshr_b32 s8, s4, 16
+; GFX10-NEXT: s_lshr_b32 s2, s2, s4
+; GFX10-NEXT: s_lshr_b32 s4, s7, s8
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6
+; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4
+; GFX10-NEXT: s_and_b32 s7, s3, 0xffff
+; GFX10-NEXT: s_lshr_b32 s3, s3, 16
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_and_b32 s2, s5, 0xf000f
+; GFX10-NEXT: s_lshr_b32 s7, s7, 0x10001
+; GFX10-NEXT: s_lshr_b32 s3, s3, 1
+; GFX10-NEXT: s_andn2_b32 s4, 0xf000f, s5
+; GFX10-NEXT: s_lshr_b32 s5, s1, 16
+; GFX10-NEXT: s_lshr_b32 s6, s2, 16
+; GFX10-NEXT: s_lshl_b32 s1, s1, s2
+; GFX10-NEXT: s_pack_ll_b32_b16 s2, s7, s3
+; GFX10-NEXT: s_lshl_b32 s3, s5, s6
+; GFX10-NEXT: s_lshr_b32 s5, s2, 16
+; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX10-NEXT: s_lshr_b32 s6, s4, 16
+; GFX10-NEXT: s_lshr_b32 s2, s2, s4
+; GFX10-NEXT: s_lshr_b32 s4, s5, s6
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4
+; GFX10-NEXT: s_lshr_b32 s3, s0, 16
+; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX10-NEXT: s_lshl_b32 s3, s3, 16
+; GFX10-NEXT: s_or_b32 s1, s1, s2
+; GFX10-NEXT: s_or_b32 s0, s0, s3
+; GFX10-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_fshl_v3i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_and_b32 s9, s2, 0xffff
+; GFX11-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-NEXT: s_and_b32 s6, s4, 0xf000f
+; GFX11-NEXT: s_lshr_b32 s9, s9, 0x10001
+; GFX11-NEXT: s_lshr_b32 s2, s2, 1
+; GFX11-NEXT: s_and_not1_b32 s4, 0xf000f, s4
+; GFX11-NEXT: s_lshr_b32 s7, s0, 16
+; GFX11-NEXT: s_lshr_b32 s8, s6, 16
+; GFX11-NEXT: s_pack_ll_b32_b16 s2, s9, s2
+; GFX11-NEXT: s_lshl_b32 s0, s0, s6
+; GFX11-NEXT: s_lshl_b32 s6, s7, s8
+; GFX11-NEXT: s_lshr_b32 s7, s2, 16
+; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_lshr_b32 s8, s4, 16
+; GFX11-NEXT: s_lshr_b32 s2, s2, s4
+; GFX11-NEXT: s_lshr_b32 s4, s7, s8
+; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s6
+; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4
+; GFX11-NEXT: s_and_b32 s7, s3, 0xffff
+; GFX11-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_and_b32 s2, s5, 0xf000f
+; GFX11-NEXT: s_lshr_b32 s7, s7, 0x10001
+; GFX11-NEXT: s_lshr_b32 s3, s3, 1
+; GFX11-NEXT: s_and_not1_b32 s4, 0xf000f, s5
+; GFX11-NEXT: s_lshr_b32 s5, s1, 16
+; GFX11-NEXT: s_lshr_b32 s6, s2, 16
+; GFX11-NEXT: s_lshl_b32 s1, s1, s2
+; GFX11-NEXT: s_pack_ll_b32_b16 s2, s7, s3
+; GFX11-NEXT: s_lshl_b32 s3, s5, s6
+; GFX11-NEXT: s_lshr_b32 s5, s2, 16
+; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_lshr_b32 s6, s4, 16
+; GFX11-NEXT: s_lshr_b32 s2, s2, s4
+; GFX11-NEXT: s_lshr_b32 s4, s5, s6
+; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4
+; GFX11-NEXT: s_lshr_b32 s3, s0, 16
+; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: s_or_b32 s1, s1, s2
+; GFX11-NEXT: s_or_b32 s0, s0, s3
+; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT: ; return to shader part epilog
+ %result = call <3 x i16> @llvm.fshl.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
+ %cast = bitcast <3 x i16> %result to i48
+ ret i48 %cast
+}
+
+define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) {
+; GFX6-LABEL: v_fshl_v3i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v9, 15, v6
+; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6
+; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX6-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15
+; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 15, v7
+; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7
+; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1
+; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15
+; GFX6-NEXT: v_bfe_u32 v4, v6, 0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 15, v8
+; GFX6-NEXT: v_xor_b32_e32 v4, -1, v8
+; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
+; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15
+; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshl_v3i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX8-NEXT: v_and_b32_e32 v7, 15, v4
+; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
+; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v2
+; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8
+; GFX8-NEXT: v_or_b32_e32 v4, v7, v4
+; GFX8-NEXT: v_and_b32_e32 v7, 15, v6
+; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v7, 1
+; GFX8-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, v6, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 15, v5
+; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5
+; GFX8-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, v2, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v3
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: v_mov_b32_e32 v2, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshl_v3i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v6, 0xf000f, v4
+; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4
+; GFX9-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, v6, v0
+; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v2
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v5
+; GFX9-NEXT: v_xor_b32_e32 v4, -1, v5
+; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4
+; GFX9-NEXT: v_pk_lshlrev_b16 v1, v2, v1
+; GFX9-NEXT: v_pk_lshrrev_b16 v2, 1, v3 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v2
+; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshl_v3i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4
+; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5
+; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4
+; GFX10-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1]
+; GFX10-NEXT: v_and_b32_e32 v5, 0xf000f, v5
+; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v6
+; GFX10-NEXT: v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1]
+; GFX10-NEXT: v_and_b32_e32 v7, 0xf000f, v7
+; GFX10-NEXT: v_pk_lshlrev_b16 v0, v4, v0
+; GFX10-NEXT: v_pk_lshlrev_b16 v1, v5, v1
+; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2
+; GFX10-NEXT: v_pk_lshrrev_b16 v3, v7, v3
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fshl_v3i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_xor_b32_e32 v6, -1, v4
+; GFX11-NEXT: v_xor_b32_e32 v7, -1, v5
+; GFX11-NEXT: v_and_b32_e32 v4, 0xf000f, v4
+; GFX11-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1]
+; GFX11-NEXT: v_and_b32_e32 v5, 0xf000f, v5
+; GFX11-NEXT: v_and_b32_e32 v6, 0xf000f, v6
+; GFX11-NEXT: v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1]
+; GFX11-NEXT: v_and_b32_e32 v7, 0xf000f, v7
+; GFX11-NEXT: v_pk_lshlrev_b16 v0, v4, v0
+; GFX11-NEXT: v_pk_lshlrev_b16 v1, v5, v1
+; GFX11-NEXT: v_pk_lshrrev_b16 v2, v6, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_pk_lshrrev_b16 v3, v7, v3
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %result = call <3 x i16> @llvm.fshl.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
+ %cast.result = bitcast <3 x i16> %result to <3 x half>
+ ret <3 x half> %cast.result
+}
define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) {
; GFX6-LABEL: s_fshl_v4i16:
ret float %cast
}
-; ; FIXME
-; define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) {
-; %result = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
-; %cast = bitcast <3 x i16> %result to i48
-; ret i48 %cast
-; }
+define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) {
+; GFX6-LABEL: s_fshr_v3i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX6-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX6-NEXT: s_lshl_b32 s7, s7, 16
+; GFX6-NEXT: s_or_b32 s6, s6, s7
+; GFX6-NEXT: s_and_b32 s7, s8, 0xffff
+; GFX6-NEXT: s_bfe_u32 s8, 1, 0x100000
+; GFX6-NEXT: s_bfe_u32 s9, s3, 0xf0001
+; GFX6-NEXT: s_bfe_u32 s10, 14, 0x100000
+; GFX6-NEXT: s_lshl_b32 s0, s0, s8
+; GFX6-NEXT: s_lshr_b32 s9, s9, s10
+; GFX6-NEXT: s_or_b32 s0, s0, s9
+; GFX6-NEXT: s_bfe_u32 s9, s4, 0xf0001
+; GFX6-NEXT: s_lshl_b32 s1, s1, s8
+; GFX6-NEXT: s_lshr_b32 s9, s9, s10
+; GFX6-NEXT: s_xor_b32 s6, s6, -1
+; GFX6-NEXT: s_or_b32 s1, s1, s9
+; GFX6-NEXT: s_lshl_b32 s3, s3, 1
+; GFX6-NEXT: s_lshr_b32 s9, s6, 16
+; GFX6-NEXT: s_and_b32 s11, s6, 15
+; GFX6-NEXT: s_andn2_b32 s6, 15, s6
+; GFX6-NEXT: s_bfe_u32 s11, s11, 0x100000
+; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001
+; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000
+; GFX6-NEXT: s_lshl_b32 s0, s0, s11
+; GFX6-NEXT: s_lshr_b32 s3, s3, s6
+; GFX6-NEXT: s_or_b32 s0, s0, s3
+; GFX6-NEXT: s_and_b32 s3, s9, 15
+; GFX6-NEXT: s_lshl_b32 s4, s4, 1
+; GFX6-NEXT: s_andn2_b32 s6, 15, s9
+; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000
+; GFX6-NEXT: s_lshl_b32 s1, s1, s3
+; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001
+; GFX6-NEXT: s_bfe_u32 s4, s6, 0x100000
+; GFX6-NEXT: s_lshr_b32 s3, s3, s4
+; GFX6-NEXT: s_or_b32 s1, s1, s3
+; GFX6-NEXT: s_bfe_u32 s3, s5, 0xf0001
+; GFX6-NEXT: s_lshl_b32 s2, s2, s8
+; GFX6-NEXT: s_lshr_b32 s3, s3, s10
+; GFX6-NEXT: s_xor_b32 s4, s7, -1
+; GFX6-NEXT: s_or_b32 s2, s2, s3
+; GFX6-NEXT: s_lshl_b32 s3, s5, 1
+; GFX6-NEXT: s_and_b32 s5, s4, 15
+; GFX6-NEXT: s_andn2_b32 s4, 15, s4
+; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000
+; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001
+; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000
+; GFX6-NEXT: s_lshl_b32 s2, s2, s5
+; GFX6-NEXT: s_lshr_b32 s3, s3, s4
+; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000
+; GFX6-NEXT: s_or_b32 s2, s2, s3
+; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000
+; GFX6-NEXT: s_lshl_b32 s1, s1, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_fshr_v3i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_lshr_b32 s8, s4, 16
+; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX8-NEXT: s_lshl_b32 s8, s8, 16
+; GFX8-NEXT: s_or_b32 s4, s4, s8
+; GFX8-NEXT: s_bfe_u32 s8, 1, 0x100000
+; GFX8-NEXT: s_bfe_u32 s9, s2, 0x100000
+; GFX8-NEXT: s_bfe_u32 s10, 15, 0x100000
+; GFX8-NEXT: s_lshr_b32 s6, s0, 16
+; GFX8-NEXT: s_lshr_b32 s7, s2, 16
+; GFX8-NEXT: s_lshl_b32 s0, s0, s8
+; GFX8-NEXT: s_lshr_b32 s9, s9, s10
+; GFX8-NEXT: s_or_b32 s0, s0, s9
+; GFX8-NEXT: s_lshl_b32 s6, s6, s8
+; GFX8-NEXT: s_lshr_b32 s9, s7, s10
+; GFX8-NEXT: s_lshl_b32 s2, s2, s8
+; GFX8-NEXT: s_xor_b32 s4, s4, -1
+; GFX8-NEXT: s_or_b32 s6, s6, s9
+; GFX8-NEXT: s_lshr_b32 s9, s4, 16
+; GFX8-NEXT: s_and_b32 s11, s4, 15
+; GFX8-NEXT: s_andn2_b32 s4, 15, s4
+; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT: s_bfe_u32 s11, s11, 0x100000
+; GFX8-NEXT: s_lshr_b32 s2, s2, s8
+; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT: s_lshl_b32 s0, s0, s11
+; GFX8-NEXT: s_lshr_b32 s2, s2, s4
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_and_b32 s2, s9, 15
+; GFX8-NEXT: s_lshl_b32 s7, s7, s8
+; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT: s_andn2_b32 s4, 15, s9
+; GFX8-NEXT: s_lshl_b32 s2, s6, s2
+; GFX8-NEXT: s_bfe_u32 s6, s7, 0x100000
+; GFX8-NEXT: s_lshr_b32 s6, s6, s8
+; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT: s_lshr_b32 s4, s6, s4
+; GFX8-NEXT: s_or_b32 s2, s2, s4
+; GFX8-NEXT: s_bfe_u32 s4, s3, 0x100000
+; GFX8-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX8-NEXT: s_lshl_b32 s1, s1, s8
+; GFX8-NEXT: s_lshr_b32 s4, s4, s10
+; GFX8-NEXT: s_or_b32 s1, s1, s4
+; GFX8-NEXT: s_lshl_b32 s3, s3, s8
+; GFX8-NEXT: s_xor_b32 s4, s5, -1
+; GFX8-NEXT: s_and_b32 s5, s4, 15
+; GFX8-NEXT: s_andn2_b32 s4, 15, s4
+; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000
+; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000
+; GFX8-NEXT: s_lshr_b32 s3, s3, s8
+; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000
+; GFX8-NEXT: s_lshl_b32 s1, s1, s5
+; GFX8-NEXT: s_lshr_b32 s3, s3, s4
+; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000
+; GFX8-NEXT: s_or_b32 s1, s1, s3
+; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000
+; GFX8-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_fshr_v3i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_lshr_b32 s7, s0, 16
+; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001
+; GFX9-NEXT: s_lshl_b32 s7, s7, 1
+; GFX9-NEXT: s_and_b32 s6, s4, 0xf000f
+; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s4
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7
+; GFX9-NEXT: s_lshr_b32 s7, s0, 16
+; GFX9-NEXT: s_lshr_b32 s8, s4, 16
+; GFX9-NEXT: s_lshl_b32 s0, s0, s4
+; GFX9-NEXT: s_lshl_b32 s4, s7, s8
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
+; GFX9-NEXT: s_lshr_b32 s4, s2, 16
+; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT: s_lshr_b32 s7, s6, 16
+; GFX9-NEXT: s_lshr_b32 s2, s2, s6
+; GFX9-NEXT: s_lshr_b32 s4, s4, s7
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_and_b32 s2, s5, 0xf000f
+; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s5
+; GFX9-NEXT: s_lshr_b32 s5, s1, 16
+; GFX9-NEXT: s_lshl_b32 s1, s1, 0x10001
+; GFX9-NEXT: s_lshl_b32 s5, s5, 1
+; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5
+; GFX9-NEXT: s_lshr_b32 s5, s1, 16
+; GFX9-NEXT: s_lshr_b32 s6, s4, 16
+; GFX9-NEXT: s_lshl_b32 s1, s1, s4
+; GFX9-NEXT: s_lshl_b32 s4, s5, s6
+; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4
+; GFX9-NEXT: s_lshr_b32 s4, s3, 16
+; GFX9-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-NEXT: s_lshr_b32 s5, s2, 16
+; GFX9-NEXT: s_lshr_b32 s2, s3, s2
+; GFX9-NEXT: s_lshr_b32 s3, s4, s5
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3
+; GFX9-NEXT: s_or_b32 s1, s1, s2
+; GFX9-NEXT: s_lshr_b32 s2, s0, 16
+; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_fshr_v3i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_lshr_b32 s6, s0, 16
+; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001
+; GFX10-NEXT: s_lshl_b32 s6, s6, 1
+; GFX10-NEXT: s_and_b32 s7, s4, 0xf000f
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6
+; GFX10-NEXT: s_andn2_b32 s4, 0xf000f, s4
+; GFX10-NEXT: s_lshr_b32 s6, s0, 16
+; GFX10-NEXT: s_lshr_b32 s8, s4, 16
+; GFX10-NEXT: s_lshl_b32 s0, s0, s4
+; GFX10-NEXT: s_lshl_b32 s4, s6, s8
+; GFX10-NEXT: s_lshr_b32 s6, s2, 16
+; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX10-NEXT: s_lshr_b32 s8, s7, 16
+; GFX10-NEXT: s_lshr_b32 s2, s2, s7
+; GFX10-NEXT: s_lshr_b32 s6, s6, s8
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
+; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6
+; GFX10-NEXT: s_and_b32 s4, s5, 0xf000f
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_lshr_b32 s2, s1, 16
+; GFX10-NEXT: s_lshl_b32 s1, s1, 0x10001
+; GFX10-NEXT: s_lshl_b32 s2, s2, 1
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2
+; GFX10-NEXT: s_andn2_b32 s2, 0xf000f, s5
+; GFX10-NEXT: s_lshr_b32 s5, s1, 16
+; GFX10-NEXT: s_lshr_b32 s6, s2, 16
+; GFX10-NEXT: s_lshl_b32 s1, s1, s2
+; GFX10-NEXT: s_lshl_b32 s2, s5, s6
+; GFX10-NEXT: s_lshr_b32 s5, s3, 16
+; GFX10-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-NEXT: s_lshr_b32 s6, s4, 16
+; GFX10-NEXT: s_lshr_b32 s3, s3, s4
+; GFX10-NEXT: s_lshr_b32 s4, s5, s6
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2
+; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
+; GFX10-NEXT: s_lshr_b32 s3, s0, 16
+; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX10-NEXT: s_lshl_b32 s3, s3, 16
+; GFX10-NEXT: s_or_b32 s1, s1, s2
+; GFX10-NEXT: s_or_b32 s0, s0, s3
+; GFX10-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_fshr_v3i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_lshr_b32 s6, s0, 16
+; GFX11-NEXT: s_lshl_b32 s0, s0, 0x10001
+; GFX11-NEXT: s_lshl_b32 s6, s6, 1
+; GFX11-NEXT: s_and_b32 s7, s4, 0xf000f
+; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s6
+; GFX11-NEXT: s_and_not1_b32 s4, 0xf000f, s4
+; GFX11-NEXT: s_lshr_b32 s6, s0, 16
+; GFX11-NEXT: s_lshr_b32 s8, s4, 16
+; GFX11-NEXT: s_lshl_b32 s0, s0, s4
+; GFX11-NEXT: s_lshl_b32 s4, s6, s8
+; GFX11-NEXT: s_lshr_b32 s6, s2, 16
+; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_lshr_b32 s8, s7, 16
+; GFX11-NEXT: s_lshr_b32 s2, s2, s7
+; GFX11-NEXT: s_lshr_b32 s6, s6, s8
+; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4
+; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s6
+; GFX11-NEXT: s_and_b32 s4, s5, 0xf000f
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_lshr_b32 s2, s1, 16
+; GFX11-NEXT: s_lshl_b32 s1, s1, 0x10001
+; GFX11-NEXT: s_lshl_b32 s2, s2, 1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2
+; GFX11-NEXT: s_and_not1_b32 s2, 0xf000f, s5
+; GFX11-NEXT: s_lshr_b32 s5, s1, 16
+; GFX11-NEXT: s_lshr_b32 s6, s2, 16
+; GFX11-NEXT: s_lshl_b32 s1, s1, s2
+; GFX11-NEXT: s_lshl_b32 s2, s5, s6
+; GFX11-NEXT: s_lshr_b32 s5, s3, 16
+; GFX11-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX11-NEXT: s_lshr_b32 s6, s4, 16
+; GFX11-NEXT: s_lshr_b32 s3, s3, s4
+; GFX11-NEXT: s_lshr_b32 s4, s5, s6
+; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2
+; GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s4
+; GFX11-NEXT: s_lshr_b32 s3, s0, 16
+; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: s_or_b32 s1, s1, s2
+; GFX11-NEXT: s_or_b32 s0, s0, s3
+; GFX11-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT: ; return to shader part epilog
+ %result = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
+ %cast = bitcast <3 x i16> %result to i48
+ ret i48 %cast
+}
-; ; FIXME
-; define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) {
-; %result = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
-; %cast.result = bitcast <3 x i16> %result to <3 x half>
-; ret <3 x half> %cast.result
-; }
+define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) {
+; GFX6-LABEL: v_fshr_v3i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v8
+; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000
+; GFX6-NEXT: v_bfe_u32 v8, v3, 1, 15
+; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v8, s5, v8
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v8
+; GFX6-NEXT: v_bfe_u32 v8, v4, 1, 15
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v8, s5, v8
+; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v8
+; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v6
+; GFX6-NEXT: v_and_b32_e32 v9, 15, v6
+; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3
+; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX6-NEXT: v_bfe_u32 v9, v9, 0, 16
+; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15
+; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 15, v8
+; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4
+; GFX6-NEXT: v_and_b32_e32 v6, 15, v6
+; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1
+; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15
+; GFX6-NEXT: v_bfe_u32 v4, v6, 0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, s5, v3
+; GFX6-NEXT: v_xor_b32_e32 v4, -1, v7
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 15, v4
+; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 15, v4
+; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16
+; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15
+; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fshr_v3i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b16_e32 v7, 1, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v8, 15, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX8-NEXT: v_mov_b32_e32 v8, 1
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b16_e32 v8, 15, v6
+; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX8-NEXT: v_and_b32_e32 v9, 15, v4
+; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2
+; GFX8-NEXT: v_lshlrev_b16_e32 v7, v9, v7
+; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2
+; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v6
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 15, v8
+; GFX8-NEXT: v_xor_b32_e32 v7, -1, v8
+; GFX8-NEXT: v_and_b32_e32 v7, 15, v7
+; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v6
+; GFX8-NEXT: v_lshrrev_b16_e32 v4, v7, v4
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, -1
+; GFX8-NEXT: v_xor_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 15, v4
+; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1
+; GFX8-NEXT: v_lshrrev_b16_e32 v3, v4, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fshr_v3i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v6, 0xf000f, v4
+; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, v4, v0
+; GFX9-NEXT: v_pk_lshrrev_b16 v2, v6, v2
+; GFX9-NEXT: v_xor_b32_e32 v4, -1, v5
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v5
+; GFX9-NEXT: v_and_b32_e32 v4, 0xf000f, v4
+; GFX9-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_lshlrev_b16 v1, v4, v1
+; GFX9-NEXT: v_pk_lshrrev_b16 v2, v2, v3
+; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fshr_v3i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4
+; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5
+; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4
+; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
+; GFX10-NEXT: v_and_b32_e32 v5, 0xf000f, v5
+; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v6
+; GFX10-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1]
+; GFX10-NEXT: v_and_b32_e32 v7, 0xf000f, v7
+; GFX10-NEXT: v_pk_lshrrev_b16 v2, v4, v2
+; GFX10-NEXT: v_pk_lshrrev_b16 v3, v5, v3
+; GFX10-NEXT: v_pk_lshlrev_b16 v0, v6, v0
+; GFX10-NEXT: v_pk_lshlrev_b16 v1, v7, v1
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fshr_v3i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_xor_b32_e32 v6, -1, v4
+; GFX11-NEXT: v_xor_b32_e32 v7, -1, v5
+; GFX11-NEXT: v_and_b32_e32 v4, 0xf000f, v4
+; GFX11-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
+; GFX11-NEXT: v_and_b32_e32 v5, 0xf000f, v5
+; GFX11-NEXT: v_and_b32_e32 v6, 0xf000f, v6
+; GFX11-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1]
+; GFX11-NEXT: v_and_b32_e32 v7, 0xf000f, v7
+; GFX11-NEXT: v_pk_lshrrev_b16 v2, v4, v2
+; GFX11-NEXT: v_pk_lshrrev_b16 v3, v5, v3
+; GFX11-NEXT: v_pk_lshlrev_b16 v0, v6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_pk_lshlrev_b16 v1, v7, v1
+; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %result = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
+ %cast.result = bitcast <3 x i16> %result to <3 x half>
+ ret <3 x half> %cast.result
+}
define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) {
; GFX6-LABEL: s_fshr_v4i16:
$vgpr0 = COPY %5
...
-# FIXME
-# ---
-# name: test_add_s33
-# body: |
-# bb.0:
-# liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-
-# %0:_(s64) = COPY $vgpr0_vgpr1
-# %1:_(s64) = COPY $vgpr2_vgpr3
-# %2:_(s33) = G_TRUNC %0
-# %3:_(s33) = G_TRUNC %1
-# %4:_(s33) = G_ADD %2, %3
-# %5:_(s64) = G_ANYEXT %4
-# $vgpr0_vgpr1 = COPY %5
-# ...
+---
+name: test_add_s33
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX6-LABEL: name: test_add_s33
+ ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX6-NEXT: {{ $}}
+ ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; GFX6-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
+ ; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]]
+ ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
+ ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
+ ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ ; GFX8-LABEL: name: test_add_s33
+ ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
+ ; GFX8-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]]
+ ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
+ ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
+ ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ ; GFX9-LABEL: name: test_add_s33
+ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
+ ; GFX9-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]]
+ ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
+ ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $vgpr2_vgpr3
+ %2:_(s33) = G_TRUNC %0
+ %3:_(s33) = G_TRUNC %1
+ %4:_(s33) = G_ADD %2, %3
+ %5:_(s64) = G_ANYEXT %4
+ $vgpr0_vgpr1 = COPY %5
+...
---
name: test_add_s96
$vgpr0 = COPY %5
...
-# FIXME
-# ---
-# name: test_sub_s33
-# body: |
-# bb.0:
-# liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-
-# %0:_(s64) = COPY $vgpr0_vgpr1
-# %1:_(s64) = COPY $vgpr2_vgpr3
-# %2:_(s33) = G_TRUNC %0
-# %3:_(s33) = G_TRUNC %1
-# %4:_(s33) = G_SUB %2, %3
-# %5:_(s64) = G_ANYEXT %4
-# $vgpr0_vgpr1 = COPY %5
-# ...
+---
+name: test_sub_s33
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX6-LABEL: name: test_sub_s33
+ ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX6-NEXT: {{ $}}
+ ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX6-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; GFX6-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
+ ; GFX6-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]]
+ ; GFX6-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
+ ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
+ ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ ; GFX8-LABEL: name: test_sub_s33
+ ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX8-NEXT: {{ $}}
+ ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
+ ; GFX8-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]]
+ ; GFX8-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
+ ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
+ ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ ; GFX9-LABEL: name: test_sub_s33
+ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; GFX9-NEXT: {{ $}}
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
+ ; GFX9-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]]
+ ; GFX9-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
+ ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
+ ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = COPY $vgpr2_vgpr3
+ %2:_(s33) = G_TRUNC %0
+ %3:_(s33) = G_TRUNC %1
+ %4:_(s33) = G_SUB %2, %3
+ %5:_(s64) = G_ANYEXT %4
+ $vgpr0_vgpr1 = COPY %5
+...
---
name: test_sub_s96
ret <2 x i16> %or
}
-; FIXME:
-; define amdgpu_ps i48 @s_orn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
-; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
-; %or = or <3 x i16> %src0, %not.src1
-; %cast = bitcast <3 x i16> %or to i48
-; ret i48 %cast
-; }
-
-; define amdgpu_ps i48 @s_orn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
-; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
-; %or = or <3 x i16> %not.src1, %src0
-; %cast = bitcast <3 x i16> %or to i48
-; ret i48 %cast
-; }
-
-; define amdgpu_ps { i48, i48 } @s_orn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
-; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
-; %or = or <3 x i16> %src0, %not.src1
-
-; %cast.0 = bitcast <3 x i16> %or to i48
-; %cast.1 = bitcast <3 x i16> %not.src1 to i48
-; %insert.0 = insertvalue { i48, i48 } undef, i48 %cast.0, 0
-; %insert.1 = insertvalue { i48, i48 } %insert.0, i48 %cast.1, 1
-; ret { i48, i48 } %insert.1
-; }
-
-; define <3 x i16> @v_orn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) {
-; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -11>
-; %or = or <3 x i16> %src0, %not.src1
-; ret <3 x i16> %or
-; }
+define amdgpu_ps i48 @s_orn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
+; GFX6-LABEL: s_orn2_v3i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX6-NEXT: s_mov_b32 s0, -1
+; GFX6-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX6-NEXT: s_lshl_b32 s6, s6, 16
+; GFX6-NEXT: s_mov_b32 s1, 0xffff
+; GFX6-NEXT: s_or_b32 s6, s5, s6
+; GFX6-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1]
+; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX6-NEXT: s_lshr_b32 s5, s0, 16
+; GFX6-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_or_b32 s2, s2, s3
+; GFX6-NEXT: s_and_b32 s3, s4, 0xffff
+; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT: s_lshl_b32 s4, s5, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s4
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT: s_lshr_b32 s2, s0, 16
+; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_orn2_v3i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b64 s[0:1], -1
+; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT: s_lshr_b32 s2, s0, 16
+; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: s_orn2_v3i16:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1
+; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16
+; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16
+; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
+ %or = or <3 x i16> %src0, %not.src1
+ %cast = bitcast <3 x i16> %or to i48
+ ret i48 %cast
+}
+
+define amdgpu_ps i48 @s_orn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
+; GFX6-LABEL: s_orn2_v3i16_commute:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX6-NEXT: s_mov_b32 s0, -1
+; GFX6-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX6-NEXT: s_lshl_b32 s6, s6, 16
+; GFX6-NEXT: s_mov_b32 s1, 0xffff
+; GFX6-NEXT: s_or_b32 s6, s5, s6
+; GFX6-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1]
+; GFX6-NEXT: s_lshr_b32 s5, s0, 16
+; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT: s_lshl_b32 s5, s5, 16
+; GFX6-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s5
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT: s_or_b32 s2, s2, s3
+; GFX6-NEXT: s_and_b32 s3, s4, 0xffff
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT: s_lshr_b32 s2, s0, 16
+; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_orn2_v3i16_commute:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b64 s[0:1], -1
+; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-NEXT: s_lshr_b32 s2, s0, 16
+; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: s_orn2_v3i16_commute:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1
+; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16
+; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16
+; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
+ %or = or <3 x i16> %not.src1, %src0
+ %cast = bitcast <3 x i16> %or to i48
+ ret i48 %cast
+}
+
+define amdgpu_ps { i48, i48 } @s_orn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
+; GFX6-LABEL: s_orn2_v3i16_multi_use:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX6-NEXT: s_mov_b32 s0, -1
+; GFX6-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX6-NEXT: s_lshl_b32 s6, s6, 16
+; GFX6-NEXT: s_mov_b32 s1, 0xffff
+; GFX6-NEXT: s_or_b32 s6, s5, s6
+; GFX6-NEXT: s_and_b32 s7, s7, 0xffff
+; GFX6-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1]
+; GFX6-NEXT: s_lshr_b32 s5, s0, 16
+; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX6-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_and_b32 s7, s4, 0xffff
+; GFX6-NEXT: s_and_b32 s4, s0, 0xffff
+; GFX6-NEXT: s_lshl_b32 s5, s5, 16
+; GFX6-NEXT: s_or_b32 s6, s2, s3
+; GFX6-NEXT: s_or_b32 s2, s4, s5
+; GFX6-NEXT: s_and_b32 s3, s1, 0xffff
+; GFX6-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3]
+; GFX6-NEXT: s_lshr_b32 s2, s0, 16
+; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s2
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT: s_or_b32 s2, s4, s5
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_orn2_v3i16_multi_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b64 s[0:1], -1
+; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1]
+; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5]
+; GFX9-NEXT: s_lshr_b32 s2, s0, 16
+; GFX9-NEXT: s_lshr_b32 s6, s4, 16
+; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_and_b32 s2, s4, 0xffff
+; GFX9-NEXT: s_lshl_b32 s3, s6, 16
+; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT: s_or_b32 s2, s2, s3
+; GFX9-NEXT: s_and_b32 s3, s5, 0xffff
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: s_orn2_v3i16_multi_use:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_mov_b64 s[0:1], -1
+; GFX10PLUS-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1]
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5]
+; GFX10PLUS-NEXT: s_lshr_b32 s3, s4, 16
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16
+; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 16
+; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, 16
+; GFX10PLUS-NEXT: s_or_b32 s0, s0, s2
+; GFX10PLUS-NEXT: s_and_b32 s2, s4, 0xffff
+; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10PLUS-NEXT: s_or_b32 s2, s2, s3
+; GFX10PLUS-NEXT: s_and_b32 s3, s5, 0xffff
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1>
+ %or = or <3 x i16> %src0, %not.src1
+ %cast.0 = bitcast <3 x i16> %or to i48
+ %cast.1 = bitcast <3 x i16> %not.src1 to i48
+ %insert.0 = insertvalue { i48, i48 } undef, i48 %cast.0, 0
+ %insert.1 = insertvalue { i48, i48 } %insert.0, i48 %cast.1, 1
+ ret { i48, i48 } %insert.1
+}
+
+define <3 x i16> @v_orn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) {
+; GFX6-LABEL: v_orn2_v3i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT: v_xor_b32_e32 v3, -1, v3
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_xor_b32_e32 v4, 0xfff5, v4
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v1, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_orn2_v3i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2
+; GFX9-NEXT: v_xor_b32_e32 v3, -11, v3
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10PLUS-LABEL: v_orn2_v3i16:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10PLUS-NEXT: v_xor_b32_e32 v2, -1, v2
+; GFX10PLUS-NEXT: v_xor_b32_e32 v3, -11, v3
+; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX10PLUS-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+ %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -11>
+ %or = or <3 x i16> %src0, %not.src1
+ ret <3 x i16> %or
+}
define amdgpu_ps i64 @s_orn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1) {
; GFX6-LABEL: s_orn2_v4i16:
ret <4 x i32> %cast
}
-; FIXME: i48 broken because i48 add broken
-; define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
-; %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
-; ret i48 %result
-; }
+define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
+; GFX6-LABEL: v_saddsat_i48:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2
+; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v1, v3, vcc
+; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
+; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
+; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffff8000, v0
+; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v5
+; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_saddsat_i48:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v1, v3, vcc
+; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 16
+; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
+; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
+; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffff8000, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v5
+; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_saddsat_i48:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_saddsat_i48:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
+; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_saddsat_i48:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
+; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
+ ret i48 %result
+}
-; define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
-; %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
-; ret i48 %result
-; }
+define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
+; GFX6-LABEL: s_saddsat_i48:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_add_u32 s4, s0, s2
+; GFX6-NEXT: s_addc_u32 s5, s1, s3
+; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
+; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
+; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
+; GFX6-NEXT: s_ashr_i32 s3, s7, 31
+; GFX6-NEXT: s_ashr_i32 s2, s7, 15
+; GFX6-NEXT: s_add_u32 s3, s3, 0xffff8000
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: v_mov_b32_e32 v3, s5
+; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_saddsat_i48:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s4, s0, s2
+; GFX8-NEXT: s_addc_u32 s5, s1, s3
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
+; GFX8-NEXT: s_ashr_i32 s3, s7, 31
+; GFX8-NEXT: s_ashr_i32 s2, s7, 15
+; GFX8-NEXT: s_add_u32 s3, s3, 0xffff8000
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_saddsat_i48:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX9-NEXT: s_add_u32 s4, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_addc_u32 s5, s1, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
+; GFX9-NEXT: s_ashr_i32 s2, s5, 31
+; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_saddsat_i48:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX10-NEXT: s_add_u32 s4, s0, s2
+; GFX10-NEXT: s_addc_u32 s5, s1, s3
+; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[4:5], s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: s_ashr_i32 s0, s5, 31
+; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX10-NEXT: s_xor_b32 s2, s2, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2
+; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_saddsat_i48:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX11-NEXT: s_add_u32 s4, s0, s2
+; GFX11-NEXT: s_addc_u32 s5, s1, s3
+; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0
+; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[4:5], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT: s_ashr_i32 s0, s5, 31
+; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX11-NEXT: s_xor_b32 s2, s2, s6
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2
+; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: ; return to shader part epilog
+ %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
+ ret i48 %result
+}
-; define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
-; %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
-; %ext.result = zext i48 %result to i64
-; %cast = bitcast i64 %ext.result to <2 x float>
-; ret <2 x float> %cast
-; }
+define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
+; GFX6-LABEL: saddsat_i48_sv:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: v_mov_b32_e32 v3, s1
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0
+; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v3, v1, vcc
+; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
+; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
+; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
+; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, 0xffff8000
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v3
+; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: saddsat_i48_sv:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v3, v1, vcc
+; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
+; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
+; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
+; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff8000
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v3
+; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: saddsat_i48_sv:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1
+; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: saddsat_i48_sv:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
+; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
+; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: saddsat_i48_sv:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
+; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
+; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
+; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: ; return to shader part epilog
+ %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
+ %ext.result = zext i48 %result to i64
+ %cast = bitcast i64 %ext.result to <2 x float>
+ ret <2 x float> %cast
+}
-; define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
-; %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
-; %ext.result = zext i48 %result to i64
-; %cast = bitcast i64 %ext.result to <2 x float>
-; ret <2 x float> %cast
-; }
+define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
+; GFX6-LABEL: saddsat_i48_vs:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: v_mov_b32_e32 v3, s1
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0
+; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc
+; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
+; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, 0xffff8000
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v3
+; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: saddsat_i48_vs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc
+; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
+; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff8000
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v3
+; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: saddsat_i48_vs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], 0
+; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1
+; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: saddsat_i48_vs:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
+; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: saddsat_i48_vs:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
+; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
+; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: ; return to shader part epilog
+ %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
+ %ext.result = zext i48 %result to i64
+ %cast = bitcast i64 %ext.result to <2 x float>
+ ret <2 x float> %cast
+}
define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX6-LABEL: v_saddsat_i64:
ret <4 x i32> %cast
}
-; FIXME: i48 broken because i48 add broken
-; define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
-; %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
-; ret i48 %result
-; }
+define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
+; GFX6-LABEL: v_ssubsat_i48:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
+; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v1, v3, vcc
+; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
+; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffff8000, v0
+; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v5
+; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_ssubsat_i48:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2
+; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v1, v3, vcc
+; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 16
+; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
+; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffff8000, v0
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v5
+; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_ssubsat_i48:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_ssubsat_i48:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
+; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
+; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_ssubsat_i48:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
+; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
+; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
+ ret i48 %result
+}
-; define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
-; %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
-; ret i48 %result
-; }
+define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
+; GFX6-LABEL: s_ssubsat_i48:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_sub_u32 s4, s0, s2
+; GFX6-NEXT: s_subb_u32 s5, s1, s3
+; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
+; GFX6-NEXT: v_mov_b32_e32 v1, s1
+; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
+; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
+; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
+; GFX6-NEXT: s_ashr_i32 s3, s7, 31
+; GFX6-NEXT: s_ashr_i32 s2, s7, 15
+; GFX6-NEXT: s_add_u32 s3, s3, 0xffff8000
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: v_mov_b32_e32 v3, s5
+; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_ssubsat_i48:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_sub_u32 s4, s0, s2
+; GFX8-NEXT: s_subb_u32 s5, s1, s3
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
+; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
+; GFX8-NEXT: s_ashr_i32 s3, s7, 31
+; GFX8-NEXT: s_ashr_i32 s2, s7, 15
+; GFX8-NEXT: s_add_u32 s3, s3, 0xffff8000
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_ssubsat_i48:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX9-NEXT: s_sub_u32 s4, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_subb_u32 s5, s1, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
+; GFX9-NEXT: s_ashr_i32 s2, s5, 31
+; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_ssubsat_i48:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX10-NEXT: s_sub_u32 s4, s0, s2
+; GFX10-NEXT: s_subb_u32 s5, s1, s3
+; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[4:5], s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: s_ashr_i32 s0, s5, 31
+; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX10-NEXT: s_xor_b32 s2, s2, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2
+; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_ssubsat_i48:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX11-NEXT: s_sub_u32 s4, s0, s2
+; GFX11-NEXT: s_subb_u32 s5, s1, s3
+; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0
+; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[4:5], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT: s_ashr_i32 s0, s5, 31
+; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX11-NEXT: s_xor_b32 s2, s2, s6
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2
+; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: ; return to shader part epilog
+ %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
+ ret i48 %result
+}
-; define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
-; %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
-; %ext.result = zext i48 %result to i64
-; %cast = bitcast i64 %ext.result to <2 x float>
-; ret <2 x float> %cast
-; }
+define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
+; GFX6-LABEL: ssubsat_i48_sv:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: v_mov_b32_e32 v3, s1
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v0
+; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v3, v1, vcc
+; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
+; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
+; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, 0xffff8000
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v3
+; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: ssubsat_i48_sv:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v3, v1, vcc
+; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
+; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
+; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff8000
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v3
+; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: ssubsat_i48_sv:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1
+; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: ssubsat_i48_sv:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
+; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
+; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: ssubsat_i48_sv:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
+; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
+; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
+; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: ; return to shader part epilog
+ %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
+ %ext.result = zext i48 %result to i64
+ %cast = bitcast i64 %ext.result to <2 x float>
+ ret <2 x float> %cast
+}
-; define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
-; %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
-; %ext.result = zext i48 %result to i64
-; %cast = bitcast i64 %ext.result to <2 x float>
-; ret <2 x float> %cast
-; }
+define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
+; GFX6-LABEL: ssubsat_i48_vs:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: v_mov_b32_e32 v3, s1
+; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0
+; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v1, v3, vcc
+; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
+; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
+; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, 0xffff8000
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v3
+; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: ssubsat_i48_vs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v1, v3, vcc
+; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
+; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
+; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff8000
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v3
+; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: ssubsat_i48_vs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], 0
+; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1
+; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: ssubsat_i48_vs:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
+; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: ssubsat_i48_vs:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
+; GFX11-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
+; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
+; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: ; return to shader part epilog
+ %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
+ %ext.result = zext i48 %result to i64
+ %cast = bitcast i64 %ext.result to <2 x float>
+ ret <2 x float> %cast
+}
define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX6-LABEL: v_ssubsat_i64:
ret <4 x i32> %cast
}
-; FIXME: i48 broken because i48 add broken
-; define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
-; %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
-; ret i48 %result
-; }
+define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
+; GFX6-LABEL: v_uaddsat_i48:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v4
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, -1, vcc
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_uaddsat_i48:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uaddsat_i48:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10PLUS-LABEL: v_uaddsat_i48:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
+; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+ %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
+ ret i48 %result
+}
-; define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
-; %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
-; ret i48 %result
-; }
+define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
+; GFX6-LABEL: s_uaddsat_i48:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_mov_b32 s4, -1
+; GFX6-NEXT: s_add_u32 s0, s0, s2
+; GFX6-NEXT: s_mov_b32 s5, 0xffff
+; GFX6-NEXT: s_addc_u32 s1, s1, s3
+; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: s_and_b64 s[6:7], s[0:1], s[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v2, s0
+; GFX6-NEXT: v_mov_b32_e32 v3, s1
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_uaddsat_i48:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX8-NEXT: s_add_u32 s0, s0, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s1, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_uaddsat_i48:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX9-NEXT: s_add_u32 s0, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_addc_u32 s1, s1, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: s_uaddsat_i48:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX10PLUS-NEXT: s_add_u32 s0, s0, s2
+; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3
+; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s2, s[0:1], s[2:3]
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s0, -1, s2
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, -1, s2
+; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
+ ret i48 %result
+}
-; define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
-; %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
-; %ext.result = zext i48 %result to i64
-; %cast = bitcast i64 %ext.result to <2 x float>
-; ret <2 x float> %cast
-; }
+define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
+; GFX6-LABEL: uaddsat_i48_sv:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: v_mov_b32_e32 v3, s1
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0
+; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v3, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, -1, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: uaddsat_i48_sv:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: uaddsat_i48_sv:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: uaddsat_i48_sv:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10PLUS-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0
+; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo
+; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
+ %ext.result = zext i48 %result to i64
+ %cast = bitcast i64 %ext.result to <2 x float>
+ ret <2 x float> %cast
+}
-; define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
-; %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
-; %ext.result = zext i48 %result to i64
-; %cast = bitcast i64 %ext.result to <2 x float>
-; ret <2 x float> %cast
-; }
+define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
+; GFX6-LABEL: uaddsat_i48_vs:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: v_mov_b32_e32 v2, s1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v1, v2, vcc
+; GFX6-NEXT: s_mov_b32 s3, 0xffff
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: uaddsat_i48_vs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: uaddsat_i48_vs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: uaddsat_i48_vs:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0
+; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
+; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
+ %ext.result = zext i48 %result to i64
+ %cast = bitcast i64 %ext.result to <2 x float>
+ ret <2 x float> %cast
+}
define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
; GFX6-LABEL: v_uaddsat_i64:
ret <4 x i32> %cast
}
-; FIXME: i48 broken because i48 add broken
-; define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
-; %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
-; ret i48 %result
-; }
+define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
+; GFX6-LABEL: v_usubsat_i48:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
+; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_usubsat_i48:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2
+; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_usubsat_i48:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10PLUS-LABEL: v_usubsat_i48:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
+; GFX10PLUS-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
+; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc_lo
+; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+ %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
+ ret i48 %result
+}
-; define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
-; %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
-; ret i48 %result
-; }
+define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
+; GFX6-LABEL: s_usubsat_i48:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: s_sub_u32 s4, s0, s2
+; GFX6-NEXT: s_mov_b32 s7, 0xffff
+; GFX6-NEXT: s_subb_u32 s5, s1, s3
+; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
+; GFX6-NEXT: v_mov_b32_e32 v3, s5
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_usubsat_i48:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_sub_u32 s4, s0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_subb_u32 s5, s1, s3
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_usubsat_i48:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_sub_u32 s4, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_subb_u32 s5, s1, s3
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: s_usubsat_i48:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX10PLUS-NEXT: s_sub_u32 s4, s0, s2
+; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[2:3]
+; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s3
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s4, 0, s0
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, 0, s0
+; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
+ ret i48 %result
+}
-; define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
-; %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
-; %ext.result = zext i48 %result to i64
-; %cast = bitcast i64 %ext.result to <2 x float>
-; ret <2 x float> %cast
-; }
+define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
+; GFX6-LABEL: usubsat_i48_sv:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_mov_b32_e32 v2, s1
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s0, v0
+; GFX6-NEXT: s_mov_b32 s3, 0xffff
+; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v1, vcc
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: usubsat_i48_sv:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s0, v0
+; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v1, vcc
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: usubsat_i48_sv:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s0, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v1, vcc
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: usubsat_i48_sv:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0
+; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
+ %ext.result = zext i48 %result to i64
+ %cast = bitcast i64 %ext.result to <2 x float>
+ ret <2 x float> %cast
+}
-; define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
-; %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
-; %ext.result = zext i48 %result to i64
-; %cast = bitcast i64 %ext.result to <2 x float>
-; ret <2 x float> %cast
-; }
+define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
+; GFX6-LABEL: usubsat_i48_vs:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_mov_b32_e32 v2, s1
+; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s0, v0
+; GFX6-NEXT: s_mov_b32 s3, 0xffff
+; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v1, v2, vcc
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: usubsat_i48_vs:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s0, v0
+; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v1, v2, vcc
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: usubsat_i48_vs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v1, v2, vcc
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: usubsat_i48_vs:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
+; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
+ %ext.result = zext i48 %result to i64
+ %cast = bitcast i64 %ext.result to <2 x float>
+ ret <2 x float> %cast
+}
define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
; GFX6-LABEL: v_usubsat_i64: