; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_cselect_b32 s7, s16, s15
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
-; GFX9-NEXT: s_mov_b64 s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v4, 16
; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_s_s:
; GFX10-NEXT: s_cmp_eq_u32 s7, 1
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: v_mov_b32_e32 v9, 0
-; GFX10-NEXT: v_mov_b32_e32 v10, 0
+; GFX10-NEXT: v_mov_b32_e32 v10, 16
+; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cselect_b32 s0, s9, s8
; GFX10-NEXT: s_cmp_eq_u32 s7, 2
; GFX10-NEXT: v_mov_b32_e32 v5, s5
; GFX10-NEXT: v_mov_b32_e32 v6, s6
; GFX10-NEXT: v_mov_b32_e32 v7, s7
-; GFX10-NEXT: s_mov_b64 s[0:1], 16
; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1]
+; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
-; GFX9-NEXT: s_mov_b64 s[0:1], 16
-; GFX9-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-NEXT: v_mov_b32_e32 v10, 16
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_v_v16i16_s_s:
; GFX10-NEXT: s_lshl_b32 s8, s8, s3
; GFX10-NEXT: s_lshl_b32 s3, s9, s3
; GFX10-NEXT: s_not_b32 s8, s8
+; GFX10-NEXT: v_mov_b32_e32 v12, 16
; GFX10-NEXT: v_mov_b32_e32 v13, 0
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s5
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s6
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2
-; GFX10-NEXT: v_and_or_b32 v12, v0, s8, s3
+; GFX10-NEXT: v_and_or_b32 v14, v0, s8, s3
; GFX10-NEXT: v_cmp_eq_u32_e64 s3, s7, 0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v12, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v12, s3
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v12, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v12, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v12, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v7, v12, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v12, s6
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v12, s2
-; GFX10-NEXT: s_mov_b64 s[0:1], 16
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v14, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v14, s3
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v14, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v14, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v14, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v7, v14, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v14, s6
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v14, s2
; GFX10-NEXT: global_store_dwordx4 v[10:11], v[0:3], off
-; GFX10-NEXT: global_store_dwordx4 v13, v[4:7], s[0:1]
+; GFX10-NEXT: global_store_dwordx4 v[12:13], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(1 )* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
+; GFX9-NEXT: v_mov_b32_e32 v10, 16
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: s_mov_b64 s[0:1], 16
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_v_s:
; GFX10-NEXT: s_cmp_eq_u32 s0, 1
; GFX10-NEXT: v_and_b32_e32 v8, s3, v0
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 0
+; GFX10-NEXT: v_mov_b32_e32 v10, 16
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cselect_b32 s1, s9, s8
; GFX10-NEXT: v_mov_b32_e32 v7, s15
; GFX10-NEXT: s_lshl_b32 s3, s3, s2
; GFX10-NEXT: s_andn2_b32 s1, s1, s3
-; GFX10-NEXT: v_lshl_or_b32 v10, v8, s2, s1
+; GFX10-NEXT: v_lshl_or_b32 v12, v8, s2, s1
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: v_mov_b32_e32 v9, 0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 4
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 5
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 6
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 7
-; GFX10-NEXT: s_mov_b64 s[0:1], 16
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc_lo
; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[0:1]
+; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
+; GFX9-NEXT: v_mov_b32_e32 v10, 16
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: s_mov_b64 s[0:1], 16
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_s_v:
; GFX10-LABEL: insertelement_s_v16i16_s_v:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 1, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: s_mov_b32 s5, 0xffff
-; GFX10-NEXT: v_mov_b32_e32 v12, 0
; GFX10-NEXT: s_and_b32 s6, s4, s5
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10
-; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v10
-; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v10
-; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v10
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12
+; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12
+; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12
+; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s5
-; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10
+; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12
; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s6
-; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v10
+; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v1, s15, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: v_mov_b32_e32 v2, s10
; GFX10-NEXT: v_mov_b32_e32 v3, s11
-; GFX10-NEXT: v_and_or_b32 v11, v11, v9, v8
+; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8
; GFX10-NEXT: v_mov_b32_e32 v4, s12
; GFX10-NEXT: v_mov_b32_e32 v5, s13
; GFX10-NEXT: v_mov_b32_e32 v6, s14
; GFX10-NEXT: v_mov_b32_e32 v7, s15
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: v_mov_b32_e32 v9, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s6
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v11, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s3
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s5
-; GFX10-NEXT: s_mov_b64 s[0:1], 16
+; GFX10-NEXT: v_mov_b32_e32 v10, 16
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1
+; GFX10-NEXT: v_mov_b32_e32 v11, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5
; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1]
+; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
+; GFX9-NEXT: v_mov_b32_e32 v10, 16
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: s_mov_b64 s[0:1], 16
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_v_v:
; GFX10-LABEL: insertelement_s_v16i16_v_v:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 1, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v1
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: s_mov_b32 s4, 0xffff
-; GFX10-NEXT: v_mov_b32_e32 v12, 0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10
-; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v10
-; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v10
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12
+; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12
+; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v10
-; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10
-; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v10
+; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12
+; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12
+; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12
; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s4
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10
+; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12
; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v2, s15, s5
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: v_mov_b32_e32 v2, s10
; GFX10-NEXT: v_mov_b32_e32 v3, s11
-; GFX10-NEXT: v_and_or_b32 v11, v11, v9, v8
+; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8
; GFX10-NEXT: v_mov_b32_e32 v4, s12
; GFX10-NEXT: v_mov_b32_e32 v5, s13
; GFX10-NEXT: v_mov_b32_e32 v6, s14
; GFX10-NEXT: v_mov_b32_e32 v7, s15
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: v_mov_b32_e32 v9, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s6
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v11, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s3
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s5
-; GFX10-NEXT: s_mov_b64 s[0:1], 16
+; GFX10-NEXT: v_mov_b32_e32 v10, 16
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1
+; GFX10-NEXT: v_mov_b32_e32 v11, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5
; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1]
+; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc
-; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11]
-; GFX9-NEXT: s_mov_b64 s[0:1], 16
-; GFX9-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-NEXT: v_mov_b32_e32 v9, 0
+; GFX9-NEXT: v_mov_b32_e32 v10, 16
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_v_v16i16_s_v:
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
; GFX10-NEXT: s_mov_b32 s5, 0xffff
-; GFX10-NEXT: v_mov_b32_e32 v14, 0
+; GFX10-NEXT: v_mov_b32_e32 v13, 16
; GFX10-NEXT: s_and_b32 s6, s2, s5
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v0
+; GFX10-NEXT: v_mov_b32_e32 v14, 0
; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, s5
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s6
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s2
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5
-; GFX10-NEXT: v_and_or_b32 v13, v1, v11, v2
+; GFX10-NEXT: v_and_or_b32 v15, v1, v11, v2
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_mov_b32_e32 v12, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v13, s6
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v13, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v13, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v13, s3
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v13, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v13, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v13, s5
-; GFX10-NEXT: s_mov_b64 s[0:1], 16
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v15, s6
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v15, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v15, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v15, s3
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v15, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v15, s5
; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off
-; GFX10-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1]
+; GFX10-NEXT: global_store_dwordx4 v[13:14], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc
-; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11]
-; GFX9-NEXT: s_mov_b64 s[0:1], 16
-; GFX9-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-NEXT: v_mov_b32_e32 v9, 0
+; GFX9-NEXT: v_mov_b32_e32 v10, 16
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_v_v16i16_v_s:
; GFX10-NEXT: s_not_b32 s7, s7
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_mov_b32_e32 v12, 0
+; GFX10-NEXT: v_mov_b32_e32 v13, 16
; GFX10-NEXT: v_mov_b32_e32 v14, 0
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s4
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v10, s5
-; GFX10-NEXT: v_and_or_b32 v13, v0, s7, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v13, s6
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v13, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v13, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v13, s3
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v13, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v13, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v13, s5
-; GFX10-NEXT: s_mov_b64 s[0:1], 16
+; GFX10-NEXT: v_and_or_b32 v15, v0, s7, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v15, s6
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v15, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v15, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v15, s3
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v15, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v15, s5
; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off
-; GFX10-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1]
+; GFX10-NEXT: global_store_dwordx4 v[13:14], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11]
-; GFX9-NEXT: s_mov_b64 s[0:1], 16
-; GFX9-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-NEXT: v_mov_b32_e32 v9, 0
+; GFX9-NEXT: v_mov_b32_e32 v10, 16
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_v_v16i16_v_v:
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
; GFX10-NEXT: s_mov_b32 s4, 0xffff
+; GFX10-NEXT: v_mov_b32_e32 v14, 16
; GFX10-NEXT: v_mov_b32_e32 v15, 0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s3
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v11, s5
-; GFX10-NEXT: v_and_or_b32 v14, v1, v3, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v14, s6
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v14, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v14, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v14, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v14, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v14, s3
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v14, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v14, s5
-; GFX10-NEXT: s_mov_b64 s[0:1], 16
+; GFX10-NEXT: v_and_or_b32 v16, v1, v3, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v16, s6
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v16, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v16, s1
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v16, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v16, s3
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v16, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v16, s5
; GFX10-NEXT: global_store_dwordx4 v[12:13], v[0:3], off
-; GFX10-NEXT: global_store_dwordx4 v15, v[4:7], s[0:1]
+; GFX10-NEXT: global_store_dwordx4 v[14:15], v[4:7], off
; GFX10-NEXT: s_endpgm
%vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
%insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
-; CHECK-NEXT: global_load_dwordx4 v[7:10], v[0:1], off
+; CHECK-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:16
-; CHECK-NEXT: ; kill: def $vgpr7_vgpr8_vgpr9_vgpr10 killed $vgpr7_vgpr8_vgpr9_vgpr10 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14 killed $exec
+; CHECK-NEXT: ; kill: def $vgpr6_vgpr7_vgpr8_vgpr9 killed $vgpr6_vgpr7_vgpr8_vgpr9 def $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13 killed $exec
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v14, v3
-; CHECK-NEXT: v_mov_b32_e32 v13, v2
-; CHECK-NEXT: v_mov_b32_e32 v12, v1
-; CHECK-NEXT: v_mov_b32_e32 v11, v0
+; CHECK-NEXT: v_mov_b32_e32 v13, v3
+; CHECK-NEXT: v_mov_b32_e32 v12, v2
+; CHECK-NEXT: v_mov_b32_e32 v11, v1
+; CHECK-NEXT: v_mov_b32_e32 v10, v0
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: global_load_dwordx4 v[18:21], v[0:1], off
; CHECK-NEXT: v_mov_b32_e32 v24, v2
; CHECK-NEXT: v_mov_b32_e32 v23, v1
; CHECK-NEXT: v_mov_b32_e32 v22, v0
-; CHECK-NEXT: v_mov_b32_e32 v2, v7
-; CHECK-NEXT: v_mov_b32_e32 v3, v8
-; CHECK-NEXT: v_mov_b32_e32 v0, v9
-; CHECK-NEXT: v_mov_b32_e32 v1, v10
-; CHECK-NEXT: v_mov_b32_e32 v5, v11
-; CHECK-NEXT: v_mov_b32_e32 v6, v12
-; CHECK-NEXT: v_mov_b32_e32 v12, v13
-; CHECK-NEXT: v_mov_b32_e32 v13, v14
-; CHECK-NEXT: v_mov_b32_e32 v8, v18
-; CHECK-NEXT: v_mov_b32_e32 v9, v19
-; CHECK-NEXT: v_mov_b32_e32 v16, v20
-; CHECK-NEXT: v_mov_b32_e32 v17, v21
-; CHECK-NEXT: v_mov_b32_e32 v14, v22
-; CHECK-NEXT: v_mov_b32_e32 v15, v23
+; CHECK-NEXT: v_mov_b32_e32 v4, v6
+; CHECK-NEXT: v_mov_b32_e32 v5, v7
+; CHECK-NEXT: v_mov_b32_e32 v2, v8
+; CHECK-NEXT: v_mov_b32_e32 v3, v9
+; CHECK-NEXT: v_mov_b32_e32 v0, v10
+; CHECK-NEXT: v_mov_b32_e32 v1, v11
+; CHECK-NEXT: v_mov_b32_e32 v8, v12
+; CHECK-NEXT: v_mov_b32_e32 v9, v13
+; CHECK-NEXT: v_mov_b32_e32 v16, v18
+; CHECK-NEXT: v_mov_b32_e32 v17, v19
+; CHECK-NEXT: v_mov_b32_e32 v14, v20
+; CHECK-NEXT: v_mov_b32_e32 v15, v21
+; CHECK-NEXT: v_mov_b32_e32 v12, v22
+; CHECK-NEXT: v_mov_b32_e32 v13, v23
; CHECK-NEXT: v_mov_b32_e32 v10, v24
; CHECK-NEXT: v_mov_b32_e32 v11, v25
+; CHECK-NEXT: v_mov_b32_e32 v6, v4
+; CHECK-NEXT: v_mov_b32_e32 v4, v5
+; CHECK-NEXT: v_mov_b32_e32 v7, v16
+; CHECK-NEXT: v_mov_b32_e32 v5, v17
+; CHECK-NEXT: v_add_co_u32 v6, s6, v6, v7
+; CHECK-NEXT: v_add_co_ci_u32_e64 v4, s6, v4, v5, s6
+; CHECK-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; CHECK-NEXT: v_mov_b32_e32 v7, v4
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: v_mov_b32_e32 v2, v3
-; CHECK-NEXT: v_mov_b32_e32 v7, v8
-; CHECK-NEXT: v_mov_b32_e32 v3, v9
-; CHECK-NEXT: v_add_co_u32 v7, s6, v4, v7
+; CHECK-NEXT: v_mov_b32_e32 v5, v14
+; CHECK-NEXT: v_mov_b32_e32 v3, v15
+; CHECK-NEXT: v_add_co_u32 v4, s6, v4, v5
; CHECK-NEXT: v_add_co_ci_u32_e64 v2, s6, v2, v3, s6
-; CHECK-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; CHECK-NEXT: v_mov_b32_e32 v8, v2
+; CHECK-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; CHECK-NEXT: v_mov_b32_e32 v5, v2
; CHECK-NEXT: v_mov_b32_e32 v2, v0
; CHECK-NEXT: v_mov_b32_e32 v0, v1
-; CHECK-NEXT: v_mov_b32_e32 v3, v16
-; CHECK-NEXT: v_mov_b32_e32 v1, v17
-; CHECK-NEXT: v_add_co_u32 v3, s6, v2, v3
+; CHECK-NEXT: v_mov_b32_e32 v3, v12
+; CHECK-NEXT: v_mov_b32_e32 v1, v13
+; CHECK-NEXT: v_add_co_u32 v2, s6, v2, v3
; CHECK-NEXT: v_add_co_ci_u32_e64 v0, s6, v0, v1, s6
-; CHECK-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; CHECK-NEXT: v_mov_b32_e32 v4, v0
-; CHECK-NEXT: v_mov_b32_e32 v1, v5
-; CHECK-NEXT: v_mov_b32_e32 v0, v6
-; CHECK-NEXT: v_mov_b32_e32 v5, v14
-; CHECK-NEXT: v_mov_b32_e32 v2, v15
-; CHECK-NEXT: v_add_co_u32 v1, s6, v1, v5
-; CHECK-NEXT: v_add_co_ci_u32_e64 v0, s6, v0, v2, s6
-; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; CHECK-NEXT: v_mov_b32_e32 v2, v0
-; CHECK-NEXT: v_mov_b32_e32 v5, v12
-; CHECK-NEXT: v_mov_b32_e32 v0, v13
+; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; CHECK-NEXT: v_mov_b32_e32 v3, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, v8
+; CHECK-NEXT: v_mov_b32_e32 v1, v9
; CHECK-NEXT: v_mov_b32_e32 v9, v10
-; CHECK-NEXT: v_mov_b32_e32 v6, v11
-; CHECK-NEXT: v_add_co_u32 v5, s6, v5, v9
-; CHECK-NEXT: v_add_co_ci_u32_e64 v0, s6, v0, v6, s6
-; CHECK-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; CHECK-NEXT: v_mov_b32_e32 v6, v0
-; CHECK-NEXT: ; kill: def $vgpr7_vgpr8 killed $vgpr7_vgpr8 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec
-; CHECK-NEXT: v_mov_b32_e32 v10, v4
-; CHECK-NEXT: v_mov_b32_e32 v9, v3
-; CHECK-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
-; CHECK-NEXT: v_mov_b32_e32 v3, v5
-; CHECK-NEXT: v_mov_b32_e32 v4, v6
-; CHECK-NEXT: v_mov_b32_e32 v6, s5
-; CHECK-NEXT: v_mov_b32_e32 v5, s4
-; CHECK-NEXT: global_store_dwordx4 v[5:6], v[7:10], off
+; CHECK-NEXT: v_mov_b32_e32 v8, v11
+; CHECK-NEXT: v_add_co_u32 v0, s6, v0, v9
+; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s6, v1, v8, s6
+; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; CHECK-NEXT: v_mov_b32_e32 v1, v8
+; CHECK-NEXT: ; kill: def $vgpr6_vgpr7 killed $vgpr6_vgpr7 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
+; CHECK-NEXT: v_mov_b32_e32 v9, v5
+; CHECK-NEXT: v_mov_b32_e32 v8, v4
+; CHECK-NEXT: ; kill: def $vgpr2_vgpr3 killed $vgpr2_vgpr3 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
+; CHECK-NEXT: v_mov_b32_e32 v5, v1
+; CHECK-NEXT: v_mov_b32_e32 v4, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: v_mov_b32_e32 v1, s5
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
; CHECK-NEXT: s_mov_b64 s[4:5], 16
-; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5]
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: v_mov_b32_e32 v1, s5
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
; CHECK-NEXT: s_endpgm
entry:
%load0 = load <4 x i64>, <4 x i64> addrspace(1)* null, align 32