// Pick best from BotCand and TopCand.
LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
dbgs() << "Bot Cand: "; traceCandidate(BotCand););
- SchedCandidate Cand;
- if (TopCand.Reason == BotCand.Reason) {
- Cand = BotCand;
- GenericSchedulerBase::CandReason TopReason = TopCand.Reason;
- TopCand.Reason = NoCand;
- GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
- if (TopCand.Reason != NoCand) {
- Cand.setBest(TopCand);
- } else {
- TopCand.Reason = TopReason;
- }
- } else {
- if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) {
- Cand = TopCand;
- } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) {
- Cand = BotCand;
- } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
- Cand = TopCand;
- } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
- Cand = BotCand;
- } else {
- if (BotCand.Reason > TopCand.Reason) {
- Cand = TopCand;
- } else {
- Cand = BotCand;
- }
- }
+ SchedCandidate Cand = BotCand;
+ TopCand.Reason = NoCand;
+ GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
+ if (TopCand.Reason != NoCand) {
+ Cand.setBest(TopCand);
}
LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
; GFX7-LABEL: v_bswap_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bswap_i16:
; GFX7-LABEL: v_bswap_i16_zext_to_i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_bswap_i16_sext_to_i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v0, v0
-; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX8-NEXT: v_log_f16_e32 v2, v0
+; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX8-NEXT: v_mov_b32_e32 v2, 16
-; GFX8-NEXT: v_exp_f16_e32 v1, v1
; GFX8-NEXT: v_exp_f16_e32 v0, v0
-; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_exp_f16_e32 v1, v1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16:
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT: v_exp_f16_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX9-NEXT: v_exp_f16_e32 v1, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_log_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_log_f32_e32 v0, v0
-; GFX6-NEXT: v_log_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2
-; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3
-; GFX6-NEXT: v_exp_f32_e32 v0, v0
+; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2
; GFX6-NEXT: v_exp_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3
+; GFX6-NEXT: v_exp_f32_e32 v2, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16_fneg_lhs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v0, v0
-; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX8-NEXT: v_log_f16_e32 v2, v0
+; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX8-NEXT: v_mov_b32_e32 v2, 16
-; GFX8-NEXT: v_exp_f16_e32 v1, v1
; GFX8-NEXT: v_exp_f16_e32 v0, v0
-; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_exp_f16_e32 v1, v1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16_fneg_lhs:
; GFX8-LABEL: v_pow_v2f16_fneg_rhs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_log_f16_e32 v2, v0
+; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
-; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: v_exp_f16_e32 v1, v2
+; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX8-NEXT: v_mov_b32_e32 v2, 16
; GFX8-NEXT: v_exp_f16_e32 v0, v0
-; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_exp_f16_e32 v1, v1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16_fneg_rhs:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s4, 0x80008000
; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0
-; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_log_f16_e32 v0, v0
+; GFX8-NEXT: v_log_f16_e32 v2, v0
+; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1
-; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: v_exp_f16_e32 v1, v2
+; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX8-NEXT: v_mov_b32_e32 v2, 16
; GFX8-NEXT: v_exp_f16_e32 v0, v0
-; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_exp_f16_e32 v1, v1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs:
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s6, s8
; MOVREL-NEXT: v_mov_b32_e32 v16, s7
-; MOVREL-NEXT: v_mov_b32_e32 v8, v0
; MOVREL-NEXT: v_mov_b32_e32 v14, s5
+; MOVREL-NEXT: v_mov_b32_e32 v12, s3
; MOVREL-NEXT: v_mov_b32_e32 v13, s4
; MOVREL-NEXT: v_mov_b32_e32 v15, s6
-; MOVREL-NEXT: v_mov_b32_e32 v12, s3
; MOVREL-NEXT: v_mov_b32_e32 v11, s2
; MOVREL-NEXT: v_mov_b32_e32 v10, s1
; MOVREL-NEXT: v_mov_b32_e32 v9, s0
; MOVREL-NEXT: s_mov_b32 s0, exec_lo
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT: v_readfirstlane_b32 s1, v8
-; MOVREL-NEXT: v_mov_b32_e32 v0, v9
-; MOVREL-NEXT: v_mov_b32_e32 v1, v10
-; MOVREL-NEXT: v_mov_b32_e32 v2, v11
-; MOVREL-NEXT: v_mov_b32_e32 v3, v12
-; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v8
+; MOVREL-NEXT: v_readfirstlane_b32 s1, v0
+; MOVREL-NEXT: v_mov_b32_e32 v1, v9
+; MOVREL-NEXT: v_mov_b32_e32 v2, v10
+; MOVREL-NEXT: v_mov_b32_e32 v3, v11
+; MOVREL-NEXT: v_mov_b32_e32 v4, v12
+; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v0
; MOVREL-NEXT: s_mov_b32 m0, s1
-; MOVREL-NEXT: v_mov_b32_e32 v4, v13
-; MOVREL-NEXT: v_mov_b32_e32 v5, v14
-; MOVREL-NEXT: v_mov_b32_e32 v6, v15
-; MOVREL-NEXT: v_mov_b32_e32 v7, v16
-; MOVREL-NEXT: v_movreld_b32_e32 v0, s10
+; MOVREL-NEXT: v_mov_b32_e32 v5, v13
+; MOVREL-NEXT: v_mov_b32_e32 v6, v14
+; MOVREL-NEXT: v_mov_b32_e32 v7, v15
+; MOVREL-NEXT: v_mov_b32_e32 v8, v16
+; MOVREL-NEXT: v_movreld_b32_e32 v1, s10
; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
; MOVREL-NEXT: s_cbranch_execnz BB3_1
; MOVREL-NEXT: ; %bb.2:
; MOVREL-NEXT: s_mov_b32 exec_lo, s0
+; MOVREL-NEXT: v_mov_b32_e32 v0, v1
+; MOVREL-NEXT: v_mov_b32_e32 v1, v2
+; MOVREL-NEXT: v_mov_b32_e32 v2, v3
+; MOVREL-NEXT: v_mov_b32_e32 v3, v4
+; MOVREL-NEXT: v_mov_b32_e32 v4, v5
+; MOVREL-NEXT: v_mov_b32_e32 v5, v6
+; MOVREL-NEXT: v_mov_b32_e32 v6, v7
+; MOVREL-NEXT: v_mov_b32_e32 v7, v8
; MOVREL-NEXT: ; return to shader part epilog
entry:
%insert = insertelement <8 x float> %vec, float %val, i32 %idx
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s6, s8
; MOVREL-NEXT: v_mov_b32_e32 v17, s7
-; MOVREL-NEXT: v_mov_b32_e32 v8, v0
-; MOVREL-NEXT: v_mov_b32_e32 v9, v1
; MOVREL-NEXT: v_mov_b32_e32 v15, s5
-; MOVREL-NEXT: v_mov_b32_e32 v16, s6
-; MOVREL-NEXT: v_mov_b32_e32 v14, s4
; MOVREL-NEXT: v_mov_b32_e32 v13, s3
+; MOVREL-NEXT: v_mov_b32_e32 v14, s4
+; MOVREL-NEXT: v_mov_b32_e32 v16, s6
; MOVREL-NEXT: v_mov_b32_e32 v12, s2
; MOVREL-NEXT: v_mov_b32_e32 v11, s1
; MOVREL-NEXT: v_mov_b32_e32 v10, s0
; MOVREL-NEXT: s_mov_b32 s0, exec_lo
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT: v_readfirstlane_b32 s1, v9
-; MOVREL-NEXT: v_mov_b32_e32 v0, v10
-; MOVREL-NEXT: v_mov_b32_e32 v1, v11
-; MOVREL-NEXT: v_mov_b32_e32 v2, v12
-; MOVREL-NEXT: v_mov_b32_e32 v3, v13
-; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v9
+; MOVREL-NEXT: v_readfirstlane_b32 s1, v1
+; MOVREL-NEXT: v_mov_b32_e32 v2, v10
+; MOVREL-NEXT: v_mov_b32_e32 v3, v11
+; MOVREL-NEXT: v_mov_b32_e32 v4, v12
+; MOVREL-NEXT: v_mov_b32_e32 v5, v13
+; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1
; MOVREL-NEXT: s_mov_b32 m0, s1
-; MOVREL-NEXT: v_mov_b32_e32 v4, v14
-; MOVREL-NEXT: v_mov_b32_e32 v5, v15
-; MOVREL-NEXT: v_mov_b32_e32 v6, v16
-; MOVREL-NEXT: v_mov_b32_e32 v7, v17
-; MOVREL-NEXT: v_movreld_b32_e32 v0, v8
+; MOVREL-NEXT: v_mov_b32_e32 v6, v14
+; MOVREL-NEXT: v_mov_b32_e32 v7, v15
+; MOVREL-NEXT: v_mov_b32_e32 v8, v16
+; MOVREL-NEXT: v_mov_b32_e32 v9, v17
+; MOVREL-NEXT: v_movreld_b32_e32 v2, v0
; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
; MOVREL-NEXT: s_cbranch_execnz BB6_1
; MOVREL-NEXT: ; %bb.2:
; MOVREL-NEXT: s_mov_b32 exec_lo, s0
+; MOVREL-NEXT: v_mov_b32_e32 v0, v2
+; MOVREL-NEXT: v_mov_b32_e32 v1, v3
+; MOVREL-NEXT: v_mov_b32_e32 v2, v4
+; MOVREL-NEXT: v_mov_b32_e32 v3, v5
+; MOVREL-NEXT: v_mov_b32_e32 v4, v6
+; MOVREL-NEXT: v_mov_b32_e32 v5, v7
+; MOVREL-NEXT: v_mov_b32_e32 v6, v8
+; MOVREL-NEXT: v_mov_b32_e32 v7, v9
; MOVREL-NEXT: ; return to shader part epilog
entry:
%insert = insertelement <8 x float> %vec, float %val, i32 %idx
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: s_add_u32 s2, 4, 4
-; CHECK-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: s_mov_b32 m0, -1
-; CHECK-NEXT: ds_read_b32 v2, v2
+; CHECK-NEXT: ds_read_b32 v2, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_add_u32 s0, s0, 4
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
-; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v0
-; CI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v2
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT: flat_atomic_dec v4, v[2:3], v4 glc
+; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT: v_mov_b32_e32 v4, 42
+; CI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: flat_store_dword v[0:1], v4
+; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_atomic_dec v4, v[2:3], v4 glc
+; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT: v_mov_b32_e32 v4, 42
+; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_store_dword v[0:1], v4
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
; GFX9-LABEL: global_atomic_dec_ret_i32_offset_addr64:
; GFX9: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
; VI-NEXT: s_endpgm
; GFX9-LABEL: global_atomic_dec_noret_i32_offset_addr64:
; GFX9: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
-; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v0
-; CI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v2
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT: flat_atomic_dec v4, v[2:3], v4 glc
+; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT: v_mov_b32_e32 v4, 42
+; CI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: flat_store_dword v[0:1], v4
+; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_atomic_dec v4, v[2:3], v4 glc
+; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT: v_mov_b32_e32 v4, 42
+; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_store_dword v[0:1], v4
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
; GFX9: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
; CI-NEXT: s_endpgm
;
; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
; VI-NEXT: s_endpgm
; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
; GFX9: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v5, s3
-; CI-NEXT: v_mov_b32_e32 v4, s2
-; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v0
-; CI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4
-; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
-; CI-NEXT: v_mov_b32_e32 v5, s1
-; CI-NEXT: v_mov_b32_e32 v4, s0
-; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT: v_mov_b32_e32 v4, 42
+; CI-NEXT: v_mov_b32_e32 v5, 0
+; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CI-NEXT: s_endpgm
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT: v_mov_b32_e32 v4, 42
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v5, s1
-; CI-NEXT: v_mov_b32_e32 v4, s0
-; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0
+; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; CI-NEXT: s_endpgm
;
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0
+; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; VI-NEXT: s_endpgm
; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v5, s3
-; CI-NEXT: v_mov_b32_e32 v4, s2
-; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v0
-; CI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4
-; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
-; CI-NEXT: v_mov_b32_e32 v5, s1
-; CI-NEXT: v_mov_b32_e32 v4, s0
-; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT: v_mov_b32_e32 v4, 42
+; CI-NEXT: v_mov_b32_e32 v5, 0
+; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CI-NEXT: s_endpgm
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT: v_mov_b32_e32 v4, 42
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v5, s1
-; CI-NEXT: v_mov_b32_e32 v4, s0
-; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0
+; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; CI-NEXT: s_endpgm
;
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0
+; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; VI-NEXT: s_endpgm
; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
-; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v0
-; CI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v2
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT: flat_atomic_inc v4, v[2:3], v4 glc
+; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT: v_mov_b32_e32 v4, 42
+; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: flat_store_dword v[0:1], v4
+; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_atomic_inc v4, v[2:3], v4 glc
+; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT: v_mov_b32_e32 v4, 42
+; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_store_dword v[0:1], v4
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_atomic_inc v4, v[2:3], v4, off glc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT: v_mov_b32_e32 v4, 42
+; GFX9-NEXT: global_atomic_inc v2, v[2:3], v4, off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v[0:1], v4, off
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_atomic_inc v0, v[0:1], v4, off glc
+; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc
; GFX9-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v5, s3
-; CI-NEXT: v_mov_b32_e32 v4, s2
-; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v0
-; CI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4
-; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
-; CI-NEXT: v_mov_b32_e32 v5, s1
-; CI-NEXT: v_mov_b32_e32 v4, s0
-; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT: v_mov_b32_e32 v4, 42
+; CI-NEXT: v_mov_b32_e32 v5, 0
+; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CI-NEXT: s_endpgm
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT: v_mov_b32_e32 v4, 42
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v5, s3
-; GFX9-NEXT: v_mov_b32_e32 v4, s2
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 40, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT: global_atomic_inc_x2 v[2:3], v[4:5], v[2:3], off glc
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT: v_mov_b32_e32 v4, 42
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: global_atomic_inc_x2 v[2:3], v[2:3], v[4:5], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v5, s1
-; CI-NEXT: v_mov_b32_e32 v4, s0
-; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0
+; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; CI-NEXT: s_endpgm
;
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0
+; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; VI-NEXT: s_endpgm
;
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
; GFX9-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
-; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v0
-; CI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v2
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT: flat_atomic_inc v4, v[2:3], v4 glc
+; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT: v_mov_b32_e32 v4, 42
+; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: flat_store_dword v[0:1], v4
+; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_atomic_inc v4, v[2:3], v4 glc
+; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT: v_mov_b32_e32 v4, 42
+; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_store_dword v[0:1], v4
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: flat_atomic_inc v4, v[2:3], v4 glc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT: v_mov_b32_e32 v4, 42
+; GFX9-NEXT: flat_atomic_inc v2, v[2:3], v4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: flat_store_dword v[0:1], v4
+; GFX9-NEXT: flat_store_dword v[0:1], v2
; GFX9-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, i32* %ptr, i32 %id
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
-; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
; CI-NEXT: s_endpgm
;
; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v4 glc
+; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
; GFX9-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, i32* %ptr, i32 %id
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v5, s3
-; CI-NEXT: v_mov_b32_e32 v4, s2
-; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v0
-; CI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4
-; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
-; CI-NEXT: v_mov_b32_e32 v5, s1
-; CI-NEXT: v_mov_b32_e32 v4, s0
-; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; CI-NEXT: v_mov_b32_e32 v4, 42
+; CI-NEXT: v_mov_b32_e32 v5, 0
+; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CI-NEXT: s_endpgm
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT: v_mov_b32_e32 v4, 42
+; VI-NEXT: v_mov_b32_e32 v5, 0
+; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v5, s3
-; GFX9-NEXT: v_mov_b32_e32 v4, s2
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 40, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT: v_mov_b32_e32 v4, 42
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX9-NEXT: s_endpgm
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v5, s1
-; CI-NEXT: v_mov_b32_e32 v4, s0
-; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0
+; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; CI-NEXT: s_endpgm
;
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0
+; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; VI-NEXT: s_endpgm
;
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GFX9-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
;
; GFX10-LABEL: dpp_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; encoding: [0x80,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa]
-; GFX10-NEXT: s_load_dword s0, s[0:1], 0x2c ; encoding: [0x00,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
-; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; encoding: [0x00,0x02,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; encoding: [0x02,0x02,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; encoding: [0x01,0x02,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x04,0x7e,0x02,0x01,0x08,0x11]
; GFX10-NEXT: global_store_dword v[0:1], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
; GFX906-LABEL: v_sdot2_inline_literal_a_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
-; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
-; GFX906-NEXT: v_mov_b32_e32 v0, s4
-; GFX906-NEXT: v_dot2_i32_i16 v0, s5, v0, v1
+; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4
+; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8
+; GFX906-NEXT: v_mov_b32_e32 v0, s5
+; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_inline_literal_a_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
-; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
-; GFX908-NEXT: v_mov_b32_e32 v0, s4
-; GFX908-NEXT: v_dot2_i32_i16 v0, s5, v0, v1
+; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4
+; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8
+; GFX908-NEXT: v_mov_b32_e32 v0, s5
+; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_inline_literal_a_b:
; GFX906-LABEL: v_sdot2_inline_literal_a_b_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
-; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
-; GFX906-NEXT: v_mov_b32_e32 v0, s4
-; GFX906-NEXT: v_dot2_i32_i16 v0, s5, v0, 8
+; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4
+; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8
+; GFX906-NEXT: v_mov_b32_e32 v0, s5
+; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, 8
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_inline_literal_a_b_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
-; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
-; GFX908-NEXT: v_mov_b32_e32 v0, s4
-; GFX908-NEXT: v_dot2_i32_i16 v0, s5, v0, 8
+; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4
+; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8
+; GFX908-NEXT: v_mov_b32_e32 v0, s5
+; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, 8
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_inline_literal_a_b_c:
; GFX906-LABEL: v_udot2_inline_literal_a_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
-; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
-; GFX906-NEXT: v_mov_b32_e32 v0, s4
-; GFX906-NEXT: v_dot2_u32_u16 v0, s5, v0, v1
+; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4
+; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8
+; GFX906-NEXT: v_mov_b32_e32 v0, s5
+; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, v1
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_inline_literal_a_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
-; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
-; GFX908-NEXT: v_mov_b32_e32 v0, s4
-; GFX908-NEXT: v_dot2_u32_u16 v0, s5, v0, v1
+; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4
+; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8
+; GFX908-NEXT: v_mov_b32_e32 v0, s5
+; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_inline_literal_a_b:
; GFX906-LABEL: v_udot2_inline_literal_a_b_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
-; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
-; GFX906-NEXT: v_mov_b32_e32 v0, s4
-; GFX906-NEXT: v_dot2_u32_u16 v0, s5, v0, 8
+; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4
+; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8
+; GFX906-NEXT: v_mov_b32_e32 v0, s5
+; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, 8
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_inline_literal_a_b_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
-; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
-; GFX908-NEXT: v_mov_b32_e32 v0, s4
-; GFX908-NEXT: v_dot2_u32_u16 v0, s5, v0, 8
+; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4
+; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8
+; GFX908-NEXT: v_mov_b32_e32 v0, s5
+; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, 8
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_inline_literal_a_b_c:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: s_nop 1
+; GFX8-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: dpp_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v2, s0
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v0, s3
+; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false)
; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4095_vgpr_offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX6-NEXT: s_add_u32 s0, s2, 0x3ffc
+; GFX6-NEXT: s_add_u32 s4, s2, 0x3ffc
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
-; GFX6-NEXT: s_addc_u32 s1, s3, 0
-; GFX6-NEXT: s_mov_b32 s2, 0
-; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_addc_u32 s5, s3, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4095_vgpr_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT: s_add_u32 s0, s2, 0x3ffc
+; GFX7-NEXT: s_add_u32 s4, s2, 0x3ffc
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
-; GFX7-NEXT: s_addc_u32 s1, s3, 0
-; GFX7-NEXT: s_mov_b32 s2, 0
-; GFX7-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_addc_u32 s5, s3, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, float addrspace(1)* %ptr, i64 4095
; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX6-NEXT: v_lshl_b64 v[1:2], v[0:1], 2
+; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; GFX6-NEXT: s_mov_b32 s0, s2
; GFX6-NEXT: s_mov_b32 s1, s3
-; GFX6-NEXT: v_mov_b32_e32 v0, 2
+; GFX6-NEXT: v_mov_b32_e32 v2, 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX6-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], 2
+; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; GFX7-NEXT: s_mov_b32 s0, s2
; GFX7-NEXT: s_mov_b32 s1, s3
-; GFX7-NEXT: v_mov_b32_e32 v0, 2
+; GFX7-NEXT: v_mov_b32_e32 v2, 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: ; return to shader part epilog
%gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 %voffset
%result = atomicrmw add i32 addrspace(1)* %gep, i32 2 seq_cst
;
; GFX10-LABEL: add3_uniform_vgpr:
; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_f32_e64 v0, s2, 1.0
; GFX10-NEXT: v_add_f32_e64 v1, s3, 2.0
-; GFX10-NEXT: v_add_f32_e64 v2, s2, 1.0
-; GFX10-NEXT: v_add_f32_e64 v0, 0x40400000, s4
+; GFX10-NEXT: v_add_f32_e64 v2, 0x40400000, s4
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, v1, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX10-NEXT: ; return to shader part epilog
%a1 = fadd float %a, 1.0
%b2 = fadd float %b, 2.0
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-codegenprepare %s | FileCheck -check-prefix=IR %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0x392fa
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: s_mov_b32 s5, 0x30c30c31
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT: v_mul_hi_i32 v0, v0, s5
+; GCN-NEXT: s_mov_b32 s4, 0x30c30c31
+; GCN-NEXT: v_mul_hi_i32 v0, v0, s4
; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v0, 3, v0
; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0xa410
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: s_mov_b32 s5, 0x30c30c31
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: v_mul_hi_i32 v0, v0, s5
+; GCN-NEXT: s_mov_b32 s4, 0x30c30c31
+; GCN-NEXT: v_mul_hi_i32 v0, v0, s4
; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v0, 3, v0
; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; IR-LABEL: @select_mul_rhs_const_i32(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
; IR-NEXT: ret i32 [[OP]]
-;
%select = select i1 %cond, i32 5, i32 8
%op = mul i32 %select, 1000
ret i32 %op
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 128, i16 131
; IR-NEXT: store i16 [[OP]], i16 addrspace(1)* undef
; IR-NEXT: ret void
-
; GCN-LABEL: select_add_lhs_const_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[4:5], 0x0
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: flat_store_short v[0:1], v0
; GCN-NEXT: s_endpgm
-;
%select = select i1 %cond, i16 5, i16 8
%op = add i16 %select, 123
store i16 %op, i16 addrspace(1)* undef
; IR-LABEL: @select_add_trunc_select(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50
; IR-NEXT: ret i16 [[OP]]
-;
%select = select i1 %cond, i32 5, i32 8
%trunc = trunc i32 %select to i16
%op = add i16 %trunc, 42
; IR-LABEL: @select_add_zext_select(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 47, i32 50
; IR-NEXT: ret i32 [[OP]]
-
; GCN-LABEL: select_add_zext_select:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
;
; GFX1064-LABEL: add_i32_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
+; GFX1064-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz BB0_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo
-; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5
+; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_add_rtn_u32 v1, v2, v1
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB0_2:
; GFX1064-NEXT: v_nop
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
;
; GFX1064-LABEL: add_i32_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
+; GFX1064-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0
; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz BB1_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_mul_i32 s1, s0, s1
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB1_2:
; GFX1064-NEXT: v_nop
-; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0
; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
;
; GFX1064-LABEL: sub_i32_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
+; GFX1064-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz BB8_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo
-; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5
+; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_sub_rtn_u32 v1, v2, v1
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB8_2:
; GFX1064-NEXT: v_nop
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
;
; GFX1064-LABEL: sub_i32_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
+; GFX1064-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0
; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz BB9_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_mul_i32 s1, s0, s1
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB9_2:
; GFX1064-NEXT: v_nop
-; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0
; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
-; SI-NEXT: s_mov_b32 s0, 0xff00ff
-; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f
-; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0
-; SI-NEXT: s_mov_b32 s3, 0x33333333
-; SI-NEXT: s_mov_b32 s6, 0xcccccccc
-; SI-NEXT: s_mov_b32 s8, 0x55555555
-; SI-NEXT: s_mov_b32 s9, 0xaaaaaaaa
+; SI-NEXT: s_mov_b32 s6, 0xff00ff
+; SI-NEXT: s_mov_b32 s8, 0xf0f0f0f
+; SI-NEXT: s_mov_b32 s9, 0xf0f0f0f0
+; SI-NEXT: s_mov_b32 s10, 0x33333333
+; SI-NEXT: s_mov_b32 s11, 0xcccccccc
+; SI-NEXT: s_mov_b32 s0, 0x55555555
+; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8
; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8
; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
-; SI-NEXT: v_bfi_b32 v2, s0, v0, v2
-; SI-NEXT: v_bfi_b32 v4, s0, v1, v3
-; SI-NEXT: v_and_b32_e32 v1, s1, v2
-; SI-NEXT: v_and_b32_e32 v0, s1, v4
-; SI-NEXT: v_and_b32_e32 v3, s2, v2
-; SI-NEXT: v_and_b32_e32 v2, s2, v4
+; SI-NEXT: v_bfi_b32 v2, s6, v0, v2
+; SI-NEXT: v_bfi_b32 v4, s6, v1, v3
+; SI-NEXT: v_and_b32_e32 v1, s8, v2
+; SI-NEXT: v_and_b32_e32 v0, s8, v4
+; SI-NEXT: v_and_b32_e32 v3, s9, v2
+; SI-NEXT: v_and_b32_e32 v2, s9, v4
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_and_b32_e32 v1, s3, v3
-; SI-NEXT: v_and_b32_e32 v0, s3, v2
-; SI-NEXT: v_and_b32_e32 v3, s6, v3
-; SI-NEXT: v_and_b32_e32 v2, s6, v2
+; SI-NEXT: v_and_b32_e32 v1, s10, v3
+; SI-NEXT: v_and_b32_e32 v0, s10, v2
+; SI-NEXT: v_and_b32_e32 v3, s11, v3
+; SI-NEXT: v_and_b32_e32 v2, s11, v2
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_and_b32_e32 v1, s8, v3
-; SI-NEXT: v_and_b32_e32 v0, s8, v2
-; SI-NEXT: v_and_b32_e32 v3, s9, v3
-; SI-NEXT: v_and_b32_e32 v2, s9, v2
+; SI-NEXT: v_and_b32_e32 v1, s0, v3
+; SI-NEXT: v_and_b32_e32 v0, s0, v2
+; SI-NEXT: v_and_b32_e32 v3, s1, v3
+; SI-NEXT: v_and_b32_e32 v2, s1, v2
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; FLAT-NEXT: s_mov_b32 s2, 0x10203
-; FLAT-NEXT: s_mov_b32 s3, 0xf0f0f0f
-; FLAT-NEXT: s_mov_b32 s6, 0xf0f0f0f0
+; FLAT-NEXT: s_mov_b32 s6, 0x10203
+; FLAT-NEXT: s_mov_b32 s2, 0x33333333
+; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT-NEXT: v_mov_b32_e32 v1, s1
; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; FLAT-NEXT: s_mov_b32 s0, 0x33333333
-; FLAT-NEXT: s_mov_b32 s1, 0xcccccccc
-; FLAT-NEXT: s_mov_b32 s8, 0x55555555
-; FLAT-NEXT: s_mov_b32 s9, 0xaaaaaaaa
+; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f
+; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0
; FLAT-NEXT: s_mov_b32 s7, 0xf000
; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; FLAT-NEXT: v_perm_b32 v2, 0, v0, s2
-; FLAT-NEXT: v_perm_b32 v4, 0, v1, s2
-; FLAT-NEXT: v_and_b32_e32 v1, s3, v2
-; FLAT-NEXT: v_and_b32_e32 v0, s3, v4
-; FLAT-NEXT: v_and_b32_e32 v3, s6, v2
-; FLAT-NEXT: v_and_b32_e32 v2, s6, v4
+; FLAT-NEXT: v_perm_b32 v2, 0, v0, s6
+; FLAT-NEXT: v_perm_b32 v4, 0, v1, s6
+; FLAT-NEXT: v_and_b32_e32 v1, s0, v2
+; FLAT-NEXT: v_and_b32_e32 v0, s0, v4
+; FLAT-NEXT: v_and_b32_e32 v3, s1, v2
+; FLAT-NEXT: v_and_b32_e32 v2, s1, v4
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
-; FLAT-NEXT: s_mov_b32 s6, -1
+; FLAT-NEXT: s_mov_b32 s0, 0x55555555
; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT: v_and_b32_e32 v1, s0, v3
-; FLAT-NEXT: v_and_b32_e32 v0, s0, v2
-; FLAT-NEXT: v_and_b32_e32 v3, s1, v3
-; FLAT-NEXT: v_and_b32_e32 v2, s1, v2
+; FLAT-NEXT: v_and_b32_e32 v1, s2, v3
+; FLAT-NEXT: v_and_b32_e32 v0, s2, v2
+; FLAT-NEXT: v_and_b32_e32 v3, s3, v3
+; FLAT-NEXT: v_and_b32_e32 v2, s3, v2
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3]
+; FLAT-NEXT: s_mov_b32 s1, 0xaaaaaaaa
; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT: v_and_b32_e32 v1, s8, v3
-; FLAT-NEXT: v_and_b32_e32 v0, s8, v2
-; FLAT-NEXT: v_and_b32_e32 v3, s9, v3
-; FLAT-NEXT: v_and_b32_e32 v2, s9, v2
+; FLAT-NEXT: v_and_b32_e32 v1, s0, v3
+; FLAT-NEXT: v_and_b32_e32 v0, s0, v2
+; FLAT-NEXT: v_and_b32_e32 v3, s1, v3
+; FLAT-NEXT: v_and_b32_e32 v2, s1, v2
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
+; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: v_or_b32_e32 v1, v3, v1
; FLAT-NEXT: v_or_b32_e32 v0, v2, v0
; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
-; SI-NEXT: s_mov_b32 s0, 0xff00ff
-; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f
-; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0
-; SI-NEXT: s_mov_b32 s3, 0x33333333
-; SI-NEXT: s_mov_b32 s8, 0xcccccccc
-; SI-NEXT: s_mov_b32 s9, 0x55555555
-; SI-NEXT: s_mov_b32 s10, 0xaaaaaaaa
+; SI-NEXT: s_mov_b32 s8, 0xff00ff
+; SI-NEXT: s_mov_b32 s9, 0xf0f0f0f
+; SI-NEXT: s_mov_b32 s10, 0xf0f0f0f0
+; SI-NEXT: s_mov_b32 s11, 0x33333333
+; SI-NEXT: s_mov_b32 s12, 0xcccccccc
+; SI-NEXT: s_mov_b32 s13, 0x55555555
+; SI-NEXT: s_mov_b32 s14, 0xaaaaaaaa
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v4, v2, v2, 8
; SI-NEXT: v_alignbit_b32 v7, v1, v1, 8
; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24
-; SI-NEXT: v_bfi_b32 v2, s0, v2, v4
-; SI-NEXT: v_bfi_b32 v4, s0, v3, v5
-; SI-NEXT: v_bfi_b32 v6, s0, v0, v6
-; SI-NEXT: v_bfi_b32 v8, s0, v1, v7
-; SI-NEXT: v_and_b32_e32 v1, s1, v2
-; SI-NEXT: v_and_b32_e32 v0, s1, v4
-; SI-NEXT: v_and_b32_e32 v3, s2, v2
-; SI-NEXT: v_and_b32_e32 v2, s2, v4
-; SI-NEXT: v_and_b32_e32 v5, s1, v6
-; SI-NEXT: v_and_b32_e32 v4, s1, v8
-; SI-NEXT: v_and_b32_e32 v7, s2, v6
-; SI-NEXT: v_and_b32_e32 v6, s2, v8
+; SI-NEXT: v_bfi_b32 v2, s8, v2, v4
+; SI-NEXT: v_bfi_b32 v4, s8, v3, v5
+; SI-NEXT: v_bfi_b32 v6, s8, v0, v6
+; SI-NEXT: v_bfi_b32 v8, s8, v1, v7
+; SI-NEXT: v_and_b32_e32 v1, s9, v2
+; SI-NEXT: v_and_b32_e32 v0, s9, v4
+; SI-NEXT: v_and_b32_e32 v3, s10, v2
+; SI-NEXT: v_and_b32_e32 v2, s10, v4
+; SI-NEXT: v_and_b32_e32 v5, s9, v6
+; SI-NEXT: v_and_b32_e32 v4, s9, v8
+; SI-NEXT: v_and_b32_e32 v7, s10, v6
+; SI-NEXT: v_and_b32_e32 v6, s10, v8
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4
; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4
; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_or_b32_e32 v7, v7, v5
; SI-NEXT: v_or_b32_e32 v6, v6, v4
-; SI-NEXT: v_and_b32_e32 v1, s3, v3
-; SI-NEXT: v_and_b32_e32 v0, s3, v2
-; SI-NEXT: v_and_b32_e32 v5, s3, v7
-; SI-NEXT: v_and_b32_e32 v4, s3, v6
-; SI-NEXT: v_and_b32_e32 v3, s8, v3
-; SI-NEXT: v_and_b32_e32 v2, s8, v2
-; SI-NEXT: v_and_b32_e32 v7, s8, v7
-; SI-NEXT: v_and_b32_e32 v6, s8, v6
+; SI-NEXT: v_and_b32_e32 v1, s11, v3
+; SI-NEXT: v_and_b32_e32 v0, s11, v2
+; SI-NEXT: v_and_b32_e32 v5, s11, v7
+; SI-NEXT: v_and_b32_e32 v4, s11, v6
+; SI-NEXT: v_and_b32_e32 v3, s12, v3
+; SI-NEXT: v_and_b32_e32 v2, s12, v2
+; SI-NEXT: v_and_b32_e32 v7, s12, v7
+; SI-NEXT: v_and_b32_e32 v6, s12, v6
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2
; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2
; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_or_b32_e32 v7, v7, v5
; SI-NEXT: v_or_b32_e32 v6, v6, v4
-; SI-NEXT: v_and_b32_e32 v1, s9, v3
-; SI-NEXT: v_and_b32_e32 v0, s9, v2
-; SI-NEXT: v_and_b32_e32 v5, s9, v7
-; SI-NEXT: v_and_b32_e32 v4, s9, v6
-; SI-NEXT: v_and_b32_e32 v3, s10, v3
-; SI-NEXT: v_and_b32_e32 v2, s10, v2
-; SI-NEXT: v_and_b32_e32 v7, s10, v7
-; SI-NEXT: v_and_b32_e32 v6, s10, v6
+; SI-NEXT: v_and_b32_e32 v1, s13, v3
+; SI-NEXT: v_and_b32_e32 v0, s13, v2
+; SI-NEXT: v_and_b32_e32 v5, s13, v7
+; SI-NEXT: v_and_b32_e32 v4, s13, v6
+; SI-NEXT: v_and_b32_e32 v3, s14, v3
+; SI-NEXT: v_and_b32_e32 v2, s14, v2
+; SI-NEXT: v_and_b32_e32 v7, s14, v7
+; SI-NEXT: v_and_b32_e32 v6, s14, v6
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; FLAT-NEXT: s_mov_b32 s2, 0x10203
-; FLAT-NEXT: s_mov_b32 s3, 0xf0f0f0f
-; FLAT-NEXT: s_mov_b32 s8, 0xf0f0f0f0
+; FLAT-NEXT: s_mov_b32 s10, 0x10203
+; FLAT-NEXT: s_mov_b32 s2, 0x33333333
+; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT-NEXT: v_mov_b32_e32 v1, s1
; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; FLAT-NEXT: s_mov_b32 s0, 0x33333333
-; FLAT-NEXT: s_mov_b32 s1, 0xcccccccc
-; FLAT-NEXT: s_mov_b32 s9, 0x55555555
-; FLAT-NEXT: s_mov_b32 s10, 0xaaaaaaaa
+; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f
+; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0
+; FLAT-NEXT: s_mov_b32 s8, 0x55555555
+; FLAT-NEXT: s_mov_b32 s9, 0xaaaaaaaa
; FLAT-NEXT: s_mov_b32 s7, 0xf000
; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; FLAT-NEXT: v_perm_b32 v6, 0, v0, s2
-; FLAT-NEXT: v_perm_b32 v4, 0, v3, s2
-; FLAT-NEXT: v_perm_b32 v2, 0, v2, s2
-; FLAT-NEXT: v_perm_b32 v8, 0, v1, s2
-; FLAT-NEXT: v_and_b32_e32 v1, s3, v2
-; FLAT-NEXT: v_and_b32_e32 v0, s3, v4
-; FLAT-NEXT: v_and_b32_e32 v3, s8, v2
-; FLAT-NEXT: v_and_b32_e32 v2, s8, v4
-; FLAT-NEXT: v_and_b32_e32 v5, s3, v6
-; FLAT-NEXT: v_and_b32_e32 v4, s3, v8
-; FLAT-NEXT: v_and_b32_e32 v7, s8, v6
-; FLAT-NEXT: v_and_b32_e32 v6, s8, v8
+; FLAT-NEXT: v_perm_b32 v6, 0, v0, s10
+; FLAT-NEXT: v_perm_b32 v4, 0, v3, s10
+; FLAT-NEXT: v_perm_b32 v2, 0, v2, s10
+; FLAT-NEXT: v_perm_b32 v8, 0, v1, s10
+; FLAT-NEXT: v_and_b32_e32 v1, s0, v2
+; FLAT-NEXT: v_and_b32_e32 v0, s0, v4
+; FLAT-NEXT: v_and_b32_e32 v3, s1, v2
+; FLAT-NEXT: v_and_b32_e32 v2, s1, v4
+; FLAT-NEXT: v_and_b32_e32 v5, s0, v6
+; FLAT-NEXT: v_and_b32_e32 v4, s0, v8
+; FLAT-NEXT: v_and_b32_e32 v7, s1, v6
+; FLAT-NEXT: v_and_b32_e32 v6, s1, v8
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5]
; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
; FLAT-NEXT: v_or_b32_e32 v7, v7, v5
; FLAT-NEXT: v_or_b32_e32 v6, v6, v4
-; FLAT-NEXT: v_and_b32_e32 v1, s0, v3
-; FLAT-NEXT: v_and_b32_e32 v0, s0, v2
-; FLAT-NEXT: v_and_b32_e32 v5, s0, v7
-; FLAT-NEXT: v_and_b32_e32 v4, s0, v6
-; FLAT-NEXT: v_and_b32_e32 v3, s1, v3
-; FLAT-NEXT: v_and_b32_e32 v2, s1, v2
-; FLAT-NEXT: v_and_b32_e32 v7, s1, v7
-; FLAT-NEXT: v_and_b32_e32 v6, s1, v6
+; FLAT-NEXT: v_and_b32_e32 v1, s2, v3
+; FLAT-NEXT: v_and_b32_e32 v0, s2, v2
+; FLAT-NEXT: v_and_b32_e32 v5, s2, v7
+; FLAT-NEXT: v_and_b32_e32 v4, s2, v6
+; FLAT-NEXT: v_and_b32_e32 v3, s3, v3
+; FLAT-NEXT: v_and_b32_e32 v2, s3, v2
+; FLAT-NEXT: v_and_b32_e32 v7, s3, v7
+; FLAT-NEXT: v_and_b32_e32 v6, s3, v6
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3]
; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
; FLAT-NEXT: v_or_b32_e32 v7, v7, v5
; FLAT-NEXT: v_or_b32_e32 v6, v6, v4
-; FLAT-NEXT: v_and_b32_e32 v1, s9, v3
-; FLAT-NEXT: v_and_b32_e32 v0, s9, v2
-; FLAT-NEXT: v_and_b32_e32 v5, s9, v7
-; FLAT-NEXT: v_and_b32_e32 v4, s9, v6
-; FLAT-NEXT: v_and_b32_e32 v3, s10, v3
-; FLAT-NEXT: v_and_b32_e32 v2, s10, v2
-; FLAT-NEXT: v_and_b32_e32 v7, s10, v7
-; FLAT-NEXT: v_and_b32_e32 v6, s10, v6
+; FLAT-NEXT: v_and_b32_e32 v1, s8, v3
+; FLAT-NEXT: v_and_b32_e32 v0, s8, v2
+; FLAT-NEXT: v_and_b32_e32 v5, s8, v7
+; FLAT-NEXT: v_and_b32_e32 v4, s8, v6
+; FLAT-NEXT: v_and_b32_e32 v3, s9, v3
+; FLAT-NEXT: v_and_b32_e32 v2, s9, v2
+; FLAT-NEXT: v_and_b32_e32 v7, s9, v7
+; FLAT-NEXT: v_and_b32_e32 v6, s9, v6
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
; FLAT-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5]
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0
-; GFX9-NEXT: v_add_u32_e32 v0, v0, v4
-; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GFX9-NEXT: v_xor_b32_e32 v2, v4, v2
; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX9-NEXT: v_mul_lo_u32 v4, v3, v1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX9-NEXT: v_mul_hi_u32 v4, v4, v3
-; GFX9-NEXT: v_add_u32_e32 v5, v3, v4
+; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; GFX9-NEXT: v_add_u32_e32 v0, v0, v5
+; GFX9-NEXT: v_xor_b32_e32 v0, v0, v5
+; GFX9-NEXT: v_add_u32_e32 v6, v3, v4
; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; GFX9-NEXT: v_mul_hi_u32 v3, v3, v0
+; GFX9-NEXT: v_xor_b32_e32 v2, v5, v2
; GFX9-NEXT: v_mul_lo_u32 v4, v3, v1
; GFX9-NEXT: v_add_u32_e32 v5, 1, v3
; GFX9-NEXT: v_add_u32_e32 v6, -1, v3
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v1
-; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v0
-; GFX9-NEXT: v_add_u32_e32 v0, v0, v3
-; GFX9-NEXT: v_xor_b32_e32 v0, v0, v3
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_mul_lo_u32 v4, v2, v1
-; GFX9-NEXT: v_mul_hi_u32 v5, v2, v1
-; GFX9-NEXT: v_sub_u32_e32 v6, 0, v4
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT: v_mul_hi_u32 v4, v4, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v2, v4
-; GFX9-NEXT: v_sub_u32_e32 v2, v2, v4
+; GFX9-NEXT: v_mul_lo_u32 v3, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v4, v2, v1
+; GFX9-NEXT: v_sub_u32_e32 v5, 0, v3
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT: v_mul_hi_u32 v3, v3, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0
+; GFX9-NEXT: v_add_u32_e32 v0, v0, v4
+; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4
+; GFX9-NEXT: v_add_u32_e32 v5, v2, v3
+; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
; GFX9-NEXT: v_mul_hi_u32 v2, v2, v0
; GFX9-NEXT: v_mul_lo_u32 v2, v2, v1
-; GFX9-NEXT: v_sub_u32_e32 v4, v0, v2
+; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2
; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
-; GFX9-NEXT: v_sub_u32_e32 v0, v4, v1
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1
+; GFX9-NEXT: v_sub_u32_e32 v0, v3, v1
; GFX9-NEXT: s_and_b64 vcc, vcc, s[4:5]
-; GFX9-NEXT: v_add_u32_e32 v5, v4, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT: v_add_u32_e32 v5, v3, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[4:5]
-; GFX9-NEXT: v_xor_b32_e32 v0, v0, v3
-; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3
+; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4
+; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
%d = srem i32 %a, %b
ret i32 %d
define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
; SI-LABEL: test_copy_v4i8:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
; VI-LABEL: test_copy_v4i8_x2:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: s_mov_b32 s0, s6
; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
-; SI-NEXT: s_mov_b32 s14, 0
-; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s18, 0
+; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
+; SI-NEXT: s_mov_b64 s[16:17], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64
; SI-NEXT: s_mov_b32 s10, -1
-; SI-NEXT: s_mov_b32 s8, s4
-; SI-NEXT: s_mov_b32 s9, s5
-; SI-NEXT: s_mov_b32 s4, s2
-; SI-NEXT: s_mov_b32 s5, s3
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s2, s10
; SI-NEXT: s_mov_b32 s3, s11
-; SI-NEXT: s_mov_b32 s6, s10
-; SI-NEXT: s_mov_b32 s7, s11
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
; SI-LABEL: test_copy_v4i8_x4:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x11
-; SI-NEXT: s_mov_b32 s15, 0xf000
-; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s15
+; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s14, 0
+; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s14, -1
-; SI-NEXT: s_mov_b32 s18, s14
-; SI-NEXT: s_mov_b32 s19, s15
-; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s16, s2
-; SI-NEXT: s_mov_b32 s17, s3
-; SI-NEXT: s_mov_b32 s2, s14
-; SI-NEXT: s_mov_b32 s3, s15
-; SI-NEXT: s_mov_b32 s12, s6
-; SI-NEXT: s_mov_b32 s13, s7
-; SI-NEXT: s_mov_b32 s6, s14
-; SI-NEXT: s_mov_b32 s7, s15
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s16, s6
+; SI-NEXT: s_mov_b32 s17, s7
+; SI-NEXT: s_mov_b32 s6, s2
+; SI-NEXT: s_mov_b32 s7, s3
+; SI-NEXT: s_mov_b32 s0, s10
+; SI-NEXT: s_mov_b32 s1, s11
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_mov_b32 s18, s2
+; SI-NEXT: s_mov_b32 s19, s3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8_x4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0
+; VI-NEXT: s_mov_b32 s0, s10
+; VI-NEXT: v_mov_b32_e32 v1, s13
+; VI-NEXT: v_add_u32_e32 v0, vcc, s12, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s2, s10
-; VI-NEXT: s_mov_b32 s3, s11
-; VI-NEXT: s_mov_b32 s8, s6
-; VI-NEXT: s_mov_b32 s9, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: s_mov_b32 s1, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
; SI-LABEL: test_copy_v4i8_extra_use:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
-; SI-NEXT: s_mov_b32 s11, 0xf000
-; SI-NEXT: s_mov_b32 s2, 0
-; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
-; SI-NEXT: s_mov_b32 s10, -1
-; SI-NEXT: s_mov_b32 s0, 0xff00
-; SI-NEXT: s_mov_b32 s8, s6
-; SI-NEXT: s_mov_b32 s9, s7
-; SI-NEXT: s_mov_b32 s6, s10
-; SI-NEXT: s_mov_b32 s7, s11
-; SI-NEXT: s_movk_i32 s1, 0xff
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT: s_mov_b32 s12, 0xff00
+; SI-NEXT: s_movk_i32 s13, 0xff
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s6
+; SI-NEXT: s_mov_b32 s1, s7
+; SI-NEXT: s_mov_b32 s6, s2
+; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT: v_and_b32_e32 v2, s0, v0
-; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_add_i32_e32 v0, vcc, 9, v0
-; SI-NEXT: v_and_b32_e32 v0, s1, v0
-; SI-NEXT: v_and_b32_e32 v3, s0, v1
+; SI-NEXT: v_and_b32_e32 v4, s12, v1
; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1
-; SI-NEXT: v_or_b32_e32 v0, v2, v0
-; SI-NEXT: v_and_b32_e32 v1, s1, v1
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0
-; SI-NEXT: v_or_b32_e32 v1, v3, v1
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v2, s12, v0
+; SI-NEXT: v_and_b32_e32 v3, s13, v3
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_and_b32_e32 v1, s13, v1
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2
+; SI-NEXT: v_or_b32_e32 v1, v4, v1
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0
-; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8_extra_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_movk_i32 s8, 0xff00
+; VI-NEXT: s_movk_i32 s10, 0x900
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: s_movk_i32 s8, 0xff00
+; VI-NEXT: s_movk_i32 s9, 0xff
; VI-NEXT: s_mov_b32 s0, s6
; VI-NEXT: s_mov_b32 s1, s7
-; VI-NEXT: s_movk_i32 s9, 0xff
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
-; VI-NEXT: s_movk_i32 s10, 0x900
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT: v_and_b32_e32 v3, s8, v1
+; VI-NEXT: v_and_b32_e32 v4, s8, v1
; VI-NEXT: v_add_u16_e32 v1, 9, v1
+; VI-NEXT: v_add_u16_e32 v3, 9, v0
; VI-NEXT: v_and_b32_e32 v1, s9, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_or_b32_e32 v1, v4, v1
; VI-NEXT: v_and_b32_e32 v2, s8, v0
-; VI-NEXT: v_add_u16_e32 v0, 9, v0
-; VI-NEXT: v_and_b32_e32 v0, s9, v0
-; VI-NEXT: v_or_b32_e32 v1, v3, v1
-; VI-NEXT: v_or_b32_e32 v0, v2, v0
+; VI-NEXT: v_and_b32_e32 v3, s9, v3
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: v_add_u16_e32 v1, s10, v1
-; VI-NEXT: v_add_u16_e32 v0, s10, v0
+; VI-NEXT: v_add_u16_e32 v2, s10, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_or_b32_e32 v1, v2, v1
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s14, 0
-; SI-NEXT: s_mov_b32 s15, s3
+; SI-NEXT: s_mov_b32 s18, 0
+; SI-NEXT: s_mov_b32 s19, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[12:13], s[10:11]
+; SI-NEXT: s_mov_b64 s[16:17], s[10:11]
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
-; SI-NEXT: s_mov_b32 s16, 0xff00
-; SI-NEXT: s_movk_i32 s17, 0xff
-; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_mov_b32 s1, s9
-; SI-NEXT: s_mov_b32 s8, s6
-; SI-NEXT: s_mov_b32 s9, s7
-; SI-NEXT: s_mov_b32 s10, s2
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_mov_b32 s8, 0xff00
+; SI-NEXT: s_movk_i32 s9, 0xff
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: s_mov_b32 s14, s2
+; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT: v_and_b32_e32 v4, s16, v1
+; SI-NEXT: v_and_b32_e32 v4, s8, v1
; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1
-; SI-NEXT: v_and_b32_e32 v2, s16, v0
-; SI-NEXT: v_and_b32_e32 v3, s17, v3
+; SI-NEXT: v_and_b32_e32 v2, s8, v0
+; SI-NEXT: v_and_b32_e32 v3, s9, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_and_b32_e32 v1, s17, v1
+; SI-NEXT: v_and_b32_e32 v1, s9, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2
; SI-NEXT: v_or_b32_e32 v1, v4, v1
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT: buffer_store_dword v1, off, s[8:11], 0
+; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_movk_i32 s12, 0xff00
-; VI-NEXT: s_movk_i32 s13, 0xff
-; VI-NEXT: s_movk_i32 s14, 0x900
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
+; VI-NEXT: s_mov_b32 s8, s4
+; VI-NEXT: s_movk_i32 s4, 0xff00
+; VI-NEXT: s_mov_b32 s9, s5
+; VI-NEXT: s_movk_i32 s5, 0xff
+; VI-NEXT: s_movk_i32 s6, 0x900
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_mov_b32 s2, s10
+; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT: v_and_b32_e32 v4, s12, v1
+; VI-NEXT: v_and_b32_e32 v4, s4, v1
; VI-NEXT: v_add_u16_e32 v1, 9, v1
; VI-NEXT: v_add_u16_e32 v3, 9, v0
-; VI-NEXT: v_and_b32_e32 v1, s13, v1
+; VI-NEXT: v_and_b32_e32 v1, s5, v1
; VI-NEXT: v_or_b32_e32 v1, v4, v1
-; VI-NEXT: v_and_b32_e32 v2, s12, v0
-; VI-NEXT: v_and_b32_e32 v3, s13, v3
+; VI-NEXT: v_and_b32_e32 v2, s4, v0
+; VI-NEXT: v_and_b32_e32 v3, s5, v3
; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: v_add_u16_e32 v1, s14, v1
-; VI-NEXT: v_add_u16_e32 v2, s14, v2
+; VI-NEXT: v_add_u16_e32 v1, s6, v1
+; VI-NEXT: v_add_u16_e32 v2, s6, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
; SI-LABEL: test_copy_v3i8_align4:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
-; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v3i8_align4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
-; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x
define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i32:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v1, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i32:
define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_v2i32:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v2, v1
; SI-NEXT: v_ffbh_u32_e32 v3, v0
; SI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_v2i32:
define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_v4i32:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v4, v3
; SI-NEXT: v_ffbh_u32_e32 v5, v2
; SI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_v4i32:
define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i8:
; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
; SI-LABEL: v_ctlz_i64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; SI-NEXT: s_mov_b64 s[10:11], s[6:7]
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v4, v2
; SI-NEXT: v_ffbh_u32_e32 v5, v3
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: v_cndmask_b32_e32 v2, 64, v3, vcc
; SI-NEXT: v_mov_b32_e32 v3, v1
-; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i64:
define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
; SI-LABEL: v_ctlz_i64_trunc:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; SI-NEXT: s_mov_b64 s[10:11], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v0, v3
; SI-NEXT: v_ffbh_u32_e32 v5, v4
; SI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc
-; SI-NEXT: buffer_store_dword v0, v[1:2], s[8:11], 0 addr64
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i64_trunc:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
-; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 0
+; VI-NEXT: v_mov_b32_e32 v4, 0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v3, vcc
-; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; VI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[1:2]
+; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v4, v0
-; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v4
+; VI-NEXT: v_ffbh_u32_e32 v2, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
; VI-NEXT: v_ffbh_u32_e32 v5, v1
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 64, v1, vcc
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: flat_store_dword v[3:4], v0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctlz_i64_trunc:
define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i32_sel_eq_neg1:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v0, v0
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i32_sel_eq_neg1:
define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i32_sel_ne_neg1:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v0, v0
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i32_sel_ne_neg1:
define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v1, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v1, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i8_sel_eq_neg1:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v0, v0
-; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i8_sel_eq_neg1:
define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i16_sel_eq_neg1:
; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i7_sel_eq_neg1:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
-; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i7_sel_eq_neg1:
define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
; SI-LABEL: load_v4i8_to_v4f32_2_uses:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s6, 0
-; SI-NEXT: s_mov_b32 s7, s3
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s10, s2
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_movk_i32 s12, 0xff
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4
-; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v0, s12, v4
; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: load_v4i8_to_v4f32_2_uses:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT: v_mov_b32_e32 v4, 9
-; VI-NEXT: s_movk_i32 s8, 0x900
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s6, s2
-; VI-NEXT: s_mov_b32 s7, s3
-; VI-NEXT: v_mov_b32_e32 v6, s8
+; VI-NEXT: v_mov_b32_e32 v4, 9
+; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_movk_i32 s0, 0x900
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v5
+; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v5
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v5
; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v5
; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v5
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT: v_and_b32_e32 v8, 0xffffff00, v5
-; VI-NEXT: v_add_u16_e32 v9, 9, v5
+; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v5
+; VI-NEXT: v_add_u16_e32 v8, 9, v5
; VI-NEXT: v_add_u16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v7
-; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6
+; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_add_u16_e32 v0, s8, v0
-; VI-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_add_u16_e32 v0, s0, v0
+; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
-; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
-; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2
-; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3
-; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4
-; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5
-; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:6
+; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:5
+; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:6
+; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:1
+; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:2
+; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3
+; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:4
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt vmcnt(5)
-; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v9, v3
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v4
-; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
+; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v4
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5
+; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v5
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v7, v0
-; SI-NEXT: v_or_b32_e32 v0, v8, v6
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
+; SI-NEXT: v_or_b32_e32 v0, v8, v7
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_or_b32_e32 v6, v0, v2
+; SI-NEXT: v_or_b32_e32 v6, v0, v6
+; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v2
; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6
-; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:24
+; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:24
; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v10, v[2:3]
-; VI-NEXT: flat_load_ubyte v11, v[4:5]
-; VI-NEXT: v_add_u32_e32 v2, vcc, 6, v0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v6, vcc, 5, v0
+; VI-NEXT: v_add_u32_e32 v6, vcc, 6, v0
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v8, vcc, 1, v0
+; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: flat_load_ubyte v4, v[6:7]
-; VI-NEXT: flat_load_ubyte v5, v[8:9]
+; VI-NEXT: v_add_u32_e32 v10, vcc, 5, v0
+; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v12, vcc, 1, v0
+; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ubyte v6, v[6:7]
+; VI-NEXT: flat_load_ubyte v7, v[8:9]
+; VI-NEXT: flat_load_ubyte v8, v[10:11]
+; VI-NEXT: flat_load_ubyte v9, v[12:13]
; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v1, v[2:3]
+; VI-NEXT: flat_load_ubyte v2, v[4:5]
; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
-; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v10
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v6
+; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v2
-; VI-NEXT: v_or_b32_sdwa v2, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8
+; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5
-; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v4
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v3
-; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2
+; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v9
; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2
; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) {
; SI-LABEL: cvt_ubyte0_or_multiuse:
; SI: ; %bb.0: ; %bb
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s4, s2
-; SI-NEXT: s_mov_b32 s5, s3
-; SI-NEXT: s_mov_b32 s2, 0
-; SI-NEXT: s_mov_b32 s3, s7
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s0, s6
+; SI-NEXT: s_mov_b32 s1, s7
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, s3
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
; SI-NEXT: v_add_f32_e32 v0, v0, v1
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: cvt_ubyte0_or_multiuse:
; VI: ; %bb.0: ; %bb
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
; VI-NEXT: v_add_f32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0
-; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen
-; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
+; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0
-; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen
; GFX7-UNALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen
-; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-ALIGNED-LABEL: private_load_2xi16_align1:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 3, v0
-; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v0
-; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 1, v0
-; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], s33 offen
-; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], s33 offen
+; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], s33 offen
+; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0
+; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 3, v0
+; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], s33 offen
+; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3)
-; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v3, v1
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmax_legacy_uge_f64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmax_legacy_uge_f64:
define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmax_legacy_oge_f64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmax_legacy_oge_f64:
define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmax_legacy_ugt_f64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmax_legacy_ugt_f64:
define amdgpu_kernel void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmax_legacy_ogt_f64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmax_legacy_ogt_f64:
define amdgpu_kernel void @test_fmin_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_uge_f64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_uge_f64:
define amdgpu_kernel void @test_fmin_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_ugt_f64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_ugt_f64:
define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_ule_f64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ngt_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_ule_f64:
define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_ult_f64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_nge_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_ult_f64:
define amdgpu_kernel void @test_fmin_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_oge_f64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_oge_f64:
define amdgpu_kernel void @test_fmin_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_ogt_f64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_ogt_f64:
define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_ole_f64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_le_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_ole_f64:
define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_olt_f64:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s4, s0
-; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_olt_f64:
; GFX6-LABEL: v_pow_v2f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_log_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_log_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v3, v1
+; GFX6-NEXT: v_log_f32_e32 v1, v1
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v2, v0
+; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v3, v1
; GFX6-NEXT: v_exp_f32_e32 v0, v0
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_exp_f32_e32 v1, v2
; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_exp_f32_e32 v2, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX9-NEXT: v_log_f32_e32 v2, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)
ret <2 x half> %pow
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_log_f32_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_log_f32_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_log_f32_e32 v4, v0
-; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v1, v2
+; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v2, v3
; GFX6-NEXT: v_exp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v3, v4
+; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v4
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_exp_f32_e32 v1, v2
; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_exp_f32_e32 v2, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16_fneg_lhs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e64 v2, -v0
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX9-NEXT: v_log_f32_e32 v2, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%x.fneg = fneg <2 x half> %x
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y)
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_exp_f32_e32 v1, v2
; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_exp_f32_e32 v2, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16_fneg_rhs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e64 v3, -v1
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v1
; GFX9-NEXT: v_log_f32_e32 v2, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%y.fneg = fneg <2 x half> %y
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg)
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_exp_f32_e32 v1, v2
; GFX8-NEXT: v_exp_f32_e32 v0, v0
-; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_exp_f32_e32 v2, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e64 v2, -v0
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e64 v3, -v1
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v1
; GFX9-NEXT: v_log_f32_e32 v2, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%x.fneg = fneg <2 x half> %x
%y.fneg = fneg <2 x half> %y
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: BB0_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7
-; GFX9-NEXT: v_mul_hi_u32 v2, v0, s6
-; GFX9-NEXT: v_add_u32_e32 v3, v2, v1
-; GFX9-NEXT: v_mul_lo_u32 v1, s3, v3
-; GFX9-NEXT: v_mul_lo_u32 v4, v3, s2
-; GFX9-NEXT: v_add_u32_e32 v7, 1, v3
-; GFX9-NEXT: v_add_u32_e32 v6, -1, v3
-; GFX9-NEXT: v_add_u32_e32 v5, s6, v1
-; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v4
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5
+; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7
+; GFX9-NEXT: v_mul_hi_u32 v4, v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3
+; GFX9-NEXT: v_mul_lo_u32 v5, v3, s2
+; GFX9-NEXT: v_add_u32_e32 v6, 1, v3
+; GFX9-NEXT: v_add_u32_e32 v7, -1, v3
+; GFX9-NEXT: v_add_u32_e32 v4, s6, v4
+; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v5
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v4
; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc
; GFX9-NEXT: s_add_u32 s6, s6, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_addc_u32 s7, s7, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: s_add_u32 s4, s4, 4
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400
; GFX9-NEXT: global_store_dword v[1:2], v3, off
; GFX9-NEXT: s_cbranch_scc0 BB0_1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: BB1_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7
-; GFX9-NEXT: v_mul_hi_u32 v2, v0, s6
-; GFX9-NEXT: v_add_u32_e32 v3, v2, v1
-; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, s2
-; GFX9-NEXT: v_sub_u32_e32 v5, 1, v3
-; GFX9-NEXT: v_not_b32_e32 v3, v3
-; GFX9-NEXT: v_mul_lo_u32 v5, s2, v5
+; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7
+; GFX9-NEXT: v_mul_hi_u32 v4, v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT: v_mul_lo_u32 v5, s3, v3
+; GFX9-NEXT: v_mul_lo_u32 v4, v3, s2
+; GFX9-NEXT: v_not_b32_e32 v6, v3
+; GFX9-NEXT: v_sub_u32_e32 v3, 1, v3
; GFX9-NEXT: v_mul_lo_u32 v3, s2, v3
-; GFX9-NEXT: v_add_u32_e32 v4, s6, v4
-; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s6, v6
+; GFX9-NEXT: v_mul_lo_u32 v6, s2, v6
+; GFX9-NEXT: v_add_u32_e32 v5, s6, v5
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s6, v4
+; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v5
; GFX9-NEXT: s_and_b64 vcc, vcc, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v4, s6, v6
; GFX9-NEXT: v_add_u32_e32 v3, s6, v3
-; GFX9-NEXT: v_add_u32_e32 v5, s6, v5
; GFX9-NEXT: s_add_u32 s6, s6, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_addc_u32 s7, s7, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: s_add_u32 s4, s4, 4
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400
; GFX9-NEXT: global_store_dword v[1:2], v3, off
; GFX9-NEXT: s_cbranch_scc0 BB1_1
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_mul_lo_u32 v4, v3, s3
-; GFX9-NEXT: v_add_u32_e32 v6, 1, v3
-; GFX9-NEXT: v_add_u32_e32 v7, -1, v3
-; GFX9-NEXT: v_sub_u32_e32 v5, s6, v4
+; GFX9-NEXT: v_add_u32_e32 v5, 1, v3
+; GFX9-NEXT: v_add_u32_e32 v6, -1, v3
+; GFX9-NEXT: v_sub_u32_e32 v7, s6, v4
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v4
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v5
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7
; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1]
; GFX9-NEXT: s_add_i32 s6, s6, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
; GFX9-NEXT: s_add_u32 s4, s4, 4
; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3
; GFX9-NEXT: s_addc_u32 s5, s5, 0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: BB3_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_mul_hi_u32 v1, v0, s3
-; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2
+; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2
; GFX9-NEXT: v_sub_u32_e32 v4, s3, v3
; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4
; GFX9-NEXT: BB4_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_and_b32_e32 v2, s2, v4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2
; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_mov_b32_e32 v7, s5
; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v2, v6, s[0:1]
-; GFX9-NEXT: v_mul_f32_e32 v2, v7, v1
+; GFX9-NEXT: v_mul_f32_e32 v2, v8, v1
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v2
+; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2
; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
-; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v7
+; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v8
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, v0
; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4
-; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v7, s[0:1]
; GFX9-NEXT: s_and_b64 vcc, exec, vcc
; GFX9-NEXT: global_store_short v[5:6], v2, off
; GFX9-NEXT: s_cbranch_vccz BB4_1
; GFX9-NEXT: BB5_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_and_b32_e32 v2, s2, v4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2
; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v8, s5
+; GFX9-NEXT: v_mov_b32_e32 v7, s5
; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1]
-; GFX9-NEXT: v_mul_f32_e32 v8, v7, v1
-; GFX9-NEXT: v_trunc_f32_e32 v8, v8
-; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v8
-; GFX9-NEXT: v_mad_f32 v7, -v8, v0, v7
+; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
+; GFX9-NEXT: v_mul_f32_e32 v7, v8, v1
+; GFX9-NEXT: v_trunc_f32_e32 v7, v7
+; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v7
+; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v8
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, v0
; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v9, s[0:1]
;
; GFX10-DL-LABEL: udot2:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff
; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_and_b32 s6, s3, s2
-; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
+; GFX10-DL-NEXT: s_and_b32 s6, s2, s4
+; GFX10-DL-NEXT: s_and_b32 s4, s3, s4
+; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16
; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s2, s6
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s3, v0
+; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s4, s6
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0
; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
;
; GFX10-DL-LABEL: idot2:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s4, s3, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s1, s0, v0
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
;
; GFX10-DL-LABEL: udot2_alt_AddOperands:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_lshr_b32 s2, s2, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX8-NEXT: v_mad_u32_u24 v0, s2, s2, v0
+; GFX8-NEXT: v_mad_u32_u24 v2, s4, s4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s0, s2, 16
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: s_and_b32 s1, s4, 0xffff
-; GFX8-NEXT: v_mad_u32_u24 v2, s0, s0, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, s1, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-NODL-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NODL-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, s2, v0
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, s4, v0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_lshr_b32 s0, s2, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NODL-NEXT: s_and_b32 s1, s4, 0xffff
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, s0, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, s1, v2
; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-DL-NEXT: s_and_b32 s4, s4, 0xffff
+; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, s2, v0
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, s4, v0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-DL-NEXT: s_and_b32 s1, s4, 0xffff
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, s1, v2
; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
;
; GFX10-DL-LABEL: udot2_v4i16:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<4 x i16> addrspace(1)* %src2,
;
; GFX10-DL-LABEL: udot2_v4i16_Hi:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x4
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x4
+; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x4
+; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x4
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<4 x i16> addrspace(1)* %src2,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_and_b32 s3, s3, s8
-; GFX10-DL-NEXT: s_and_b32 s5, s5, s8
+; GFX10-DL-NEXT: s_and_b32 s3, s3, s7
+; GFX10-DL-NEXT: s_and_b32 s5, s5, s7
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT: s_and_b32 s2, s2, s8
-; GFX10-DL-NEXT: s_and_b32 s4, s4, s8
+; GFX10-DL-NEXT: s_and_b32 s2, s2, s7
+; GFX10-DL-NEXT: s_and_b32 s4, s4, s7
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s3, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_and_b32 s3, s3, s8
-; GFX10-DL-NEXT: s_and_b32 s5, s5, s8
+; GFX10-DL-NEXT: s_and_b32 s3, s3, s7
+; GFX10-DL-NEXT: s_and_b32 s5, s5, s7
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16
; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16
-; GFX10-DL-NEXT: s_and_b32 s7, s4, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-DL-NEXT: s_and_b32 s2, s3, s2
-; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16
+; GFX10-DL-NEXT: s_lshr_b32 s6, s2, 16
+; GFX10-DL-NEXT: s_and_b32 s7, s3, s5
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: s_and_b32 s2, s2, s5
+; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16
-; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 16
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-DL-NEXT: s_and_b32 s3, s3, s2
-; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
+; GFX10-DL-NEXT: s_lshr_b32 s6, s2, 16
+; GFX10-DL-NEXT: s_lshr_b32 s7, s3, 16
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: s_and_b32 s2, s2, s5
+; GFX10-DL-NEXT: s_and_b32 s3, s3, s5
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0
-; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v0
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v0
; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_and_b32 s6, s3, s2
-; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-DL-NEXT: s_and_b32 s6, s2, s5
+; GFX10-DL-NEXT: s_and_b32 s5, s3, s5
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16
; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s6, v0
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s3, v0
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s6, v0
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s6, v0
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0
+; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 16
; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16
-; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 16
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-DL-NEXT: s_and_b32 s3, s3, s2
-; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0
+; GFX10-DL-NEXT: s_and_b32 s2, s2, s4
+; GFX10-DL-NEXT: s_and_b32 s3, s3, s4
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0
+; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: flat_load_ushort v2, v[2:3]
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: flat_load_ushort v1, v[2:3]
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0)
-; GFX8-NEXT: v_bfe_i32 v1, v2, 0, 8
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v3, v0, 0, 8
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
-; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_i32 v2, v1, 0, 8
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
+; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v2, s2
-; GFX8-NEXT: v_mad_i32_i24 v2, v3, v1, v0
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, s2
+; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NODL-NEXT: global_load_ushort v2, v[2:3], off
; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-NODL-NEXT: global_load_ushort v1, v[2:3], off
+; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT: v_bfe_i32 v1, v2, 0, 8
-; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_bfe_i32 v3, v0, 0, 8
; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v0, 8, v0
-; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT: v_bfe_i32 v2, v1, 0, 8
+; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1
+; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v2, s2
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v1, v0
+; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v1, s2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7
; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-DL-NEXT: global_load_ushort v2, v[2:3], off
; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-DL-NEXT: global_load_ushort v1, v[2:3], off
+; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_bfe_i32 v1, v2, 0, 8
-; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_bfe_i32 v3, v0, 0, 8
; GFX9-DL-NEXT: v_lshrrev_b16_e32 v0, 8, v0
-; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_bfe_i32 v2, v1, 0, 8
+; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1
+; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v2, s2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v1, v0
+; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v1, s2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s4, v0, v1
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s4, v2, v3
; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc32:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s3, s4, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v0
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sext_i32_i8 s1, s2
-; GFX8-NEXT: s_bfe_i32 s3, s2, 0x80008
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80010
+; GFX8-NEXT: s_sext_i32_i8 s3, s2
+; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010
; GFX8-NEXT: s_sext_i32_i8 s1, s0
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80008
-; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80010
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: s_bfe_i32 s6, s0, 0x80010
; GFX8-NEXT: s_ashr_i32 s2, s2, 24
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_ashr_i32 s0, s0, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s3, v5, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s6, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s2
-; GFX9-NODL-NEXT: s_bfe_i32 s3, s2, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80010
+; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2
+; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NODL-NEXT: s_bfe_i32 s4, s0, 0x80008
-; GFX9-NODL-NEXT: s_bfe_i32 s3, s0, 0x80010
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80010
; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7
; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v5, v2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sext_i32_i8 s1, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s2
-; GFX8-NEXT: v_mov_b32_e32 v4, s1
-; GFX8-NEXT: s_bfe_i32 s3, s2, 0x80010
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 8, s0
-; GFX8-NEXT: s_sext_i32_i8 s1, s0
-; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s1
+; GFX8-NEXT: s_bfe_i32 s5, s1, 0x80010
+; GFX8-NEXT: s_ashr_i32 s4, s1, 24
+; GFX8-NEXT: s_sext_i32_i8 s1, s1
+; GFX8-NEXT: s_ashr_i32 s2, s0, 24
+; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80010
+; GFX8-NEXT: s_sext_i32_i8 s0, s0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80010
-; GFX8-NEXT: s_ashr_i32 s2, s2, 24
-; GFX8-NEXT: v_mov_b32_e32 v6, s3
-; GFX8-NEXT: s_ashr_i32 s0, s0, 24
+; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8
+; GFX8-NEXT: v_mov_b32_e32 v6, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, v5, v3, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s4, v6, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s0, v5, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, v3, v4, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s3, v6, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
;
; GFX10-DL-LABEL: idot4_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off
-; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80000
; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s0
; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s1
-; GFX10-DL-NEXT: v_and_b32_e32 v6, s3, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v7, s4, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v6, s3, v3
+; GFX10-DL-NEXT: v_and_b32_e32 v7, s4, v3
; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x80000
; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x80000
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 8, s2
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6
; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s2
+; GFX10-DL-NEXT: v_and_b32_e32 v8, s1, v3
+; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 8, s5
-; GFX10-DL-NEXT: v_and_b32_e32 v8, s1, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v2, s0, v2
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
+; GFX10-DL-NEXT: v_lshl_or_b32 v3, v6, 16, v3
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v7, 16, v8
-; GFX10-DL-NEXT: v_lshl_or_b32 v2, v11, 16, v2
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s4, v0, v1
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s4, v2, v3
; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc32:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s3, s4, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v0
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
;
; GFX10-DL-LABEL: udot2_8:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
-; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_and_b32 s3, s0, s2
-; GFX10-DL-NEXT: s_and_b32 s2, s1, s2
+; GFX10-DL-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT: s_movk_i32 s1, 0xff
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: s_and_b32 s3, s2, s1
+; GFX10-DL-NEXT: s_and_b32 s1, s0, s1
+; GFX10-DL-NEXT: s_bfe_u32 s2, s2, 0x80008
; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008
-; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x80008
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s2, v2
; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s3, v3, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
;
; GFX10-DL-LABEL: udot4_CommutationAccrossMADs:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
+; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80008
; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT: s_movk_i32 s5, 0xff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_and_b32 s6, s3, s2
-; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80008
-; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80008
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s2, v0
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s7, v0
-; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010
-; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80010
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s2, v0
-; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 24
-; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 24
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s7, v0
+; GFX10-DL-NEXT: s_and_b32 s6, s2, s5
+; GFX10-DL-NEXT: s_and_b32 s5, s3, s5
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80008
+; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80008
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s7, v0
+; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010
+; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010
+; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s7, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT: s_movk_i32 s5, 0xff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_bfe_u32 s6, s3, 0x80008
-; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80008
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5
-; GFX10-DL-NEXT: s_and_b32 s8, s3, s2
-; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
+; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80008
+; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80008
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: s_and_b32 s8, s2, s5
+; GFX10-DL-NEXT: s_and_b32 s5, s3, s5
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0
-; GFX10-DL-NEXT: s_bfe_u32 s6, s3, 0x80010
-; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80010
-; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s2, v0
-; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 24
-; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 24
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s5, v0
+; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80010
+; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010
+; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s5, v0
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s4, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1
; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80008
+; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX8-NEXT: s_sext_i32_i8 s3, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80010
-; GFX8-NEXT: s_bfe_u32 s1, s0, 0x80008
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
+; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008
+; GFX8-NEXT: s_sext_i32_i8 s1, s0
; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: s_sext_i32_i8 s4, s0
-; GFX8-NEXT: s_bfe_u32 s3, s0, 0x80010
+; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010
; GFX8-NEXT: s_lshr_b32 s2, s2, 24
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_lshr_b32 s0, s0, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s3, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s4, v3, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_u32 s1, s2, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80010
-; GFX9-NODL-NEXT: s_bfe_u32 s1, s0, 0x80008
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
+; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008
+; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0
-; GFX9-NODL-NEXT: s_bfe_u32 s3, s0, 0x80010
+; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7
; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v5, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x80008
+; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-DL-NEXT: s_sext_i32_i8 s3, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80010
-; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x80008
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010
+; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x80008
+; GFX9-DL-NEXT: s_sext_i32_i8 s1, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-DL-NEXT: s_sext_i32_i8 s4, s0
-; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x80010
+; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7
; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v4, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v5, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
;
; GFX10-DL-LABEL: notdot4_mixedtypes:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_movk_i32 s3, 0xff
-; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0
-; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX10-DL-NEXT: s_movk_i32 s5, 0xff
+; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: s_and_b32 s7, s4, s3
-; GFX10-DL-NEXT: s_and_b32 s3, s5, s3
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6
-; GFX10-DL-NEXT: v_and_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-DL-NEXT: v_and_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x80010
-; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 24
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s3, v2
-; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x80010
-; GFX10-DL-NEXT: s_lshr_b32 s5, s5, 24
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_and_b32 s7, s2, s5
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-DL-NEXT: s_and_b32 s5, s3, s5
+; GFX10-DL-NEXT: v_and_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-DL-NEXT: v_and_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010
+; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s5, v2
+; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010
+; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24
; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v0
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0
+; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
;
; GFX10-DL-LABEL: udot4_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off
-; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s0
-; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT: v_and_b32_sdwa v7, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s1
-; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16
; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16
; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7
; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
; GFX10-DL-NEXT: v_lshl_or_b32 v5, s1, 16, v6
-; GFX10-DL-NEXT: v_lshl_or_b32 v2, s0, 16, v2
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5
+; GFX10-DL-NEXT: v_lshl_or_b32 v3, s0, 16, v3
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v0, s2, v0
+; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 16
+; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 24
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NODL-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7
+; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s4, v1
+; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT: v_or_b32_e32 v3, v2, v0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s2
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s2, 16
-; GFX9-NODL-NEXT: s_lshr_b32 s3, s2, 24
-; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s0, v3
-; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NODL-NEXT: s_lshr_b32 s2, s0, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v5, s2, v5
-; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX9-NODL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NODL-NEXT: v_or_b32_e32 v4, v3, v4
-; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; GFX9-NODL-NEXT: global_load_ubyte v5, v[0:1], off
+; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 8, v3
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_add_u32_e32 v2, v3, v2
; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v5
-; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v4
+; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v0, s2, v0
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-DL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16
+; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 24
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s4, v1
+; GFX9-DL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s2
-; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 16
-; GFX9-DL-NEXT: s_lshr_b32 s3, s2, 24
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s0, v3
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s2, v5
-; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; GFX9-DL-NEXT: global_load_ubyte v5, v[0:1], off
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4
+; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc8_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v0, v1
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v2, v3
; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot8_acc32:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0
+; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s4, s5, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s1, s2, v0
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_bfe_i32 s1, s2, 0x40000
-; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40004
-; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40008
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40000
-; GFX8-NEXT: s_lshr_b32 s1, s0, 12
-; GFX8-NEXT: s_lshr_b32 s7, s2, 12
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000
+; GFX8-NEXT: s_bfe_i32 s6, s1, 0x40000
+; GFX8-NEXT: s_bfe_i32 s8, s1, 0x40004
+; GFX8-NEXT: s_bfe_i32 s10, s1, 0x40008
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: s_lshr_b32 s2, s0, 12
+; GFX8-NEXT: s_lshr_b32 s4, s1, 12
+; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40004
; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008
-; GFX8-NEXT: v_mov_b32_e32 v5, s4
-; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004
-; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s1
-; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s7
-; GFX8-NEXT: v_mul_i32_i24_e32 v4, s9, v4
-; GFX8-NEXT: s_bfe_i32 s1, s2, 0x40010
-; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40014
-; GFX8-NEXT: v_mov_b32_e32 v8, s1
-; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40010
-; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40018
-; GFX8-NEXT: v_mov_b32_e32 v9, s5
-; GFX8-NEXT: s_bfe_i32 s1, s0, 0x40014
-; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40018
-; GFX8-NEXT: s_ashr_i32 s2, s2, 28
-; GFX8-NEXT: v_mov_b32_e32 v10, s7
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
+; GFX8-NEXT: v_mov_b32_e32 v7, s8
+; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2
+; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4
+; GFX8-NEXT: v_mul_i32_i24_e32 v3, s9, v3
+; GFX8-NEXT: s_bfe_i32 s12, s1, 0x40010
+; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
+; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX8-NEXT: s_bfe_i32 s14, s1, 0x40014
+; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40010
+; GFX8-NEXT: v_mov_b32_e32 v8, s12
+; GFX8-NEXT: s_bfe_i32 s16, s1, 0x40018
+; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40014
+; GFX8-NEXT: v_mov_b32_e32 v9, s14
+; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40018
+; GFX8-NEXT: s_ashr_i32 s1, s1, 28
+; GFX8-NEXT: v_mov_b32_e32 v10, s16
; GFX8-NEXT: s_ashr_i32 s0, s0, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v2, s6, v3, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s8, v5, v2
-; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s4, v8, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s1, v9, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s5, v10, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s7, v7, v2
+; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_bfe_i32 s1, s2, 0x40000
-; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40004
-; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40000
-; GFX9-NEXT: s_lshr_b32 s1, s0, 12
-; GFX9-NEXT: s_lshr_b32 s7, s2, 12
-; GFX9-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000
+; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40000
+; GFX9-NEXT: s_bfe_i32 s8, s1, 0x40004
+; GFX9-NEXT: s_bfe_i32 s10, s1, 0x40008
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-NEXT: s_lshr_b32 s2, s0, 12
+; GFX9-NEXT: s_lshr_b32 s4, s1, 12
+; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40004
; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004
-; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s1
-; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7
-; GFX9-NEXT: v_mul_i32_i24_e32 v4, s9, v4
-; GFX9-NEXT: s_bfe_i32 s1, s2, 0x40010
-; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v8, s1
-; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40010
-; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40018
-; GFX9-NEXT: v_mov_b32_e32 v9, s5
-; GFX9-NEXT: s_bfe_i32 s1, s0, 0x40014
-; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40018
-; GFX9-NEXT: s_ashr_i32 s2, s2, 28
-; GFX9-NEXT: v_mov_b32_e32 v10, s7
+; GFX9-NEXT: v_mov_b32_e32 v3, s10
+; GFX9-NEXT: v_mov_b32_e32 v7, s8
+; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2
+; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4
+; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3
+; GFX9-NEXT: s_bfe_i32 s12, s1, 0x40010
+; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
+; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40014
+; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40010
+; GFX9-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40018
+; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40014
+; GFX9-NEXT: v_mov_b32_e32 v9, s14
+; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40018
+; GFX9-NEXT: s_ashr_i32 s1, s1, 28
+; GFX9-NEXT: v_mov_b32_e32 v10, s16
; GFX9-NEXT: s_ashr_i32 s0, s0, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_i32_i24 v2, s6, v3, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s8, v5, v2
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NEXT: v_mad_u32_u24 v2, v6, v7, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s4, v8, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s1, v9, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s5, v10, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2
+; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: s_endpgm
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x40000
-; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40004
-; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40000
-; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12
-; GFX9-DL-NEXT: s_lshr_b32 s7, s2, 12
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000
+; GFX9-DL-NEXT: s_bfe_i32 s6, s1, 0x40000
+; GFX9-DL-NEXT: s_bfe_i32 s8, s1, 0x40004
+; GFX9-DL-NEXT: s_bfe_i32 s10, s1, 0x40008
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 12
+; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 12
+; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40004
; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s1
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, s9, v4
-; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x40010
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s1
-; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40010
-; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40018
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s5
-; GFX9-DL-NEXT: s_bfe_i32 s1, s0, 0x40014
-; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40018
-; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28
-; GFX9-DL-NEXT: v_mov_b32_e32 v10, s7
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3
+; GFX9-DL-NEXT: s_bfe_i32 s12, s1, 0x40010
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX9-DL-NEXT: s_bfe_i32 s14, s1, 0x40014
+; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40010
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-DL-NEXT: s_bfe_i32 s16, s1, 0x40018
+; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40014
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14
+; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40018
+; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28
+; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16
; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v3, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v5, v2
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v6, v7, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v8, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v9, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v10, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2
+; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot8_acc16:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 12
-; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 12
-; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000
-; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000
-; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5
-; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40004
-; GFX10-DL-NEXT: s_bfe_i32 s10, s0, 0x40008
-; GFX10-DL-NEXT: s_bfe_i32 s11, s1, 0x40008
+; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12
+; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12
+; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000
+; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000
+; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40004
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4
+; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40008
+; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40008
+; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
+; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
+; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s8, s9
+; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3
+; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4
; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010
-; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014
-; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s10, s11
-; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
-; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4
-; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s7, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2
+; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014
; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s9, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v2, s7, s2, v2
+; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2
;
; GFX10-DL-LABEL: idot8_acc8:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 12
-; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 12
-; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000
-; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000
-; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5
-; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40004
-; GFX10-DL-NEXT: s_bfe_i32 s10, s0, 0x40008
-; GFX10-DL-NEXT: s_bfe_i32 s11, s1, 0x40008
+; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12
+; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12
+; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000
+; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000
+; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40004
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4
+; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40008
+; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40008
+; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
+; GFX10-DL-NEXT: s_movk_i32 s4, 0xff
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
+; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s8, s9
+; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3
+; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4
; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010
-; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014
-; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s10, s11
-; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
-; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4
-; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s7, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2
+; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014
; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s9, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v2, s7, s2, v2
+; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshl_b32 s5, s7, 28
-; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60
-; GFX8-NEXT: s_lshl_b32 s9, s7, 24
-; GFX8-NEXT: s_lshl_b32 s11, s7, 20
-; GFX8-NEXT: s_lshl_b32 s5, s1, 28
-; GFX8-NEXT: s_ashr_i64 s[14:15], s[4:5], 60
-; GFX8-NEXT: s_lshl_b32 s5, s1, 20
-; GFX8-NEXT: s_lshl_b32 s13, s1, 24
-; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
-; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
-; GFX8-NEXT: v_mov_b32_e32 v3, s4
-; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60
-; GFX8-NEXT: v_mov_b32_e32 v4, s10
+; GFX8-NEXT: s_lshl_b32 s29, s7, 28
+; GFX8-NEXT: s_ashr_i64 s[18:19], s[6:7], 60
+; GFX8-NEXT: s_lshl_b32 s21, s7, 8
+; GFX8-NEXT: s_lshl_b32 s23, s7, 12
+; GFX8-NEXT: s_lshl_b32 s17, s1, 28
+; GFX8-NEXT: s_lshl_b32 s25, s7, 16
+; GFX8-NEXT: s_lshl_b32 s27, s7, 24
+; GFX8-NEXT: s_lshl_b32 s19, s7, 4
+; GFX8-NEXT: s_lshl_b32 s7, s7, 20
+; GFX8-NEXT: s_ashr_i64 s[4:5], s[0:1], 60
+; GFX8-NEXT: s_ashr_i64 s[28:29], s[28:29], 60
+; GFX8-NEXT: s_lshl_b32 s9, s1, 8
+; GFX8-NEXT: s_lshl_b32 s11, s1, 12
+; GFX8-NEXT: s_lshl_b32 s13, s1, 16
+; GFX8-NEXT: s_lshl_b32 s15, s1, 24
+; GFX8-NEXT: s_lshl_b32 s5, s1, 4
+; GFX8-NEXT: s_lshl_b32 s1, s1, 20
+; GFX8-NEXT: s_ashr_i64 s[26:27], s[26:27], 60
+; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60
+; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60
+; GFX8-NEXT: v_mov_b32_e32 v4, s28
+; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60
+; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s26
+; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60
+; GFX8-NEXT: v_mul_i32_i24_e32 v3, s0, v3
+; GFX8-NEXT: s_ashr_i64 s[22:23], s[22:23], 60
; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60
-; GFX8-NEXT: s_lshl_b32 s5, s7, 16
-; GFX8-NEXT: v_mov_b32_e32 v5, s8
-; GFX8-NEXT: s_lshl_b32 s9, s1, 16
-; GFX8-NEXT: s_lshl_b32 s11, s7, 12
-; GFX8-NEXT: s_ashr_i64 s[16:17], s[4:5], 60
-; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
-; GFX8-NEXT: v_mul_i32_i24_e32 v4, s4, v4
-; GFX8-NEXT: s_lshl_b32 s5, s1, 12
-; GFX8-NEXT: s_lshl_b32 s9, s7, 8
+; GFX8-NEXT: v_mov_b32_e32 v6, s24
+; GFX8-NEXT: s_ashr_i64 s[20:21], s[20:21], 60
; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
-; GFX8-NEXT: v_mov_b32_e32 v6, s16
-; GFX8-NEXT: s_ashr_i64 s[20:21], s[8:9], 60
-; GFX8-NEXT: s_lshl_b32 s13, s1, 8
-; GFX8-NEXT: s_ashr_i64 s[18:19], s[4:5], 60
-; GFX8-NEXT: s_lshl_b32 s5, s7, 4
-; GFX8-NEXT: v_mov_b32_e32 v7, s10
-; GFX8-NEXT: s_lshl_b32 s9, s1, 4
-; GFX8-NEXT: s_ashr_i64 s[24:25], s[4:5], 60
-; GFX8-NEXT: s_ashr_i64 s[22:23], s[12:13], 60
+; GFX8-NEXT: v_mov_b32_e32 v7, s22
+; GFX8-NEXT: s_ashr_i64 s[32:33], s[18:19], 60
+; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
; GFX8-NEXT: v_mov_b32_e32 v8, s20
-; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60
-; GFX8-NEXT: s_ashr_i64 s[26:27], s[8:9], 60
-; GFX8-NEXT: v_mov_b32_e32 v9, s24
-; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60
+; GFX8-NEXT: s_ashr_i64 s[30:31], s[4:5], 60
+; GFX8-NEXT: v_mov_b32_e32 v9, s32
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s12, v5, v2
-; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX8-NEXT: v_mad_i32_i24 v2, s8, v6, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s18, v7, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s22, v8, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s26, v9, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
-; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s16, v4, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s14, v5, v2
+; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX8-NEXT: v_mad_i32_i24 v2, s12, v6, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s10, v7, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s8, v8, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s30, v9, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s18
+; GFX8-NEXT: v_mad_i32_i24 v2, s4, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s4, s2, 15
-; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40004
-; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008
-; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010
-; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018
-; GFX9-NEXT: s_lshr_b32 s12, s2, 28
-; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40014
-; GFX9-NEXT: s_pack_ll_b32_b16 s2, s10, s2
+; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018
+; GFX9-NEXT: s_lshr_b32 s5, s2, 28
+; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010
+; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014
+; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008
+; GFX9-NEXT: s_bfe_u32 s11, s2, 0x4000c
+; GFX9-NEXT: s_and_b32 s12, s2, 15
+; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s2
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1]
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s10, s11
+; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1]
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s8, s9
; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s12
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s5
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018
; GFX9-NEXT: s_lshr_b32 s13, s6, 28
; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010
; GFX9-NEXT: s_and_b32 s18, s6, 15
; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004
; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s18, s6
-; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s16, s17
; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1]
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s14, s15
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_and_b32 s4, s2, 15
-; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40004
-; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018
-; GFX9-DL-NEXT: s_lshr_b32 s12, s2, 28
-; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40014
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s10, s2
+; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018
+; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28
+; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x4000c
+; GFX9-DL-NEXT: s_and_b32 s12, s2, 15
+; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s12, s2
+; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s10, s11
+; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s8, s9
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s12
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s4, s5
; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018
; GFX9-DL-NEXT: s_lshr_b32 s13, s6, 28
; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010
; GFX9-DL-NEXT: s_and_b32 s18, s6, 15
; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s18, s6
-; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1]
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s8, s9
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s16, s17
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1]
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s14, s15
; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
;
; GFX10-DL-LABEL: idot8_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
;
; GFX10-DL-LABEL: idot8_acc8_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v2, v3
; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc32:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0
+; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s4, s5, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s1, s2, 15
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX8-NEXT: s_lshr_b32 s2, s0, 28
+; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c
+; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008
+; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004
+; GFX8-NEXT: s_lshr_b32 s10, s1, 28
+; GFX8-NEXT: s_and_b32 s1, s1, 15
+; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c
+; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008
+; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004
+; GFX8-NEXT: s_and_b32 s0, s0, 15
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: s_and_b32 s1, s0, 15
-; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40008
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX8-NEXT: v_mov_b32_e32 v5, s4
-; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008
-; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v6, s7
-; GFX8-NEXT: s_bfe_u32 s4, s0, 0x4000c
-; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40014
-; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v7, s8
-; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018
-; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40014
-; GFX8-NEXT: v_mov_b32_e32 v8, s9
-; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40018
-; GFX8-NEXT: s_lshr_b32 s2, s2, 28
+; GFX8-NEXT: v_mov_b32_e32 v4, s16
+; GFX8-NEXT: v_mov_b32_e32 v5, s15
+; GFX8-NEXT: v_mov_b32_e32 v6, s14
+; GFX8-NEXT: v_mov_b32_e32 v7, s13
+; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: v_mov_b32_e32 v9, s11
-; GFX8-NEXT: s_lshr_b32 s0, s0, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s10, v8, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s12, v9, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
+; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s1, s2, 15
-; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX9-NEXT: s_lshr_b32 s2, s0, 28
+; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX9-NEXT: s_bfe_u32 s14, s1, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008
+; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004
+; GFX9-NEXT: s_lshr_b32 s10, s1, 28
+; GFX9-NEXT: s_and_b32 s1, s1, 15
+; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008
+; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004
+; GFX9-NEXT: s_and_b32 s0, s0, 15
; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: s_and_b32 s1, s0, 15
-; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40008
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008
-; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v6, s7
-; GFX9-NEXT: s_bfe_u32 s4, s0, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014
-; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v7, s8
-; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018
-; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v8, s9
-; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40018
-; GFX9-NEXT: s_lshr_b32 s2, s2, 28
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-NEXT: v_mov_b32_e32 v8, s12
; GFX9-NEXT: v_mov_b32_e32 v9, s11
-; GFX9-NEXT: s_lshr_b32 s0, s0, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s10, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s12, v9, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s10
+; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_and_b32 s1, s2, 15
-; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28
+; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004
+; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28
+; GFX9-DL-NEXT: s_and_b32 s1, s1, 15
+; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004
+; GFX9-DL-NEXT: s_and_b32 s0, s0, 15
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-DL-NEXT: s_and_b32 s1, s0, 15
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7
-; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8
-; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018
-; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9
-; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40018
-; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12
; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11
-; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v9, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc16:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s1, s2, 15
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX8-NEXT: s_lshr_b32 s2, s0, 28
+; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c
+; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008
+; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004
+; GFX8-NEXT: s_lshr_b32 s10, s1, 28
+; GFX8-NEXT: s_and_b32 s1, s1, 15
+; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c
+; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008
+; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004
+; GFX8-NEXT: s_and_b32 s0, s0, 15
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: s_and_b32 s1, s0, 15
-; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40008
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX8-NEXT: v_mov_b32_e32 v5, s4
-; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008
-; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v6, s7
-; GFX8-NEXT: s_bfe_u32 s4, s0, 0x4000c
-; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40014
-; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v7, s8
-; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018
-; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40014
-; GFX8-NEXT: v_mov_b32_e32 v8, s9
-; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40018
-; GFX8-NEXT: s_lshr_b32 s2, s2, 28
+; GFX8-NEXT: v_mov_b32_e32 v4, s16
+; GFX8-NEXT: v_mov_b32_e32 v5, s15
+; GFX8-NEXT: v_mov_b32_e32 v6, s14
+; GFX8-NEXT: v_mov_b32_e32 v7, s13
+; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: v_mov_b32_e32 v9, s11
-; GFX8-NEXT: s_lshr_b32 s0, s0, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s10, v8, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s12, v9, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
+; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s1, s2, 15
-; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX9-NEXT: s_lshr_b32 s2, s0, 28
+; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX9-NEXT: s_bfe_u32 s14, s1, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008
+; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004
+; GFX9-NEXT: s_lshr_b32 s10, s1, 28
+; GFX9-NEXT: s_and_b32 s1, s1, 15
+; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008
+; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004
+; GFX9-NEXT: s_and_b32 s0, s0, 15
; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: s_and_b32 s1, s0, 15
-; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40008
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008
-; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v6, s7
-; GFX9-NEXT: s_bfe_u32 s4, s0, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014
-; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v7, s8
-; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018
-; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v8, s9
-; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40018
-; GFX9-NEXT: s_lshr_b32 s2, s2, 28
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-NEXT: v_mov_b32_e32 v8, s12
; GFX9-NEXT: v_mov_b32_e32 v9, s11
-; GFX9-NEXT: s_lshr_b32 s0, s0, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s10, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s12, v9, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s10
+; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_and_b32 s1, s2, 15
-; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28
+; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004
+; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28
+; GFX9-DL-NEXT: s_and_b32 s1, s1, 15
+; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004
+; GFX9-DL-NEXT: s_and_b32 s0, s0, 15
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-DL-NEXT: s_and_b32 s1, s0, 15
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7
-; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8
-; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018
-; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9
-; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40018
-; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12
; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11
-; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v9, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc8:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s1, s2, 15
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX8-NEXT: s_and_b32 s9, s0, 15
+; GFX8-NEXT: s_and_b32 s16, s1, 15
+; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004
+; GFX8-NEXT: v_mov_b32_e32 v4, s16
+; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008
+; GFX8-NEXT: s_lshr_b32 s10, s1, 28
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c
+; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004
+; GFX8-NEXT: v_mov_b32_e32 v5, s15
+; GFX8-NEXT: s_lshr_b32 s2, s0, 28
+; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008
+; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008
-; GFX8-NEXT: s_and_b32 s1, s0, 15
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX8-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NEXT: s_bfe_u32 s8, s0, 0x4000c
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008
-; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5
-; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010
-; GFX8-NEXT: v_and_b32_e32 v5, 15, v5
-; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014
-; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v7, s6
-; GFX8-NEXT: s_lshr_b32 s11, s2, 28
-; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018
-; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014
-; GFX8-NEXT: v_mov_b32_e32 v8, s8
-; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40018
-; GFX8-NEXT: v_mov_b32_e32 v9, s2
-; GFX8-NEXT: s_lshr_b32 s0, s0, 28
+; GFX8-NEXT: v_mov_b32_e32 v6, s14
+; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3
+; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT: v_mov_b32_e32 v7, s13
+; GFX8-NEXT: v_mov_b32_e32 v8, s12
+; GFX8-NEXT: v_mov_b32_e32 v9, s11
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s9, v8, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s10, v9, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s11
-; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
+; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s1, s2, 15
-; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX9-NEXT: s_and_b32 s9, s0, 15
+; GFX9-NEXT: s_and_b32 s16, s1, 15
+; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008
+; GFX9-NEXT: s_lshr_b32 s10, s1, 28
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004
+; GFX9-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-NEXT: s_lshr_b32 s2, s0, 28
+; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008
+; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c
; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008
-; GFX9-NEXT: s_and_b32 s1, s0, 15
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX9-NEXT: v_mov_b32_e32 v5, s7
-; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008
-; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010
-; GFX9-NEXT: v_and_b32_e32 v5, 15, v5
-; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014
-; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v7, s6
-; GFX9-NEXT: s_lshr_b32 s11, s2, 28
-; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40018
-; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v8, s8
-; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40018
-; GFX9-NEXT: v_mov_b32_e32 v9, s2
-; GFX9-NEXT: s_lshr_b32 s0, s0, 28
+; GFX9-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3
+; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX9-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-NEXT: v_mov_b32_e32 v9, s11
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s9, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s10, v9, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NEXT: v_add_u32_e32 v2, v2, v3
+; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s10
+; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
; GFX9-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NEXT: s_endpgm
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_and_b32 s1, s2, 15
-; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX9-DL-NEXT: s_and_b32 s9, s0, 15
+; GFX9-DL-NEXT: s_and_b32 s16, s1, 15
+; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008
+; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28
+; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28
+; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008
-; GFX9-DL-NEXT: s_and_b32 s1, s0, 15
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7
-; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010
-; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5
-; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s6
-; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 28
-; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40018
-; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8
-; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40018
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s2
-; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3
+; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v9, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc4:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s1, s2, 15
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX8-NEXT: s_and_b32 s9, s0, 15
+; GFX8-NEXT: s_and_b32 s16, s1, 15
+; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004
+; GFX8-NEXT: v_mov_b32_e32 v4, s16
+; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008
+; GFX8-NEXT: s_lshr_b32 s10, s1, 28
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c
+; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004
+; GFX8-NEXT: v_mov_b32_e32 v5, s15
+; GFX8-NEXT: s_lshr_b32 s2, s0, 28
+; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008
+; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008
-; GFX8-NEXT: s_and_b32 s1, s0, 15
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX8-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NEXT: s_bfe_u32 s8, s0, 0x4000c
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008
-; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5
-; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010
-; GFX8-NEXT: v_and_b32_e32 v5, 15, v5
-; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014
-; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v7, s6
-; GFX8-NEXT: s_lshr_b32 s11, s2, 28
-; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018
-; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014
-; GFX8-NEXT: v_mov_b32_e32 v8, s8
-; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40018
-; GFX8-NEXT: v_mov_b32_e32 v9, s2
-; GFX8-NEXT: s_lshr_b32 s0, s0, 28
+; GFX8-NEXT: v_mov_b32_e32 v6, s14
+; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3
+; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT: v_mov_b32_e32 v7, s13
+; GFX8-NEXT: v_mov_b32_e32 v8, s12
+; GFX8-NEXT: v_mov_b32_e32 v9, s11
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s9, v8, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s10, v9, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s11
-; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
+; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s1, s2, 15
-; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX9-NEXT: s_and_b32 s9, s0, 15
+; GFX9-NEXT: s_and_b32 s16, s1, 15
+; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008
+; GFX9-NEXT: s_lshr_b32 s10, s1, 28
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004
+; GFX9-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-NEXT: s_lshr_b32 s2, s0, 28
+; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008
+; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c
; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008
-; GFX9-NEXT: s_and_b32 s1, s0, 15
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX9-NEXT: v_mov_b32_e32 v5, s7
-; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008
-; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010
-; GFX9-NEXT: v_and_b32_e32 v5, 15, v5
-; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014
-; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v7, s6
-; GFX9-NEXT: s_lshr_b32 s11, s2, 28
-; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40018
-; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v8, s8
-; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40018
-; GFX9-NEXT: v_mov_b32_e32 v9, s2
-; GFX9-NEXT: s_lshr_b32 s0, s0, 28
+; GFX9-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3
+; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX9-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-NEXT: v_mov_b32_e32 v9, s11
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s9, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s10, v9, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s10
+; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
; GFX9-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NEXT: s_endpgm
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_and_b32 s1, s2, 15
-; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX9-DL-NEXT: s_and_b32 s9, s0, 15
+; GFX9-DL-NEXT: s_and_b32 s16, s1, 15
+; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008
+; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28
+; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28
+; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008
-; GFX9-DL-NEXT: s_and_b32 s1, s0, 15
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7
-; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010
-; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5
-; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s6
-; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 28
-; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40018
-; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8
-; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40018
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s2
-; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3
+; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v9, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_CommutationInsideMAD:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v2, v3
; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc32_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0
+; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s4, s5, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s1, s2, 15
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX8-NEXT: s_lshr_b32 s2, s0, 28
+; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c
+; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008
+; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004
+; GFX8-NEXT: s_lshr_b32 s10, s1, 28
+; GFX8-NEXT: s_and_b32 s1, s1, 15
+; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c
+; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008
+; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004
+; GFX8-NEXT: s_and_b32 s0, s0, 15
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: s_and_b32 s1, s0, 15
-; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40008
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX8-NEXT: v_mov_b32_e32 v5, s4
-; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008
-; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v6, s7
-; GFX8-NEXT: s_bfe_u32 s4, s0, 0x4000c
-; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40014
-; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v7, s8
-; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018
-; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40014
-; GFX8-NEXT: v_mov_b32_e32 v8, s9
-; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40018
-; GFX8-NEXT: s_lshr_b32 s2, s2, 28
+; GFX8-NEXT: v_mov_b32_e32 v4, s16
+; GFX8-NEXT: v_mov_b32_e32 v5, s15
+; GFX8-NEXT: v_mov_b32_e32 v6, s14
+; GFX8-NEXT: v_mov_b32_e32 v7, s13
+; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: v_mov_b32_e32 v9, s11
-; GFX8-NEXT: s_lshr_b32 s0, s0, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s10, v8, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s12, v9, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
+; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s1, s2, 15
-; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004
-; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018
-; GFX9-NEXT: s_lshr_b32 s13, s2, 28
-; GFX9-NEXT: s_and_b32 s4, s0, 15
-; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004
+; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018
+; GFX9-NEXT: s_lshr_b32 s13, s6, 28
+; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s13
+; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018
+; GFX9-NEXT: s_lshr_b32 s5, s2, 28
+; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010
+; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40014
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13
-; GFX9-NEXT: s_bfe_u32 s1, s0, 0x40008
-; GFX9-NEXT: s_bfe_u32 s5, s0, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40010
-; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40014
-; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40018
-; GFX9-NEXT: s_lshr_b32 s0, s0, 28
-; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v0, s7
+; GFX9-NEXT: v_pk_mul_lo_u16 v2, s4, v0
+; GFX9-NEXT: s_pack_ll_b32_b16 s4, s14, s15
; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010
-; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40014
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s12, s0
-; GFX9-NEXT: v_mov_b32_e32 v4, s11
-; GFX9-NEXT: v_pk_mul_lo_u16 v4, s0, v4
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s8, s2
-; GFX9-NEXT: v_mov_b32_e32 v5, s0
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s6, s7
-; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v3
-; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v6, s0
-; GFX9-NEXT: v_pk_mul_lo_u16 v6, s1, v6
-; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10
-; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v5
+; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014
+; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008
+; GFX9-NEXT: s_bfe_u32 s17, s6, 0x4000c
+; GFX9-NEXT: s_and_b32 s18, s6, 15
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: s_pack_ll_b32_b16 s5, s8, s9
+; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008
+; GFX9-NEXT: s_bfe_u32 s11, s2, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004
+; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s17
+; GFX9-NEXT: v_pk_mul_lo_u16 v3, s5, v0
+; GFX9-NEXT: s_and_b32 s12, s2, 15
+; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: s_pack_ll_b32_b16 s5, s10, s11
+; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s6
+; GFX9-NEXT: v_pk_mul_lo_u16 v4, s5, v0
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_load_ushort v6, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_add_u32_e32 v6, v5, v6
+; GFX9-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
+; GFX9-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
+; GFX9-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
+; GFX9-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_and_b32 s1, s2, 15
-; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018
-; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 28
-; GFX9-DL-NEXT: s_and_b32 s4, s0, 15
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004
+; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018
+; GFX9-DL-NEXT: s_lshr_b32 s13, s6, 28
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s13
+; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018
+; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28
+; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40014
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s11, s11, s13
-; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40018
-; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28
-; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s4, v0
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s14, s15
; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40014
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s12, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s0, v4
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s0
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s7
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v3
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s5
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s0
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, s1, v6
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v5
+; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x4000c
+; GFX9-DL-NEXT: s_and_b32 s18, s6, 15
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s8, s9
+; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s16, s17
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s5, v0
+; GFX9-DL-NEXT: s_and_b32 s12, s2, 15
+; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s10, s11
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s18, s6
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s5, v0
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s12, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v0
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u32_e32 v6, v5, v6
+; GFX9-DL-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
+; GFX9-DL-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u32_e32 v4, v4, v3
+; GFX9-DL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v2
+; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
;
; GFX10-DL-LABEL: udot8_acc8_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40004
-; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004
-; GFX10-DL-NEXT: s_and_b32 s6, s0, 15
-; GFX10-DL-NEXT: s_and_b32 s8, s1, 15
-; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c
-; GFX10-DL-NEXT: s_bfe_u32 s9, s1, 0x4000c
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s4, s5
-; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40008
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s6, s8
-; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40008
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s7, s9
+; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004
+; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40004
+; GFX10-DL-NEXT: s_and_b32 s5, s0, 15
+; GFX10-DL-NEXT: s_and_b32 s7, s1, 15
+; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c
+; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s2, s4
+; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s5, s7
+; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s8
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3
+; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff
; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014
-; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40018
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s4, s5
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s4
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5
; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3
-; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014
-; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40010
-; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28
+; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014
+; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010
+; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40018
; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX10-DL-NEXT: v_and_b32_e32 v3, s5, v3
; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40010
; GFX10-DL-NEXT: s_lshr_b32 s9, s1, 28
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s5, s7
-; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x40018
+; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s7
; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s4, s8
+; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x40018
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s8
; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s9
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 8, v7
; GFX10-DL-NEXT: v_or_b32_e32 v3, v6, v3
; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX10-DL-NEXT: v_and_b32_e32 v3, s5, v3
; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5
; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s1, s2, 15
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX8-NEXT: s_and_b32 s9, s0, 15
+; GFX8-NEXT: s_and_b32 s16, s1, 15
+; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004
+; GFX8-NEXT: v_mov_b32_e32 v4, s16
+; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008
+; GFX8-NEXT: s_lshr_b32 s10, s1, 28
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c
+; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004
+; GFX8-NEXT: v_mov_b32_e32 v5, s15
+; GFX8-NEXT: s_lshr_b32 s2, s0, 28
+; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008
+; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008
-; GFX8-NEXT: s_and_b32 s1, s0, 15
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX8-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NEXT: s_bfe_u32 s8, s0, 0x4000c
-; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008
-; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5
-; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010
-; GFX8-NEXT: v_and_b32_e32 v5, 15, v5
-; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014
-; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v7, s6
-; GFX8-NEXT: s_lshr_b32 s11, s2, 28
-; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018
-; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014
-; GFX8-NEXT: v_mov_b32_e32 v8, s8
-; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40018
-; GFX8-NEXT: v_mov_b32_e32 v9, s2
-; GFX8-NEXT: s_lshr_b32 s0, s0, 28
+; GFX8-NEXT: v_mov_b32_e32 v6, s14
+; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3
+; GFX8-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT: v_mov_b32_e32 v7, s13
+; GFX8-NEXT: v_mov_b32_e32 v8, s12
+; GFX8-NEXT: v_mov_b32_e32 v9, s11
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s9, v8, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s10, v9, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s11
-; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
+; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s1, s2, 15
-; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX9-NEXT: s_and_b32 s9, s0, 15
+; GFX9-NEXT: s_and_b32 s16, s1, 15
+; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008
+; GFX9-NEXT: s_lshr_b32 s10, s1, 28
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004
+; GFX9-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-NEXT: s_lshr_b32 s2, s0, 28
+; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008
+; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c
; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008
-; GFX9-NEXT: s_and_b32 s1, s0, 15
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX9-NEXT: v_mov_b32_e32 v5, s7
-; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008
-; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010
-; GFX9-NEXT: v_and_b32_e32 v5, 15, v5
-; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014
-; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v7, s6
-; GFX9-NEXT: s_lshr_b32 s11, s2, 28
-; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40018
-; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v8, s8
-; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40018
-; GFX9-NEXT: v_mov_b32_e32 v9, s2
-; GFX9-NEXT: s_lshr_b32 s0, s0, 28
+; GFX9-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3
+; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX9-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-NEXT: v_mov_b32_e32 v9, s11
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s9, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s10, v9, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NEXT: v_add_u32_e32 v2, v2, v3
+; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s10
+; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
; GFX9-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NEXT: s_endpgm
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_and_b32 s1, s2, 15
-; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004
+; GFX9-DL-NEXT: s_and_b32 s9, s0, 15
+; GFX9-DL-NEXT: s_and_b32 s16, s1, 15
+; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008
+; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28
+; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28
+; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008
-; GFX9-DL-NEXT: s_and_b32 s1, s0, 15
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7
-; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5
-; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010
-; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5
-; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s6
-; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 28
-; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40018
-; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8
-; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40018
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s2
-; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3
+; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v9, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc4_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
;
; GFX10-DL-LABEL: udot8_variant1:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s4, s3, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
+; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s0, v0
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
i32 addrspace(1)* %v2addr,
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_lh_b32_b16 s0, 0x3e7, s2
+; GFX9-NEXT: s_pack_lh_b32_b16 s0, 0x3e7, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
+; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000
+; CIVI-NEXT: s_and_b32 s0, s0, 0xffff0000
; CIVI-NEXT: s_or_b32 s0, s0, 0x3e7
; CIVI-NEXT: v_mov_b32_e32 v2, s0
; CIVI-NEXT: flat_store_dword v[0:1], v2
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_lh_b32_b16 s0, s4, s2
+; GFX9-NEXT: s_pack_lh_b32_b16 s0, s4, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x30
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_and_b32 s0, s4, 0xffff
+; VI-NEXT: s_and_b32 s1, s4, 0xffff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s1, s2, 0xffff0000
-; VI-NEXT: s_or_b32 s0, s0, s1
+; VI-NEXT: s_and_b32 s0, s0, 0xffff0000
+; VI-NEXT: s_or_b32 s0, s1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_load_dword s0, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_and_b32 s1, s4, 0xffff
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s0, s2, 0xffff0000
+; CI-NEXT: s_and_b32 s0, s0, 0xffff0000
; CI-NEXT: s_or_b32 s0, s1, s0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dword v[0:1], v2
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s0, s2, 16
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x30
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_and_b32 s0, s4, 0xffff
+; VI-NEXT: s_and_b32 s1, s4, 0xffff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s1, s2, 16
-; VI-NEXT: s_and_b32 s2, s2, 0xffff0000
-; VI-NEXT: s_or_b32 s0, s0, s2
+; VI-NEXT: s_lshr_b32 s2, s0, 16
+; VI-NEXT: s_and_b32 s0, s0, 0xffff0000
+; VI-NEXT: s_or_b32 s0, s1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: ;;#ASMSTART
-; VI-NEXT: ; use s1
+; VI-NEXT: ; use s2
; VI-NEXT: ;;#ASMEND
; VI-NEXT: s_endpgm
;
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_load_dword s2, s[2:3], 0x0
-; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_and_b32 s0, s4, 0xffff
+; CI-NEXT: s_load_dword s0, s[2:3], 0x0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: s_and_b32 s1, s4, 0xffff
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshr_b32 s1, s2, 16
-; CI-NEXT: s_lshl_b32 s2, s1, 16
-; CI-NEXT: s_or_b32 s0, s0, s2
-; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_lshr_b32 s0, s0, 16
+; CI-NEXT: s_lshl_b32 s2, s0, 16
+; CI-NEXT: s_or_b32 s1, s1, s2
+; CI-NEXT: v_mov_b32_e32 v2, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: ;;#ASMSTART
-; CI-NEXT: ; use s1
+; CI-NEXT: ; use s0
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_hh_b32_b16 s0, s4, s2
+; GFX9-NEXT: s_pack_hh_b32_b16 s0, s4, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x30
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_lshr_b32 s0, s4, 16
+; VI-NEXT: s_lshr_b32 s1, s4, 16
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s1, s2, 0xffff0000
-; VI-NEXT: s_or_b32 s0, s0, s1
+; VI-NEXT: s_and_b32 s0, s0, 0xffff0000
+; VI-NEXT: s_or_b32 s0, s1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_load_dword s0, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_lshr_b32 s1, s4, 16
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s0, s2, 0xffff0000
+; CI-NEXT: s_and_b32 s0, s0, 0xffff0000
; CI-NEXT: s_or_b32 s0, s1, s0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dword v[0:1], v2
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_lshr_b32 s0, s4, 16
+; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_lshr_b32 s1, s4, 16
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_lh_b32_b16 s1, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: s_pack_lh_b32_b16 s0, s1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s0
+; GFX9-NEXT: ; use s1
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_lshr_b32 s0, s4, 16
+; VI-NEXT: s_lshr_b32 s1, s4, 16
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s1, s2, 0xffff0000
-; VI-NEXT: s_or_b32 s1, s0, s1
-; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: s_and_b32 s0, s0, 0xffff0000
+; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: ;;#ASMSTART
-; VI-NEXT: ; use s0
+; VI-NEXT: ; use s1
; VI-NEXT: ;;#ASMEND
; VI-NEXT: s_endpgm
;
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_load_dword s0, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_lshr_b32 s0, s4, 16
+; CI-NEXT: s_lshr_b32 s1, s4, 16
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s1, s2, 0xffff0000
-; CI-NEXT: s_or_b32 s1, s0, s1
-; CI-NEXT: v_mov_b32_e32 v2, s1
+; CI-NEXT: s_and_b32 s0, s0, 0xffff0000
+; CI-NEXT: s_or_b32 s0, s1, s0
+; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: ;;#ASMSTART
-; CI-NEXT: ; use s0
+; CI-NEXT: ; use s1
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_lshr_b32 s0, s4, 16
+; GFX9-NEXT: s_lshr_b32 s1, s4, 16
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s1, s2, 16
-; GFX9-NEXT: s_pack_ll_b32_b16 s2, s0, s1
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s0
+; GFX9-NEXT: ; use s1
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s1
+; GFX9-NEXT: ; use s0
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_lshr_b32 s0, s4, 16
+; VI-NEXT: s_lshr_b32 s1, s4, 16
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s1, s2, 16
-; VI-NEXT: s_and_b32 s2, s2, 0xffff0000
-; VI-NEXT: s_or_b32 s2, s0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshr_b32 s2, s0, 16
+; VI-NEXT: s_and_b32 s0, s0, 0xffff0000
+; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: ;;#ASMSTART
-; VI-NEXT: ; use s0
+; VI-NEXT: ; use s1
; VI-NEXT: ;;#ASMEND
; VI-NEXT: ;;#ASMSTART
-; VI-NEXT: ; use s1
+; VI-NEXT: ; use s2
; VI-NEXT: ;;#ASMEND
; VI-NEXT: s_endpgm
;
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_load_dword s2, s[2:3], 0x0
-; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_lshr_b32 s0, s4, 16
+; CI-NEXT: s_load_dword s0, s[2:3], 0x0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: s_lshr_b32 s1, s4, 16
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshr_b32 s1, s2, 16
-; CI-NEXT: s_lshl_b32 s2, s1, 16
-; CI-NEXT: s_or_b32 s2, s0, s2
+; CI-NEXT: s_lshr_b32 s0, s0, 16
+; CI-NEXT: s_lshl_b32 s2, s0, 16
+; CI-NEXT: s_or_b32 s2, s1, s2
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: ;;#ASMSTART
-; CI-NEXT: ; use s0
+; CI-NEXT: ; use s1
; CI-NEXT: ;;#ASMEND
; CI-NEXT: ;;#ASMSTART
-; CI-NEXT: ; use s1
+; CI-NEXT: ; use s0
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, 0x3e7
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, 0x3e7
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
+; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: s_and_b32 s0, s2, 0xffff
+; CIVI-NEXT: s_and_b32 s0, s0, 0xffff
; CIVI-NEXT: s_or_b32 s0, s0, 0x3e70000
; CIVI-NEXT: v_mov_b32_e32 v2, s0
; CIVI-NEXT: flat_store_dword v[0:1], v2
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s4
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x30
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_lshl_b32 s0, s4, 16
+; VI-NEXT: s_lshl_b32 s1, s4, 16
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s1, s2, 0xffff
-; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_load_dword s0, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_lshl_b32 s1, s4, 16
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s0, s2, 0xffff
+; CI-NEXT: s_and_b32 s0, s0, 0xffff
; CI-NEXT: s_or_b32 s0, s0, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dword v[0:1], v2
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s0, s2, 16
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s0, 0x4500, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
+; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000
+; CIVI-NEXT: s_and_b32 s0, s0, 0xffff0000
; CIVI-NEXT: s_or_b32 s0, s0, 0x4500
; CIVI-NEXT: v_mov_b32_e32 v2, s0
; CIVI-NEXT: flat_store_dword v[0:1], v2
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, 0x4500
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, 0x4500
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
+; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
-; CIVI-NEXT: s_and_b32 s0, s2, 0xffff
+; CIVI-NEXT: s_and_b32 s0, s0, 0xffff
; CIVI-NEXT: s_or_b32 s0, s0, 0x45000000
; CIVI-NEXT: v_mov_b32_e32 v2, s0
; CIVI-NEXT: flat_store_dword v[0:1], v2
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_movk_i32 s4, 0x3e7
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v4, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: s_movk_i32 s0, 0x3e7
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_bfi_b32 v2, v3, s4, v4
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_bfi_b32 v0, v1, s0, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v2i16_0:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT: v_or_b32_e32 v2, 0x3e7, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_e32 v0, 0x3e7, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_insertelement_v2i16_0:
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_load_dword v3, v[0:1]
-; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: flat_load_dword v0, v[0:1]
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; CI-NEXT: v_or_b32_e32 v2, 0x3e7, v2
-; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_or_b32_e32 v0, 0x3e7, v0
+; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000
+; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff0000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v4, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s4
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_lshrrev_b32_e64 v1, 16, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v2i16_0_reghi:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: s_lshr_b32 s1, s4, 16
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_lshr_b32 s0, s4, 16
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT: v_or_b32_e32 v2, s1, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_e32 v0, s0, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_insertelement_v2i16_0_reghi:
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_load_dword v3, v[0:1]
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT: s_lshr_b32 s1, s4, 16
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: flat_load_dword v0, v[0:1]
+; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_lshr_b32 s0, s4, 16
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; CI-NEXT: v_or_b32_e32 v2, s1, v2
-; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_or_b32_e32 v0, s0, v0
+; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v4, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_bfi_b32 v2, v3, 53, v4
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_bfi_b32 v0, v1, 53, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v2i16_0_inlineimm:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT: v_or_b32_e32 v2, 53, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_e32 v0, 53, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_insertelement_v2i16_0_inlineimm:
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_load_dword v3, v[0:1]
-; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: flat_load_dword v0, v[0:1]
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; CI-NEXT: v_or_b32_e32 v2, 53, v2
-; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_or_b32_e32 v0, 53, v0
+; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_movk_i32 s4, 0x3e7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: s_movk_i32 s0, 0x3e7
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX9-NEXT: v_lshl_or_b32 v2, s4, 16, v2
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v2i16_1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 0x3e70000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 0x3e70000
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_insertelement_v2i16_1:
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_load_dword v3, v[0:1]
-; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: flat_load_dword v0, v[0:1]
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; CI-NEXT: v_or_b32_e32 v2, 0x3e70000, v2
-; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT: v_or_b32_e32 v0, 0x3e70000, v0
+; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX9-NEXT: v_lshl_or_b32 v2, -15, 16, v2
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, -15, 16, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v2i16_1_inlineimm:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 0xfff10000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 0xfff10000
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_insertelement_v2i16_1_inlineimm:
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_load_dword v3, v[0:1]
-; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: flat_load_dword v0, v[0:1]
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; CI-NEXT: v_or_b32_e32 v2, 0xfff10000, v2
-; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT: v_or_b32_e32 v0, 0xfff10000, v0
+; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x4500
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v4, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x4500
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4
-; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v2f16_0:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT: v_or_b32_e32 v2, 0x4500, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_e32 v0, 0x4500, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_insertelement_v2f16_0:
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_load_dword v3, v[0:1]
-; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: flat_load_dword v0, v[0:1]
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; CI-NEXT: v_or_b32_e32 v2, 0x4500, v2
-; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_or_b32_e32 v0, 0x4500, v0
+; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, 53
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 53
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v2f16_0_inlineimm:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT: v_or_b32_e32 v2, 53, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_or_b32_e32 v0, 53, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_insertelement_v2f16_0_inlineimm:
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_load_dword v3, v[0:1]
-; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: flat_load_dword v0, v[0:1]
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; CI-NEXT: v_or_b32_e32 v2, 53, v2
-; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_or_b32_e32 v0, 53, v0
+; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_movk_i32 s4, 0x4500
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: s_movk_i32 s0, 0x4500
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX9-NEXT: v_lshl_or_b32 v2, s4, 16, v2
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v2f16_1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 0x45000000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 0x45000000
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_insertelement_v2f16_1:
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_load_dword v3, v[0:1]
-; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: flat_load_dword v0, v[0:1]
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; CI-NEXT: v_or_b32_e32 v2, 0x45000000, v2
-; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT: v_or_b32_e32 v0, 0x45000000, v0
+; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; GFX9-NEXT: v_lshl_or_b32 v2, 35, 16, v2
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, 35, 16, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v2f16_1_inlineimm:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 0x230000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 0x230000
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_insertelement_v2f16_1_inlineimm:
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_load_dword v3, v[0:1]
-; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: flat_load_dword v0, v[0:1]
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
-; CI-NEXT: v_or_b32_e32 v2, 0x230000, v2
-; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT: v_or_b32_e32 v0, 0x230000, v0
+; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x3e703e7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v4, v[0:1], off
-; GFX9-NEXT: s_lshl_b32 s2, s4, 4
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s2
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: s_lshl_b32 s0, s4, 4
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x3e703e7
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_bfi_b32 v2, s0, v3, v4
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 0x3e703e7
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: s_lshl_b32 s2, s4, 4
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_lshl_b32 s0, 0xffff, s2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: s_lshl_b32 s0, s4, 4
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_lshl_b32 s0, 0xffff, s0
+; VI-NEXT: v_mov_b32_e32 v1, 0x3e703e7
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_bfi_b32 v2, s0, v3, v4
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_bfi_b32 v0, s0, v1, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 0x3e703e7
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_load_dword v4, v[0:1]
-; CI-NEXT: s_lshl_b32 s2, s4, 4
-; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_lshl_b32 s0, 0xffff, s2
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: flat_load_dword v0, v[0:1]
+; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT: s_lshl_b32 s0, s4, 4
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_lshl_b32 s0, 0xffff, s0
+; CI-NEXT: v_mov_b32_e32 v1, 0x3e703e7
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_bfi_b32 v2, s0, v3, v4
-; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: v_bfi_b32 v0, s0, v1, v0
+; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s6, 0xffff
-; GFX9-NEXT: s_mov_b32 s7, 0x12341234
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v4, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v4
-; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_dword v1, v[2:3], off
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4
+; GFX9-NEXT: s_mov_b32 s0, 0xffff
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_bfi_b32 v2, v2, s7, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x12341234
+; GFX9-NEXT: v_bfi_b32 v0, v1, s0, v0
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
-; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s6, 0xffff
-; VI-NEXT: s_mov_b32 s7, 0x12341234
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v4
-; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_dword v1, v[2:3]
+; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: s_mov_b32 s0, 0xffff
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_bfi_b32 v2, v2, s7, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; VI-NEXT: v_lshlrev_b32_e64 v1, v1, s0
+; VI-NEXT: s_mov_b32 s0, 0x12341234
+; VI-NEXT: v_bfi_b32 v0, v1, s0, v0
+; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
-; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b32 s6, 0x12341234
-; CI-NEXT: v_mov_b32_e32 v3, s3
-; CI-NEXT: v_mov_b32_e32 v1, s5
-; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_load_dword v4, v[0:1]
-; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; CI-NEXT: flat_load_dword v3, v[0:1]
-; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
-; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, s5
+; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: flat_load_dword v2, v[2:3]
+; CI-NEXT: flat_load_dword v0, v[0:1]
+; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
+; CI-NEXT: v_mov_b32_e32 v5, s1
+; CI-NEXT: s_mov_b32 s0, 0x12341234
+; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v4
-; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
+; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v2
+; CI-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_bfi_b32 v2, v2, s6, v3
-; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: v_bfi_b32 v0, v1, s0, v0
+; CI-NEXT: flat_store_dword v[4:5], v0
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: s_and_b32 s1, s4, 0xffff
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_and_b32 s0, s4, 0xffff
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; VI-NEXT: v_or_b32_e32 v0, s1, v0
+; VI-NEXT: v_or_b32_e32 v0, s0, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT: s_and_b32 s1, s4, 0xffff
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_and_b32 s0, s4, 0xffff
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT: v_or_b32_e32 v0, s1, v0
+; CI-NEXT: v_or_b32_e32 v0, s0, v0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_lshl_b32 s2, s4, 16
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: s_lshl_b32 s0, s4, 16
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT: s_lshl_b32 s1, s4, 16
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_lshl_b32 s0, s4, 16
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; CI-NEXT: v_or_b32_e32 v0, s1, v0
+; CI-NEXT: v_or_b32_e32 v0, s0, v0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: s_and_b32 s1, s4, 0xffff
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_and_b32 s0, s4, 0xffff
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; VI-NEXT: v_or_b32_e32 v1, s1, v1
+; VI-NEXT: v_or_b32_e32 v1, s0, v1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT: s_and_b32 s1, s4, 0xffff
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_and_b32 s0, s4, 0xffff
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT: v_or_b32_e32 v1, s1, v1
+; CI-NEXT: v_or_b32_e32 v1, s0, v1
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_lshl_b32 s2, s4, 16
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: s_lshl_b32 s0, s4, 16
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT: s_lshl_b32 s1, s4, 16
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_lshl_b32 s0, s4, 16
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; CI-NEXT: v_or_b32_e32 v1, s1, v1
+; CI-NEXT: v_or_b32_e32 v1, s0, v1
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: s_and_b32 s1, s4, 0xffff
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_and_b32 s0, s4, 0xffff
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; VI-NEXT: v_or_b32_e32 v1, s1, v1
+; VI-NEXT: v_or_b32_e32 v1, s0, v1
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT: s_and_b32 s1, s4, 0xffff
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_and_b32 s0, s4, 0xffff
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT: v_or_b32_e32 v1, s1, v1
+; CI-NEXT: v_or_b32_e32 v1, s0, v1
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
-; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX9-NEXT: s_mov_b32 s5, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_mov_b32 s4, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: s_pack_ll_b32_b16 s1, s6, s6
+; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: s_mov_b32 s0, 0xffff
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v4
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[4:5]
+; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1]
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_bfi_b32 v1, v5, s1, v1
-; GFX9-NEXT: v_bfi_b32 v0, v4, s1, v0
+; GFX9-NEXT: v_bfi_b32 v1, v5, s0, v1
+; GFX9-NEXT: v_bfi_b32 v0, v4, s0, v0
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: s_load_dword s6, s[4:5], 0x10
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: s_load_dword s4, s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_mov_b32 s4, 0xffff
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: s_mov_b32 s5, 0
-; VI-NEXT: s_and_b32 s1, s6, s4
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: s_lshl_b32 s0, s1, 16
-; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: s_mov_b32 s0, 0xffff
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_and_b32 s2, s4, s0
+; VI-NEXT: s_mov_b32 s1, 0
+; VI-NEXT: s_lshl_b32 s3, s2, 16
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
-; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[4:5]
+; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1]
+; VI-NEXT: s_or_b32 s0, s2, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_bfi_b32 v1, v5, s0, v1
; VI-NEXT: v_bfi_b32 v0, v4, s0, v0
;
; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: s_load_dword s6, s[4:5], 0x4
-; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: flat_load_dword v4, v[0:1]
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: s_load_dword s4, s[4:5], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT: s_mov_b32 s4, 0xffff
+; CI-NEXT: s_mov_b32 s6, 0xffff
+; CI-NEXT: s_mov_b32 s7, 0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT: s_mov_b32 s5, 0
-; CI-NEXT: s_lshl_b32 s2, s6, 16
-; CI-NEXT: s_and_b32 s3, s6, s4
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_lshl_b32 s1, s4, 16
+; CI-NEXT: s_and_b32 s3, s4, s6
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT: s_or_b32 s1, s3, s2
+; CI-NEXT: s_or_b32 s0, s3, s1
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
-; CI-NEXT: v_lshl_b64 v[4:5], s[4:5], v4
+; CI-NEXT: v_lshl_b64 v[4:5], s[6:7], v4
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_bfi_b32 v1, v5, s1, v1
-; CI-NEXT: v_bfi_b32 v0, v4, s1, v0
+; CI-NEXT: v_bfi_b32 v1, v5, s0, v1
+; CI-NEXT: v_bfi_b32 v0, v4, s0, v0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX9-NEXT: s_mov_b32 s7, 0
-; GFX9-NEXT: s_mov_b32 s6, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s4
-; GFX9-NEXT: s_lshl_b32 s2, s5, 4
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
-; GFX9-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NEXT: v_mov_b32_e32 v5, s3
+; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: s_mov_b32 s0, 0xffff
+; GFX9-NEXT: s_lshl_b32 s3, s5, 4
+; GFX9-NEXT: v_mov_b32_e32 v4, s2
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s3
+; GFX9-NEXT: v_mov_b32_e32 v5, s2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_bfi_b32 v1, s1, v4, v1
-; GFX9-NEXT: v_bfi_b32 v0, s0, v5, v0
+; GFX9-NEXT: v_bfi_b32 v1, s1, v5, v1
+; GFX9-NEXT: v_bfi_b32 v0, s0, v4, v0
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_mov_b32 s6, 0xffff
-; VI-NEXT: s_mov_b32 s7, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_and_b32 s2, s4, s6
-; VI-NEXT: s_lshl_b32 s3, s2, 16
-; VI-NEXT: s_or_b32 s2, s2, s3
-; VI-NEXT: s_lshl_b32 s4, s5, 4
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], s4
+; VI-NEXT: s_mov_b32 s0, 0xffff
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_mov_b32 s1, 0
+; VI-NEXT: s_lshl_b32 s2, s5, 4
+; VI-NEXT: s_and_b32 s3, s4, s0
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
+; VI-NEXT: s_lshl_b32 s2, s3, 16
+; VI-NEXT: s_or_b32 s2, s3, s2
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: v_mov_b32_e32 v5, s2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT: s_mov_b32 s6, 0xffff
-; CI-NEXT: s_mov_b32 s7, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT: s_and_b32 s2, s4, s6
-; CI-NEXT: s_lshl_b32 s3, s4, 16
-; CI-NEXT: s_or_b32 s2, s2, s3
-; CI-NEXT: s_lshl_b32 s4, s5, 4
-; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT: s_lshl_b64 s[0:1], s[6:7], s4
+; CI-NEXT: s_mov_b32 s0, 0xffff
+; CI-NEXT: s_and_b32 s2, s4, s0
+; CI-NEXT: s_lshl_b32 s4, s4, 16
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_or_b32 s2, s2, s4
+; CI-NEXT: s_mov_b32 s1, 0
+; CI-NEXT: s_lshl_b32 s3, s5, 4
+; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], s3
; CI-NEXT: v_mov_b32_e32 v4, s2
; CI-NEXT: v_mov_b32_e32 v5, s2
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_dword v5, v[0:1]
-; VI-NEXT: flat_load_dword v2, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_dword v1, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
+; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32:
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_dword v5, v[0:1], off
-; GFX9-NEXT: global_load_dword v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_dword v1, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_dword v5, v[0:1]
-; VI-NEXT: flat_load_dword v2, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_dword v1, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1
+; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_dword v5, v[0:1], off
-; GFX9-NEXT: global_load_dword v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_dword v1, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_dword v5, v[0:1]
-; VI-NEXT: flat_load_dword v2, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_dword v1, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1
+; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_dword v5, v[0:1], off
-; GFX9-NEXT: global_load_dword v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_dword v1, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_dword v5, v[0:1]
-; VI-NEXT: flat_load_dword v2, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_dword v1, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1
+; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_dword v5, v[0:1], off
-; GFX9-NEXT: global_load_dword v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_dword v1, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_dword v5, v[0:1]
-; VI-NEXT: flat_load_dword v2, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_dword v1, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1
+; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_dword v5, v[0:1], off
-; GFX9-NEXT: global_load_dword v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_dword v1, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v7, s9 ; encoding: [0x09,0x02,0x0e,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; encoding: [0x10,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; encoding: [0x18,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v7, s9 ; encoding: [0x09,0x02,0x0e,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; encoding: [0x20,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; encoding: [0x28,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x30,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
; GFX10-NEXT: v_mov_b32_e32 v8, v3 ; encoding: [0x03,0x03,0x10,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x38,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
-; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v7, s9 ; encoding: [0x09,0x02,0x0e,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x06,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x05,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
; VERDE-LABEL: image_load_mmo:
; VERDE: ; %bb.0:
; VERDE-NEXT: image_load v1, v[1:2], s[0:7] dmask:0x1 unorm
-; VERDE-NEXT: v_mov_b32_e32 v2, 0
+; VERDE-NEXT: v_mov_b32_e32 v3, 0
; VERDE-NEXT: s_mov_b32 m0, -1
-; VERDE-NEXT: v_add_i32_e32 v3, vcc, 16, v0
-; VERDE-NEXT: ds_write_b32 v0, v2
-; VERDE-NEXT: ds_write_b32 v3, v2
+; VERDE-NEXT: ds_write_b32 v0, v3
+; VERDE-NEXT: v_add_i32_e32 v0, vcc, 16, v0
+; VERDE-NEXT: ds_write_b32 v0, v3
; VERDE-NEXT: s_waitcnt vmcnt(0)
; VERDE-NEXT: v_mov_b32_e32 v0, v1
; VERDE-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-LABEL: image_load_mmo:
; GFX10: ; %bb.0:
; GFX10-NEXT: image_load v1, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm ; encoding: [0x08,0x11,0x00,0xf0,0x01,0x01,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: ds_write2_b32 v0, v3, v3 offset1:4 ; encoding: [0x00,0x04,0x38,0xd8,0x00,0x03,0x03,0x00]
+; GFX10-NEXT: ds_write2_b32 v0, v2, v2 offset1:4 ; encoding: [0x00,0x04,0x38,0xd8,0x00,0x02,0x02,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; encoding: [0x01,0x03,0x00,0x7e]
; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
; GFX10-NEXT: v_mov_b32_e32 v10, v0 ; encoding: [0x00,0x03,0x14,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; encoding: [0x01,0x03,0x12,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v11, s13 ; encoding: [0x0d,0x02,0x16,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: image_sample_c_d_o v[0:1], [v10, v9, v2, v3, v4, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x2c,0x04,0xe9,0xf0,0x0a,0x00,0x40,0x00,0x09,0x02,0x03,0x04,0x05,0x06,0x07,0x08]
-; GFX10-NEXT: v_mov_b32_e32 v10, s12 ; encoding: [0x0c,0x02,0x14,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, s12 ; encoding: [0x0c,0x02,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, s13 ; encoding: [0x0d,0x02,0x06,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[10:11], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x01,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
; VARIANT0: ; %bb.0: ; %entry
; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; VARIANT0-NEXT: s_load_dword s2, s[0:1], 0xb
-; VARIANT0-NEXT: v_not_b32_e32 v3, v0
; VARIANT0-NEXT: s_mov_b32 s7, 0xf000
; VARIANT0-NEXT: s_mov_b32 s6, 0
; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VARIANT0-NEXT: v_mov_b32_e32 v2, 0
+; VARIANT0-NEXT: v_not_b32_e32 v3, v0
; VARIANT0-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; VARIANT1: ; %bb.0: ; %entry
; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; VARIANT1-NEXT: s_load_dword s2, s[0:1], 0xb
-; VARIANT1-NEXT: v_not_b32_e32 v3, v0
; VARIANT1-NEXT: s_mov_b32 s7, 0xf000
; VARIANT1-NEXT: s_mov_b32 s6, 0
; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VARIANT1-NEXT: v_mov_b32_e32 v2, 0
+; VARIANT1-NEXT: v_not_b32_e32 v3, v0
; VARIANT1-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; VARIANT1-NEXT: s_barrier
; VARIANT2: ; %bb.0: ; %entry
; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c
-; VARIANT2-NEXT: v_lshlrev_b32_e32 v3, 2, v0
+; VARIANT2-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VARIANT2-NEXT: s_waitcnt lgkmcnt(0)
-; VARIANT2-NEXT: v_mov_b32_e32 v4, s3
-; VARIANT2-NEXT: v_xad_u32 v1, v0, -1, s0
-; VARIANT2-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; VARIANT2-NEXT: v_mov_b32_e32 v2, s3
+; VARIANT2-NEXT: v_xad_u32 v3, v0, -1, s0
+; VARIANT2-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; VARIANT2-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1
+; VARIANT2-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4]
+; VARIANT2-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; VARIANT2-NEXT: global_store_dword v[1:2], v0, off
+; VARIANT2-NEXT: v_mov_b32_e32 v0, s3
; VARIANT2-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3
-; VARIANT2-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2]
-; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; VARIANT2-NEXT: global_store_dword v[3:4], v0, off
-; VARIANT2-NEXT: v_mov_b32_e32 v5, s3
-; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1
-; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
+; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc
; VARIANT2-NEXT: s_waitcnt vmcnt(0)
; VARIANT2-NEXT: s_barrier
-; VARIANT2-NEXT: global_load_dword v0, v[0:1], off
+; VARIANT2-NEXT: global_load_dword v0, v[3:4], off
; VARIANT2-NEXT: s_waitcnt vmcnt(0)
-; VARIANT2-NEXT: global_store_dword v[3:4], v0, off
+; VARIANT2-NEXT: global_store_dword v[1:2], v0, off
; VARIANT2-NEXT: s_endpgm
;
; VARIANT3-LABEL: test_barrier:
; VARIANT3: ; %bb.0: ; %entry
; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c
-; VARIANT3-NEXT: v_lshlrev_b32_e32 v3, 2, v0
+; VARIANT3-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VARIANT3-NEXT: s_waitcnt lgkmcnt(0)
-; VARIANT3-NEXT: v_mov_b32_e32 v4, s3
-; VARIANT3-NEXT: v_xad_u32 v1, v0, -1, s0
-; VARIANT3-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; VARIANT3-NEXT: v_mov_b32_e32 v2, s3
+; VARIANT3-NEXT: v_xad_u32 v3, v0, -1, s0
+; VARIANT3-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; VARIANT3-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1
+; VARIANT3-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4]
+; VARIANT3-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; VARIANT3-NEXT: global_store_dword v[1:2], v0, off
+; VARIANT3-NEXT: v_mov_b32_e32 v0, s3
; VARIANT3-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3
-; VARIANT3-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2]
-; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; VARIANT3-NEXT: global_store_dword v[3:4], v0, off
-; VARIANT3-NEXT: v_mov_b32_e32 v5, s3
-; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1
-; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
+; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc
; VARIANT3-NEXT: s_barrier
-; VARIANT3-NEXT: global_load_dword v0, v[0:1], off
+; VARIANT3-NEXT: global_load_dword v0, v[3:4], off
; VARIANT3-NEXT: s_waitcnt vmcnt(0)
-; VARIANT3-NEXT: global_store_dword v[3:4], v0, off
+; VARIANT3-NEXT: global_store_dword v[1:2], v0, off
; VARIANT3-NEXT: s_endpgm
entry:
%tmp = call i32 @llvm.amdgcn.workitem.id.x()
define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
; SI-LABEL: simplify_bfe_u32_multi_use_arg:
; SI: ; %bb.0:
-; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; SI-NEXT: s_movk_i32 s9, 0xfc01
-; SI-NEXT: s_mov_b32 s5, 0xfffff
-; SI-NEXT: s_mov_b32 s4, -1
+; SI-NEXT: s_movk_i32 s11, 0xfc01
+; SI-NEXT: s_mov_b32 s9, 0xfffff
+; SI-NEXT: s_mov_b32 s8, -1
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
-; SI-NEXT: s_brev_b32 s8, -2
+; SI-NEXT: s_brev_b32 s10, -2
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_mov_b32 s7, 0x80000
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bfe_u32 v4, v3, 20, 11
-; SI-NEXT: v_add_i32_e32 v10, vcc, s9, v4
-; SI-NEXT: v_lshr_b64 v[4:5], s[4:5], v10
+; SI-NEXT: v_add_i32_e32 v10, vcc, s11, v4
+; SI-NEXT: v_lshr_b64 v[4:5], s[8:9], v10
; SI-NEXT: v_cmp_eq_u32_e32 vcc, -1, v10
; SI-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
-; SI-NEXT: v_bfi_b32 v11, s8, v8, v3
+; SI-NEXT: v_bfi_b32 v11, s10, v8, v3
; SI-NEXT: v_and_b32_e32 v9, v3, v5
; SI-NEXT: v_and_b32_e32 v8, v2, v4
; SI-NEXT: v_lshr_b64 v[6:7], s[6:7], v10
;
; CI-LABEL: v_round_f64:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; CI-NEXT: s_brev_b32 s2, -2
-; CI-NEXT: v_mov_b32_e32 v5, 0x3ff00000
-; CI-NEXT: v_mov_b32_e32 v4, 0
+; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT: s_brev_b32 s6, -2
+; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_trunc_f64_e32 v[6:7], v[2:3]
-; CI-NEXT: v_add_f64 v[8:9], v[2:3], -v[6:7]
-; CI-NEXT: v_bfi_b32 v2, s2, v5, v3
-; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5
-; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
-; CI-NEXT: v_cndmask_b32_e32 v5, 0, v2, vcc
-; CI-NEXT: v_add_f64 v[2:3], v[6:7], v[4:5]
-; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3]
+; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5]
+; CI-NEXT: v_bfi_b32 v2, s6, v8, v3
+; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
+; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
+; CI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
+; CI-NEXT: v_mov_b32_e32 v2, 0
+; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
+; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep = getelementptr double, double addrspace(1)* %in, i32 %tid
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16 v0, v0
-; GFX900-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX900-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: ds_write_b16 v3, v0
-; GFX900-NEXT: v_bfi_b32 v0, v2, v0, v1
+; GFX900-NEXT: ds_write_b16 v2, v0
+; GFX900-NEXT: v_bfi_b32 v0, v3, v0, v1
; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v0, v0
-; GFX906-NEXT: v_mov_b32_e32 v3, 0
-; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX906-NEXT: v_mov_b32_e32 v2, 0
+; GFX906-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: ds_write_b16 v3, v0
-; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
+; GFX906-NEXT: ds_write_b16 v2, v0
+; GFX906-NEXT: v_bfi_b32 v0, v3, v0, v1
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v0, v0
+; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX906-NEXT: v_mov_b32_e32 v3, 0
+; GFX906-NEXT: ds_write_b16 v3, v2
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
-; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX906-NEXT: v_mov_b32_e32 v4, 0
-; GFX906-NEXT: ds_write_b16 v4, v3
; GFX906-NEXT: s_waitcnt lgkmcnt(1)
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX803-NEXT: v_mov_b32_e32 v3, 0
-; GFX803-NEXT: ds_write_b16 v3, v1
+; GFX803-NEXT: v_mov_b32_e32 v2, 0
+; GFX803-NEXT: ds_write_b16 v2, v1
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
-; GFX803-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16 v0, v0
-; GFX900-NEXT: v_mov_b32_e32 v4, 0xffff
-; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: ds_write_b16 v2, v0
-; GFX900-NEXT: ds_write_b16 v3, v5
-; GFX900-NEXT: v_bfi_b32 v0, v4, v0, v1
+; GFX900-NEXT: ds_write_b16 v3, v4
+; GFX900-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX900-NEXT: v_bfi_b32 v0, v2, v0, v1
; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v0, v0
-; GFX906-NEXT: v_mov_b32_e32 v4, 0xffff
-; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: ds_write_b16 v2, v0
-; GFX906-NEXT: ds_write_b16 v3, v5
-; GFX906-NEXT: v_bfi_b32 v0, v4, v0, v1
+; GFX906-NEXT: ds_write_b16 v3, v4
+; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX803-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: ds_write_b16 v2, v0
; GFX803-NEXT: ds_write_b16 v3, v1
-; GFX803-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094
-; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094
-; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
; GFX803-NEXT: s_waitcnt vmcnt(0)
-; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 addrspace(1)* nocapture readonly %arg1) {
; GCN-LABEL: muli24_shl64:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GCN-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[0:1], s[6:7]
; GCN-NEXT: v_lshlrev_b32_e32 v3, 3, v0
-; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v4, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v1
+; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v0
; GCN-NEXT: v_mul_i32_i24_e32 v0, -7, v0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
-; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64
+; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v4
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_lshr_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v5, v[0:1]
-; VI-NEXT: flat_load_dword v2, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_dword v1, v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v5
-; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v2, v3, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_lshrrev_b16_e32 v4, v1, v0
+; VI-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v0, v4, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_lshr_v2i16:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4
+; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
; CI-NEXT: s_mov_b32 s8, 0xffff
-; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshr_b32_e32 v3, v4, v5
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
-; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_lshrrev_b16 v0, s0, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_v_s_v2i16:
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b16_e32 v2, s0, v3
-; VI-NEXT: v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_lshrrev_b16_e32 v1, s0, v0
+; VI-NEXT: v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: lshr_v_s_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; CI-NEXT: s_load_dword s8, s[0:1], 0xd
+; CI-NEXT: s_load_dword s0, s[0:1], 0xd
+; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_lshr_b32 s9, s0, 16
+; CI-NEXT: s_and_b32 s10, s0, s8
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
-; CI-NEXT: s_lshr_b32 s9, s8, 16
-; CI-NEXT: s_mov_b32 s10, 0xffff
-; CI-NEXT: s_and_b32 s8, s8, s10
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; CI-NEXT: v_and_b32_e32 v2, s10, v2
+; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_lshrrev_b32_e32 v3, s9, v3
-; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2
+; CI-NEXT: v_lshrrev_b32_e32 v2, s10, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_lshrrev_b16 v2, v3, s0
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, s0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_s_v_v2i16:
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b16_e64 v2, v3, s0
-; VI-NEXT: v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_lshrrev_b16_e64 v1, v0, s0
+; VI-NEXT: v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: lshr_s_v_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; CI-NEXT: s_load_dword s8, s[0:1], 0xd
+; CI-NEXT: s_load_dword s0, s[0:1], 0xd
+; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_lshr_b32 s9, s0, 16
+; CI-NEXT: s_and_b32 s10, s0, s8
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
-; CI-NEXT: s_lshr_b32 s9, s8, 16
-; CI-NEXT: s_mov_b32 s10, 0xffff
-; CI-NEXT: s_and_b32 s8, s8, s10
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; CI-NEXT: v_and_b32_e32 v2, s10, v2
+; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_lshr_b32_e32 v3, s9, v3
-; CI-NEXT: v_lshr_b32_e32 v2, s8, v2
+; CI-NEXT: v_lshr_b32_e32 v2, s10, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_lshrrev_b16 v2, v3, 8 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, 8 op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_imm_v_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 8
+; VI-NEXT: v_mov_b32_e32 v4, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b16_e64 v2, v4, 8
-; VI-NEXT: v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_lshrrev_b16_e64 v1, v0, 8
+; VI-NEXT: v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: lshr_imm_v_v2i16:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; CI-NEXT: v_lshr_b32_e32 v2, 8, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
-; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_lshrrev_b16 v2, 8, v3 op_sel_hi:[0,1]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_v_imm_v2i16:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v3
-; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: lshr_v_imm_v2i16:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2
-; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; GFX9-LABEL: v_lshr_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v3
-; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v2
-; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v5
+; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v4
+; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_lshr_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1
-; VI-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v0
-; VI-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_lshrrev_b16_e32 v6, v5, v1
+; VI-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_lshrrev_b16_e32 v5, v4, v0
+; VI-NEXT: v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v1, v6, v1
-; VI-NEXT: v_or_b32_e32 v0, v3, v0
-; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; VI-NEXT: v_or_b32_e32 v0, v5, v0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_lshr_v4i16:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8
+; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
; CI-NEXT: s_mov_b32 s8, 0xffff
-; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; CI-NEXT: v_or_b32_e32 v3, v3, v5
; CI-NEXT: v_or_b32_e32 v2, v2, v4
-; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
;
; CI-LABEL: lshr_v_imm_v4i16:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 s8, 0xff00ff
-; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3
; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
; CI-NEXT: v_and_b32_e32 v3, s8, v3
; CI-NEXT: v_and_b32_e32 v2, s8, v2
-; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_ushort v5, v[0:1]
-; VI-NEXT: flat_load_ushort v2, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ushort v0, v[0:1]
+; VI-NEXT: flat_load_ushort v1, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_i16_e32 v2, v5, v2
-; VI-NEXT: flat_store_short v[0:1], v2
+; VI-NEXT: v_max_i16_e32 v0, v0, v1
+; VI-NEXT: flat_store_short v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imax_sge_i16:
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_ushort v5, v[0:1], off
-; GFX9-NEXT: global_load_ushort v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-NEXT: global_load_ushort v1, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_i16_e32 v2, v5, v2
-; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: v_max_i16_e32 v0, v0, v1
+; GFX9-NEXT: global_store_short v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_dword v5, v[0:1]
-; VI-NEXT: flat_load_dword v2, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_dword v1, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_i16_e32 v3, v5, v2
-; VI-NEXT: v_max_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v2, v3, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_max_i16_e32 v2, v0, v1
+; VI-NEXT: v_max_i16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v0, v2, v0
+; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imax_sge_v2i16:
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_dword v5, v[0:1], off
-; GFX9-NEXT: global_load_dword v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_dword v1, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_max_i16 v2, v5, v2
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_max_i16 v0, v0, v1
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0
+; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v6
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v6
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v7, s5
-; VI-NEXT: v_add_u32_e32 v6, vcc, s4, v6
-; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; VI-NEXT: flat_load_ushort v8, v[4:5]
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0
+; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ushort v8, v[6:7]
; VI-NEXT: flat_load_dword v9, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
-; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v6
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v4
+; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; VI-NEXT: v_max_i16_e32 v0, v8, v0
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_max_i16_e32 v2, v9, v1
; VI-NEXT: v_max_i16_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
-; VI-NEXT: flat_store_short v[4:5], v0
-; VI-NEXT: flat_store_dword v[6:7], v1
+; VI-NEXT: flat_store_short v[6:7], v0
+; VI-NEXT: flat_store_dword v[4:5], v1
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imax_sge_v3i16:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_short_d16 v6, v[0:1], off offset:4
-; GFX9-NEXT: global_load_dword v7, v[0:1], off
-; GFX9-NEXT: global_load_short_d16 v5, v[2:3], off offset:4
-; GFX9-NEXT: global_load_dword v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_pk_max_i16 v3, v6, v5
+; GFX9-NEXT: global_load_short_d16 v7, v[0:1], off offset:4
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_short_d16 v6, v[2:3], off offset:4
+; GFX9-NEXT: global_load_dword v1, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_max_i16 v2, v7, v2
-; GFX9-NEXT: global_store_short v[0:1], v3, off offset:4
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_max_i16 v0, v0, v1
+; GFX9-NEXT: v_pk_max_i16 v1, v7, v6
+; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_ushort v5, v[0:1]
-; VI-NEXT: flat_load_ushort v2, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ushort v0, v[0:1]
+; VI-NEXT: flat_load_ushort v1, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_i16_e32 v2, v5, v2
-; VI-NEXT: flat_store_short v[0:1], v2
+; VI-NEXT: v_max_i16_e32 v0, v0, v1
+; VI-NEXT: flat_store_short v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imax_sgt_i16:
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_ushort v5, v[0:1], off
-; GFX9-NEXT: global_load_ushort v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-NEXT: global_load_ushort v1, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_i16_e32 v2, v5, v2
-; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: v_max_i16_e32 v0, v0, v1
+; GFX9-NEXT: global_store_short v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_ushort v5, v[0:1]
-; VI-NEXT: flat_load_ushort v2, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ushort v0, v[0:1]
+; VI-NEXT: flat_load_ushort v1, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_u16_e32 v2, v5, v2
-; VI-NEXT: flat_store_short v[0:1], v2
+; VI-NEXT: v_max_u16_e32 v0, v0, v1
+; VI-NEXT: flat_store_short v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_umax_uge_i16:
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_ushort v5, v[0:1], off
-; GFX9-NEXT: global_load_ushort v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-NEXT: global_load_ushort v1, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_u16_e32 v2, v5, v2
-; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: v_max_u16_e32 v0, v0, v1
+; GFX9-NEXT: global_store_short v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_ushort v5, v[0:1]
-; VI-NEXT: flat_load_ushort v2, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ushort v0, v[0:1]
+; VI-NEXT: flat_load_ushort v1, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_u16_e32 v2, v5, v2
-; VI-NEXT: flat_store_short v[0:1], v2
+; VI-NEXT: v_max_u16_e32 v0, v0, v1
+; VI-NEXT: flat_store_short v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_umax_ugt_i16:
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_ushort v5, v[0:1], off
-; GFX9-NEXT: global_load_ushort v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-NEXT: global_load_ushort v1, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_max_u16_e32 v2, v5, v2
-; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: v_max_u16_e32 v0, v0, v1
+; GFX9-NEXT: global_store_short v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_dword v5, v[0:1]
-; VI-NEXT: flat_load_dword v2, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_dword v1, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_u16_e32 v3, v5, v2
-; VI-NEXT: v_max_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v2, v3, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_max_u16_e32 v2, v0, v1
+; VI-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v0, v2, v0
+; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_umax_ugt_v2i16:
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_dword v5, v[0:1], off
-; GFX9-NEXT: global_load_dword v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_dword v1, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_max_u16 v2, v5, v2
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_max_u16 v0, v0, v1
+; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid
; CHECK: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, 0, 0, implicit $exec
; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, 0, 0, implicit $exec
- ; CHECK: undef %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec
- ; CHECK: %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
+ ; CHECK: undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
; CHECK: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: undef %11.sub1:vreg_64 = IMPLICIT_DEF
- ; CHECK: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
- ; CHECK: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; CHECK: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]]
; CHECK: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $exec
; CHECK: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $exec
+ ; CHECK: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec
- ; CHECK: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK: undef %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $exec
+ ; CHECK: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $exec
+ ; CHECK: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK: %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec
; CHECK: GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, implicit $exec
- ; CHECK: %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, 0, 0, implicit $exec
- ; CHECK: [[DEF2]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, implicit $exec
+ ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK: %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF2]], 0, 0, 0, 0, implicit $exec
+ ; CHECK: [[DEF1]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, implicit $exec
; CHECK: dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, 0, implicit $exec
- ; CHECK: dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF4]], 0, 0, 0, 0, implicit $exec
- ; CHECK: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 2, [[DEF2]], implicit $exec
- ; CHECK: dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF5]], 0, 0, 0, 0, implicit $exec
- ; CHECK: S_NOP 0, implicit [[DEF7]], implicit [[V_LSHLREV_B64_]].sub0, implicit [[DEF6]], implicit [[V_MOV_B32_e32_]]
- ; CHECK: GLOBAL_STORE_DWORD [[DEF5]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, implicit $exec
+ ; CHECK: dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF6]], 0, 0, 0, 0, implicit $exec
+ ; CHECK: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 2, [[DEF1]], implicit $exec
+ ; CHECK: dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF7]], 0, 0, 0, 0, implicit $exec
+ ; CHECK: S_NOP 0, implicit [[DEF5]], implicit [[V_LSHLREV_B64_]].sub0, implicit [[DEF4]], implicit [[V_MOV_B32_e32_]]
+ ; CHECK: GLOBAL_STORE_DWORD [[DEF7]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, implicit $exec
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: S_SETREG_IMM32_B32 0, 1
; CHECK: successors: %bb.1(0x80000000)
; CHECK: INLINEASM &"", 1, 851978, def dead %11
; CHECK: GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+ ; CHECK: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load 8, addrspace 3)
; CHECK: INLINEASM &"def $0 $1", 1, 851978, def %15, 851978, def %16
; CHECK: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec
; CHECK: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec
; CHECK: INLINEASM &"def $0 $1", 1, 851978, def %21, 851978, def %22
; CHECK: [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
- ; CHECK: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK: %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
- ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[DEF2]], implicit $exec
- ; CHECK: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U32_e64 64, [[V_ADD_U32_e32_]], implicit $exec
- ; CHECK: [[DEF]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
- ; CHECK: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK: INLINEASM &"", 1, 851978, def dead [[V_MOV_B32_e32_2]], 851978, def dead [[V_MOV_B32_e32_3]], 851977, [[DS_READ_B64_gfx9_]].sub0, 2147483657, [[V_MOV_B32_e32_2]](tied-def 3), 2147549193, [[V_MOV_B32_e32_3]](tied-def 5), 851977, %15, 851977, %16, 851977, [[DS_READ_B32_gfx9_1]], 851977, [[DS_READ_B32_gfx9_]], 851977, [[DS_READ_B32_gfx9_3]], 851977, [[DS_READ_B32_gfx9_2]]
+ ; CHECK: %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
; CHECK: DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store 4, addrspace 3)
; CHECK: DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store 4, addrspace 3)
; CHECK: DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3)
; CHECK: undef %31.sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+ ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[DEF2]], implicit $exec
+ ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+ ; CHECK: [[DEF]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
; CHECK: [[V_MUL_LO_U32_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32 [[V_ADD_U32_e32_]], [[S_MOV_B32_]], implicit $exec
+ ; CHECK: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U32_e64 64, [[V_ADD_U32_e32_]], implicit $exec
; CHECK: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_ADD_U32_e32_]], [[V_CMP_GT_U32_e64_]], implicit $exec
; CHECK: [[V_SUB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[V_MUL_LO_U32_]], [[DEF1]], implicit $exec
; CHECK: [[V_MUL_LO_U32_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32 [[V_CNDMASK_B32_e64_]], [[S_MOV_B32_]], implicit $exec
; CHECK: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_SUB_U32_e32_]], [[DEF]].sub0, implicit $exec
; CHECK: [[V_SUB_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[V_MUL_LO_U32_1]], [[V_MUL_LO_U32_]], implicit $exec
- ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; CHECK: [[DEF]].sub0:vreg_64 = V_ADD_U32_e32 [[V_SUB_U32_e32_1]], [[V_ADD_U32_e32_1]], implicit $exec
; CHECK: undef %38.sub0:vreg_64, %39:sreg_64_xexec = V_ADD_I32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[DEF]].sub0, 0, implicit $exec
; CHECK: undef %40.sub1:vreg_64, dead %41:sreg_64_xexec = V_ADDC_U32_e64 [[COPY1]], [[DEF]].sub1, %39, 0, implicit $exec
; GCN-LABEL: name: handleMove_bundle
; GCN: liveins: $sgpr4_sgpr5
; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
- ; GCN: $vcc_hi = IMPLICIT_DEF
- ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; GCN: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0, 0 :: (dereferenceable invariant load 4, align 16, addrspace 4)
+ ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN: $vcc_hi = IMPLICIT_DEF
; GCN: DS_WRITE_B32_gfx9 [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store 4, addrspace 3)
; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
; GCN: $m0 = S_MOV_B32 0
;
; GCN-IR-LABEL: s_test_sdiv24_48:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe
-; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xb
+; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb
; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc
-; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd
+; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xd
+; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_sext_i32_i16 s7, s2
-; GCN-IR-NEXT: s_ashr_i32 s2, s7, 31
-; GCN-IR-NEXT: s_sext_i32_i16 s5, s3
-; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 24
-; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31
-; GCN-IR-NEXT: s_mov_b32 s5, s4
-; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 24
-; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], s[8:9]
-; GCN-IR-NEXT: s_sub_u32 s10, s8, s4
+; GCN-IR-NEXT: s_sext_i32_i16 s3, s3
+; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 24
+; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31
+; GCN-IR-NEXT: s_sext_i32_i16 s5, s5
; GCN-IR-NEXT: s_mov_b32 s3, s2
-; GCN-IR-NEXT: s_subb_u32 s11, s9, s4
-; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[6:7]
-; GCN-IR-NEXT: s_sub_u32 s6, s6, s2
-; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
-; GCN-IR-NEXT: s_flbit_i32_b32 s0, s6
-; GCN-IR-NEXT: s_subb_u32 s7, s7, s2
+; GCN-IR-NEXT: s_ashr_i32 s6, s5, 31
+; GCN-IR-NEXT: s_ashr_i64 s[12:13], s[4:5], 24
+; GCN-IR-NEXT: s_xor_b64 s[4:5], s[2:3], s[8:9]
+; GCN-IR-NEXT: s_sub_u32 s10, s4, s2
+; GCN-IR-NEXT: s_mov_b32 s7, s6
+; GCN-IR-NEXT: s_subb_u32 s11, s5, s2
+; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], s[12:13]
+; GCN-IR-NEXT: s_sub_u32 s8, s4, s6
+; GCN-IR-NEXT: s_subb_u32 s9, s5, s6
+; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-IR-NEXT: s_flbit_i32_b32 s0, s8
; GCN-IR-NEXT: s_add_i32 s0, s0, 32
-; GCN-IR-NEXT: s_flbit_i32_b32 s1, s7
+; GCN-IR-NEXT: s_flbit_i32_b32 s1, s9
; GCN-IR-NEXT: v_mov_b32_e32 v1, s0
; GCN-IR-NEXT: s_flbit_i32_b32 s0, s10
; GCN-IR-NEXT: v_mov_b32_e32 v0, s1
-; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0
; GCN-IR-NEXT: s_add_i32 s0, s0, 32
; GCN-IR-NEXT: s_flbit_i32_b32 s1, s11
; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
; GCN-IR-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v2, v3
; GCN-IR-NEXT: v_subb_u32_e64 v1, s[0:1], 0, 0, vcc
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[6:7], 0
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[8:9], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[10:11], 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[0:1]
; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
; GCN-IR-NEXT: BB9_4: ; %udiv-preheader
; GCN-IR-NEXT: v_not_b32_e32 v2, v2
; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[10:11], v4
-; GCN-IR-NEXT: s_add_u32 s10, s6, -1
+; GCN-IR-NEXT: s_add_u32 s10, s8, -1
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3
; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT: s_addc_u32 s11, s7, -1
+; GCN-IR-NEXT: s_addc_u32 s11, s9, -1
; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v3, 0
; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s10, v6
; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc
; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2
-; GCN-IR-NEXT: v_and_b32_e32 v10, s6, v8
+; GCN-IR-NEXT: v_and_b32_e32 v10, s8, v8
; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8
-; GCN-IR-NEXT: v_and_b32_e32 v11, s7, v8
+; GCN-IR-NEXT: v_and_b32_e32 v11, s9, v8
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4
; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc
; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0
; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1
; GCN-IR-NEXT: BB9_7: ; %udiv-end
-; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
+; GCN-IR-NEXT: s_xor_b64 s[0:1], s[6:7], s[2:3]
; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0
; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1
; GCN-IR-NEXT: v_mov_b32_e32 v2, s1
; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-IR-NEXT: s_mov_b32 s11, 0xf000
-; GCN-IR-NEXT: s_mov_b32 s10, -1
-; GCN-IR-NEXT: buffer_store_short v1, off, s[8:11], 0 offset:4
-; GCN-IR-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT: s_mov_b32 s6, -1
+; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
+; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-IR-NEXT: s_endpgm
%1 = ashr i48 %x, 24
%2 = ashr i48 %y, 24
define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
; GCN-LABEL: shl_i16_computed_amount:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s10, s6
-; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s10, s2
+; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s2
-; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: s_mov_b32 s8, s6
+; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 s14, 0
-; GCN-NEXT: s_mov_b32 s15, s7
-; GCN-NEXT: s_mov_b64 s[12:13], s[2:3]
+; GCN-NEXT: s_mov_b32 s15, s3
+; GCN-NEXT: s_mov_b64 s[12:13], s[6:7]
; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0
; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_lshl_b32_e32 v0, v2, v0
-; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; EG-LABEL: shl_i16_computed_amount:
define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
; GCN-LABEL: shl_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s10, s6
-; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s10, s2
+; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s2
-; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: s_mov_b32 s8, s6
+; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 s14, 0
-; GCN-NEXT: s_mov_b32 s15, s7
-; GCN-NEXT: s_mov_b64 s[12:13], s[2:3]
+; GCN-NEXT: s_mov_b32 s15, s3
+; GCN-NEXT: s_mov_b64 s[12:13], s[6:7]
; GCN-NEXT: buffer_load_dword v2, off, s[8:11], 0
; GCN-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: s_mov_b32 s0, 0xffff
-; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s4, 0xffff
+; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GCN-NEXT: v_and_b32_e32 v0, s0, v0
+; GCN-NEXT: v_and_b32_e32 v0, s4, v0
; GCN-NEXT: v_lshl_b32_e32 v0, v2, v0
; GCN-NEXT: v_lshl_b32_e32 v1, v1, v3
-; GCN-NEXT: v_and_b32_e32 v0, s0, v0
+; GCN-NEXT: v_and_b32_e32 v0, s4, v0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
-; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; EG-LABEL: shl_v2i16:
define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
; GCN-LABEL: shl_v4i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8
+; GCN-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
; GCN-NEXT: s_mov_b32 s8, 0xffff
-; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v2, s8, v2
; GCN-NEXT: v_or_b32_e32 v3, v3, v5
; GCN-NEXT: v_or_b32_e32 v2, v2, v4
-; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
;
; EG-LABEL: shl_v4i16:
define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
; GCN-LABEL: v_shl_32_i64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; GCN-NEXT: s_ashr_i32 s3, s2, 31
-; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], 3
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, 0
+; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b64 s[0:1], s[6:7]
-; GCN-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64
-; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GCN-NEXT: s_mov_b64 s[4:5], s[10:11]
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
; GCN-NEXT: s_endpgm
;
; EG-LABEL: v_shl_32_i64:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_lshlrev_b16 v2, v4, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_shl_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v5, v[0:1]
-; VI-NEXT: flat_load_dword v2, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_dword v1, v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5
-; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v2, v3, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0
+; VI-NEXT: v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v0, v4, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_shl_v2i16:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4
+; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
; CI-NEXT: s_mov_b32 s8, 0xffff
-; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_or_b32_e32 v2, v2, v3
-; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_lshlrev_b16 v2, s0, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_v_s_v2i16:
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v2, s0, v3
-; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_lshlrev_b16_e32 v1, s0, v0
+; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_v_s_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; CI-NEXT: s_load_dword s8, s[0:1], 0xd
+; CI-NEXT: s_load_dword s0, s[0:1], 0xd
+; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_lshr_b32 s9, s0, 16
+; CI-NEXT: s_and_b32 s10, s0, s8
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
-; CI-NEXT: s_mov_b32 s9, 0xffff
-; CI-NEXT: s_lshr_b32 s10, s8, 16
-; CI-NEXT: s_and_b32 s8, s8, s9
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2
-; CI-NEXT: v_lshlrev_b32_e32 v3, s10, v3
-; CI-NEXT: v_and_b32_e32 v2, s9, v2
+; CI-NEXT: v_lshlrev_b32_e32 v2, s10, v2
+; CI-NEXT: v_lshlrev_b32_e32 v3, s9, v3
+; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, s0
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, s0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_s_v_v2i16:
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e64 v2, v3, s0
-; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_lshlrev_b16_e64 v1, v0, s0
+; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_s_v_v2i16:
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 s0, 0xffff
-; CI-NEXT: s_lshr_b32 s1, s8, 16
+; CI-NEXT: s_lshr_b32 s9, s8, 16
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_and_b32_e32 v3, s0, v2
; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; CI-NEXT: v_lshl_b32_e32 v2, s1, v2
+; CI-NEXT: v_lshl_b32_e32 v2, s9, v2
; CI-NEXT: v_lshl_b32_e32 v3, s8, v3
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_and_b32_e32 v3, s0, v3
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, 8 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, 8 op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_imm_v_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 8
+; VI-NEXT: v_mov_b32_e32 v4, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e64 v2, v4, 8
-; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_lshlrev_b16_e64 v1, v0, 8
+; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_imm_v_v2i16:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_and_b32_e32 v3, 0xffff, v2
; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_and_b32_e32 v3, 0xfff8, v3
; CI-NEXT: v_or_b32_e32 v2, v3, v2
-; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_v_imm_v2i16:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
-; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2
-; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
-; VI-NEXT: v_or_b32_e32 v2, v3, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
+; VI-NEXT: v_and_b32_e32 v1, 0xff000000, v1
+; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_v_imm_v2i16:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
-; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; GFX9-LABEL: v_shl_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v3
-; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v2
-; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v5
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4
+; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_shl_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
-; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0
-; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_lshlrev_b16_e32 v6, v5, v1
+; VI-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_lshlrev_b16_e32 v5, v4, v0
+; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v1, v6, v1
-; VI-NEXT: v_or_b32_e32 v0, v3, v0
-; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; VI-NEXT: v_or_b32_e32 v0, v5, v0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_shl_v4i16:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8
+; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
; CI-NEXT: s_mov_b32 s8, 0xffff
-; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_or_b32_e32 v3, v3, v5
; CI-NEXT: v_or_b32_e32 v2, v2, v4
-; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_mov_b32 s4, 0xff000000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: s_mov_b32 s0, 0xff000000
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1
; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; VI-NEXT: v_and_b32_e32 v0, s4, v0
+; VI-NEXT: v_and_b32_e32 v0, s0, v0
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_and_b32_e32 v4, s4, v4
+; VI-NEXT: v_and_b32_e32 v4, s0, v4
; VI-NEXT: v_or_b32_e32 v1, v1, v4
; VI-NEXT: v_or_b32_e32 v0, v5, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
;
; CI-LABEL: shl_v_imm_v4i16:
; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, 0
+; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; CI-NEXT: s_mov_b32 s3, 0xf000
+; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
-; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
+; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 s8, 0xff00
-; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3
; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; CI-NEXT: v_or_b32_e32 v3, v3, v4
; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
-; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_subrev_u32_e32 v0, vcc, 64, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_i32_x_sub_64:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_x_sub_64:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_sub_u32_e32 v2, vcc, 64, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_sub_u32_e32 v0, vcc, 64, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_i32_64_sub_x:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_u32_e32 v2, 64, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_sub_u32_e32 v0, 64, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_64_sub_x:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v2, vcc, 0xffffffbf, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_add_u32_e32 v0, vcc, 0xffffffbf, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_i32_x_sub_65:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffbf, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffbf, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_x_sub_65:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_sub_u32_e32 v2, vcc, 0x41, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x41, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_i32_65_sub_x:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_u32_e32 v2, 0x41, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_sub_u32_e32 v0, 0x41, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_65_sub_x:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_i32_x_sub_neg16:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v2, 16, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_add_u32_e32 v0, 16, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_x_sub_neg16:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_sub_u32_e32 v2, vcc, -16, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_sub_u32_e32 v0, vcc, -16, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_i32_neg16_sub_x:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_u32_e32 v2, -16, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_sub_u32_e32 v0, -16, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_neg16_sub_x:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v2, vcc, 17, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_add_u32_e32 v0, vcc, 17, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_i32_x_sub_neg17:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v2, 17, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_add_u32_e32 v0, 17, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_x_sub_neg17:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_sub_u32_e32 v2, vcc, 0xffffffef, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_sub_u32_e32 v0, vcc, 0xffffffef, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_i32_neg17_sub_x:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_sub_u32_e32 v2, 0xffffffef, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_sub_u32_e32 v0, 0xffffffef, v0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_neg17_sub_x:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ushort v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ushort v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_subrev_u16_e32 v2, 64, v3
-; VI-NEXT: flat_store_short v[0:1], v2
+; VI-NEXT: v_subrev_u16_e32 v0, 64, v0
+; VI-NEXT: flat_store_short v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_i16_x_sub_64:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_ushort v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3
-; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: v_subrev_u16_e32 v0, 64, v0
+; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i16_x_sub_64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
-; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_ushort v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0
+; VI-NEXT: flat_load_ushort v0, v[1:2]
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_subrev_u16_e32 v2, 64, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_subrev_u16_e32 v0, 64, v0
+; VI-NEXT: flat_store_dword v[3:4], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_ushort v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s0, v0
+; GFX9-NEXT: global_load_ushort v0, v[1:2], off
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_subrev_u16_e32 v0, 64, v0
+; GFX9-NEXT: global_store_dword v[3:4], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 64
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 64
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_subrev_u16_e32 v3, 64, v4
-; VI-NEXT: v_or_b32_e32 v2, v3, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_subrev_u16_e32 v0, 64, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, 64 op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_64_64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 64
+; VI-NEXT: v_mov_b32_e32 v4, 64
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v2, -7, v4
-; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_add_u16_e32 v1, -7, v0
+; VI-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_sub_7_64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x400007
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: s_mov_b32 s0, 0x400007
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_7_64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 0xffffff85
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 0xffffff85
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_subrev_u16_e32 v3, 64, v4
-; VI-NEXT: v_or_b32_e32 v2, v3, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_subrev_u16_e32 v0, 64, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_sub_64_123:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x7b0040
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: s_mov_b32 s0, 0x7b0040
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_64_123:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT: v_add_u16_e32 v3, -7, v3
-; VI-NEXT: v_or_b32_e32 v2, v3, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT: v_add_u16_e32 v0, -7, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_i16 v2, v3, 7
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, 7
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_7_0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, -16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, -16
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_sub_0_16:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_i16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, 16 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_0_16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 0x3c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_sub_0_1_0:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_i16 v2, v3, -4.0 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, -4.0 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 0xffffbc00
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 0xffffbc00
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_i16 v2, v3, 4.0 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, 4.0 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 32
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 32
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_subrev_u16_e32 v3, 32, v4
-; VI-NEXT: v_or_b32_e32 v2, v3, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 32
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 32
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_0_neg32:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_0_neg32:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT: v_subrev_u16_e32 v3, 32, v3
-; VI-NEXT: v_or_b32_e32 v2, v3, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg32_0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, -16
+; VI-NEXT: v_mov_b32_e32 v4, -16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v2, -16, v4
-; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_add_u16_e32 v1, -16, v0
+; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_u16 v0, v0, 16 op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, -16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, -16
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_0_neg16:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_u16 v0, v0, 16 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_0_neg16:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
-; VI-NEXT: v_add_u16_e32 v3, -16, v3
-; VI-NEXT: v_or_b32_e32 v2, v3, v2
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT: v_add_u16_e32 v0, -16, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_u16 v0, v0, 16
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg16_0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 0xffffc400
+; VI-NEXT: v_mov_b32_e32 v4, 0xffffc400
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v2, 0xffffc400, v4
-; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_add_u16_e32 v1, 0xffffc400, v0
+; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_u16 v2, v3, 1.0 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_u16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg_fpone:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 0x4400
+; VI-NEXT: v_mov_b32_e32 v4, 0x4400
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v2, 4.0, v4
-; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_add_u16_e32 v1, 4.0, v0
+; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_u16 v2, v3, -1.0 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_u16 v0, v0, -1.0 op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg_negfpone:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 0x4000
+; VI-NEXT: v_mov_b32_e32 v4, 0x4000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v2, 2.0, v4
-; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_add_u16_e32 v1, 2.0, v0
+; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_u16 v2, v3, -2.0 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_u16 v0, v0, -2.0 op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 0xffffc000
+; VI-NEXT: v_mov_b32_e32 v4, 0xffffc000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v2, 0xffffc000, v4
-; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_add_u16_e32 v1, 0xffffc000, v0
+; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_u16 v2, v3, 2.0 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_u16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 32
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, 32
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_subrev_u16_e32 v2, 32, v3
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
+; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v3, v[0:1], off
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg32_undef:
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
-; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0
; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
-; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0
; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
;
; GCN-IR-LABEL: s_test_srem24_48:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe
; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb
-; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xc
+; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc
; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xd
+; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_sext_i32_i16 s5, s3
-; GCN-IR-NEXT: s_ashr_i32 s10, s5, 31
-; GCN-IR-NEXT: s_sext_i32_i16 s3, s6
+; GCN-IR-NEXT: s_sext_i32_i16 s3, s3
; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 24
+; GCN-IR-NEXT: s_sext_i32_i16 s5, s5
; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31
+; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 24
+; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31
; GCN-IR-NEXT: s_mov_b32 s3, s2
-; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[4:5], 24
-; GCN-IR-NEXT: s_mov_b32 s11, s10
+; GCN-IR-NEXT: s_mov_b32 s5, s4
; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
+; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5]
; GCN-IR-NEXT: s_sub_u32 s6, s6, s2
; GCN-IR-NEXT: s_subb_u32 s7, s7, s2
-; GCN-IR-NEXT: s_sub_u32 s8, s4, s10
-; GCN-IR-NEXT: s_subb_u32 s9, s5, s10
+; GCN-IR-NEXT: s_sub_u32 s8, s8, s4
+; GCN-IR-NEXT: s_subb_u32 s9, s9, s4
; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GCN-IR-NEXT: s_flbit_i32_b32 s0, s8
; GCN-IR-NEXT: s_add_i32 s0, s0, 32
; CIVI: ; %bb.0:
; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CIVI-NEXT: s_mov_b32 m0, -1
-; CIVI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; CIVI-NEXT: ds_write_b16 v0, v2 offset:4
; CIVI-NEXT: ds_write_b32 v0, v1
-; CIVI-NEXT: ds_write_b8 v0, v3 offset:6
+; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; CIVI-NEXT: ds_write_b8 v0, v1 offset:6
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_setpc_b64 s[30:31]
;
; FIJI-LABEL: local_store_i55:
; FIJI: ; %bb.0:
; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0
-; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8
-; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc
+; FIJI-NEXT: s_load_dword s2, s[4:5], 0x8
+; FIJI-NEXT: s_load_dword s1, s[4:5], 0xc
; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v2, s0
-; FIJI-NEXT: v_mov_b32_e32 v3, s1
-; FIJI-NEXT: s_and_b32 s3, s2, 0xffff
+; FIJI-NEXT: s_and_b32 s3, s1, 0xffff
; FIJI-NEXT: s_add_u32 s0, s4, 14
+; FIJI-NEXT: v_mov_b32_e32 v3, s1
; FIJI-NEXT: s_addc_u32 s1, s5, 0
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s1
; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
-; FIJI-NEXT: v_mov_b32_e32 v1, s2
-; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; FIJI-NEXT: ds_write_b16 v2, v3 offset:4
+; FIJI-NEXT: v_mov_b32_e32 v3, s2
+; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; FIJI-NEXT: v_or_b32_e32 v0, s3, v0
; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7
-; FIJI-NEXT: ds_write_b16 v2, v1 offset:4
; FIJI-NEXT: ds_write_b8 v2, v0 offset:6
; FIJI-NEXT: ds_write_b32 v2, v3
; FIJI-NEXT: s_endpgm
; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
+; HAWAII-NEXT: v_mov_b32_e32 v1, s2
+; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4
; HAWAII-NEXT: v_mov_b32_e32 v1, s1
-; HAWAII-NEXT: v_mov_b32_e32 v2, s2
-; HAWAII-NEXT: ds_write_b16 v0, v2 offset:4
; HAWAII-NEXT: ds_write_b32 v0, v1
; HAWAII-NEXT: s_endpgm
;
; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v0, s0
+; FIJI-NEXT: v_mov_b32_e32 v1, s2
+; FIJI-NEXT: ds_write_b16 v0, v1 offset:4
; FIJI-NEXT: v_mov_b32_e32 v1, s1
-; FIJI-NEXT: v_mov_b32_e32 v2, s2
-; FIJI-NEXT: ds_write_b16 v0, v2 offset:4
; FIJI-NEXT: ds_write_b32 v0, v1
; FIJI-NEXT: s_endpgm
;
; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: ds_write_b16 v0, v2 offset:4
-; GFX9-NEXT: ds_write_b32 v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: ds_write_b16 v0, v1 offset:4
+; GFX9-NEXT: ds_write_b32 v0, v2
; GFX9-NEXT: s_endpgm
store i48 %arg, i48 addrspace(3)* %ptr, align 8
ret void
; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
; HAWAII-NEXT: v_mov_b32_e32 v2, s2
+; HAWAII-NEXT: s_and_b32 s3, s3, 1
+; HAWAII-NEXT: v_mov_b32_e32 v0, s3
+; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v1, s1
-; HAWAII-NEXT: s_and_b32 s0, s3, 1
-; HAWAII-NEXT: v_mov_b32_e32 v3, s0
-; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8
; HAWAII-NEXT: ds_write_b64 v2, v[0:1]
; HAWAII-NEXT: s_endpgm
;
; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v2, s2
+; FIJI-NEXT: s_and_b32 s3, s3, 1
+; FIJI-NEXT: v_mov_b32_e32 v0, s3
+; FIJI-NEXT: ds_write_b8 v2, v0 offset:8
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s1
-; FIJI-NEXT: s_and_b32 s0, s3, 1
-; FIJI-NEXT: v_mov_b32_e32 v3, s0
-; FIJI-NEXT: ds_write_b8 v2, v3 offset:8
; FIJI-NEXT: ds_write_b64 v2, v[0:1]
; FIJI-NEXT: s_endpgm
;
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_and_b32 s3, s3, 1
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_and_b32 s0, s3, 1
-; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: ds_write_b8 v2, v3 offset:8
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
; GFX9-NEXT: s_endpgm
; CIVI: ; %bb.0:
; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CIVI-NEXT: s_mov_b32 m0, -1
-; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1
; CIVI-NEXT: ds_write_b16 v0, v1
-; CIVI-NEXT: ds_write_b8 v0, v2 offset:2
+; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1
+; CIVI-NEXT: ds_write_b8 v0, v1 offset:2
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_sub_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s9
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; VI-LABEL: v_test_sub_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s8, 0x1c8007b
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 0xfffffe38
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v1, 0xffffff85, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s8, 0xfc21fcb3
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_neg_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 0x3df
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v1, 0x34d, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_inline_neg1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 1
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v1, 1, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s8, 1.0
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s4, 1.0
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_mov_b32_e32 v2, 0xffffc080
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_sdwa v1, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s9
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v1, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT: v_mov_b32_e32 v3, s9
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s8, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: global_load_dword v1, v[2:3], off
+; GFX9-NEXT: global_load_dword v1, v[4:5], off
; GFX9-NEXT: s_mov_b32 s0, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
-; VI-NEXT: flat_load_dword v4, v[4:5]
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mov_b32_e32 v3, v1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_sub_u16_e32 v0, v2, v4
-; VI-NEXT: v_sub_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_sub_u16_e32 v0, v4, v2
+; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s9
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s9
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s9
+; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; GCN-NEXT: s_load_dword s3, s[0:1], 0xe
; GCN-NEXT: s_mov_b32 s5, 0xff000000
; GCN-NEXT: s_mov_b32 s4, 0xffff
-; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, s4
+; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, s4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s2, s2, s5
; GCN-NEXT: s_and_b32 s3, s3, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s2
-; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 24
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 24
+; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0
; GCN-NEXT: s_load_dword s6, s[0:1], 0xb
; GCN-NEXT: s_load_dword s7, s[0:1], 0xc
; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 24
; GCN-NEXT: v_mov_b32_e32 v9, 0
-; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v0
-; GCN-NEXT: v_rcp_f32_e32 v0, v2
+; GCN-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2
+; GCN-NEXT: v_rcp_f32_e32 v1, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s7, s7, s4
; GCN-NEXT: s_and_b32 s6, s6, s5
; GCN-NEXT: s_sub_u32 s8, 0, s2
-; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
; GCN-NEXT: v_trunc_f32_e32 v2, v2
-; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
+; GCN-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: s_subb_u32 s9, 0, s3
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: v_mul_lo_u32 v3, s8, v2
-; GCN-NEXT: v_mul_hi_u32 v4, s8, v0
-; GCN-NEXT: v_mul_lo_u32 v5, s9, v0
+; GCN-NEXT: v_mul_hi_u32 v4, s8, v1
+; GCN-NEXT: v_mul_lo_u32 v5, s9, v1
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT: v_mul_lo_u32 v4, s8, v0
+; GCN-NEXT: v_mul_lo_u32 v4, s8, v1
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT: v_mul_lo_u32 v6, v0, v3
-; GCN-NEXT: v_mul_hi_u32 v5, v0, v3
-; GCN-NEXT: v_mul_hi_u32 v7, v0, v4
+; GCN-NEXT: v_mul_lo_u32 v6, v1, v3
+; GCN-NEXT: v_mul_hi_u32 v5, v1, v3
+; GCN-NEXT: v_mul_hi_u32 v7, v1, v4
; GCN-NEXT: v_mul_hi_u32 v10, v2, v3
; GCN-NEXT: v_mul_lo_u32 v3, v2, v3
; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
; GCN-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc
; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc
; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v3
+; GCN-NEXT: v_add_i32_e64 v1, s[2:3], v1, v3
; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc
; GCN-NEXT: v_addc_u32_e64 v3, vcc, v2, v4, s[2:3]
; GCN-NEXT: v_mul_lo_u32 v5, s8, v3
-; GCN-NEXT: v_mul_hi_u32 v6, s8, v0
-; GCN-NEXT: v_mul_lo_u32 v7, s9, v0
+; GCN-NEXT: v_mul_hi_u32 v6, s8, v1
+; GCN-NEXT: v_mul_lo_u32 v7, s9, v1
; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GCN-NEXT: v_mul_lo_u32 v6, s8, v0
+; GCN-NEXT: v_mul_lo_u32 v6, s8, v1
; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GCN-NEXT: v_mul_lo_u32 v11, v0, v5
-; GCN-NEXT: v_mul_hi_u32 v13, v0, v5
-; GCN-NEXT: v_mul_hi_u32 v12, v0, v6
+; GCN-NEXT: v_mul_lo_u32 v11, v1, v5
+; GCN-NEXT: v_mul_hi_u32 v13, v1, v5
+; GCN-NEXT: v_mul_hi_u32 v12, v1, v6
; GCN-NEXT: v_mul_hi_u32 v10, v3, v6
; GCN-NEXT: v_mul_lo_u32 v6, v3, v6
; GCN-NEXT: v_mul_hi_u32 v7, v3, v5
; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GCN-NEXT: v_addc_u32_e64 v2, vcc, v2, v5, s[2:3]
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: v_alignbit_b32 v3, s7, v3, 24
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT: v_mul_hi_u32 v5, v3, v0
+; GCN-NEXT: v_mul_hi_u32 v5, v3, v1
; GCN-NEXT: v_mul_lo_u32 v4, v3, v2
; GCN-NEXT: v_mul_hi_u32 v6, v3, v2
-; GCN-NEXT: v_mul_hi_u32 v0, 0, v0
+; GCN-NEXT: v_mul_hi_u32 v1, 0, v1
; GCN-NEXT: v_mul_hi_u32 v2, 0, v2
; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc
; GCN-NEXT: v_add_i32_e32 v4, vcc, 0, v4
-; GCN-NEXT: v_addc_u32_e32 v0, vcc, v5, v0, vcc
+; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v8, vcc
-; GCN-NEXT: v_add_i32_e32 v0, vcc, 0, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1
; GCN-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GCN-NEXT: v_mul_lo_u32 v4, v1, v2
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v0
-; GCN-NEXT: v_mul_lo_u32 v6, v1, v0
+; GCN-NEXT: v_mul_lo_u32 v4, v0, v2
+; GCN-NEXT: v_mul_hi_u32 v5, v0, v1
+; GCN-NEXT: v_mul_lo_u32 v6, v0, v1
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6
; GCN-NEXT: v_subb_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT: v_sub_i32_e32 v5, vcc, v3, v1
+; GCN-NEXT: v_sub_i32_e32 v5, vcc, v3, v0
; GCN-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v4, vcc
-; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v0
; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
; GCN-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
-; GCN-NEXT: v_add_i32_e32 v6, vcc, 2, v0
+; GCN-NEXT: v_add_i32_e32 v6, vcc, 2, v1
; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v2, vcc
-; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v0
-; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v1
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v1
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v0
; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; GCN-NEXT: v_cndmask_b32_e64 v1, -1, v1, s[0:1]
-; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[0:1]
; GCN-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc
+; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v5, s[0:1]
+; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc
; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1]
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb
; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc
-; GCN-IR-NEXT: s_load_dword s7, s[0:1], 0xd
+; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd
; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe
; GCN-IR-NEXT: s_mov_b32 s4, 0xffff
-; GCN-IR-NEXT: s_mov_b32 s6, 0xff000000
+; GCN-IR-NEXT: s_mov_b32 s7, 0xff000000
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_and_b32 s3, s3, s4
-; GCN-IR-NEXT: s_and_b32 s2, s2, s6
+; GCN-IR-NEXT: s_and_b32 s2, s2, s7
; GCN-IR-NEXT: s_and_b32 s5, s5, s4
-; GCN-IR-NEXT: s_and_b32 s4, s7, s6
+; GCN-IR-NEXT: s_and_b32 s4, s6, s7
; GCN-IR-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[4:5], 24
; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v2, v[2:3], off
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
+; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
-; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v4
+; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, s2
; GFX9-NEXT: v_mov_b32_e32 v5, s3
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: s_endpgm
%ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16
; VI-LABEL: no_widen_i16_constant_divergent_load:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
-; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_ushort v2, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v2, 0x3e7, v2
-; VI-NEXT: v_or_b32_e32 v2, 4, v2
+; VI-NEXT: v_add_u16_e32 v0, 0x3e7, v0
+; VI-NEXT: v_or_b32_e32 v2, 4, v0
+; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
;
; GFX10-LABEL: xor3_uniform_vgpr:
; GFX10: ; %bb.0:
+; GFX10-NEXT: v_add_f32_e64 v0, s2, 1.0
; GFX10-NEXT: v_add_f32_e64 v1, s3, 2.0
-; GFX10-NEXT: v_add_f32_e64 v2, s2, 1.0
-; GFX10-NEXT: v_add_f32_e64 v0, 0x40400000, s4
+; GFX10-NEXT: v_add_f32_e64 v2, 0x40400000, s4
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX10-NEXT: ; return to shader part epilog
%a1 = fadd float %a, 1.0
%b2 = fadd float %b, 2.0