// Match the immediate offset first, which canonically is moved as low as
// possible.
- if (CurDAG->isBaseWithConstantOffset(Addr)) {
- SDValue LHS = Addr.getOperand(0);
- SDValue RHS = Addr.getOperand(1);
+ SDValue LHS, RHS;
+ if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
}
// Match the variable offset.
- if (Addr.getOpcode() != ISD::ADD)
- return false;
+ if (Addr.getOpcode() != ISD::ADD) {
+ if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
+ isa<ConstantSDNode>(Addr))
+ return false;
+
+ // It's cheaper to materialize a single 32-bit zero for vaddr than the two
+ // moves required to copy a 64-bit SGPR to VGPR.
+ SAddr = Addr;
+ SDNode *VMov = CurDAG->getMachineNode(
+ AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
+ CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
+ VOffset = SDValue(VMov, 0);
+ Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+ return true;
+ }
- SDValue LHS = Addr.getOperand(0);
- SDValue RHS = Addr.getOperand(1);
+ LHS = Addr.getOperand(0);
+ RHS = Addr.getOperand(1);
if (!LHS->isDivergent()) {
// add (i64 sgpr), (zero_extend (i32 vgpr))
return None;
// Match the variable offset.
- if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
- return None;
+ if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) {
+ // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
+ // drop this.
+ if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
+ AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT)
+ return None;
+
+ // It's cheaper to materialize a single 32-bit zero for vaddr than the two
+ // moves required to copy a 64-bit SGPR to VGPR.
+ const Register SAddr = AddrDef->Reg;
+ if (!isSGPR(SAddr))
+ return None;
+
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
+ VOffset)
+ .addImm(0);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
+ }};
+ }
// Look through the SGPR->VGPR copy.
- Register PtrBaseSrc =
+ Register SAddr =
getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
- if (!PtrBaseSrc)
- return None;
-
- const RegisterBank *BaseRB = RBI.getRegBank(PtrBaseSrc, *MRI, TRI);
- if (BaseRB->getID() != AMDGPU::SGPRRegBankID)
+ if (!SAddr || !isSGPR(SAddr))
return None;
- Register SAddr = PtrBaseSrc;
Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
// It's possible voffset is an SGPR here, but the copy to VGPR will be
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
; GPRIDX-NEXT: wavefront_sgpr_count = 9
-; GPRIDX-NEXT: workitem_vgpr_count = 4
+; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
; GPRIDX-NEXT: reserved_sgpr_first = 0
; GPRIDX-NEXT: s_cmp_eq_u32 s8, 4
; GPRIDX-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3]
; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
-; GPRIDX-NEXT: v_mov_b32_e32 v2, s6
; GPRIDX-NEXT: v_mov_b32_e32 v1, s1
-; GPRIDX-NEXT: v_mov_b32_e32 v3, s7
-; GPRIDX-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GPRIDX-NEXT: v_mov_b32_e32 v2, 0
+; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GPRIDX-NEXT: s_endpgm
;
; MOVREL-LABEL: dyn_extract_v5f64_s_s:
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
; GPRIDX-NEXT: wavefront_sgpr_count = 6
-; GPRIDX-NEXT: workitem_vgpr_count = 3
+; GPRIDX-NEXT: workitem_vgpr_count = 2
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
; GPRIDX-NEXT: reserved_sgpr_first = 0
; GPRIDX-NEXT: ; %bb.0: ; %entry
; GPRIDX-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GPRIDX-NEXT: s_load_dword s2, s[4:5], 0x8
+; GPRIDX-NEXT: v_mov_b32_e32 v1, 0
; GPRIDX-NEXT: s_waitcnt lgkmcnt(0)
-; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
; GPRIDX-NEXT: s_cmp_eq_u32 s2, 1
; GPRIDX-NEXT: s_cselect_b32 s3, 2.0, 1.0
; GPRIDX-NEXT: s_cmp_eq_u32 s2, 2
; GPRIDX-NEXT: s_cselect_b32 s3, 0x40400000, s3
; GPRIDX-NEXT: s_cmp_eq_u32 s2, 3
; GPRIDX-NEXT: s_cselect_b32 s2, 4.0, s3
-; GPRIDX-NEXT: v_mov_b32_e32 v2, s2
-; GPRIDX-NEXT: v_mov_b32_e32 v1, s1
-; GPRIDX-NEXT: global_store_dword v[0:1], v2, off
+; GPRIDX-NEXT: v_mov_b32_e32 v0, s2
+; GPRIDX-NEXT: global_store_dword v1, v0, s[0:1]
; GPRIDX-NEXT: s_endpgm
;
; MOVREL-LABEL: dyn_extract_v4f32_s_s_s:
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
; GPRIDX-NEXT: wavefront_sgpr_count = 7
-; GPRIDX-NEXT: workitem_vgpr_count = 4
+; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
; GPRIDX-NEXT: reserved_sgpr_first = 0
; GPRIDX-NEXT: s_load_dword s6, s[4:5], 0x8
; GPRIDX-NEXT: s_mov_b32 s0, 0
; GPRIDX-NEXT: s_mov_b32 s1, 0x40080000
+; GPRIDX-NEXT: v_mov_b32_e32 v2, 0
; GPRIDX-NEXT: s_waitcnt lgkmcnt(0)
-; GPRIDX-NEXT: v_mov_b32_e32 v2, s2
; GPRIDX-NEXT: s_cmp_eq_u32 s6, 1
; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
; GPRIDX-NEXT: s_cmp_eq_u32 s6, 2
; GPRIDX-NEXT: s_cselect_b64 s[0:1], 4.0, s[0:1]
; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
; GPRIDX-NEXT: v_mov_b32_e32 v1, s1
-; GPRIDX-NEXT: v_mov_b32_e32 v3, s3
-; GPRIDX-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GPRIDX-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GPRIDX-NEXT: s_endpgm
;
; MOVREL-LABEL: dyn_extract_v4f64_s_s_s:
define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.ptr, <64 x i32> addrspace(1)* %ptr, i32 %val, i32 %idx) #0 {
; GCN-LABEL: v_insert_v64i32_varidx:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_u32 s0, s0, s7
; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GCN-NEXT: s_add_u32 s0, s0, s7
; GCN-NEXT: v_mov_b32_e32 v16, 0x100
; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: v_add_u32_e32 v31, 64, v16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x0
+; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x0
; GCN-NEXT: s_load_dwordx16 s[52:67], s[10:11], 0x40
-; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x80
+; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x80
+; GCN-NEXT: v_add_u32_e32 v31, 64, v16
; GCN-NEXT: v_add_u32_e32 v32, 0x44, v16
-; GCN-NEXT: v_add_u32_e32 v33, 0x48, v16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s12
-; GCN-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NEXT: v_mov_b32_e32 v2, s14
-; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: v_mov_b32_e32 v4, s16
-; GCN-NEXT: v_mov_b32_e32 v5, s17
-; GCN-NEXT: v_mov_b32_e32 v6, s18
-; GCN-NEXT: v_mov_b32_e32 v7, s19
-; GCN-NEXT: v_mov_b32_e32 v8, s20
-; GCN-NEXT: v_mov_b32_e32 v9, s21
-; GCN-NEXT: v_mov_b32_e32 v10, s22
-; GCN-NEXT: v_mov_b32_e32 v11, s23
-; GCN-NEXT: v_mov_b32_e32 v12, s24
-; GCN-NEXT: v_mov_b32_e32 v13, s25
-; GCN-NEXT: v_mov_b32_e32 v14, s26
-; GCN-NEXT: v_mov_b32_e32 v15, s27
-; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0xc0
+; GCN-NEXT: v_mov_b32_e32 v0, s36
+; GCN-NEXT: v_mov_b32_e32 v1, s37
+; GCN-NEXT: v_mov_b32_e32 v2, s38
+; GCN-NEXT: v_mov_b32_e32 v3, s39
+; GCN-NEXT: v_mov_b32_e32 v4, s40
+; GCN-NEXT: v_mov_b32_e32 v5, s41
+; GCN-NEXT: v_mov_b32_e32 v6, s42
+; GCN-NEXT: v_mov_b32_e32 v7, s43
+; GCN-NEXT: v_mov_b32_e32 v8, s44
+; GCN-NEXT: v_mov_b32_e32 v9, s45
+; GCN-NEXT: v_mov_b32_e32 v10, s46
+; GCN-NEXT: v_mov_b32_e32 v11, s47
+; GCN-NEXT: v_mov_b32_e32 v12, s48
+; GCN-NEXT: v_mov_b32_e32 v13, s49
+; GCN-NEXT: v_mov_b32_e32 v14, s50
+; GCN-NEXT: v_mov_b32_e32 v15, s51
+; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0xc0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:256
; GCN-NEXT: v_add_u32_e32 v0, 4, v16
; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v1, s53
; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen
+; GCN-NEXT: v_add_u32_e32 v33, 0x48, v16
; GCN-NEXT: v_mov_b32_e32 v1, s54
; GCN-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen
-; GCN-NEXT: s_movk_i32 s4, 0x50
; GCN-NEXT: v_add_u32_e32 v34, 0x4c, v16
; GCN-NEXT: v_mov_b32_e32 v1, s55
; GCN-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen
-; GCN-NEXT: v_add_u32_e32 v35, s4, v16
+; GCN-NEXT: v_add_u32_e32 v35, 0x50, v16
; GCN-NEXT: v_mov_b32_e32 v1, s56
; GCN-NEXT: buffer_store_dword v1, v35, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v36, 0x54, v16
; GCN-NEXT: v_add_u32_e32 v37, 0x58, v16
; GCN-NEXT: v_mov_b32_e32 v1, s58
; GCN-NEXT: buffer_store_dword v1, v37, s[0:3], 0 offen
-; GCN-NEXT: s_movk_i32 s5, 0x60
; GCN-NEXT: v_add_u32_e32 v38, 0x5c, v16
; GCN-NEXT: v_mov_b32_e32 v1, s59
; GCN-NEXT: buffer_store_dword v1, v38, s[0:3], 0 offen
-; GCN-NEXT: v_add_u32_e32 v39, s5, v16
+; GCN-NEXT: v_add_u32_e32 v39, 0x60, v16
; GCN-NEXT: v_mov_b32_e32 v1, s60
; GCN-NEXT: buffer_store_dword v1, v39, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v40, 0x64, v16
; GCN-NEXT: v_add_u32_e32 v41, 0x68, v16
; GCN-NEXT: v_mov_b32_e32 v1, s62
; GCN-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen
-; GCN-NEXT: s_movk_i32 s10, 0x70
; GCN-NEXT: v_add_u32_e32 v42, 0x6c, v16
; GCN-NEXT: v_mov_b32_e32 v1, s63
; GCN-NEXT: buffer_store_dword v1, v42, s[0:3], 0 offen
-; GCN-NEXT: v_add_u32_e32 v43, s10, v16
+; GCN-NEXT: v_add_u32_e32 v43, 0x70, v16
; GCN-NEXT: v_mov_b32_e32 v1, s64
; GCN-NEXT: buffer_store_dword v1, v43, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v44, 0x74, v16
; GCN-NEXT: v_mov_b32_e32 v1, s67
; GCN-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v47, 0x80, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s36
+; GCN-NEXT: v_mov_b32_e32 v1, s12
; GCN-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v48, 0x84, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s37
+; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: buffer_store_dword v1, v48, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v49, 0x88, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s38
+; GCN-NEXT: v_mov_b32_e32 v1, s14
; GCN-NEXT: buffer_store_dword v1, v49, s[0:3], 0 offen
-; GCN-NEXT: s_movk_i32 s11, 0x90
; GCN-NEXT: v_add_u32_e32 v50, 0x8c, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s39
+; GCN-NEXT: v_mov_b32_e32 v1, s15
; GCN-NEXT: buffer_store_dword v1, v50, s[0:3], 0 offen
-; GCN-NEXT: v_add_u32_e32 v51, s11, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s40
+; GCN-NEXT: v_add_u32_e32 v51, 0x90, v16
+; GCN-NEXT: v_mov_b32_e32 v1, s16
; GCN-NEXT: buffer_store_dword v1, v51, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v52, 0x94, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s41
+; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: buffer_store_dword v1, v52, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v53, 0x98, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s42
+; GCN-NEXT: v_mov_b32_e32 v1, s18
; GCN-NEXT: buffer_store_dword v1, v53, s[0:3], 0 offen
-; GCN-NEXT: s_movk_i32 s28, 0xa0
; GCN-NEXT: v_add_u32_e32 v54, 0x9c, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s43
+; GCN-NEXT: v_mov_b32_e32 v1, s19
; GCN-NEXT: buffer_store_dword v1, v54, s[0:3], 0 offen
-; GCN-NEXT: v_add_u32_e32 v55, s28, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s44
+; GCN-NEXT: v_add_u32_e32 v55, 0xa0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, s20
; GCN-NEXT: buffer_store_dword v1, v55, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v56, 0xa4, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s45
+; GCN-NEXT: v_mov_b32_e32 v1, s21
; GCN-NEXT: buffer_store_dword v1, v56, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v57, 0xa8, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s46
+; GCN-NEXT: v_mov_b32_e32 v1, s22
; GCN-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen
-; GCN-NEXT: s_movk_i32 s29, 0xb0
; GCN-NEXT: v_add_u32_e32 v58, 0xac, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s47
+; GCN-NEXT: v_mov_b32_e32 v1, s23
; GCN-NEXT: buffer_store_dword v1, v58, s[0:3], 0 offen
-; GCN-NEXT: v_add_u32_e32 v59, s29, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s48
+; GCN-NEXT: v_add_u32_e32 v59, 0xb0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, s24
; GCN-NEXT: buffer_store_dword v1, v59, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v60, 0xb4, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s49
+; GCN-NEXT: v_mov_b32_e32 v1, s25
; GCN-NEXT: buffer_store_dword v1, v60, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v61, 0xb8, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s50
+; GCN-NEXT: v_mov_b32_e32 v1, s26
; GCN-NEXT: buffer_store_dword v1, v61, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v62, 0xbc, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s51
+; GCN-NEXT: v_mov_b32_e32 v1, s27
; GCN-NEXT: buffer_store_dword v1, v62, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v1, s12
; GCN-NEXT: v_add_u32_e32 v63, 0xc0, v16
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s36
; GCN-NEXT: buffer_store_dword v1, v63, s[0:3], 0 offen
-; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_add_u32_e32 v64, 0xc4, v16
+; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: buffer_store_dword v1, v64, s[0:3], 0 offen
-; GCN-NEXT: v_mov_b32_e32 v1, s14
; GCN-NEXT: v_add_u32_e32 v65, 0xc8, v16
+; GCN-NEXT: v_mov_b32_e32 v1, s38
; GCN-NEXT: buffer_store_dword v1, v65, s[0:3], 0 offen
-; GCN-NEXT: s_movk_i32 s12, 0xd0
; GCN-NEXT: v_add_u32_e32 v66, 0xcc, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s15
+; GCN-NEXT: v_mov_b32_e32 v1, s39
; GCN-NEXT: buffer_store_dword v1, v66, s[0:3], 0 offen
-; GCN-NEXT: v_add_u32_e32 v67, s12, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s16
+; GCN-NEXT: v_add_u32_e32 v67, 0xd0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, s40
; GCN-NEXT: buffer_store_dword v1, v67, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v68, 0xd4, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s17
+; GCN-NEXT: v_mov_b32_e32 v1, s41
; GCN-NEXT: buffer_store_dword v1, v68, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v69, 0xd8, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s18
+; GCN-NEXT: v_mov_b32_e32 v1, s42
; GCN-NEXT: buffer_store_dword v1, v69, s[0:3], 0 offen
-; GCN-NEXT: s_movk_i32 s13, 0xe0
; GCN-NEXT: v_add_u32_e32 v70, 0xdc, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s19
+; GCN-NEXT: v_mov_b32_e32 v1, s43
; GCN-NEXT: buffer_store_dword v1, v70, s[0:3], 0 offen
-; GCN-NEXT: v_add_u32_e32 v71, s13, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s20
+; GCN-NEXT: v_add_u32_e32 v71, 0xe0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, s44
; GCN-NEXT: buffer_store_dword v1, v71, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v72, 0xe4, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NEXT: v_mov_b32_e32 v1, s45
; GCN-NEXT: buffer_store_dword v1, v72, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v73, 0xe8, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s22
+; GCN-NEXT: v_mov_b32_e32 v1, s46
; GCN-NEXT: buffer_store_dword v1, v73, s[0:3], 0 offen
-; GCN-NEXT: s_movk_i32 s14, 0xf0
; GCN-NEXT: v_add_u32_e32 v74, 0xec, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s23
+; GCN-NEXT: v_mov_b32_e32 v1, s47
; GCN-NEXT: buffer_store_dword v1, v74, s[0:3], 0 offen
-; GCN-NEXT: v_add_u32_e32 v75, s14, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s24
+; GCN-NEXT: v_add_u32_e32 v75, 0xf0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, s48
; GCN-NEXT: buffer_store_dword v1, v75, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v76, 0xf4, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s25
-; GCN-NEXT: s_and_b32 s7, s7, 63
+; GCN-NEXT: v_mov_b32_e32 v1, s49
+; GCN-NEXT: s_and_b32 s5, s5, 63
; GCN-NEXT: buffer_store_dword v1, v76, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v77, 0xf8, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s26
+; GCN-NEXT: v_mov_b32_e32 v1, s50
; GCN-NEXT: v_add_u32_e32 v17, 8, v16
; GCN-NEXT: buffer_store_dword v1, v77, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v78, 0xfc, v16
-; GCN-NEXT: v_mov_b32_e32 v1, s27
-; GCN-NEXT: s_lshl_b32 s7, s7, 2
+; GCN-NEXT: v_mov_b32_e32 v1, s51
+; GCN-NEXT: s_lshl_b32 s5, s5, 2
; GCN-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v18, 12, v16
; GCN-NEXT: v_add_u32_e32 v19, 16, v16
; GCN-NEXT: v_add_u32_e32 v29, 56, v16
; GCN-NEXT: v_add_u32_e32 v30, 60, v16
; GCN-NEXT: buffer_store_dword v1, v78, s[0:3], 0 offen
-; GCN-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NEXT: v_add_u32_e32 v1, s7, v16
+; GCN-NEXT: v_add_u32_e32 v1, s5, v16
+; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v62, v77, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v63, v78, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:256
-; GCN-NEXT: v_mov_b32_e32 v65, s9
-; GCN-NEXT: s_add_u32 s6, s8, 16
-; GCN-NEXT: v_mov_b32_e32 v64, s8
-; GCN-NEXT: s_addc_u32 s7, s9, 0
+; GCN-NEXT: v_mov_b32_e32 v64, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v[64:65], v[0:3], off
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NEXT: s_add_u32 s6, s8, 32
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
-; GCN-NEXT: s_addc_u32 s7, s9, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NEXT: s_add_u32 s6, s8, 48
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
-; GCN-NEXT: s_addc_u32 s7, s9, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NEXT: s_add_u32 s6, s8, 64
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[12:15], off
-; GCN-NEXT: s_addc_u32 s7, s9, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NEXT: s_add_u32 s6, s8, s4
-; GCN-NEXT: s_addc_u32 s7, s9, 0
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[16:19], off
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_add_u32 s4, s8, s5
-; GCN-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[20:23], off
-; GCN-NEXT: s_addc_u32 s5, s9, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: s_add_u32 s4, s8, s10
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[24:27], off
-; GCN-NEXT: s_addc_u32 s5, s9, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: s_add_u32 s4, s8, 0x80
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[28:31], off
-; GCN-NEXT: s_addc_u32 s5, s9, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: s_add_u32 s4, s8, s11
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[32:35], off
-; GCN-NEXT: s_addc_u32 s5, s9, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: s_add_u32 s4, s8, s28
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[36:39], off
-; GCN-NEXT: s_addc_u32 s5, s9, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: s_add_u32 s4, s8, s29
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[40:43], off
-; GCN-NEXT: s_addc_u32 s5, s9, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: s_add_u32 s4, s8, 0xc0
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[44:47], off
-; GCN-NEXT: s_addc_u32 s5, s9, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: s_add_u32 s4, s8, s12
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[48:51], off
-; GCN-NEXT: s_addc_u32 s5, s9, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: s_add_u32 s4, s8, s13
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[52:55], off
-; GCN-NEXT: s_addc_u32 s5, s9, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: s_add_u32 s4, s8, s14
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[56:59], off
-; GCN-NEXT: s_addc_u32 s5, s9, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: global_store_dwordx4 v[0:1], v[60:63], off
+; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[8:9]
+; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[8:9] offset:16
+; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[8:9] offset:32
+; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[8:9] offset:48
+; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[8:9] offset:64
+; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[8:9] offset:80
+; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[8:9] offset:96
+; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[8:9] offset:112
+; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[8:9] offset:128
+; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[8:9] offset:144
+; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[8:9] offset:160
+; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[8:9] offset:176
+; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[8:9] offset:192
+; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[8:9] offset:208
+; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[8:9] offset:224
+; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[8:9] offset:240
; GCN-NEXT: s_endpgm
%vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
%insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: s_cselect_b32 s7, s16, s15
; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v4, 16
+; GFX9-NEXT: s_mov_b64 s[0:1], 16
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NEXT: v_mov_b32_e32 v5, 0
-; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_s_s:
; GFX9-NEXT: v_and_or_b32 v10, v1, s13, v0
; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13]
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-NEXT: v_mov_b32_e32 v10, 16
; GFX9-NEXT: v_mov_b32_e32 v9, 0
-; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_v_v16i16_s_s:
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_mov_b32_e32 v0, 16
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; GFX9-NEXT: s_mov_b64 s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_v_s:
; GFX9-NEXT: v_mov_b32_e32 v6, s22
; GFX9-NEXT: v_mov_b32_e32 v7, s23
; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_mov_b32_e32 v0, 16
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; GFX9-NEXT: s_mov_b64 s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_s_v:
; GFX9-NEXT: v_mov_b32_e32 v6, s18
; GFX9-NEXT: v_mov_b32_e32 v7, s19
; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_mov_b32_e32 v0, 16
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; GFX9-NEXT: s_mov_b64 s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_s_v16i16_v_v:
; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11]
; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc
+; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11]
-; GFX9-NEXT: v_mov_b32_e32 v10, 16
-; GFX9-NEXT: v_mov_b32_e32 v9, 0
-; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_v_v16i16_s_v:
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11]
; GFX9-NEXT: v_and_or_b32 v11, v1, s13, v0
; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9]
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc
+; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11]
-; GFX9-NEXT: v_mov_b32_e32 v10, 16
-; GFX9-NEXT: v_mov_b32_e32 v9, 0
-; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_v_v16i16_v_s:
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11]
-; GFX9-NEXT: v_mov_b32_e32 v10, 16
-; GFX9-NEXT: v_mov_b32_e32 v9, 0
-; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: s_mov_b64 s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
+; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX8-LABEL: insertelement_v_v16i16_v_v:
define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 inreg %idx) {
; GFX9-LABEL: insertelement_s_v2i8_s_s:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_store_short v[0:1], v2, off
define amdgpu_ps void @insertelement_s_v2i8_v_s(<2 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 inreg %idx) {
; GFX9-LABEL: insertelement_s_v2i8_v_s:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: global_load_ushort v1, v[1:2], off
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v1
define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 %idx) {
; GFX9-LABEL: insertelement_s_v2i8_s_v:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: global_load_ushort v1, v[1:2], off
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_load_ushort v2, v2, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX9-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_store_short v[0:1], v2, off
define amdgpu_ps void @insertelement_s_v2i8_v_v(<2 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 %idx) {
; GFX9-LABEL: insertelement_s_v2i8_v_v:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: global_load_ushort v2, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_load_ushort v2, v2, s[2:3]
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2
; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX9: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]]
- ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
- ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]]
+ ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]]
; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr
; GFX10: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX10: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]]
- ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
- ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]]
+ ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+ ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]]
%0:sgpr(p1) = COPY $sgpr0_sgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s32) = COPY $vgpr3
; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
- ; GFX9: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]]
- ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 4095, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
- ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]]
+ ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 4095, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]]
; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr_offset_4095
; GFX10: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3
; GFX10: $vcc_hi = IMPLICIT_DEF
; WAVE32-LABEL: name: copy
; WAVE32: $vcc_hi = IMPLICIT_DEF
; WAVE32: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3
- ; WAVE32: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[COPY]]
; WAVE32: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; WAVE32: GLOBAL_STORE_DWORD [[COPY1]], [[DEF]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+ ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; WAVE32: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[DEF]], [[COPY]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
%0:sgpr(p1) = COPY $sgpr2_sgpr3
%1:vgpr(p1) = COPY %0
%2:vgpr(s32) = G_IMPLICIT_DEF
; CHECK: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK: %12:vreg_64 = nofpexcept V_ADD_F64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
; CHECK: %15:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %12, 0, 0, implicit $mode, implicit $exec
- ; CHECK: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[COPY1]]
- ; CHECK: GLOBAL_STORE_DWORDX2 [[COPY5]], %15, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+ ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %15, [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
; CHECK: S_ENDPGM 0
%2:sgpr(p4) = COPY $sgpr0_sgpr1
%7:sgpr(s64) = G_CONSTANT i64 36
; CHECK: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK: %13:vreg_64 = nofpexcept V_ADD_F64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
; CHECK: %16:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %13, 0, 0, implicit $mode, implicit $exec
- ; CHECK: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[COPY1]]
- ; CHECK: GLOBAL_STORE_DWORDX2 [[COPY5]], %16, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+ ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %16, [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
; CHECK: S_ENDPGM 0
%2:sgpr(p4) = COPY $sgpr0_sgpr1
%7:sgpr(s64) = G_CONSTANT i64 36
; GFX9-LABEL: name: load_global_s32_from_sgpr
; GFX9: liveins: $sgpr0_sgpr1
; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[COPY]]
- ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
- ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
; GFX10-LABEL: name: load_global_s32_from_sgpr
; GFX10: liveins: $sgpr0_sgpr1
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[COPY]]
- ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
- ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
%0:sgpr(p1) = COPY $sgpr0_sgpr1
%1:vgpr(p1) = COPY %0
%2:vgpr(s32) = G_LOAD %1 :: (load 4, align 4, addrspace 1)
; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_2049
; GFX9: liveins: $sgpr0_sgpr1
; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2049
- ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
- ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
- ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
- ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
- ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
- ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
- ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
- ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
- ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2049, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_2049
; GFX10: liveins: $sgpr0_sgpr1
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX9-LABEL: name: load_global_s32_from_sgpr_base_offset_neg2049
; GFX9: liveins: $sgpr0_sgpr1
; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294965247
- ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
- ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
- ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX9: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
- ; GFX9: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
- ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
- ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc
- ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
- ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
- ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
- ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], -2049, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg2049
; GFX10: liveins: $sgpr0_sgpr1
; GFX10: $vcc_hi = IMPLICIT_DEF
$vgpr0 = COPY %4
...
+
+---
+name: load_global_s32_from_copy_undef_sgpr
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ ; GFX9-LABEL: name: load_global_s32_from_copy_undef_sgpr
+ ; GFX9: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ ; GFX10-LABEL: name: load_global_s32_from_copy_undef_sgpr
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+ ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ %0:sgpr(p1) = G_IMPLICIT_DEF
+ %1:vgpr(p1) = COPY %0
+ %2:vgpr(s32) = G_LOAD %1 :: (load 4, align 4, addrspace 1)
+ $vgpr0 = COPY %2
+
+...
+
+---
+name: load_global_s32_from_undef_vgpr
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+ bb.0:
+ ; GFX9-LABEL: name: load_global_s32_from_undef_vgpr
+ ; GFX9: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ ; GFX10-LABEL: name: load_global_s32_from_undef_vgpr
+ ; GFX10: $vcc_hi = IMPLICIT_DEF
+ ; GFX10: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+ ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+ %0:vgpr(p1) = G_IMPLICIT_DEF
+ %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 1)
+ $vgpr0 = COPY %1
+
+...
; GFX9-NEXT: v_mov_b32_e32 v1, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: ds_inc_rtn_u32 v2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false), !noalias !0
store i32 %result, i32 addrspace(1)* %out
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
; GFX9-LABEL: global_atomic_inc_ret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_inc v2, v[0:1], v2, off glc
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
store i32 %result, i32 addrspace(1)* %out
; GFX9-LABEL: global_atomic_inc_ret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s2, s2, 16
-; GFX9-NEXT: s_addc_u32 s3, s3, 0
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_inc v2, v[0:1], v2, off glc
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
%gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
; GFX9-LABEL: global_atomic_inc_noret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc
+; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[0:1] glc
; GFX9-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
ret void
; GFX9-LABEL: global_atomic_inc_noret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 16
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc
+; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[0:1] offset:16 glc
; GFX9-NEXT: s_endpgm
%gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
;
; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32:
; GFX9: ; %bb.0:
+; GFX9-NEXT: v_add_u32_e32 v1, 2, v0
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT: v_add_u32_e32 v2, 2, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 9
-; GFX9-NEXT: ds_inc_rtn_u32 v3, v0, v1 offset:8
+; GFX9-NEXT: v_mov_b32_e32 v2, 9
+; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v2 offset:8
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dword v[0:1], v3, off
+; GFX9-NEXT: global_store_dword v2, v1, s[2:3]
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
store i64 %result, i64 addrspace(1)* %out
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
%gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
; GFX9-LABEL: global_atomic_inc_ret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
store i64 %result, i64 addrspace(1)* %out
; GFX9-LABEL: global_atomic_inc_ret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s2, s2, 32
-; GFX9-NEXT: s_addc_u32 s3, s3, 0
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
%gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
; GFX9-LABEL: global_atomic_inc_noret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
ret void
; GFX9-LABEL: global_atomic_inc_noret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
-; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 32
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
+; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_endpgm
%gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
;
; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT: v_add_u32_e32 v4, 2, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: v_add_u32_e32 v2, 2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 9
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:16
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v3, v[0:1] offset:16
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: global_store_dword v[2:3], v4, off
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: global_store_dword v3, v2, s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: ds_inc_rtn_u32 v4, v1, v0
-; GFX9-NEXT: ds_inc_rtn_u32 v5, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: ds_inc_rtn_u32 v2, v1, v0
+; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(1)
-; GFX9-NEXT: global_store_dword v[0:1], v4, off
+; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v[2:3], v5, off
+; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
%result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3
; GFX10_W32-NEXT: v_mov_b32_e32 v1, s4
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10_W32-NEXT: v_div_fmas_f32 v2, s5, v0, v1
-; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT: v_div_fmas_f32 v0, s5, v0, v1
+; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
+; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10_W32-NEXT: s_endpgm
;
; GFX10_W64-LABEL: test_div_fmas_f32:
; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3
; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
; GFX10_W64-NEXT: v_mov_b32_e32 v1, s4
-; GFX10_W64-NEXT: v_div_fmas_f32 v2, s5, v0, v1
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT: v_div_fmas_f32 v0, s5, v0, v1
+; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
+; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10_W64-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d)
store float %result, float addrspace(1)* %out, align 4
; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x94
; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x70
; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W32-NEXT: ; implicit-def: $vcc_hi
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: s_and_b32 s2, 1, s2
; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10_W32-NEXT: v_div_fmas_f32 v2, 1.0, s4, v0
-; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0
+; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10_W32-NEXT: s_endpgm
;
; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0:
; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x94
; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x70
; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W64-NEXT: s_and_b32 s2, 1, s2
; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3
; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX10_W64-NEXT: v_div_fmas_f32 v2, 1.0, s4, v0
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0
+; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10_W64-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d)
store float %result, float addrspace(1)* %out, align 4
; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x34
; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W32-NEXT: ; implicit-def: $vcc_hi
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: s_and_b32 s2, 1, s2
; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, 1.0, v0
-; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0
+; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10_W32-NEXT: s_endpgm
;
; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1:
; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x34
; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W64-NEXT: s_and_b32 s2, 1, s2
; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3
; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, 1.0, v0
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0
+; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10_W64-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d)
store float %result, float addrspace(1)* %out, align 4
; GFX10_W32-NEXT: s_load_dword s3, s[0:1], 0x70
; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c
; GFX10_W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W32-NEXT: ; implicit-def: $vcc_hi
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: s_and_b32 s2, 1, s2
; GFX10_W32-NEXT: v_mov_b32_e32 v0, s3
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, 1.0
-; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0
+; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10_W32-NEXT: s_endpgm
;
; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2:
; GFX10_W64-NEXT: s_load_dword s3, s[0:1], 0x70
; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c
; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W64-NEXT: s_and_b32 s2, 1, s2
; GFX10_W64-NEXT: v_mov_b32_e32 v0, s3
; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, 1.0
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0
+; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10_W64-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d)
store float %result, float addrspace(1)* %out, align 4
; GFX10_W32-NEXT: v_mov_b32_e32 v3, s7
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8
; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3]
-; GFX10_W32-NEXT: v_mov_b32_e32 v3, s1
-; GFX10_W32-NEXT: v_mov_b32_e32 v2, s0
-; GFX10_W32-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10_W32-NEXT: v_mov_b32_e32 v2, 0
+; GFX10_W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10_W32-NEXT: s_endpgm
;
; GFX10_W64-LABEL: test_div_fmas_f64:
; GFX10_W64-NEXT: v_mov_b32_e32 v1, s5
; GFX10_W64-NEXT: v_mov_b32_e32 v3, s7
; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3]
-; GFX10_W64-NEXT: v_mov_b32_e32 v3, s1
-; GFX10_W64-NEXT: v_mov_b32_e32 v2, s0
-; GFX10_W64-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10_W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX10_W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10_W64-NEXT: s_endpgm
%result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d)
store double %result, double addrspace(1)* %out, align 8
; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0
; GFX10_W32-NEXT: s_and_b32 s2, 1, s2
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1
-; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1
+; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
+; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10_W32-NEXT: s_endpgm
;
; GFX10_W64-LABEL: test_div_fmas_f32_cond_to_vcc:
; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6
; GFX10_W64-NEXT: s_and_b32 s2, 1, s2
; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1
+; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
+; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10_W64-NEXT: s_endpgm
%cmp = icmp eq i32 %i, 0
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp)
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2
; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3
-; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1
-; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1
+; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
+; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10_W32-NEXT: s_endpgm
;
; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2
; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3
-; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1
+; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
+; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10_W64-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false)
store float %result, float addrspace(1)* %out, align 4
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2
; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3
-; GFX10_W32-NEXT: v_div_fmas_f32 v2, s4, v0, v1
-; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1
+; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
+; GFX10_W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10_W32-NEXT: s_endpgm
;
; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2
; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3
-; GFX10_W64-NEXT: v_div_fmas_f32 v2, s4, v0, v1
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1
+; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
+; GFX10_W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10_W64-NEXT: s_endpgm
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true)
store float %result, float addrspace(1)* %out, align 4
; GFX10_W32: ; %bb.0:
; GFX10_W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W32-NEXT: s_load_dword s2, s[0:1], 0x54
+; GFX10_W32-NEXT: s_load_dword s0, s[0:1], 0x54
; GFX10_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10_W32-NEXT: ; implicit-def: $vcc_hi
; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W32-NEXT: global_load_dword v2, v1, s[6:7]
; GFX10_W32-NEXT: global_load_dword v3, v1, s[6:7] offset:4
; GFX10_W32-NEXT: global_load_dword v1, v1, s[6:7] offset:8
-; GFX10_W32-NEXT: s_add_u32 s0, s4, 8
-; GFX10_W32-NEXT: s_addc_u32 s1, s5, 0
-; GFX10_W32-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10_W32-NEXT: s_cselect_b32 s2, 1, 0
-; GFX10_W32-NEXT: s_and_b32 s2, 1, s2
-; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s2, 0, s2
-; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s2
+; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10_W32-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10_W32-NEXT: s_and_b32 s0, 1, s0
+; GFX10_W32-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s0
; GFX10_W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10_W32-NEXT: v_div_fmas_f32 v2, v2, v3, v1
-; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT: v_div_fmas_f32 v0, v2, v3, v1
+; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
+; GFX10_W32-NEXT: global_store_dword v1, v0, s[4:5] offset:8
; GFX10_W32-NEXT: s_endpgm
;
; GFX10_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
; GFX10_W64: ; %bb.0:
; GFX10_W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W64-NEXT: s_load_dword s2, s[0:1], 0x54
+; GFX10_W64-NEXT: s_load_dword s0, s[0:1], 0x54
; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10_W64-NEXT: s_clause 0x2
; GFX10_W64-NEXT: global_load_dword v2, v1, s[6:7]
; GFX10_W64-NEXT: global_load_dword v3, v1, s[6:7] offset:4
; GFX10_W64-NEXT: global_load_dword v1, v1, s[6:7] offset:8
-; GFX10_W64-NEXT: s_add_u32 s0, s4, 8
-; GFX10_W64-NEXT: s_addc_u32 s1, s5, 0
-; GFX10_W64-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10_W64-NEXT: s_cselect_b32 s2, 1, 0
-; GFX10_W64-NEXT: s_and_b32 s2, 1, s2
-; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2
-; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[2:3]
+; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10_W64-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10_W64-NEXT: s_and_b32 s0, 1, s0
+; GFX10_W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX10_W64-NEXT: s_and_b64 vcc, vcc, s[0:1]
; GFX10_W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10_W64-NEXT: v_div_fmas_f32 v2, v2, v3, v1
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT: v_div_fmas_f32 v0, v2, v3, v1
+; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
+; GFX10_W64-NEXT: global_store_dword v1, v0, s[4:5] offset:8
; GFX10_W64-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10_W32-NEXT: s_and_b32 s0, 1, s4
; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT: s_add_u32 s0, s2, 8
-; GFX10_W32-NEXT: s_addc_u32 s1, s3, 0
; GFX10_W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10_W32-NEXT: v_div_fmas_f32 v2, v1, v2, v3
-; GFX10_W32-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W32-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W32-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3
+; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0
+; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] offset:8
; GFX10_W32-NEXT: s_endpgm
;
; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
; GFX10_W64-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX10_W64-NEXT: s_and_b32 s0, 1, s6
; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT: s_add_u32 s0, s2, 8
-; GFX10_W64-NEXT: s_addc_u32 s1, s3, 0
; GFX10_W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10_W64-NEXT: v_div_fmas_f32 v2, v1, v2, v3
-; GFX10_W64-NEXT: v_mov_b32_e32 v0, s0
-; GFX10_W64-NEXT: v_mov_b32_e32 v1, s1
-; GFX10_W64-NEXT: global_store_dword v[0:1], v2, off
+; GFX10_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3
+; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0
+; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] offset:8
; GFX10_W64-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v1
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v2, s2, v1, v0, v1
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
-; GFX10-NEXT: v_mov_b32_e32 v2, s0
-; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
-; GFX10-NEXT: v_mov_b32_e32 v2, s0
-; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x54
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v2, s0, v0, v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr float, float addrspace(1)* %in, i32 %tid
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v2, s0, s0, v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, v0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr float, float addrspace(1)* %in, i32 %tid
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v2, s0, s0, s0, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, v0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr float, float addrspace(1)* %in, i32 %tid
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v2, s0, v0, s0, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, s0, v0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr float, float addrspace(1)* %in, i32 %tid
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1]
-; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr double, double addrspace(1)* %in, i32 %tid
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1]
-; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr double, double addrspace(1)* %in, i32 %tid
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1]
-; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr double, double addrspace(1)* %in, i32 %tid
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1]
-; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr double, double addrspace(1)* %in, i32 %tid
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c
; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v2, s2, s3, s3, s2
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_div_scale_f32 v0, s2, s3, s3, s2
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false)
%result0 = extractvalue { float, i1 } %result, 0
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c
; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v2, s2, s2, s3, s2
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_div_scale_f32 v0, s2, s2, s3, s2
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
%result0 = extractvalue { float, i1 } %result, 0
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3]
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
-; GFX10-NEXT: v_mov_b32_e32 v2, s0
-; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false)
%result0 = extractvalue { double, i1 } %result, 0
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3]
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
-; GFX10-NEXT: v_mov_b32_e32 v2, s0
-; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
%result0 = extractvalue { double, i1 } %result, 0
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, 1.0
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, 1.0
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v2, s2, 2.0, 2.0, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_div_scale_f32 v0, s2, 2.0, 2.0, v0
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v1
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, v1
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
; GFX10-LABEL: test_div_scale_f32_val_undef_val:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_div_scale_f32 v2, s2, s0, s0, 0x41000000
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false)
%result0 = extractvalue { float, i1 } %result, 0
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v2, s2, v0, v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false)
%result0 = extractvalue { float, i1 } %result, 0
; GFX10-LABEL: test_div_scale_f32_undef_undef_val:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_div_scale_f32 v2, s2, s0, s0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false)
%result0 = extractvalue { float, i1 } %result, 0
;
; GFX10-LABEL: test_div_scale_f64_val_undef_val:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: s_mov_b32 s3, 0x40200000
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
-; GFX10-NEXT: v_mov_b32_e32 v2, s0
-; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false)
%result0 = extractvalue { double, i1 } %result, 0
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: v_mov_b32_e32 v5, s10
+; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe
-; GFX10-NEXT: v_mov_b32_e32 v6, s11
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v[5:6], v4, off
+; GFX10-NEXT: global_store_dword v5, v4, s[10:11]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: v_mov_b32_e32 v5, s10
+; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe lwe
-; GFX10-NEXT: v_mov_b32_e32 v6, s11
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v[5:6], v4, off
+; GFX10-NEXT: global_store_dword v5, v4, s[10:11]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 3, i32 0)
; GFX9-NEXT: s_mov_b32 s7, s9
; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1
; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe da
-; GFX9-NEXT: v_mov_b32_e32 v5, s10
-; GFX9-NEXT: v_mov_b32_e32 v6, s11
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v[5:6], v4, off
+; GFX9-NEXT: global_store_dword v5, v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: v_mov_b32_e32 v5, s10
+; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 tfe
-; GFX10-NEXT: v_mov_b32_e32 v6, s11
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v[5:6], v4, off
+; GFX10-NEXT: global_store_dword v5, v4, s[10:11]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
; GFX9-NEXT: s_mov_b32 s7, s9
; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1
; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe lwe da
-; GFX9-NEXT: v_mov_b32_e32 v5, s10
-; GFX9-NEXT: v_mov_b32_e32 v6, s11
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v[5:6], v4, off
+; GFX9-NEXT: global_store_dword v5, v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: v_mov_b32_e32 v5, s10
+; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 tfe lwe
-; GFX10-NEXT: v_mov_b32_e32 v6, s11
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v[5:6], v4, off
+; GFX10-NEXT: global_store_dword v5, v4, s[10:11]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: v_mov_b32_e32 v5, s10
+; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe
-; GFX10-NEXT: v_mov_b32_e32 v6, s11
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v[5:6], v4, off
+; GFX10-NEXT: global_store_dword v5, v4, s[10:11]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: v_mov_b32_e32 v5, s10
+; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe lwe
-; GFX10-NEXT: v_mov_b32_e32 v6, s11
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v[5:6], v4, off
+; GFX10-NEXT: global_store_dword v5, v4, s[10:11]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
; GFX9-NEXT: s_mov_b32 s7, s9
; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s8
; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe
-; GFX9-NEXT: v_mov_b32_e32 v5, s10
-; GFX9-NEXT: v_mov_b32_e32 v6, s11
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v[5:6], v4, off
+; GFX9-NEXT: global_store_dword v5, v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: v_mov_b32_e32 v5, s10
+; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
-; GFX10-NEXT: v_mov_b32_e32 v6, s11
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v[5:6], v4, off
+; GFX10-NEXT: global_store_dword v5, v4, s[10:11]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 1, i32 0)
; GFX9-NEXT: s_mov_b32 s7, s9
; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s8
; GFX9-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm a16 tfe lwe
-; GFX9-NEXT: v_mov_b32_e32 v5, s10
-; GFX9-NEXT: v_mov_b32_e32 v6, s11
+; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v[5:6], v4, off
+; GFX9-NEXT: global_store_dword v5, v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: v_mov_b32_e32 v5, s10
+; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
-; GFX10-NEXT: v_mov_b32_e32 v6, s11
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v[5:6], v4, off
+; GFX10-NEXT: global_store_dword v5, v4, s[10:11]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 3, i32 0)
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: v_mov_b32_e32 v5, s10
+; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe
-; GFX10-NEXT: v_mov_b32_e32 v6, s11
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v[5:6], v4, off
+; GFX10-NEXT: global_store_dword v5, v4, s[10:11]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 1, i32 0)
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
-; GFX10-NEXT: v_mov_b32_e32 v5, s10
+; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe
-; GFX10-NEXT: v_mov_b32_e32 v6, s11
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v[5:6], v4, off
+; GFX10-NEXT: global_store_dword v5, v4, s[10:11]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa]
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; encoding: [0x80,0x02,0x02,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
-; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; encoding: [0x02,0x02,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; encoding: [0x01,0x02,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x04,0x7e,0x02,0x01,0x08,0x11]
-; GFX10-NEXT: global_store_dword v[0:1], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x00,0x02,0x7d,0x00]
+; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; encoding: [0x00,0x80,0x70,0xdc,0x01,0x00,0x00,0x00]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 true) #0
store i32 %tmp0, i32 addrspace(1)* %out
; GFX10-LABEL: mov_dpp64_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; encoding: [0x01,0x02,0x06,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; encoding: [0x00,0x02,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11]
; GFX10-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
-; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x7d,0x00]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x00,0x00]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 %in1, i32 1, i32 1, i32 1, i1 false) #0
store i64 %tmp0, i64 addrspace(1)* %out
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-NEXT: v_mov_b32_e32 v0, s3
-; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false)
store i32 %tmp0, i32 addrspace(1)* %out
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)* inreg %ptr) {
; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
; GFX9-UNALIGNED: ; %bb.0:
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-UNALIGNED-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-UNALIGNED-NEXT: global_load_dwordx3 v[0:2], v0, s[0:1]
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX9-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
;
; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
; GFX9-NOUNALIGNED: ; %bb.0:
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 1
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 3
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 5
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, s3
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, s2
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 7
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v15, s3
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v14, s2
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[0:1], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v17, v[2:3], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v18, v[4:5], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v19, v[6:7], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[8:9], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[10:11], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[12:13], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[14:15], off
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 9
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 10
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 11
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s1
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s0
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[0:1], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[4:5], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[6:7], off
+; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v0, s[0:1] offset:1
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v0, s[0:1] offset:2
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v0, s[0:1] offset:3
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v0, s[0:1] offset:4
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v0, s[0:1] offset:5
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v0, s[0:1] offset:6
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v0, s[0:1] offset:7
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v0, s[0:1] offset:9
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v0, s[0:1] offset:10
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v0, s[0:1] offset:11
+; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, 0xff
; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff
; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xff
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 8
+; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, s1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v18
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, s0, v19
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v16, s0, v0
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7
-; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v7
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v1, s0, v2
+; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v6, s1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v10, v5
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v12
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v11, v5
-; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v8, v5, v0
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7
-; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v0, v1, v7
-; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v12
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v3, v5
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v12
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v4, v5
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v12, v5, v0
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2
-; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v0, v1, v2
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v0, v12
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v4
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v4, v5, s0, v6
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 24, v8
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v7, v9, v12, v10
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 24, v0
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v11
+; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v1, v2, v3
+; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v4, v5, v6
+; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v7, v8, v9
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog
;
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)* inreg %ptr) {
; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
; GFX9-UNALIGNED: ; %bb.0:
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-UNALIGNED-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-UNALIGNED-NEXT: global_load_dwordx3 v[0:2], v0, s[0:1]
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX9-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
;
; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
; GFX9-NOUNALIGNED: ; %bb.0:
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 2
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 4
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, s3
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, s2
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 6
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v7, s3
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 8
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 10
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v9, s3
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s1
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s0
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, s2
-; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[4:5], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v[6:7], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[8:9], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[10:11], off
+; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v0, s[0:1] offset:2
+; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v0, s[0:1] offset:4
+; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v0, s[0:1] offset:6
+; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v0, s[0:1] offset:8
+; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v0, s[0:1] offset:10
; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1
-; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v3, v6
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v2, v6, v0
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v5, v6
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v4, v6, v0
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v1, s0, v2
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v3, s0, v4
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v5, s0, v6
+; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
; GFX9-NEXT: s_cbranch_scc0 BB1_2
; GFX9-NEXT: ; %bb.1: ; %bb1
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, gv2@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, gv2@gotpcrel32@hi+12
; GFX9-NEXT: s_getpc_b64 s[2:3]
-; GFX9-NEXT: s_add_u32 s2, s2, gv2@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s3, s3, gv2@gotpcrel32@hi+12
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, gv3@gotpcrel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, gv3@gotpcrel32@hi+12
+; GFX9-NEXT: s_add_u32 s2, s2, gv3@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, gv3@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 1
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v2, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: global_store_dword v0, v0, s[4:5]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: BB1_2: ; %Flow
; GFX9-NEXT: s_xor_b32 s0, s0, -1
; GFX9-NEXT: s_and_b32 s0, s0, 1
; GFX9-NEXT: s_addc_u32 s3, s3, gv1@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mov_b32_e32 v3, 1
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dword v[0:1], v3, off
+; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: BB1_4: ; %bb2
; GFX9-NEXT: s_endpgm
entry:
; GFX9-NEXT: s_getpc_b64 s[6:7]
; GFX9-NEXT: s_add_u32 s6, s6, static.gv2@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s7, s7, static.gv2@rel32@hi+12
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_store_dword v0, v0, s[6:7]
; GFX9-NEXT: s_getpc_b64 s[6:7]
; GFX9-NEXT: s_add_u32 s6, s6, static.gv3@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s7, s7, static.gv3@rel32@hi+12
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v2, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: BB2_2: ; %Flow
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5]
; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_getpc_b64 s[6:7]
; GFX9-NEXT: s_add_u32 s6, s6, static.gv0@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s7, s7, static.gv0@rel32@hi+12
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_store_dword v0, v0, s[6:7]
; GFX9-NEXT: s_getpc_b64 s[6:7]
; GFX9-NEXT: s_add_u32 s6, s6, static.gv1@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s7, s7, static.gv1@rel32@hi+12
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v2, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: BB2_4: ; %bb2
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GCN-NEXT: s_load_dword s4, s[4:5], 0x10
; GCN-NEXT: s_add_u32 s5, s32, 0x1000
+; GCN-NEXT: s_add_u32 s8, s5, 4
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NEXT: s_add_u32 s8, s5, 4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshl_b32 s4, s4, 2
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT: v_mov_b32_e32 v1, 1
-; GCN-NEXT: v_mov_b32_e32 v2, s8
+; GCN-NEXT: v_mov_b32_e32 v2, 1
+; GCN-NEXT: v_mov_b32_e32 v3, s8
; GCN-NEXT: s_add_u32 s4, s5, s4
-; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT: v_mov_b32_e32 v1, s4
-; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_add_u32_e32 v2, v1, v0
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NEXT: global_store_dword v[0:1], v2, off
+; GCN-NEXT: v_add_u32_e32 v0, v2, v0
+; GCN-NEXT: global_store_dword v1, v0, s[6:7]
; GCN-NEXT: BB0_3: ; %bb.2
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_lshl_b32 s4, s4, 2
; GCN-NEXT: v_mov_b32_e32 v2, s5
; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT: v_mov_b32_e32 v1, 1
-; GCN-NEXT: v_mov_b32_e32 v2, s8
+; GCN-NEXT: v_mov_b32_e32 v2, 1
+; GCN-NEXT: v_mov_b32_e32 v3, s8
; GCN-NEXT: s_add_u32 s4, s5, s4
-; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT: v_mov_b32_e32 v1, s4
-; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_add_u32_e32 v2, v1, v0
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NEXT: global_store_dword v[0:1], v2, off
+; GCN-NEXT: v_add_u32_e32 v0, v2, v0
+; GCN-NEXT: global_store_dword v1, v0, s[6:7]
; GCN-NEXT: BB1_2: ; %bb.1
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: global_store_dword v[0:1], v0, off
; HSA-LABEl: {{^}}use_constant_to_global_addrspacecast:
; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
-; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
-; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
-; HSA: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
+; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; CI: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
+
+; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GFX9: global_load_dword v{{[0-9]+}}, [[ZERO:v[0-9]+]], s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}
define amdgpu_kernel void @use_constant_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
%stof = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
%ld = load volatile i32, i32 addrspace(1)* %stof
; HSA: enable_sgpr_queue_ptr = 0
; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
-; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
-; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
-; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
-; HSA: {{flat|global}}_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
+; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
+; CI: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
+
+; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
+; GFX9: global_store_dword [[ZERO]], [[ZERO]], s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]$}}
define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 {
%ftos = addrspacecast i32* %ptr to i32 addrspace(1)*
store volatile i32 0, i32 addrspace(1)* %ftos
; ELF: }
; GFX10-W32: NumSGPRsForWavesPerEU: 4
-; GFX10-W32: NumVGPRsForWavesPerEU: 3
+; GFX10-W32: NumVGPRsForWavesPerEU: 1
; GFX10-W64: NumSGPRsForWavesPerEU: 2
-; GFX10-W64: NumVGPRsForWavesPerEU: 3
+; GFX10-W64: NumVGPRsForWavesPerEU: 1
define amdgpu_kernel void @simple(i32 addrspace(1)* %out) {
entry:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
-; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: global_store_dword v[0:1], v2, off
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_dword v0, v0, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v0, s34
-; GCN-NEXT: v_mov_b32_e32 v1, s35
-; GCN-NEXT: global_store_dword v[0:1], v40, off
+; GCN-NEXT: global_store_dword v40, v40, s[34:35]
; GCN-NEXT: s_endpgm
call void @func(i32 0)
store i32 0, i32 addrspace(1)* %ptr
; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+12
; GCN-NEXT: s_mov_b32 s32, 0
+; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v1, s34
-; GCN-NEXT: v_mov_b32_e32 v2, s35
-; GCN-NEXT: global_store_dword v[1:2], v0, off
+; GCN-NEXT: global_store_dword v40, v0, s[34:35]
; GCN-NEXT: s_endpgm
%rv = call i32 @func.return(i32 0)
store i32 %rv, i32 addrspace(1)* %ptr
}
; GCN-LABEL: {{^}}other_arg_use_workgroup_id_x:
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; CIVI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0, off
; GCN: ; use s12
define hidden void @other_arg_use_workgroup_id_x(i32 %arg0) #1 {
%val = call i32 @llvm.amdgcn.workgroup.id.x()
}
; GCN-LABEL: {{^}}other_arg_use_workgroup_id_y:
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; CIVI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0, off
; GCN: ; use s13
define hidden void @other_arg_use_workgroup_id_y(i32 %arg0) #1 {
%val = call i32 @llvm.amdgcn.workgroup.id.y()
}
; GCN-LABEL: {{^}}other_arg_use_workgroup_id_z:
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; CIVI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0, off
; GCN: ; use s14
define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
%val = call i32 @llvm.amdgcn.workgroup.id.z()
; GCN-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GCN-NEXT: s_cbranch_execz BB0_2
; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: v_mov_b32_e32 v2, 2
-; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off offset:28 glc
+; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] offset:28 glc
; GCN-NEXT: BB0_2: ; %endif
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0800
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz BB0_2
; GCN-NEXT: ; %bb.1: ; %if
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 2.0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: v_mov_b32_e32 v2, 2.0
-; GCN-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:28
+; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] offset:28
; GCN-NEXT: global_load_dword v0, v[0:1], off
; GCN-NEXT: BB0_2: ; %endif
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
; GCN: s_and_saveexec_b64
; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
-; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off offset:4095{{$}}
+; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GFX9: global_load_sbyte {{v[0-9]+}}, [[ZERO]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
; GCN: {{^}}BB2_2:
; GCN: s_or_b64 exec
define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
; OPT-GFX9: load i8, i8 addrspace(1)* %sunkaddr
; GCN-LABEL: {{^}}test_sink_global_small_min_scratch_global_offset:
-; GFX9: global_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:-4096{{$}}
+; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GFX9: global_load_sbyte v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:-4096{{$}}
define amdgpu_kernel void @test_sink_global_small_min_scratch_global_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
entry:
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
; GFX900-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GFX900-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX900-NEXT: v_mov_b32_e32 v2, 0
; GFX900-NEXT: s_add_u32 s0, s0, s9
; GFX900-NEXT: s_addc_u32 s1, s1, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: v_mov_b32_e32 v0, s4
-; GFX900-NEXT: v_mov_b32_e32 v1, s5
-; GFX900-NEXT: global_load_ushort v2, v[0:1], off
+; GFX900-NEXT: global_load_ushort v0, v2, s[4:5]
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4
-; GFX900-NEXT: global_load_ushort v2, v[0:1], off offset:2
+; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
+; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:2
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:6
-; GFX900-NEXT: global_load_ushort v2, v[0:1], off offset:4
-; GFX900-NEXT: v_mov_b32_e32 v0, s6
-; GFX900-NEXT: v_mov_b32_e32 v1, s7
+; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6
+; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:4
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:8
-; GFX900-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4
-; GFX900-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6
+; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8
+; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4
+; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6
; GFX900-NEXT: s_waitcnt vmcnt(1)
-; GFX900-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_mov_b32_e32 v3, v4
-; GFX900-NEXT: buffer_load_short_d16_hi v3, off, s[0:3], 0 offset:8
-; GFX900-NEXT: v_lshl_or_b32 v2, v4, 16, v2
+; GFX900-NEXT: v_mov_b32_e32 v1, v3
+; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8
+; GFX900-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX900-NEXT: s_endpgm
;
; FLATSCR-LABEL: vload2_private:
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
-; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
-; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off
+; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1]
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:4
-; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:2
+; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:4
+; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:2
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:6
-; FLATSCR-NEXT: global_load_ushort v2, v[0:1], off offset:4
+; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:6
+; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:4
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
-; FLATSCR-NEXT: v_mov_b32_e32 v0, s2
-; FLATSCR-NEXT: v_mov_b32_e32 v1, s3
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: scratch_store_short off, v2, vcc_hi offset:8
+; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:8
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
-; FLATSCR-NEXT: scratch_load_ushort v2, off, vcc_hi offset:4
+; FLATSCR-NEXT: scratch_load_ushort v0, off, vcc_hi offset:4
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
-; FLATSCR-NEXT: scratch_load_ushort v4, off, vcc_hi offset:6
+; FLATSCR-NEXT: scratch_load_ushort v3, off, vcc_hi offset:6
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
; FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; FLATSCR-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; FLATSCR-NEXT: v_and_b32_e32 v0, 0xffff, v0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: v_mov_b32_e32 v3, v4
-; FLATSCR-NEXT: scratch_load_short_d16_hi v3, off, vcc_hi offset:8
-; FLATSCR-NEXT: v_lshl_or_b32 v2, v4, 16, v2
+; FLATSCR-NEXT: v_mov_b32_e32 v1, v3
+; FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, vcc_hi offset:8
+; FLATSCR-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; FLATSCR-NEXT: s_endpgm
entry:
%loc = alloca [3 x i16], align 2, addrspace(5)
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
; GCN-NOT: v_and_b32
-; GCN: store_dword v[{{[0-9:]+}}], [[VSEL]],
+; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @select_and1(i32 addrspace(1)* %p, i32 %x, i32 %y) {
%c = icmp slt i32 %x, 11
%s = select i1 %c, i32 0, i32 -1
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
; GCN-NOT: v_and_b32
-; GCN: store_dword v[{{[0-9:]+}}], [[VSEL]],
+; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @select_and2(i32 addrspace(1)* %p, i32 %x, i32 %y) {
%c = icmp slt i32 %x, 11
%s = select i1 %c, i32 0, i32 -1
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
; GCN-NOT: v_and_b32
-; GCN: store_dword v[{{[0-9:]+}}], [[VSEL]],
+; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @select_and3(i32 addrspace(1)* %p, i32 %x, i32 %y) {
%c = icmp slt i32 %x, 11
%s = select i1 %c, i32 -1, i32 0
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
; GCN-NOT: v_or_b32
-; GCN: store_dword v[{{[0-9:]+}}], [[VSEL]],
+; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @select_or1(i32 addrspace(1)* %p, i32 %x, i32 %y) {
%c = icmp slt i32 %x, 11
%s = select i1 %c, i32 0, i32 -1
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
; GCN-NOT: v_or_b32
-; GCN: store_dword v[{{[0-9:]+}}], [[VSEL]],
+; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @select_or2(i32 addrspace(1)* %p, i32 %x, i32 %y) {
%c = icmp slt i32 %x, 11
%s = select i1 %c, i32 0, i32 -1
; GCN: s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
; GCN: v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
; GCN-NOT: v_or_b32
-; GCN: store_dword v[{{[0-9:]+}}], [[VSEL]],
+; GCN: store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @select_or3(i32 addrspace(1)* %p, i32 %x, i32 %y) {
%c = icmp slt i32 %x, 11
%s = select i1 %c, i32 -1, i32 0
}
; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16:
-; GCN: v_cndmask_b32_e64 v2, 2, 9,
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 9,
define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(i16 addrspace(1)* %p, i1 %cond) {
%sel = select i1 %cond, i16 -4, i16 3
%bo = sub i16 5, %sel
;
; GFX9-LABEL: load_constant_adjacent_offsets:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
%val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
%val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
;
; GFX9-LABEL: load_constant_disjoint_offsets:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:2
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:2
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
%val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
%val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
;
; GFX9-ALIGNED-LABEL: load_misaligned64_constant_offsets:
; GFX9-ALIGNED: ; %bb.0:
-; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
-; GFX9-ALIGNED-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
+; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-ALIGNED-NEXT: ds_read2_b32 v[0:1], v4 offset1:1
+; GFX9-ALIGNED-NEXT: ds_read2_b32 v[2:3], v4 offset0:2 offset1:3
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-ALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-ALIGNED-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-ALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX9-ALIGNED-NEXT: s_endpgm
;
; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets:
; GFX9-UNALIGNED: ; %bb.0:
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v4
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-UNALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX9-UNALIGNED-NEXT: s_endpgm
%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
; GFX9-NEXT: v_add_f32_e32 v0, v0, v6
; GFX9-NEXT: v_add_f32_e32 v0, v0, v7
; GFX9-NEXT: v_add_f32_e32 v0, v0, v8
-; GFX9-NEXT: v_add_f32_e32 v2, v0, v9
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-NEXT: v_add_f32_e32 v0, v0, v9
+; GFX9-NEXT: global_store_dword v10, v0, s[0:1]
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
%y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%load = load i64, i64 addrspace(3)* %in, align 4
store i64 %load, i64 addrspace(1)* %out, align 8
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v2, s0, v1
; GFX9-NEXT: v_add_u32_e32 v3, s1, v0
; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7
-; GFX9-NEXT: v_sub_f32_e32 v2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:40
+; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX9-NEXT: global_store_dword v8, v0, s[4:5] offset:40
; GFX9-NEXT: s_endpgm
float addrspace(1)* nocapture %arg,
[4 x [4 x float]] addrspace(3)* %arg1,
; GFX9-NEXT: s_mov_b32 s36, s0
; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s36, s36, s3
; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s0
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, void_func_void@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, void_func_void@gotpcrel32@hi+12
-; GFX9-NEXT: v_lshl_add_u32 v40, v0, 2, s2
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT: ds_read_b32 v41, v40
+; GFX9-NEXT: ds_read_b32 v42, v41
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: ds_read_b32 v0, v40 offset:4
+; GFX9-NEXT: ds_read_b32 v0, v41 offset:4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v2, v41, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s34
-; GFX9-NEXT: v_mov_b32_e32 v1, s35
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_add_u32_e32 v0, v42, v0
+; GFX9-NEXT: global_store_dword v40, v0, s[34:35]
; GFX9-NEXT: s_endpgm
%x = call i32 @llvm.amdgcn.workitem.id.x()
%arrayidx0 = getelementptr i32, i32 addrspace(3)* %arg, i32 %x
;
; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset:
; GFX9-ALIGNED: ; %bb.0: ; %entry
-; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v0 offset:65
-; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v0 offset:66
-; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v0 offset:67
-; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v0 offset:68
-; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v0 offset:69
-; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v0 offset:70
-; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v0 offset:71
-; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v0 offset:72
+; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:65
+; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:66
+; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:67
+; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:68
+; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:69
+; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:70
+; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:71
+; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:72
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v6
; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v6, v1
-; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v0, v1
-; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v3
-; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v5
-; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-ALIGNED-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v8
+; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v5
+; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v6, v1
+; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX9-ALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-ALIGNED-NEXT: s_endpgm
;
; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset:
; GFX9-UNALIGNED: ; %bb.0: ; %entry
-; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41
+; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-UNALIGNED-NEXT: s_endpgm
entry:
%load = load <2 x i32>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
; GCN: s_load_dword [[VAL:s[0-9]+]]
; GCN: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff
; GCN: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]]
-; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
+; GCN: {{flat|global}}_store_short v{{.+}}, [[V_RESULT]]
define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
%bc= bitcast i16 %in to half
%fabs = call half @llvm.fabs.f16(half %bc)
; GCN: s_load_dword [[VAL:s[0-9]+]]
; GCN: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff
; GCN: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]]
-; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]]
+; GCN: {{flat|global}}_store_short v{{.+}}, [[V_RESULT]]
define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
%fabs = call half @llvm.fabs.f16(half %in)
store half %fabs, half addrspace(1)* %out
; GFX89-NOT: and
; GFX89: v_mov_b32_e32 [[V_IN1:v[0-9]+]], [[IN1]]
; GFX89: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN0]]|, [[V_IN1]]
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[RESULT]]
define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
%fabs = call half @llvm.fabs.f16(half %in0)
%fmul = fmul half %fabs, %in1
; GFX9-LABEL: global_store_2xi16_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX9-NEXT: v_mov_b32_e32 v2, 1
-; GFX9-NEXT: v_mov_b32_e32 v3, 2
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-NEXT: v_mov_b32_e32 v2, 2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_short v[0:1], v2, off
-; GFX9-NEXT: global_store_short v[0:1], v3, off offset:2
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:2
; GFX9-NEXT: s_endpgm
%gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
store i16 1, i16 addrspace(1)* %r, align 2
; GFX9-LABEL: global_store_2xi16_align1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x20001
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
store i16 1, i16 addrspace(1)* %r, align 1
; GFX9-LABEL: global_store_2xi16_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x20001
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
store i16 1, i16 addrspace(1)* %r, align 4
; GCN-LABEL: {{^}}test_fold_canonicalize_undef_value_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half undef)
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}s_test_canonicalize_var_f16:
; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
%val = bitcast i16 %val.arg to half
%canonicalized = call half @llvm.canonicalize.f16(half %val)
; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16:
; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}|
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
%val = load half, half addrspace(1)* %out
%val.fabs = call half @llvm.fabs.f16(half %val)
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16:
; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}|
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16:
; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
; CI: v_cvt_f32_f16_e64 {{v[0-9]+}}, -{{v[0-9]+}}
; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
; GCN-LABEL: {{^}}v_test_no_denormals_canonicalize_fneg_var_f16:
; VI: v_mul_f16_e32 [[REG:v[0-9]+]], -1.0, v{{[0-9]+}}
; GFX9: v_max_f16_e64 [[REG:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(half addrspace(1)* %out) #2 {
%val = load half, half addrspace(1)* %out
%val.fneg = fneg half %val
; VI: v_mul_f16_e64 [[REG:v[0-9]+]], -1.0, |v{{[0-9]+}}|
; GFX9: v_max_f16_e64 [[REG:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}|
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
; CI: v_cvt_f32_f16_e64 {{v[0-9]+}}, -|{{v[0-9]+}}|
; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0.0)
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half -0.0)
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 1.0)
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half -1.0)
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 16.0)
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal0_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal1_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00)
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half))
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half))
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01)
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF)
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF)
store half %canonicalized, half addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01)
store half %canonicalized, half addrspace(1)* %out
; VI-NOT: v_and_b32
; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+$}}
-; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX9: global_store_dword v{{.+}}, [[REG]], s
define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
; VI-NOT: 0xffff
; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} neg_lo:[1,1] neg_hi:[1,1]{{$}}
-; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX9: global_store_dword v{{[0-9]+}}, [[REG]], s
define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
; VI-NOT: v_and_b32
; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+$}}
-; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX9: global_store_dword v{{[0-9]+}}, [[REG]], s
define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 {
%val = bitcast i32 %val.arg to <2 x half>
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
; GCN-LABEL: {{^}}test_fold_canonicalize_p0_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_n0_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_p1_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_n1_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_literal_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c004c00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c007c00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>))
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}s_test_canonicalize_undef_v2f16:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00
-; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GFX89: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
; GCN-LABEL: {{^}}v_test_canonicalize_var_f32:
; GFX678: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
; GFX9: v_max_f32_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
%val = load float, float addrspace(1)* %out
%canonicalized = call float @llvm.canonicalize.f32(float %val)
; GCN-LABEL: {{^}}s_test_canonicalize_var_f32:
; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float %val)
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f32:
; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}|
; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}|
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 {
%val = load float, float addrspace(1)* %out
%val.fabs = call float @llvm.fabs.f32(float %val)
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f32:
; GFX678: v_mul_f32_e64 [[REG:v[0-9]+]], -1.0, |{{v[0-9]+}}|
; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}|
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 {
%val = load float, float addrspace(1)* %out
%val.fabs = call float @llvm.fabs.f32(float %val)
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f32:
; GFX678: v_mul_f32_e32 [[REG:v[0-9]+]], -1.0, {{v[0-9]+}}
; GFX9: v_max_f32_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 {
%val = load float, float addrspace(1)* %out
%val.fneg = fneg float %val
; GCN-LABEL: {{^}}test_fold_canonicalize_undef_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_undef_f32(float addrspace(1)* %out) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float undef)
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float 0.0)
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f32:
; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float -0.0)
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float 1.0)
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float -1.0)
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x41800000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float 16.0)
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fffff{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 {
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x807fffff{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 {
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float))
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float))
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float))
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float))
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float))
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f32:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 {
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float))
store float %canonicalized, float addrspace(1)* %out
; GCN-LABEL: {{^}}v_test_canonicalize_var_f64:
; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
%val = load double, double addrspace(1)* %out
%canonicalized = call double @llvm.canonicalize.f64(double %val)
; GCN-LABEL: {{^}}s_test_canonicalize_var_f64:
; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]]
define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
%canonicalized = call double @llvm.canonicalize.f64(double %val)
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64:
; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], |{{v\[[0-9]+:[0-9]+\]}}|, |{{v\[[0-9]+:[0-9]+\]}}|
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 {
%val = load double, double addrspace(1)* %out
%val.fabs = call double @llvm.fabs.f64(double %val)
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64:
; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]\]]], -|{{v\[[0-9]+:[0-9]+\]}}|, -|{{v\[[0-9]+:[0-9]+\]}}|
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 {
%val = load double, double addrspace(1)* %out
%val.fabs = call double @llvm.fabs.f64(double %val)
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64:
; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -{{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 {
%val = load double, double addrspace(1)* %out
%val.fneg = fneg double %val
; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f64:
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
%canonicalized = call double @llvm.canonicalize.f64(double 0.0)
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f64:
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
%canonicalized = call double @llvm.canonicalize.f64(double -0.0)
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f64:
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
%canonicalized = call double @llvm.canonicalize.f64(double 1.0)
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f64:
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
%canonicalized = call double @llvm.canonicalize.f64(double -1.0)
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f64:
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
%canonicalized = call double @llvm.canonicalize.f64(double 16.0)
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f64:
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 {
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f64:
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 {
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f64:
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 {
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f64:
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 {
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f64:
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
%canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f64:
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 {
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f64:
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 {
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f64:
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 {
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f64:
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 {
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double))
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f64:
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 {
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double))
store double %canonicalized, double addrspace(1)* %out
; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f64:
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
-; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 {
%canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double))
store double %canonicalized, double addrspace(1)* %out
-; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck --check-prefixes=GCN,GCN-DENORM %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefixes=GCN,GCN-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH %s
; GCN-LABEL: {{^}}div_1_by_x_25ulp:
; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
; GCN-FLUSH: v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
-; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
+; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @div_1_by_x_25ulp(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%div = fdiv float 1.000000e+00, %load, !fpmath !0
; GCN-FLUSH: v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
-; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
+; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @div_minus_1_by_x_25ulp(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%div = fdiv float -1.000000e+00, %load, !fpmath !0
; GCN-FLUSH: v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
-; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
+; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @div_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%neg = fsub float -0.000000e+00, %load
; GCN-FLUSH: v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
-; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
+; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%neg = fsub float -0.000000e+00, %load
; GCN-FLUSH: v_rcp_f32_e32
; GCN-FLUSH: v_rcp_f32_e32
; GCN-FLUSH: v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
-; GCN-FLUSH: global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
+; GCN-FLUSH: global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
%load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
%div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %load, !fpmath !0
}
; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp:
+; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
}
; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp:
+; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
; GCN-FLUSH: v_rcp_f32_e64
; GCN-FLUSH: v_rcp_f32_e64
; GCN-FLUSH: v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]]
-; GCN-FLUSH: global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
+; GCN-FLUSH: global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
%load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
%neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
; GCN-FLUSH: v_rcp_f32_e32
; GCN-FLUSH: v_rcp_f32_e32
; GCN-FLUSH: v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
-; GCN-FLUSH: global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
+; GCN-FLUSH: global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
%load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
%neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
; GCN-FLUSH: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
-; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
+; GCN: global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num) {
%load = load float, float addrspace(1)* %arg, align 4
%div = fdiv float %num, %load, !fpmath !0
; GCN-LABEL: {{^}}div_1_by_x_fast:
; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
-; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]:[0-9]+\]}}
define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%div = fdiv fast float 1.000000e+00, %load, !fpmath !0
; GCN-LABEL: {{^}}div_minus_1_by_x_fast:
; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
-; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%div = fdiv fast float -1.000000e+00, %load, !fpmath !0
; GCN-LABEL: {{^}}div_1_by_minus_x_fast:
; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
-; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%neg = fsub float -0.000000e+00, %load, !fpmath !0
; GCN-LABEL: {{^}}div_minus_1_by_minus_x_fast:
; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
-; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
+; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%neg = fsub float -0.000000e+00, %load, !fpmath !0
; GFX89-NOT: _and
; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{s[0-9]+}}, -|{{v[0-9]+}}|
; GFX89-NOT: [[MUL]]
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
+; GFX89: {{flat|global}}_store_short v{{.+}}, [[MUL]]
define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
%fabs = call half @llvm.fabs.f16(half %x)
%fsub = fsub half -0.0, %fabs
; GFX9: v_mov_b32_e32 [[V_ABS:v[0-9]+]], [[ABS]]
; GFX9: s_xor_b32 [[NEG:s[0-9]+]], [[ABS]], 0x80008000
; GFX9-DAG: v_mov_b32_e32 [[V_NEG:v[0-9]+]], [[NEG]]
-; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_ABS]]
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_NEG]]
+; GFX9-DAG: global_store_dword v{{[0-9]+}}, [[V_ABS]], s{{\[[0-9]+:[0-9]+\]}}
+; GFX9: global_store_dword v{{[0-9]+}}, [[V_NEG]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
%fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
; GCN: s_load_dword [[NEG_VALUE:s[0-9]+]],
; GCN: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}
; GCN: v_mov_b32_e32 [[V_XOR:v[0-9]+]], [[XOR]]
-; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_XOR]]
+; GCN: {{flat|global}}_store_short v{{.+}}, [[V_XOR]]
define amdgpu_kernel void @s_fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
%bc = bitcast i16 %in to half
%fsub = fsub half -0.0, %bc
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_not_b32 s0, s0
; GFX9-NEXT: s_lshr_b32 s1, s4, 1
-; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_alignbit_b32 v2, s1, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, v2
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, 25
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 25
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_i32_imm:
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: s_not_b32 s1, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_lshr_b32 s7, s5, 1
; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1
-; GFX9-NEXT: v_alignbit_b32 v1, s7, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_lshr_b32 s5, s5, 1
+; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: s_not_b32 s0, s0
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
; GFX9-NEXT: s_lshr_b32 s1, s4, 1
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v3
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v2i32:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23
-; GFX9-NEXT: v_alignbit_b32 v0, s4, v2, 25
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v2i32_imm:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: s_not_b32 s3, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_lshr_b32 s11, s7, 1
; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1
-; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_lshr_b32 s7, s7, 1
+; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s10
; GFX9-NEXT: s_not_b32 s2, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: s_not_b32 s0, s0
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
; GFX9-NEXT: s_lshr_b32 s1, s4, 1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v4
-; GFX9-NEXT: v_mov_b32_e32 v4, s12
-; GFX9-NEXT: v_mov_b32_e32 v5, s13
-; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, s0
+; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v5
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v4i32:
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-NEXT: v_mov_b32_e32 v5, s9
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 31
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31
-; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v4i32_imm:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s5
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_alignbit_b32 v2, s4, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, v2
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, 7
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 7
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_i32_imm:
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_v2i32:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9
-; GFX9-NEXT: v_alignbit_b32 v0, s4, v2, 7
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_v2i32_imm:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v4
-; GFX9-NEXT: v_mov_b32_e32 v4, s12
-; GFX9-NEXT: v_mov_b32_e32 v5, s13
-; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, s0
+; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_v4i32:
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-NEXT: v_mov_b32_e32 v5, s9
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
-; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_v4i32_imm:
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_add_u32 s32, s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[4:5], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[4:5], off offset:16
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5]
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v4, s[4:5] offset:16
+; GFX9-NEXT: s_add_u32 s32, s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, s4
-; GFX10-NEXT: v_mov_b32_e32 v5, s5
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5]
+; GFX10-NEXT: global_load_dwordx4 v[4:7], v4, s[4:5] offset:16
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[4:5], off
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v[4:5], off offset:16
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v12, 0
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_add_u32 s32, s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v13, s5
-; GFX9-NEXT: v_mov_b32_e32 v12, s4
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[12:13], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:48
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v12, s[4:5]
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v12, s[4:5] offset:16
+; GFX9-NEXT: global_load_dwordx4 v[8:11], v12, s[4:5] offset:32
+; GFX9-NEXT: global_load_dwordx4 v[12:15], v12, s[4:5] offset:48
+; GFX9-NEXT: s_add_u32 s32, s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v12, 0
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v13, s5
-; GFX10-NEXT: v_mov_b32_e32 v12, s4
+; GFX10-NEXT: s_clause 0x3
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v12, s[4:5]
+; GFX10-NEXT: global_load_dwordx4 v[4:7], v12, s[4:5] offset:16
+; GFX10-NEXT: global_load_dwordx4 v[8:11], v12, s[4:5] offset:32
+; GFX10-NEXT: global_load_dwordx4 v[12:15], v12, s[4:5] offset:48
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12
-; GFX10-NEXT: s_clause 0x3
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[12:13], off
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v[12:13], off offset:16
-; GFX10-NEXT: global_load_dwordx4 v[8:11], v[12:13], off offset:32
-; GFX10-NEXT: global_load_dwordx4 v[12:15], v[12:13], off offset:48
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v28, 0
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_add_u32 s32, s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v29, s5
-; GFX9-NEXT: v_mov_b32_e32 v28, s4
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
-; GFX9-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
-; GFX9-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
-; GFX9-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
-; GFX9-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5]
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16
+; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32
+; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48
+; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64
+; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80
+; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96
+; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112
+; GFX9-NEXT: s_add_u32 s32, s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v28, 0
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v29, s5
-; GFX10-NEXT: v_mov_b32_e32 v28, s4
+; GFX10-NEXT: s_clause 0x7
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5]
+; GFX10-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16
+; GFX10-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32
+; GFX10-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48
+; GFX10-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64
+; GFX10-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80
+; GFX10-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96
+; GFX10-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12
-; GFX10-NEXT: s_clause 0x7
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
-; GFX10-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
-; GFX10-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
-; GFX10-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
-; GFX10-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
-; GFX10-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
-; GFX10-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v28, 0
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_add_u32 s32, s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v29, s5
-; GFX9-NEXT: v_mov_b32_e32 v28, s4
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
-; GFX9-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
-; GFX9-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
-; GFX9-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
-; GFX9-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5]
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16
+; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32
+; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48
+; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64
+; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80
+; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96
+; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112
+; GFX9-NEXT: s_add_u32 s32, s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v28, 0
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
+; GFX10-NEXT: global_load_dword v32, v[0:1], off
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v29, s5
-; GFX10-NEXT: v_mov_b32_e32 v28, s4
+; GFX10-NEXT: s_clause 0x7
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5]
+; GFX10-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16
+; GFX10-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32
+; GFX10-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48
+; GFX10-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64
+; GFX10-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80
+; GFX10-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96
+; GFX10-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12
-; GFX10-NEXT: global_load_dword v32, v[0:1], off
-; GFX10-NEXT: s_clause 0x7
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
-; GFX10-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
-; GFX10-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
-; GFX10-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
-; GFX10-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
-; GFX10-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
-; GFX10-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
; GFX10-NEXT: s_waitcnt vmcnt(8)
; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_add_u32 s32, s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4
-; GFX9-NEXT: global_load_ubyte v0, v[2:3], off
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:4
+; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5]
+; GFX9-NEXT: s_add_u32 s32, s32, 0x400
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: v_mov_b32_e32 v2, s5
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_ubyte v0, v1, s[4:5]
+; GFX10-NEXT: global_load_dword v1, v1, s[4:5] offset:4
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_ubyte v0, v[1:2], off
-; GFX10-NEXT: global_load_dword v1, v[1:2], off offset:4
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: v_readlane_b32 s5, v40, 1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_add_u32 s32, s32, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[4:5]
+; GFX9-NEXT: s_add_u32 s32, s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[4:5]
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v16, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v0
; GFX900-NEXT: s_cbranch_execnz [[LOOP]]
; GFX908-NOT: v_add_f32
-; GFX908: global_atomic_add_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off
+; GFX908: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s
; GFX908-NOT: s_cbranch_execnz
define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr) #0 {
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
; No vgpr offset, constants
; --------------------------------------------------------------------------------
+; SGPR base only
+define amdgpu_ps float @global_load_saddr_i8_offset_0(i8 addrspace(1)* inreg %sbase) {
+; GCN-LABEL: global_load_saddr_i8_offset_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
+ %load = load i8, i8 addrspace(1)* %sbase
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
; SGPR base with maximum gfx9 immediate offset
define amdgpu_ps float @global_load_saddr_i8_offset_4095(i8 addrspace(1)* inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_4095:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(i8 addrspace(1)* inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_neg4096:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
define amdgpu_ps float @global_load_saddr_i8_offset_2048(i8 addrspace(1)* inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_2048:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
define amdgpu_ps float @global_load_saddr_i8_offset_2049(i8 addrspace(1)* inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_2049:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2049
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2049
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
define amdgpu_ps float @global_load_saddr_i8_offset_2050(i8 addrspace(1)* inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_2050:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2050
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2050
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(i8 addrspace(1)* inreg %sbase) {
; GCN-LABEL: global_load_saddr_i8_offset_neg2048:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2048
define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(i8 addrspace(1)* inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_neg2049:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(i8 addrspace(1)* inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_neg2050:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2050
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2050
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
ret <2 x half> %cast
}
+; --------------------------------------------------------------------------------
+; or-with-constant as add
+; --------------------------------------------------------------------------------
+
+; Check add-as-or with split 64-bit or.
+define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(i8 addrspace(6)* inreg %sbase, i32 %idx) {
+; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_or_b32_e32 v0, 16, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: global_load_ubyte v0, v[0:1], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
+ %zext.idx = zext i32 %idx to i64
+ %or = or i64 %zext.idx, 16
+ %addr = inttoptr i64 %or to i8 addrspace(1)*
+ %load = load i8, i8 addrspace(1)* %addr
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
+define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(i8 addrspace(6)* inreg %sbase, i32 %idx) {
+; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_or_b32_e32 v0, 0x1040, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: global_load_ubyte v0, v[0:1], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; return to shader part epilog
+ %zext.idx = zext i32 %idx to i64
+ %or = or i64 %zext.idx, 4160
+ %addr = inttoptr i64 %or to i8 addrspace(1)*
+ %load = load i8, i8 addrspace(1)* %addr
+ %zext = zext i8 %load to i32
+ %to.vgpr = bitcast i32 %zext to float
+ ret float %to.vgpr
+}
+
!0 = !{i32 0, i32 1073741824} ; (1 << 30)
!1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1
; GCN-LABEL: {{^}}atomic_add_i32_offset:
; SIVI: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}}
define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
}
; GCN-LABEL: {{^}}atomic_add_i32_max_neg_offset:
-; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:-4096{{$}}
+; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-4096{{$}}
define amdgpu_kernel void @atomic_add_i32_max_neg_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 -1024
; SIVI: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}}
define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; SIVI: buffer_store_dword [[RET]]
; GFX9: global_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX9: global_store_dword v{{[0-9]+}}, [[RET]], s
define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
entry:
%ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
; GCN-LABEL: {{^}}atomic_add_i32:
; SIVI: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_add v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
; SIVI: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
+; GFX9: global_atomic_add [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
+; GFX9: global_store_dword v{{[0-9]+}}, [[RET]], s
define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
; GCN-LABEL: {{^}}atomic_and_i32_offset:
; SIVI: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-; GFX9: global_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_and v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}}
define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; SIVI: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_and [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_and_i32:
; SIVI: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_and v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_and v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
; SIVI: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_and v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
; GCN-LABEL: {{^}}atomic_sub_i32_offset:
; SIVI: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-; GFX9: global_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_sub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; SIVI: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_sub v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_sub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}}
define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_sub_i32:
; SIVI: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_sub v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_sub v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}}
define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
; SIVI: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_sub [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
; GCN-LABEL: {{^}}atomic_max_i32_offset:
; SIVI: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-; GFX9: global_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_smax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}}
define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; SIVI: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_smax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_max_i32:
; SIVI: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_smax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}}
define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
; SIVI: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_smax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
; GCN-LABEL: {{^}}atomic_umax_i32_offset:
; SIVI: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-; GFX9: global_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_umax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}}
define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; SIVI: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_umax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_umax_i32:
; SIVI: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_umax v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}}
define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
; SIVI: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_umax [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}}
define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
; GCN-LABEL: {{^}}atomic_min_i32_offset:
; SIVI: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-; GFX9: global_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_smin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}}
define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; SIVI: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_smin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_min_i32:
; SIVI: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_smin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}}
define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
; SIVI: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_smin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}}
define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
; GCN-LABEL: {{^}}atomic_umin_i32_offset:
; SIVI: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-; GFX9: global_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_umin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}}
define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; SIVI: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_umin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_umin_i32:
; SIVI: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_umin v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}}
define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
; SIVI: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_umin [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
; GCN-LABEL: {{^}}atomic_or_i32_offset:
; SIVI: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-; GFX9: global_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_or v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}}
define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; SIVI: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_or [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_or_i32:
; SIVI: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_or v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}}
define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
; SIVI: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_or [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}}
define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
; GCN-LABEL: {{^}}atomic_xchg_i32_offset:
; SIVI: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_swap v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}}
define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_xchg_f32_offset:
; SIVI: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_swap v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}}
define amdgpu_kernel void @atomic_xchg_f32_offset(float addrspace(1)* %out, float %in) {
entry:
%gep = getelementptr float, float addrspace(1)* %out, i64 4
; SIVI: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_swap [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_xchg_i32:
; SIVI: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_swap v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}}
define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
; SIVI: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_swap [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] glc{{$}}
define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
; GCN-LABEL: {{^}}atomic_cmpxchg_i32_offset:
; SIVI: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-; GFX9: global_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:16{{$}}
+; GFX9: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] offset:16{{$}}
define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; SIVI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; SIVI: buffer_store_dword v[[RET]]
-; GFX9: global_atomic_cmpswap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_cmpswap [[RET:v[0-9]+]], v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_cmpxchg_i32:
; SIVI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]{{$}}
define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
; SIVI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; SIVI: buffer_store_dword v[[RET]]
-; GFX9: global_atomic_cmpswap [[RET:v[0-9]+]], v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], off glc{{$}}
+; GFX9: global_atomic_cmpswap [[RET:v[0-9]+]], v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
entry:
%val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
; GCN-LABEL: {{^}}atomic_xor_i32_offset:
; SIVI: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-; GFX9: global_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
+; GFX9: global_atomic_xor v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}}
define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; SIVI: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_xor v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off offset:16 glc{{$}}
+; GFX9: global_atomic_xor v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}}
define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_xor_i32:
; SIVI: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off{{$}}
+; GFX9: global_atomic_xor v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]{{$}}
define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
; SIVI: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, off glc{{$}}
+; GFX9: global_atomic_xor [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
entry:
%val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:16 glc{{$}}
+; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %in, i64 4
; VI-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, -1
; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:-512 glc{{$}}
+; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-512 glc{{$}}
define amdgpu_kernel void @atomic_load_i32_negoffset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %in, i64 -128
; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:16 glc{{$}}
+; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
define amdgpu_kernel void @atomic_load_f32_offset(float addrspace(1)* %in, float addrspace(1)* %out) {
entry:
%gep = getelementptr float, float addrspace(1)* %in, i64 4
; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
; SIVI: buffer_store_dword [[RET]]
-; GFX9: global_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], off glc
+; GFX9: global_load_dword [[RET:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc
define amdgpu_kernel void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4
; GCN-LABEL: {{^}}atomic_store_i32_offset:
; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}}
-; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off offset:16{{$}}
+; GFX9: global_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_store_i32:
; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}}
-; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off{{$}}
+; GFX9: global_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
; GCN-LABEL: {{^}}atomic_store_f32:
; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+$}}
-; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}, off{{$}}
+; GFX9: global_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}}
define amdgpu_kernel void @atomic_store_f32(float %in, float addrspace(1)* %out) {
entry:
store atomic float %in, float addrspace(1)* %out seq_cst, align 4
; GCN-LABEL: {{^}}atomic_add_i64_offset:
; CIVI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], off offset:32{{$}}
+; GFX9: global_atomic_add_x2 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32{{$}}
define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; CIVI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], off offset:32 glc{{$}}
+; GFX9: global_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}}
define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_add_i64:
; SIVI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_add_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) {
entry:
%tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
; CIVI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
; GCN-LABEL: {{^}}atomic_and_i64_offset:
; CIVI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_and_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; CIVI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_and_i64:
; CIVI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_and_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) {
entry:
%tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
; CIVI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
; GCN-LABEL: {{^}}atomic_sub_i64_offset:
; CIVI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_sub_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; CIVI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_sub_i64:
; CIVI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_sub_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) {
entry:
%tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
; CIVI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
; GCN-LABEL: {{^}}atomic_max_i64_offset:
; CIVI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_smax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; CIVI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_max_i64:
; CIVI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_smax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
entry:
%tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
; CIVI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
; GCN-LABEL: {{^}}atomic_umax_i64_offset:
; CIVI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_umax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; CIVI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_umax_i64:
; CIVI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_umax_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
entry:
%tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
; CIVI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
; GCN-LABEL: {{^}}atomic_min_i64_offset:
; CIVI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_smin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; CIVI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_min_i64:
; CIVI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_smin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
entry:
%tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
; CIVI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
; GCN-LABEL: {{^}}atomic_umin_i64_offset:
; CIVI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_umin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; CIVI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_umin_i64:
; CIVI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_umin_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
entry:
%tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
; CIVI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
; GCN-LABEL: {{^}}atomic_or_i64_offset:
; CIVI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_or_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; CIVI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_or_i64:
; CIVI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_or_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}}
define amdgpu_kernel void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) {
entry:
%tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
; CIVI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
; GCN-LABEL: {{^}}atomic_xchg_i64_offset:
; CIVI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_swap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_xchg_f64_offset:
; CIVI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_swap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
define amdgpu_kernel void @atomic_xchg_f64_offset(double addrspace(1)* %out, double %in) {
entry:
%gep = getelementptr double, double addrspace(1)* %out, i64 4
; CIVI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_xchg_i64:
; CIVI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_swap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}}
define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) {
entry:
%tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
; CIVI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
; GCN-LABEL: {{^}}atomic_xor_i64_offset:
; CIVI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_xor_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; CIVI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_xor_i64:
; CIVI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_atomic_xor_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}{{$}}
define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) {
entry:
%tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
; CIVI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off glc{{$}}
+; GFX9: global_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
entry:
%tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
; GCN-LABEL: {{^}}atomic_cmpxchg_i64_offset:
; CIVI: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32{{$}}
+; GFX9: global_atomic_cmpswap_x2 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; CIVI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
; CIVI: buffer_store_dwordx2 v{{\[}}[[RET]]:
-; GFX9: global_atomic_cmpswap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_cmpswap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_cmpxchg_i64:
; CIVI: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9: global_atomic_cmpswap_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]{{$}}
define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) {
entry:
%val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
; CIVI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
; CIVI: buffer_store_dwordx2 v{{\[}}[[RET]]:
-; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off glc{{$}}
+; GFX9: global_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+:[0-9]+}}] glc{{$}}
define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
entry:
%val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:32 glc{{$}}
+; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}}
define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %in, i64 4
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:-32 glc{{$}}
+; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-32 glc{{$}}
define amdgpu_kernel void @atomic_load_i64_neg_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %in, i64 -4
; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc
; CIVI: buffer_store_dwordx2 [[RET]]
-; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], off glc{{$}}
+; GFX9: global_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
define amdgpu_kernel void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
entry:
%val = load atomic i64, i64 addrspace(1)* %in seq_cst, align 8
; GCN-LABEL: {{^}}atomic_store_i64_offset:
; CI: buffer_store_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
; VI: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GFX9: global_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], off offset:32{{$}}
+; GFX9: global_store_dwordx2 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32{{$}}
define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) {
entry:
%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
; GCN-LABEL: {{^}}atomic_store_i64:
; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-; GFX9: global_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}], off{{$}}
+; GFX9: global_store_dwordx2 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]\]}}, s[{{[0-9]+}}:{{[0-9]+}}]{{$}}
define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) {
entry:
store atomic i64 %in, i64 addrspace(1)* %out seq_cst, align 8
; WAVE64: .sgpr_count: 8
; WAVE32: .sgpr_count: 10
; CHECK: .symbol: test.kd
-; CHECK: .vgpr_count: 6
+; CHECK: .vgpr_count: {{3|6}}
; WAVE64: .wavefront_size: 64
; WAVE32: .wavefront_size: 32
define amdgpu_kernel void @test(
; CHECK: .name: num_spilled_sgprs
; GFX700: .sgpr_spill_count: 38
; GFX803: .sgpr_spill_count: 22
-; GFX900: .sgpr_spill_count: 22
-; GFX1010: .sgpr_spill_count: 22
+; GFX900: .sgpr_spill_count: 48
+; GFX1010: .sgpr_spill_count: 48
; CHECK: .symbol: num_spilled_sgprs.kd
define amdgpu_kernel void @num_spilled_sgprs(
i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],
; CHECK: .name: num_spilled_vgprs
; CHECK: .symbol: num_spilled_vgprs.kd
-; CHECK: .vgpr_spill_count: 14
+; CHECK: .vgpr_spill_count: {{13|14}}
define amdgpu_kernel void @num_spilled_vgprs() #1 {
%val0 = load volatile float, float addrspace(1)* @var
%val1 = load volatile float, float addrspace(1)* @var
; CHECK: KernargSegmentAlign: 8
; CHECK: WavefrontSize: 64
; CHECK: NumSGPRs: 8
-; CHECK: NumVGPRs: 6
+; CHECK: NumVGPRs: {{3|6}}
; CHECK: MaxFlatWorkGroupSize: 1024
define amdgpu_kernel void @test(
half addrspace(1)* %r,
; CHECK: KernargSegmentAlign: 8
; CHECK: WavefrontSize: 64
; CHECK: NumSGPRs: 8
-; CHECK: NumVGPRs: 6
+; CHECK: NumVGPRs: {{3|6}}
; CHECK: MaxFlatWorkGroupSize: 256
define amdgpu_kernel void @test_max_flat_workgroup_size(
half addrspace(1)* %r,
; CHECK: CodeProps:
; GFX700: NumSpilledSGPRs: 38
; GFX803: NumSpilledSGPRs: 22
-; GFX900: NumSpilledSGPRs: 22
+; GFX900: NumSpilledSGPRs: {{22|48}}
define amdgpu_kernel void @num_spilled_sgprs(
i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],
i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32],
; CHECK-LABEL: - Name: num_spilled_vgprs
; CHECK: SymbolName: 'num_spilled_vgprs@kd'
; CHECK: CodeProps:
-; CHECK: NumSpilledVGPRs: 14
+; CHECK: NumSpilledVGPRs: {{13|14}}
define amdgpu_kernel void @num_spilled_vgprs() #1 {
%val0 = load volatile float, float addrspace(1)* @var
%val1 = load volatile float, float addrspace(1)* @var
; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
; Make sure we generate flat store for HSA
; PRE-GFX10: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
-; GFX10: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
+; GFX10: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
; HSA: .Lfunc_end0:
; HSA: .size simple, .Lfunc_end0-simple
; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0
; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: BB0_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_mul_lo_u32 v3, s5, v0
-; GFX9-NEXT: v_mul_hi_u32 v4, s4, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3
-; GFX9-NEXT: v_not_b32_e32 v6, v3
-; GFX9-NEXT: v_mul_lo_u32 v6, s2, v6
-; GFX9-NEXT: v_add_u32_e32 v5, 1, v3
-; GFX9-NEXT: v_add_u32_e32 v4, s4, v4
-; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT: v_add_u32_e32 v5, s4, v6
+; GFX9-NEXT: v_mul_lo_u32 v2, s5, v0
+; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2
+; GFX9-NEXT: v_not_b32_e32 v5, v2
+; GFX9-NEXT: v_mul_lo_u32 v5, s2, v5
+; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
+; GFX9-NEXT: v_add_u32_e32 v3, s4, v3
+; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT: v_add_u32_e32 v4, s4, v5
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: s_add_u32 s4, s4, 1
+; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
+; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, 4
; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: v_add_u32_e32 v5, 1, v3
-; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400
-; GFX9-NEXT: global_store_dword v[1:2], v3, off
; GFX9-NEXT: s_cbranch_scc0 BB0_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0
; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: BB1_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_mul_lo_u32 v3, s5, v0
-; GFX9-NEXT: v_mul_hi_u32 v4, s4, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3
-; GFX9-NEXT: v_not_b32_e32 v3, v3
-; GFX9-NEXT: v_mul_lo_u32 v3, s2, v3
-; GFX9-NEXT: v_add_u32_e32 v4, s4, v4
-; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4
+; GFX9-NEXT: v_mul_lo_u32 v2, s5, v0
+; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2
+; GFX9-NEXT: v_not_b32_e32 v2, v2
+; GFX9-NEXT: v_mul_lo_u32 v2, s2, v2
; GFX9-NEXT: v_add_u32_e32 v3, s4, v3
+; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3
+; GFX9-NEXT: v_add_u32_e32 v2, s4, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX9-NEXT: s_add_u32 s4, s4, 1
+; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v2
+; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, 4
; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3
-; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400
-; GFX9-NEXT: global_store_dword v[1:2], v3, off
; GFX9-NEXT: s_cbranch_scc0 BB1_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: BB2_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_mul_lo_u32 v4, v3, s3
-; GFX9-NEXT: v_add_u32_e32 v5, 1, v3
-; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4
-; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT: v_mul_hi_u32 v2, s4, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v2, s3
+; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
+; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3
+; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT: v_add_u32_e32 v4, 1, v2
+; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2
+; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2
; GFX9-NEXT: s_add_i32 s4, s4, 1
-; GFX9-NEXT: v_add_u32_e32 v5, 1, v3
-; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v4
+; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, 4
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3
; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400
-; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v3
-; GFX9-NEXT: global_store_dword v[1:2], v3, off
; GFX9-NEXT: s_cbranch_scc0 BB2_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: BB3_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_mul_hi_u32 v3, s3, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2
-; GFX9-NEXT: v_sub_u32_e32 v3, s3, v3
+; GFX9-NEXT: v_mul_hi_u32 v2, s3, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2
+; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2
+; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v2
+; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v2
+; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX9-NEXT: s_add_i32 s3, s3, 1
-; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3
-; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, 4
; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3
-; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3
; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT: global_store_dword v[1:2], v3, off
; GFX9-NEXT: s_cbranch_scc0 BB3_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot2:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot2:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v0, s2, v0
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT: v_add_u32_e32 v2, s5, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v1, s2, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NODL-NEXT: v_add_u32_e32 v1, s5, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot2_MulMul:
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_and_b32 s6, s3, s2
; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v0, s2, v0
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-DL-NEXT: v_add_u32_e32 v2, s5, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v1, s2, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT: v_add_u32_e32 v1, s5, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot2_MulMul:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s5, s6
; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s4, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s4, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16
; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3
; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1
; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot2:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s4, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_dot2_i32_i16 v1, s4, v1, v2
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot2:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s1, s0, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot2_i32_i16 v0, s1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3
; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1
; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, v1, v0
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot2_MixedTypedMul:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16
; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3
; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1
; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, v1, v0
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot2_MixedTypedMul:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0
; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot2_alt_AddOperands:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot2_alt_AddOperands:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16
; GFX9-NODL-NEXT: s_and_b32 s6, s3, 0xffff
; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1
; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot2_MixedExt:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16
; GFX9-DL-NEXT: s_and_b32 s6, s3, 0xffff
; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1
; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot2_MixedExt:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0
; GFX10-DL-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-NODL-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NODL-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, s2, v0
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, s4, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, s2, v1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, s4, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: notudot2_SameVec:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
; GFX9-DL-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, s2, v0
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, s4, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, s2, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, s4, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: notudot2_SameVec:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, s3
; GFX10-DL-NEXT: s_and_b32 s2, s4, 0xffff
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s2, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot2_v4i16:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot2_v4i16:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x4
; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x4
; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot2_v4i16_Hi:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x4
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x4
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s4, v1, v2
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot2_v4i16_Hi:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x4
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8
; GFX9-NODL-NEXT: s_and_b32 s2, s2, s8
; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2
; GFX9-NODL-NEXT: s_and_b32 s4, s4, s8
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: notudot2_v4i16_Even:
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX9-DL-NEXT: s_and_b32 s3, s3, s8
; GFX9-DL-NEXT: s_and_b32 s2, s2, s8
; GFX9-DL-NEXT: s_and_b32 s5, s5, s8
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2
; GFX9-DL-NEXT: s_and_b32 s4, s4, s8
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: notudot2_v4i16_Even:
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_and_b32 s0, s0, s7
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s1, v0
; GFX10-DL-NEXT: s_and_b32 s1, s2, s7
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2
; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: notudot2_v4i16_Middle:
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX9-DL-NEXT: s_and_b32 s3, s3, s8
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16
; GFX9-DL-NEXT: s_and_b32 s5, s5, s8
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2
; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: notudot2_v4i16_Middle:
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s1, v0
; GFX10-DL-NEXT: s_lshr_b32 s1, s2, 16
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, v1, v0
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v2
; GFX9-NODL-NEXT: s_lshr_b32 s7, s4, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: notudot2_DiffIndex:
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_and_b32 s6, s3, s2
; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v2
; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: notudot2_DiffIndex:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_and_b32 s0, s0, s2
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s3, v0
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v0
-; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v2, v1
+; GFX9-NODL-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot2_MultipleUses_add1:
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v0
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v2, v1
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot2_MultipleUses_add1:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_and_b32 s0, s0, s6
; GFX10-DL-NEXT: s_and_b32 s1, s1, s6
; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v0
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16
; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3
; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1
; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0
-; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v0
-; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v1
+; GFX9-NODL-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot2_MultipleUses_add1:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16
; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3
; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1
; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0
-; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v0
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v1
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot2_MultipleUses_add1:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1
; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0
; GFX10-DL-NEXT: v_mad_i32_i24 v1, s1, s0, v0
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v2, v0
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v2
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot2_MultipleUses_mul1:
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_and_b32 s6, s3, s2
; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5
; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v2, v0
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v2
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot2_MultipleUses_mul1:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2
; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3
; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v2, v0
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v3, v1
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot2_MultipleUses_mul1:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2
; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3
; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5
; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v1, v0
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v2, v0
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v3, v1
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot2_MultipleUses_mul1:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 16
; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0
; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot2_MultipleUses_mul2:
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v1, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot2_MultipleUses_mul2:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_and_b32 s0, s0, s6
; GFX10-DL-NEXT: s_and_b32 s1, s1, s6
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16
; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3
; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s3, v2, v1
; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot2_MultipleUses_mul2:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16
; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3
; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1
; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot2_MultipleUses_mul2:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1
; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0
; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff
+; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
-; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
-; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2
+; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
+; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2
+; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot2_acc16:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1]
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s2, v3, v2
-; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s2, v2, v1
+; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot2_acc16:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s0, s1, v2
-; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, s0, s1, v1
+; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
i16 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NODL-NEXT: global_load_ushort v1, v[2:3], off
+; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[4:5]
+; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[6:7]
; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT: v_bfe_i32 v2, v0, 0, 8
-; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v0, 8, v0
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8
; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1
-; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 8
+; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2
; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v1, v0, s2
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v2, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v4, v3, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: notsdot2_sext8:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-DL-NEXT: global_load_ushort v1, v[2:3], off
+; GFX9-DL-NEXT: global_load_ushort v1, v0, s[4:5]
+; GFX9-DL-NEXT: global_load_ushort v2, v0, s[6:7]
; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_bfe_i32 v2, v0, 0, 8
-; GFX9-DL-NEXT: v_lshrrev_b16_e32 v0, 8, v0
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8
; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1
-; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 8
+; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2
; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, v1, v0, s2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s2
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, v4, v3, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: notsdot2_sext8:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6
-; GFX10-DL-NEXT: v_mov_b32_e32 v3, s7
-; GFX10-DL-NEXT: global_load_ushort v0, v[0:1], off
-; GFX10-DL-NEXT: global_load_ushort v1, v[2:3], off
+; GFX10-DL-NEXT: s_clause 0x1
+; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
+; GFX10-DL-NEXT: global_load_ushort v2, v0, s[6:7]
; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, v0
-; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, v1
-; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, v2
; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX10-DL-NEXT: v_bfe_i32 v4, v4, 0, 8
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, v3, v2, s2
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, v1, v0, v2
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s2
+; GFX10-DL-NEXT: v_mad_i32_i24 v1, v2, v1, v3
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-DL-NEXT: s_endpgm
<2 x i8> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2
; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3
; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10
; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s4, v0, v1
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v1, v2
; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9
; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s8, v1, v0
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s8, v2, v1
; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s2, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_acc32:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s4, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s4, v1, v2
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc32:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot4_i32_i8 v0, s0, s1, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s0
-; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1
-; GFX9-NODL-NEXT: s_bfe_i32 s5, s1, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010
-; GFX9-NODL-NEXT: s_bfe_i32 s4, s0, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80010
-; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7
-; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24
+; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2
+; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3
+; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010
+; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010
+; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s9
+; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v5, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
-; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v2, v1
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v3, v1
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s8, v4, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s2, v2, v1
+; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_acc16:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1]
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v3, v2
-; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s2, v2, v1
+; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc16:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v2
-; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot4_i32_i8 v1, s0, s1, v1
+; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i16 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
-; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008
-; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010
-; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
+; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008
+; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2
+; GFX9-NODL-NEXT: s_bfe_u32 s6, s4, 0x80008
+; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010
+; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v3, v1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_acc8:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1]
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
-; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s2, v2, v1
+; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc8:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2
-; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s0, s1, v1
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i8 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2
; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3
; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10
; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008
-; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v0, v1
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v1, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010
-; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v3, v2
; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s4, v0, v1
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v1, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9
; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s8, v1, v0
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s8, v2, v1
; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s2, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_multiuse_mul1:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2
; GFX9-DL-NEXT: s_sext_i32_i8 s5, s3
; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80008
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10
; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x80008
-; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7
; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x80010
-; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v3, v2
; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x80010
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v1, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9
; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 24
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v1, v0
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s8, v2, v1
; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 24
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v2, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_multiuse_mul1:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 24
; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 24
; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, s0, s1, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s3
+; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s2
+; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s3
; GFX9-NODL-NEXT: s_ashr_i32 s6, s3, 24
; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80010
; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s3
; GFX9-NODL-NEXT: s_ashr_i32 s4, s2, 24
; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80010
; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8
-; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8
; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v2, v3
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v1, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s5, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s2, v3, v4
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v1, v2, v3
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v2, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_acc32_vecMul:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s8, s[0:1], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3
+; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2
+; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3
; GFX9-DL-NEXT: s_ashr_i32 s6, s3, 24
; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80010
; GFX9-DL-NEXT: s_sext_i32_i8 s3, s3
; GFX9-DL-NEXT: s_ashr_i32 s4, s2, 24
; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x80010
; GFX9-DL-NEXT: s_sext_i32_i8 s2, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8
-; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8
; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v2, v3
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX9-DL-NEXT: v_mad_i32_i24 v3, s2, v3, v4
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, v1, v2, v3
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v2, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v2, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc32_vecMul:
; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 24
; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 24
; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, s2, s3, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0xffff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v5, 0xffff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16
; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 16
-; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v3, 8, s5
+; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v4, 8, s5
; GFX9-NODL-NEXT: s_bfe_i32 s5, s5, 0x80000
-; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v2, 8, s4
-; GFX9-NODL-NEXT: v_and_b32_e32 v5, s5, v4
+; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v3, 8, s4
+; GFX9-NODL-NEXT: v_and_b32_e32 v6, s5, v5
; GFX9-NODL-NEXT: s_bfe_i32 s4, s4, 0x80000
-; GFX9-NODL-NEXT: v_lshl_or_b32 v3, v3, 16, v5
-; GFX9-NODL-NEXT: v_and_b32_e32 v5, s4, v4
-; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v5
-; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s3
+; GFX9-NODL-NEXT: v_lshl_or_b32 v4, v4, 16, v6
+; GFX9-NODL-NEXT: v_and_b32_e32 v6, s4, v5
+; GFX9-NODL-NEXT: v_lshl_or_b32 v3, v3, 16, v6
+; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v2, 8, s3
; GFX9-NODL-NEXT: s_bfe_i32 s3, s3, 0x80000
-; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v0, 8, s2
-; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v2, v3
-; GFX9-NODL-NEXT: v_and_b32_e32 v3, s3, v4
+; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s2
+; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v3, v4
+; GFX9-NODL-NEXT: v_and_b32_e32 v4, s3, v5
; GFX9-NODL-NEXT: s_bfe_i32 s2, s2, 0x80000
-; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v3
-; GFX9-NODL-NEXT: v_and_b32_e32 v3, s2, v4
-; GFX9-NODL-NEXT: v_lshl_or_b32 v0, v0, 16, v3
-; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v0, v1
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off
+; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v4
+; GFX9-NODL-NEXT: v_and_b32_e32 v4, s2, v5
+; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v4
+; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
+; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_add_u32_e32 v4, v3, v4
-; GFX9-NODL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NODL-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v2
+; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v3
+; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_acc16_vecMul:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, 0xffff
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16
; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 16
-; GFX9-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s5
+; GFX9-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s5
; GFX9-DL-NEXT: s_bfe_i32 s5, s5, 0x80000
-; GFX9-DL-NEXT: v_ashrrev_i16_e64 v2, 8, s4
-; GFX9-DL-NEXT: v_and_b32_e32 v5, s5, v4
+; GFX9-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s4
+; GFX9-DL-NEXT: v_and_b32_e32 v6, s5, v5
; GFX9-DL-NEXT: s_bfe_i32 s4, s4, 0x80000
-; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v5
-; GFX9-DL-NEXT: v_and_b32_e32 v5, s4, v4
-; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5
-; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s3
+; GFX9-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6
+; GFX9-DL-NEXT: v_and_b32_e32 v6, s4, v5
+; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v6
+; GFX9-DL-NEXT: v_ashrrev_i16_e64 v2, 8, s3
; GFX9-DL-NEXT: s_bfe_i32 s3, s3, 0x80000
-; GFX9-DL-NEXT: v_ashrrev_i16_e64 v0, 8, s2
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3
-; GFX9-DL-NEXT: v_and_b32_e32 v3, s3, v4
+; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s2
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4
+; GFX9-DL-NEXT: v_and_b32_e32 v4, s3, v5
; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x80000
-; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v3
-; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v4
-; GFX9-DL-NEXT: v_lshl_or_b32 v0, v0, 16, v3
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off
+; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v4
+; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v5
+; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v4
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
+; GFX9-DL-NEXT: global_load_ushort v2, v0, s[0:1]
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_add_u32_e32 v4, v3, v4
-; GFX9-DL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v2
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v2
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s0
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s0
; GFX10-DL-NEXT: s_bfe_i32 s0, s0, 0x80000
; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000
-; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v3
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s1
-; GFX10-DL-NEXT: v_and_b32_e32 v6, s3, v3
+; GFX10-DL-NEXT: v_and_b32_e32 v6, s0, v2
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s1
+; GFX10-DL-NEXT: v_and_b32_e32 v5, s3, v2
; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 16
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 8, s2
-; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 8, s2
+; GFX10-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v6
; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x80000
-; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6
+; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5
; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x80000
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s0
-; GFX10-DL-NEXT: v_and_b32_e32 v7, s2, v3
-; GFX10-DL-NEXT: v_and_b32_e32 v3, s1, v3
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v7
-; GFX10-DL-NEXT: v_lshl_or_b32 v3, v8, 16, v3
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s0
+; GFX10-DL-NEXT: v_and_b32_e32 v6, s2, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v2, s1, v2
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4
+; GFX10-DL-NEXT: v_lshl_or_b32 v4, v5, 16, v6
+; GFX10-DL-NEXT: v_lshl_or_b32 v2, v7, 16, v2
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v3, v1
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i16 addrspace(1)* nocapture %dst) {
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2
; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10
; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v0, v1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2
; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9
; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v1, v0
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v2, v1
; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_acc32:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s4, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s4, v1, v2
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc32:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, s0, s1, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
-; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
+; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2
+; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
+; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010
+; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010
+; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s9
+; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v3, v1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v4, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_acc16:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1]
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
-; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s2, v2, v1
+; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc16:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2
-; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s0, s1, v1
+; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i16 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
-; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008
-; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010
-; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
+; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008
+; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2
+; GFX9-NODL-NEXT: s_bfe_u32 s6, s4, 0x80008
+; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010
+; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v3, v1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_acc8:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1]
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
-; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s2, v2, v1
+; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc8:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2
-; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s0, s1, v1
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i8 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
-; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT: s_bfe_u32 s2, s2, 0x80008
-; GFX9-NODL-NEXT: s_bfe_u32 s1, s1, 0x80008
+; GFX9-NODL-NEXT: s_and_b32 s5, s4, s2
+; GFX9-NODL-NEXT: s_and_b32 s2, s3, s2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT: s_bfe_u32 s4, s4, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s3, s3, 0x80008
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot2_8:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
+; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_and_b32 s3, s2, s0
-; GFX9-DL-NEXT: s_and_b32 s0, s1, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x80008
-; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x80008
+; GFX9-DL-NEXT: s_and_b32 s5, s4, s2
+; GFX9-DL-NEXT: s_and_b32 s2, s3, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x80008
+; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x80008
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot2_8:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_movk_i32 s1, 0xff
; GFX10-DL-NEXT: s_and_b32 s1, s0, s1
; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x80008
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i8 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
-; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
-; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2
+; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008
+; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010
+; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010
+; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v3, v1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_CommutationInsideMAD:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1]
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s3, v3, v2
-; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s3, v2, v1
+; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_CommutationInsideMAD:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s1, s0, v2
-; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, s1, s0, v1
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i8 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
-; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
-; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008
+; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2
+; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010
+; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010
+; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v4, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v3, v1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_CommutationAccrossMADs:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
+; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x80008
-; GFX9-DL-NEXT: s_and_b32 s3, s1, s0
-; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-DL-NEXT: s_and_b32 s0, s2, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010
-; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s6
-; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008
+; GFX9-DL-NEXT: s_and_b32 s5, s3, s2
+; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010
+; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010
+; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v4, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v3, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_CommutationAccrossMADs:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT: s_movk_i32 s4, 0xff
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT: s_movk_i32 s6, 0xff
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2
-; GFX10-DL-NEXT: s_and_b32 s2, s0, s4
-; GFX10-DL-NEXT: s_and_b32 s3, s1, s4
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v1
+; GFX10-DL-NEXT: s_and_b32 s2, s0, s6
+; GFX10-DL-NEXT: s_and_b32 s3, s1, s6
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010
; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2
-; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v1
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v1
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i8 addrspace(1)* nocapture %dst) {
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2
; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10
; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008
-; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v0, v1
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010
-; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v3, v2
; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v0, v1
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v1, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9
; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v1, v0
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v2, v1
; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_multiuse_mul1:
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_and_b32 s5, s3, s2
; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10
; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7
; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v3, v2
; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v1, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9
; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s8, v1, v0
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v2, v1
; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_multiuse_mul1:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s1, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v0, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v1, v2
; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010
-; GFX9-NODL-NEXT: v_add_u32_e32 v1, s10, v0
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v2, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NODL-NEXT: v_add_u32_e32 v2, s10, v1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v3, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v2, v0
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v3, v1
; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, v2, v0
-; GFX9-NODL-NEXT: v_add_u32_e32 v2, v0, v1
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s3, v3, v1
+; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_multiuse_add1:
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008
; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s6, v0, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v1, v2
; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010
-; GFX9-DL-NEXT: v_add_u32_e32 v1, s10, v0
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v2, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-DL-NEXT: v_add_u32_e32 v2, s10, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v3, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9
; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s8, v2, v0
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v3, v1
; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, v2, v0
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v3, v1
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_multiuse_add1:
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: s_movk_i32 s7, 0xff
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s6, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
-; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1
+; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008
+; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010
+; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010
-; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7
-; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
+; GFX9-NODL-NEXT: s_bfe_u32 s8, s2, 0x80010
+; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s9
+; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v4, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1
+; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v3, v1
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s8, v4, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: notdot4_mixedtypes:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x80008
-; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x80008
-; GFX9-DL-NEXT: s_sext_i32_i8 s3, s1
+; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x80008
+; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80008
+; GFX9-DL-NEXT: s_sext_i32_i8 s5, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT: s_bfe_u32 s9, s3, 0x80010
+; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010
-; GFX9-DL-NEXT: s_sext_i32_i8 s2, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010
-; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7
-; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24
+; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x80010
+; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s9
+; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v4, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v3, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v4, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: notdot4_mixedtypes:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_sext_i32_i8 s2, s0
; GFX10-DL-NEXT: s_sext_i32_i8 s3, s1
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010
; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1
+; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i16 addrspace(1)* nocapture %dst) {
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 24
; GFX9-NODL-NEXT: s_lshr_b32 s6, s4, 24
; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80010
-; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s3
+; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s3
; GFX9-NODL-NEXT: s_and_b32 s3, s3, s2
; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010
-; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s4
+; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s4
; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v2, v3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, v0, v1, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s7, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NODL-NEXT: v_mad_u32_u24 v3, s3, v3, v4
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v1, v2, v3
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v2, v1
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_acc32_vecMul:
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 24
; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 24
; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80010
-; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s3
+; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3
; GFX9-DL-NEXT: s_and_b32 s3, s3, s2
; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x80010
-; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s4
+; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s4
; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v2, v3
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s7, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-DL-NEXT: v_mad_u32_u24 v3, s3, v3, v4
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, v3
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc32_vecMul:
; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24
; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24
; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0xffff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 16
; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 16
; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 24
-; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_and_b32_sdwa v5, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 24
-; GFX9-NODL-NEXT: v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_lshl_or_b32 v3, s6, 16, v3
-; GFX9-NODL-NEXT: v_lshl_or_b32 v4, s4, 16, v4
-; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v4, v3
-; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v3, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_lshl_or_b32 v4, s6, 16, v4
+; GFX9-NODL-NEXT: v_lshl_or_b32 v5, s4, 16, v5
+; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v4, v5, v4
+; GFX9-NODL-NEXT: v_and_b32_sdwa v5, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s3
; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s2
-; GFX9-NODL-NEXT: v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v4
-; GFX9-NODL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v0, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off
+; GFX9-NODL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v5
+; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
+; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[0:1]
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_add_u32_e32 v4, v2, v4
-; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v3
-; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v2
+; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v4
+; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NODL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_acc16_vecMul:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0xffff
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 16
; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 16
; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 24
-; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-DL-NEXT: v_and_b32_sdwa v5, v3, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 24
-; GFX9-DL-NEXT: v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-DL-NEXT: v_lshl_or_b32 v3, s6, 16, v3
-; GFX9-DL-NEXT: v_lshl_or_b32 v4, s4, 16, v4
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v4, v3
-; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-DL-NEXT: v_and_b32_sdwa v4, v3, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-DL-NEXT: v_lshl_or_b32 v4, s6, 16, v4
+; GFX9-DL-NEXT: v_lshl_or_b32 v5, s4, 16, v5
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4
+; GFX9-DL-NEXT: v_and_b32_sdwa v5, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3
; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2
-; GFX9-DL-NEXT: v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v4
-; GFX9-DL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v0, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off
+; GFX9-DL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5
+; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
+; GFX9-DL-NEXT: global_load_ushort v2, v0, s[0:1]
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_add_u32_e32 v4, v2, v4
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v2
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v4
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s0
-; GFX10-DL-NEXT: v_and_b32_sdwa v7, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s1
-; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0
+; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1
+; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: s_lshr_b32 s2, s1, 16
; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 16
-; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7
+; GFX10-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v6
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6
-; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-DL-NEXT: v_and_b32_sdwa v3, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5
+; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT: v_lshl_or_b32 v5, s1, 16, v6
-; GFX10-DL-NEXT: v_lshl_or_b32 v3, s0, 16, v3
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4
+; GFX10-DL-NEXT: v_lshl_or_b32 v4, s1, 16, v5
+; GFX10-DL-NEXT: v_lshl_or_b32 v2, s0, 16, v2
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v3, v1
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i16 addrspace(1)* nocapture %dst) {
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_ubyte v4, v0, s[0:1]
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v0, s2, v0
-; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 16
; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NODL-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s2, v1
+; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v2, s2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6
; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s4, v1
-; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NODL-NEXT: v_or_b32_e32 v3, v2, v0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NODL-NEXT: global_load_ubyte v5, v[0:1], off
-; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 8, v3
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v2, s5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s4, v3
+; GFX9-NODL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NODL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT: v_or_b32_e32 v2, v1, v2
+; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 8, v2
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v5
-; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v4
-; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v4
+; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v3
+; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NODL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NODL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_acc8_vecMul:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT: global_load_ubyte v4, v0, s[0:1]
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v0, s2, v0
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-DL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16
; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 24
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s2, v1
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, s2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-DL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6
; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s4, v1
-; GFX9-DL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ubyte v5, v[0:1], off
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, s5, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s4, v3
+; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-DL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT: v_or_b32_e32 v2, v1, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v2
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v4
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc8_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0
-; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1
+; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s0
+; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s1
; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 24
; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 24
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s3
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s2, s3
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, v2, v3
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s0, s1
; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3
-; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v5
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s0, s1
-; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v2
+; GFX10-DL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v4
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1
+; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
i8 addrspace(1)* nocapture %dst) {
; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0
; GFX9-NEXT: s_addc_u32 s21, s21, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000
; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000
; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004
-; GFX9-NEXT: v_mov_b32_e32 v0, s5
-; GFX9-NEXT: v_mov_b32_e32 v1, s18
-; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, s18
+; GFX9-NEXT: v_mad_i32_i24 v1, s4, v1, v2
; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008
-; GFX9-NEXT: v_mad_i32_i24 v0, s6, v1, v0
+; GFX9-NEXT: v_mad_i32_i24 v1, s6, v2, v1
; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c
-; GFX9-NEXT: v_mad_i32_i24 v0, s8, v1, v0
+; GFX9-NEXT: v_mad_i32_i24 v1, s8, v2, v1
; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v1, s11
+; GFX9-NEXT: v_mov_b32_e32 v2, s11
; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010
-; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0
+; GFX9-NEXT: v_mad_i32_i24 v1, s10, v2, v1
; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v1, s13
+; GFX9-NEXT: v_mov_b32_e32 v2, s13
; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014
; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018
-; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0
+; GFX9-NEXT: v_mad_i32_i24 v1, s12, v2, v1
; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v1, s15
+; GFX9-NEXT: v_mov_b32_e32 v2, s15
; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018
-; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s17
+; GFX9-NEXT: v_mad_i32_i24 v1, s14, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s17
; GFX9-NEXT: s_ashr_i32 s3, s3, 28
-; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0
+; GFX9-NEXT: v_mad_i32_i24 v1, s16, v2, v1
; GFX9-NEXT: s_ashr_i32 s2, s2, 28
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mad_i32_i24 v2, s2, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_mad_i32_i24 v1, s2, v2, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot8_acc32:
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s4, v1, v2
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot8_acc32:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s0, s1, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot8_i32_i4 v0, s0, s1, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s18, -1
-; GFX9-NEXT: s_mov_b32 s19, 0xe00000
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s22, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
-; GFX9-NEXT: s_add_u32 s16, s16, s3
-; GFX9-NEXT: s_addc_u32 s17, s17, 0
+; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: s_mov_b32 s23, 0xe00000
+; GFX9-NEXT: s_add_u32 s20, s20, s3
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT: s_addc_u32 s21, s21, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40000
-; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000
-; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004
-; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-NEXT: s_lshr_b32 s2, s0, 12
-; GFX9-NEXT: s_lshr_b32 s3, s1, 12
-; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40004
-; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2
-; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s3
-; GFX9-NEXT: v_mul_i32_i24_e32 v3, s8, v3
-; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40010
+; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000
+; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40000
+; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40004
+; GFX9-NEXT: s_bfe_i32 s11, s3, 0x40008
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: s_lshr_b32 s4, s2, 12
+; GFX9-NEXT: s_lshr_b32 s5, s3, 12
+; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004
+; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40008
+; GFX9-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s4
+; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s5
+; GFX9-NEXT: v_mul_i32_i24_e32 v2, s10, v2
+; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010
+; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3
; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
-; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40014
-; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v8, s11
-; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40018
-; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v9, s13
-; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40018
-; GFX9-NEXT: s_ashr_i32 s1, s1, 28
-; GFX9-NEXT: v_mov_b32_e32 v10, s15
-; GFX9-NEXT: s_ashr_i32 s0, s0, 28
+; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014
+; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010
+; GFX9-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018
+; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014
+; GFX9-NEXT: v_mov_b32_e32 v8, s15
+; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018
+; GFX9-NEXT: s_ashr_i32 s3, s3, 28
+; GFX9-NEXT: v_mov_b32_e32 v9, s17
+; GFX9-NEXT: s_ashr_i32 s2, s2, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_i32_i24 v2, s4, v6, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s6, v7, v2
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s10, v8, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s12, v9, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s14, v10, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2
-; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: v_mad_i32_i24 v1, s6, v5, v1
+; GFX9-NEXT: v_mad_i32_i24 v1, s8, v6, v1
+; GFX9-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NEXT: v_mad_u32_u24 v1, v3, v4, v1
+; GFX9-NEXT: v_mad_i32_i24 v1, s12, v7, v1
+; GFX9-NEXT: v_mad_i32_i24 v1, s14, v8, v1
+; GFX9-NEXT: v_mad_i32_i24 v1, s16, v9, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_mad_i32_i24 v1, s2, v2, v1
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot8_acc16:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT: s_mov_b32 s18, -1
-; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-DL-NEXT: s_mov_b32 s22, -1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
-; GFX9-DL-NEXT: s_add_u32 s16, s16, s3
-; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0
+; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000
+; GFX9-DL-NEXT: s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40000
-; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40000
-; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40004
-; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 12
-; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 12
-; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40004
-; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s3
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s8, v3
-; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40010
+; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000
+; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40000
+; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40004
+; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x40008
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 12
+; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 12
+; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004
+; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x40008
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, s10, v2
+; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
-; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40014
-; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40018
-; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13
-; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40018
-; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28
-; GFX9-DL-NEXT: v_mov_b32_e32 v10, s15
-; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28
+; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014
+; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018
+; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s15
+; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018
+; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s17
+; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v6, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v7, v2
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v8, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v9, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v10, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
-; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v5, v1
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s8, v6, v1
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, v3, v4, v1
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s12, v7, v1
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s14, v8, v1
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s16, v9, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v2, v1
+; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot8_acc16:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s14, -1
; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s12, s12, s3
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12
; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12
-; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40000
-; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40000
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3
-; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40004
-; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40008
-; GFX10-DL-NEXT: s_bfe_i32 s8, s1, 0x40008
+; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000
+; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s2
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s3
+; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004
+; GFX10-DL-NEXT: s_bfe_i32 s9, s0, 0x40008
+; GFX10-DL-NEXT: s_bfe_i32 s10, s1, 0x40008
; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004
-; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s7, s8
+; GFX10-DL-NEXT: v_mul_i32_i24_e64 v4, s9, s10
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s2, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1
+; GFX10-DL-NEXT: v_mad_i32_i24 v1, s8, s2, v1
; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
+; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2
; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
-; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, v2, v3, v1
+; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014
; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018
; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018
; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28
; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2
-; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
+; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1
+; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i16 addrspace(1)* nocapture %dst) {
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s22, -1
-; GFX9-NEXT: s_mov_b32 s23, 0xe00000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-NEXT: s_mov_b32 s23, 0xe00000
; GFX9-NEXT: s_add_u32 s20, s20, s3
+; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NEXT: s_addc_u32 s21, s21, 0
-; GFX9-NEXT: s_movk_i32 s0, 0xff
+; GFX9-NEXT: s_movk_i32 s2, 0xff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s3, s1, 12
-; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000
-; GFX9-NEXT: s_lshr_b32 s4, s2, 12
-; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004
-; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40008
-; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s3
-; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4
-; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004
-; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v3, s10
-; GFX9-NEXT: v_mov_b32_e32 v7, s8
+; GFX9-NEXT: s_lshr_b32 s5, s3, 12
+; GFX9-NEXT: s_bfe_i32 s8, s4, 0x40000
+; GFX9-NEXT: s_lshr_b32 s6, s4, 12
+; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40004
+; GFX9-NEXT: s_bfe_i32 s12, s4, 0x40008
+; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40000
+; GFX9-NEXT: v_mov_b32_e32 v5, s8
+; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s5
+; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s6
+; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40004
+; GFX9-NEXT: s_bfe_i32 s11, s3, 0x40008
+; GFX9-NEXT: v_mov_b32_e32 v2, s12
+; GFX9-NEXT: v_mov_b32_e32 v6, s10
+; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3
; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
-; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3
-; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010
-; GFX9-NEXT: v_and_b32_e32 v4, s0, v4
-; GFX9-NEXT: v_and_b32_e32 v5, s0, v5
-; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014
-; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v8, s12
-; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018
-; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v9, s14
-; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40018
-; GFX9-NEXT: s_ashr_i32 s2, s2, 28
-; GFX9-NEXT: v_mov_b32_e32 v10, s16
-; GFX9-NEXT: s_ashr_i32 s1, s1, 28
+; GFX9-NEXT: v_mul_i32_i24_e32 v2, s11, v2
+; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40010
+; GFX9-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX9-NEXT: v_and_b32_e32 v4, s2, v4
+; GFX9-NEXT: s_bfe_i32 s16, s4, 0x40014
+; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010
+; GFX9-NEXT: v_mov_b32_e32 v7, s14
+; GFX9-NEXT: s_bfe_i32 s18, s4, 0x40018
+; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014
+; GFX9-NEXT: v_mov_b32_e32 v8, s16
+; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018
+; GFX9-NEXT: s_ashr_i32 s4, s4, 28
+; GFX9-NEXT: v_mov_b32_e32 v9, s18
+; GFX9-NEXT: s_ashr_i32 s3, s3, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: v_mad_i32_i24 v2, s1, v3, v2
-; GFX9-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-NEXT: v_mad_i32_i24 v1, s7, v5, v1
+; GFX9-NEXT: v_mad_i32_i24 v1, s9, v6, v1
+; GFX9-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NEXT: v_mad_u32_u24 v1, v3, v4, v1
+; GFX9-NEXT: v_mad_i32_i24 v1, s13, v7, v1
+; GFX9-NEXT: v_mad_i32_i24 v1, s15, v8, v1
+; GFX9-NEXT: v_mad_i32_i24 v1, s17, v9, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mad_i32_i24 v1, s3, v2, v1
+; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot8_acc8:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_mov_b32 s22, -1
-; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000
; GFX9-DL-NEXT: s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0
-; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
+; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 12
-; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000
-; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 12
-; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004
-; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x40008
-; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40000
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4
-; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40004
-; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8
+; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 12
+; GFX9-DL-NEXT: s_bfe_i32 s8, s4, 0x40000
+; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 12
+; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40004
+; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x40008
+; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40000
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s8
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s5
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s6
+; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40004
+; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x40008
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s10
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3
-; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010
-; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4
-; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v5
-; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014
-; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12
-; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018
-; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14
-; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40018
-; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28
-; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16
-; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, s11, v2
+; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40010
+; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4
+; GFX9-DL-NEXT: s_bfe_i32 s16, s4, 0x40014
+; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s14
+; GFX9-DL-NEXT: s_bfe_i32 s18, s4, 0x40018
+; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s16
+; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018
+; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s18
+; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
-; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s7, v5, v1
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s9, v6, v1
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, v3, v4, v1
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s13, v7, v1
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s15, v8, v1
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s17, v9, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s3, v2, v1
+; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot8_acc8:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s14, -1
; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s12, s12, s3
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12
; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12
-; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40000
-; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40000
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3
-; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40004
-; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40008
-; GFX10-DL-NEXT: s_bfe_i32 s8, s1, 0x40008
+; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000
+; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s2
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s3
+; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004
+; GFX10-DL-NEXT: s_bfe_i32 s9, s0, 0x40008
+; GFX10-DL-NEXT: s_bfe_i32 s10, s1, 0x40008
; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004
-; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s7, s8
+; GFX10-DL-NEXT: v_mul_i32_i24_e64 v4, s9, s10
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s2, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1
+; GFX10-DL-NEXT: v_mad_i32_i24 v1, s8, s2, v1
; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
+; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2
; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
-; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, v2, v3, v1
+; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40014
; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40014
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018
; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018
; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28
; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2
-; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
+; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i8 addrspace(1)* nocapture %dst) {
; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0
; GFX9-NEXT: s_addc_u32 s21, s21, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000
; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000
-; GFX9-NEXT: v_mov_b32_e32 v0, s5
-; GFX9-NEXT: v_mov_b32_e32 v1, s18
-; GFX9-NEXT: v_mad_i32_i24 v1, s4, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, s18
+; GFX9-NEXT: v_mad_i32_i24 v2, s4, v1, v2
; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004
; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004
; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008
-; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-NEXT: v_mad_i32_i24 v0, s6, v2, v0
+; GFX9-NEXT: v_mad_i32_i24 v1, s4, v1, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mad_i32_i24 v1, s6, v3, v1
; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c
-; GFX9-NEXT: v_mad_i32_i24 v0, s8, v2, v0
+; GFX9-NEXT: v_mad_i32_i24 v1, s8, v3, v1
; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010
-; GFX9-NEXT: v_mad_i32_i24 v0, s10, v2, v0
+; GFX9-NEXT: v_mad_i32_i24 v1, s10, v3, v1
; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v2, s13
+; GFX9-NEXT: v_mov_b32_e32 v3, s13
; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014
; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018
-; GFX9-NEXT: v_mad_i32_i24 v0, s12, v2, v0
+; GFX9-NEXT: v_mad_i32_i24 v1, s12, v3, v1
; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v2, s15
+; GFX9-NEXT: v_mov_b32_e32 v3, s15
; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018
-; GFX9-NEXT: v_mad_i32_i24 v0, s14, v2, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, s17
+; GFX9-NEXT: v_mad_i32_i24 v1, s14, v3, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, s17
; GFX9-NEXT: s_ashr_i32 s3, s3, 28
-; GFX9-NEXT: v_mad_i32_i24 v0, s16, v2, v0
+; GFX9-NEXT: v_mad_i32_i24 v1, s16, v3, v1
; GFX9-NEXT: s_ashr_i32 s2, s2, 28
-; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: v_mad_i32_i24 v0, s2, v2, v0
-; GFX9-NEXT: v_add_u32_e32 v2, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_mad_i32_i24 v1, s2, v3, v1
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot8_multiuses_mul1:
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0
; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40000
; GFX9-DL-NEXT: s_bfe_i32 s5, s3, 0x40000
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s18
-; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v0, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v2
; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40004
; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40004
; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40008
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v2, v0
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v1, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v3, v1
; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9
; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x4000c
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v2, v0
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s8, v3, v1
; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11
; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v2, v0
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s10, v3, v1
; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s13
; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014
; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v2, v0
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s12, v3, v1
; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s15
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s15
; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v2, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s14, v3, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17
; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v2, v0
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s16, v3, v1
; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, s2, v2, v0
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mad_i32_i24 v1, s2, v3, v1
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot8_multiuses_mul1:
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28
; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1
; GFX10-DL-NEXT: v_mad_i32_i24 v1, s0, s1, v1
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0
; GFX9-NEXT: s_addc_u32 s21, s21, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s2, 28
; GFX9-NEXT: s_ashr_i32 s11, s3, 28
; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008
; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40004
; GFX9-NEXT: s_bfe_i32 s2, s2, 0x40000
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s18
-; GFX9-NEXT: v_mad_i32_i24 v0, s2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, s17
-; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s16
-; GFX9-NEXT: v_mad_i32_i24 v0, s9, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s15
-; GFX9-NEXT: v_mad_i32_i24 v0, s8, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s14
-; GFX9-NEXT: v_mad_i32_i24 v0, s7, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s13
-; GFX9-NEXT: v_mad_i32_i24 v0, s6, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-NEXT: v_mad_i32_i24 v0, s5, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s11
-; GFX9-NEXT: v_mad_i32_i24 v2, s4, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v2, s18
+; GFX9-NEXT: v_mad_i32_i24 v1, s2, v1, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, s17
+; GFX9-NEXT: v_mad_i32_i24 v1, s10, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s16
+; GFX9-NEXT: v_mad_i32_i24 v1, s9, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s15
+; GFX9-NEXT: v_mad_i32_i24 v1, s8, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s14
+; GFX9-NEXT: v_mad_i32_i24 v1, s7, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s13
+; GFX9-NEXT: v_mad_i32_i24 v1, s6, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s12
+; GFX9-NEXT: v_mad_i32_i24 v1, s5, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-NEXT: v_mad_i32_i24 v1, s4, v2, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot8_acc32_vecMul:
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s4, v1, v2
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot8_acc32_vecMul:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s0, s1, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot8_i32_i4 v0, s0, s1, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NEXT: s_and_b32 s11, s2, 15
; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2
-; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10
; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s8
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10
; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s8
+; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018
; GFX9-NEXT: s_lshr_b32 s12, s6, 28
; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c
; GFX9-NEXT: s_and_b32 s17, s6, 15
; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004
-; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT: s_pack_ll_b32_b16 s2, s17, s6
; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT: s_pack_ll_b32_b16 s2, s15, s16
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s17, s6
; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT: s_pack_ll_b32_b16 s2, s13, s14
-; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v5
+; GFX9-NEXT: global_load_ushort v5, v0, s[0:1]
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s15, s16
; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1]
-; GFX9-NEXT: v_pk_mul_lo_u16 v5, v1, v5
-; GFX9-NEXT: v_pk_mul_lo_u16 v4, v0, v4
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s13, s14
; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6
-; GFX9-NEXT: global_load_ushort v6, v[0:1], off
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s12
-; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, s2 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7
+; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v6, v4, v6
-; GFX9-NEXT: v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_add_u32_e32 v4, v4, v2
-; GFX9-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: v_add_u32_e32 v5, v1, v5
+; GFX9-NEXT: v_add_u32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_add_u32_e32 v1, v1, v4
+; GFX9-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot8_acc16_vecMul:
; GFX9-DL-NEXT: s_and_b32 s11, s2, 15
; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2
-; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s5, s8
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s5, s8
+; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018
; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28
; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c
; GFX9-DL-NEXT: s_and_b32 s17, s6, 15
; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004
-; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s17, s6
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s15, s16
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s17, s6
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s13, s14
-; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
-; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v5
+; GFX9-DL-NEXT: global_load_ushort v5, v0, s[0:1]
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s15, s16
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1]
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v1, v5
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v0, v4
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s13, s14
; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1]
; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v6
-; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s7, s12
-; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1]
; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s2 op_sel_hi:[0,1]
; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v7
+; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_add_u32_e32 v6, v4, v6
-; GFX9-DL-NEXT: v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u32_e32 v4, v4, v2
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT: v_add_u32_e32 v5, v1, v5
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v4
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot8_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s14, -1
; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s12, s12, s3
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018
; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 28
-; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40010
-; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014
-; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008
-; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c
-; GFX10-DL-NEXT: s_and_b32 s8, s0, 15
+; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40010
+; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40014
+; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x40008
+; GFX10-DL-NEXT: s_bfe_u32 s9, s0, 0x4000c
+; GFX10-DL-NEXT: s_and_b32 s10, s0, 15
; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x40004
-; GFX10-DL-NEXT: s_and_b32 s9, s1, 15
-; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0
-; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40004
+; GFX10-DL-NEXT: s_and_b32 s11, s1, 15
+; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s10, s0
+; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40004
+; GFX10-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1]
+; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s10
+; GFX10-DL-NEXT: s_bfe_u32 s11, s1, 0x4000c
; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
-; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s9, s8
-; GFX10-DL-NEXT: s_bfe_u32 s9, s1, 0x4000c
-; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1]
; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40008
+; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
+; GFX10-DL-NEXT: s_pack_ll_b32_b16 s8, s8, s9
+; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s11
; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
+; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s8 op_sel_hi:[0,1]
+; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1]
+; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40010
+; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3
+; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v4 op_sel_hi:[0,1]
+; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1]
; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7
-; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s9
-; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
+; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0
; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1]
; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
-; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010
-; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4
-; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1]
-; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1]
-; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5
-; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0
-; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s4 op_sel_hi:[0,1]
-; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1]
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40018
+; GFX10-DL-NEXT: s_bfe_u32 s10, s1, 0x40018
; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 28
; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s2, s3
-; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1]
-; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s0
-; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1]
+; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1]
+; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s10, s0
+; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v6 op_sel_hi:[0,1]
-; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1]
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5
-; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1]
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v5 op_sel_hi:[0,1]
+; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s1 op_sel_hi:[0,1]
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v4
; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1]
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v5 op_sel_hi:[0,1]
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i16 addrspace(1)* nocapture %dst) {
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s22, -1
-; GFX9-NEXT: s_mov_b32 s23, 0xe00000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-NEXT: s_mov_b32 s23, 0xe00000
; GFX9-NEXT: s_add_u32 s20, s20, s3
+; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NEXT: s_addc_u32 s21, s21, 0
-; GFX9-NEXT: s_mov_b32 s0, 0xffff
+; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s7, s1, 4
-; GFX9-NEXT: s_lshr_b32 s14, s2, 4
-; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s1
-; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2
-; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7
-; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s14
-; GFX9-NEXT: s_lshr_b32 s8, s1, 12
-; GFX9-NEXT: s_lshr_b32 s9, s1, 8
-; GFX9-NEXT: s_lshr_b32 s15, s2, 12
-; GFX9-NEXT: s_lshr_b32 s16, s2, 8
-; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s9
-; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s8
-; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s16
-; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s15
+; GFX9-NEXT: s_lshr_b32 s9, s3, 4
+; GFX9-NEXT: s_lshr_b32 s16, s4, 4
+; GFX9-NEXT: v_lshlrev_b16_e64 v2, 12, s3
+; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s4
+; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s9
+; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s16
+; GFX9-NEXT: s_lshr_b32 s10, s3, 12
+; GFX9-NEXT: s_lshr_b32 s11, s3, 8
+; GFX9-NEXT: s_lshr_b32 s17, s4, 12
+; GFX9-NEXT: s_lshr_b32 s18, s4, 8
+; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s11
+; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s10
+; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s18
+; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s17
+; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2
; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3
+; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
+; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13
; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14
+; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12
-; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-NEXT: v_ashrrev_i16_e32 v13, 12, v13
-; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v4
-; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_lshr_b32 s3, s1, 20
-; GFX9-NEXT: s_lshr_b32 s4, s1, 16
-; GFX9-NEXT: s_lshr_b32 s10, s2, 20
-; GFX9-NEXT: s_lshr_b32 s11, s2, 16
+; GFX9-NEXT: v_mul_lo_u16_e32 v2, v2, v3
; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12
-; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4
-; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s3
-; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s11
-; GFX9-NEXT: v_lshlrev_b16_e64 v18, 12, s10
-; GFX9-NEXT: s_lshr_b32 s5, s1, 28
-; GFX9-NEXT: s_lshr_b32 s6, s1, 24
-; GFX9-NEXT: s_lshr_b32 s12, s2, 28
-; GFX9-NEXT: s_lshr_b32 s13, s2, 24
-; GFX9-NEXT: v_and_b32_e32 v3, s0, v3
-; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s6
-; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s5
-; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s13
-; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s12
-; GFX9-NEXT: v_or_b32_e32 v5, v3, v5
+; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: s_lshr_b32 s5, s3, 20
+; GFX9-NEXT: s_lshr_b32 s6, s3, 16
+; GFX9-NEXT: s_lshr_b32 s12, s4, 20
+; GFX9-NEXT: s_lshr_b32 s13, s4, 16
+; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_mul_lo_u16_e32 v4, v4, v11
+; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s6
+; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s5
+; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s13
+; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s12
+; GFX9-NEXT: s_lshr_b32 s7, s3, 28
+; GFX9-NEXT: s_lshr_b32 s8, s3, 24
+; GFX9-NEXT: s_lshr_b32 s14, s4, 28
+; GFX9-NEXT: s_lshr_b32 s15, s4, 24
+; GFX9-NEXT: v_and_b32_e32 v2, s2, v2
+; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s8
+; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s7
+; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s15
+; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s14
+; GFX9-NEXT: v_or_b32_e32 v4, v2, v4
+; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
+; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16
; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17
-; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v18
+; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
+; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v14
; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v15
-; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v16
-; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_mul_lo_u16_e32 v10, v10, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v5
-; GFX9-NEXT: v_or_b32_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15
-; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v4, s0, v4
-; GFX9-NEXT: v_or_b32_e32 v6, v4, v8
+; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v4
+; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v14
+; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX9-NEXT: v_or_b32_e32 v5, v3, v7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v7
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v6
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_add_u32_e32 v1, v1, v6
+; GFX9-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX9-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v5
+; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot8_acc8_vecMul:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_mov_b32 s22, -1
-; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000
; GFX9-DL-NEXT: s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0
-; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff
+; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 4
-; GFX9-DL-NEXT: s_lshr_b32 s14, s2, 4
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s1
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s14
-; GFX9-DL-NEXT: s_lshr_b32 s8, s1, 12
-; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 8
-; GFX9-DL-NEXT: s_lshr_b32 s15, s2, 12
-; GFX9-DL-NEXT: s_lshr_b32 s16, s2, 8
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s9
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s15
+; GFX9-DL-NEXT: s_lshr_b32 s9, s3, 4
+; GFX9-DL-NEXT: s_lshr_b32 s16, s4, 4
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s3
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s16
+; GFX9-DL-NEXT: s_lshr_b32 s10, s3, 12
+; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 8
+; GFX9-DL-NEXT: s_lshr_b32 s17, s4, 12
+; GFX9-DL-NEXT: s_lshr_b32 s18, s4, 8
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s18
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s17
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v13, 12, v13
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v4
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 20
-; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 16
-; GFX9-DL-NEXT: s_lshr_b32 s10, s2, 20
-; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 16
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v2, v2, v3
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s11
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v18, 12, s10
-; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 28
-; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 24
-; GFX9-DL-NEXT: s_lshr_b32 s12, s2, 28
-; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 24
-; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3
-; GFX9-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s13
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s12
-; GFX9-DL-NEXT: v_or_b32_e32 v5, v3, v5
+; GFX9-DL-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 20
+; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16
+; GFX9-DL-NEXT: s_lshr_b32 s12, s4, 20
+; GFX9-DL-NEXT: s_lshr_b32 s13, s4, 16
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, v5, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v4, v4, v11
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s13
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s12
+; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 28
+; GFX9-DL-NEXT: s_lshr_b32 s8, s3, 24
+; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 28
+; GFX9-DL-NEXT: s_lshr_b32 s15, s4, 24
+; GFX9-DL-NEXT: v_and_b32_e32 v2, s2, v2
+; GFX9-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14
+; GFX9-DL-NEXT: v_or_b32_e32 v4, v2, v4
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v18
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v14
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v15
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v16
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v10, v10, v17
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v5
-; GFX9-DL-NEXT: v_or_b32_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15
-; GFX9-DL-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4
-; GFX9-DL-NEXT: v_or_b32_e32 v6, v4, v8
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v3, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v16
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v4
+; GFX9-DL-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v14
+; GFX9-DL-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX9-DL-NEXT: v_or_b32_e32 v5, v3, v7
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v7
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v6
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v6
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v3
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v5
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot8_acc8_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s22, -1
; GFX10-DL-NEXT: s_mov_b32 s23, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s20, s20, s3
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: s_addc_u32 s21, s21, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 4
-; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 4
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s14
-; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 12
-; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 12
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13
-; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 8
-; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 8
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s9
+; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 4
+; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 4
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16
+; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 12
+; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 12
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s0
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s1
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s17
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12
+; GFX10-DL-NEXT: s_lshr_b32 s11, s0, 8
+; GFX10-DL-NEXT: s_lshr_b32 s18, s1, 8
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s18
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v5
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v6, v12
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v6
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v13
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v14, 12, v14
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v7
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v19, v14
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, v2, v3
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v6
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v19, v13
; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 20
-; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16
-; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 28
-; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 24
-; GFX10-DL-NEXT: s_lshr_b32 s10, s1, 20
-; GFX10-DL-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s10
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v12
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4
-; GFX10-DL-NEXT: s_lshr_b32 s11, s1, 16
-; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 28
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s11
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v8
+; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 16
+; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 28
+; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 24
+; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 20
+; GFX10-DL-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s3
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s12
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v11
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3
+; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 16
+; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 28
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s13
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v7
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v8
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v9
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v10
-; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s12
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v11
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v13
-; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 24
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s13
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v16
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v10
-; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, v9, v7
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v15
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v11
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v4
+; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v10
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v12
+; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 24
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v15
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15
+; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v9
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v15, v8, v6
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v10
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v14
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v3
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v5
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v6, v12
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v8
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v9
-; GFX10-DL-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v4
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v5, v11
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v7
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v8
+; GFX10-DL-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v4
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v3
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i8 addrspace(1)* nocapture %dst) {
; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008
; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004
; GFX9-NEXT: s_and_b32 s2, s2, 15
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s18
-; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, s17
-; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s16
-; GFX9-NEXT: v_mad_u32_u24 v0, s10, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s15
-; GFX9-NEXT: v_mad_u32_u24 v0, s9, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s14
-; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s13
-; GFX9-NEXT: v_mad_u32_u24 v0, s5, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-NEXT: v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mad_u32_u24 v2, s3, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: v_mov_b32_e32 v2, s18
+; GFX9-NEXT: v_mad_u32_u24 v1, s2, v1, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, s17
+; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s16
+; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s15
+; GFX9-NEXT: v_mad_u32_u24 v1, s9, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s14
+; GFX9-NEXT: v_mad_u32_u24 v1, s8, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s13
+; GFX9-NEXT: v_mad_u32_u24 v1, s5, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s12
+; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot8_acc32:
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s4, v1, v2
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc32:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s0, s1, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s18, -1
-; GFX9-NEXT: s_mov_b32 s19, 0xe00000
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s22, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
-; GFX9-NEXT: s_add_u32 s16, s16, s3
-; GFX9-NEXT: s_addc_u32 s17, s17, 0
+; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: s_mov_b32 s23, 0xe00000
+; GFX9-NEXT: s_add_u32 s20, s20, s3
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT: s_addc_u32 s21, s21, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s2, s0, 28
-; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018
-; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014
-; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010
-; GFX9-NEXT: s_bfe_u32 s13, s1, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008
-; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004
-; GFX9-NEXT: s_lshr_b32 s9, s1, 28
-; GFX9-NEXT: s_and_b32 s1, s1, 15
-; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018
-; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014
-; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010
-; GFX9-NEXT: s_bfe_u32 s6, s0, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008
-; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004
-; GFX9-NEXT: s_and_b32 s0, s0, 15
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s15
-; GFX9-NEXT: v_mov_b32_e32 v5, s14
-; GFX9-NEXT: v_mov_b32_e32 v6, s13
-; GFX9-NEXT: v_mov_b32_e32 v7, s12
-; GFX9-NEXT: v_mov_b32_e32 v8, s11
-; GFX9-NEXT: v_mov_b32_e32 v9, s10
+; GFX9-NEXT: s_lshr_b32 s4, s2, 28
+; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018
+; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014
+; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010
+; GFX9-NEXT: s_bfe_u32 s15, s3, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40008
+; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40004
+; GFX9-NEXT: s_lshr_b32 s11, s3, 28
+; GFX9-NEXT: s_and_b32 s3, s3, 15
+; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018
+; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014
+; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010
+; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008
+; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004
+; GFX9-NEXT: s_and_b32 s2, s2, 15
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_mov_b32_e32 v3, s17
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-NEXT: v_mov_b32_e32 v8, s12
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot8_acc16:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT: s_mov_b32 s18, -1
-; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-DL-NEXT: s_mov_b32 s22, -1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
-; GFX9-DL-NEXT: s_add_u32 s16, s16, s3
-; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0
+; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000
+; GFX9-DL-NEXT: s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28
-; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018
-; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004
-; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28
-; GFX9-DL-NEXT: s_and_b32 s1, s1, 15
-; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018
-; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004
-; GFX9-DL-NEXT: s_and_b32 s0, s0, 15
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10
+; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28
+; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s17, s3, 0x40004
+; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28
+; GFX9-DL-NEXT: s_and_b32 s3, s3, 15
+; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40004
+; GFX9-DL-NEXT: s_and_b32 s2, s2, 15
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc16:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT: s_mov_b32 s6, -1
-; GFX10-DL-NEXT: s_mov_b32 s7, 0x31c16000
-; GFX10-DL-NEXT: s_add_u32 s4, s4, s3
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT: s_addc_u32 s5, s5, 0
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX10-DL-NEXT: s_mov_b32 s10, -1
+; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
+; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_and_b32 s2, s0, 15
; GFX10-DL-NEXT: s_and_b32 s3, s1, 15
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008
-; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018
; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1
+; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i16 addrspace(1)* nocapture %dst) {
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s18, -1
-; GFX9-NEXT: s_mov_b32 s19, 0xe00000
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s22, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
-; GFX9-NEXT: s_add_u32 s16, s16, s3
-; GFX9-NEXT: s_addc_u32 s17, s17, 0
+; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-NEXT: s_mov_b32 s23, 0xe00000
+; GFX9-NEXT: s_add_u32 s20, s20, s3
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT: s_addc_u32 s21, s21, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s2, s0, 28
-; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018
-; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014
-; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010
-; GFX9-NEXT: s_bfe_u32 s13, s1, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008
-; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004
-; GFX9-NEXT: s_lshr_b32 s9, s1, 28
-; GFX9-NEXT: s_and_b32 s1, s1, 15
-; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018
-; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014
-; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010
-; GFX9-NEXT: s_bfe_u32 s6, s0, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008
-; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004
-; GFX9-NEXT: s_and_b32 s0, s0, 15
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s15
-; GFX9-NEXT: v_mov_b32_e32 v5, s14
-; GFX9-NEXT: v_mov_b32_e32 v6, s13
-; GFX9-NEXT: v_mov_b32_e32 v7, s12
-; GFX9-NEXT: v_mov_b32_e32 v8, s11
-; GFX9-NEXT: v_mov_b32_e32 v9, s10
+; GFX9-NEXT: s_lshr_b32 s4, s2, 28
+; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018
+; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014
+; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010
+; GFX9-NEXT: s_bfe_u32 s15, s3, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40008
+; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40004
+; GFX9-NEXT: s_lshr_b32 s11, s3, 28
+; GFX9-NEXT: s_and_b32 s3, s3, 15
+; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018
+; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014
+; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010
+; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008
+; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004
+; GFX9-NEXT: s_and_b32 s2, s2, 15
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_mov_b32_e32 v3, s17
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-NEXT: v_mov_b32_e32 v8, s12
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot8_acc8:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT: s_mov_b32 s18, -1
-; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-DL-NEXT: s_mov_b32 s22, -1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
-; GFX9-DL-NEXT: s_add_u32 s16, s16, s3
-; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0
+; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000
+; GFX9-DL-NEXT: s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28
-; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018
-; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004
-; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28
-; GFX9-DL-NEXT: s_and_b32 s1, s1, 15
-; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018
-; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004
-; GFX9-DL-NEXT: s_and_b32 s0, s0, 15
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10
+; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28
+; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s17, s3, 0x40004
+; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28
+; GFX9-DL-NEXT: s_and_b32 s3, s3, 15
+; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40004
+; GFX9-DL-NEXT: s_and_b32 s2, s2, 15
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v2, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-DL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc8:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT: s_mov_b32 s6, -1
-; GFX10-DL-NEXT: s_mov_b32 s7, 0x31c16000
-; GFX10-DL-NEXT: s_add_u32 s4, s4, s3
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT: s_addc_u32 s5, s5, 0
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX10-DL-NEXT: s_mov_b32 s10, -1
+; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_and_b32 s2, s0, 15
; GFX10-DL-NEXT: s_and_b32 s3, s1, 15
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008
-; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018
; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i8 addrspace(1)* nocapture %dst) {
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s18, -1
-; GFX9-NEXT: s_mov_b32 s19, 0xe00000
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s22, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
-; GFX9-NEXT: s_add_u32 s16, s16, s3
-; GFX9-NEXT: s_addc_u32 s17, s17, 0
+; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-NEXT: s_mov_b32 s23, 0xe00000
+; GFX9-NEXT: s_add_u32 s20, s20, s3
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT: s_addc_u32 s21, s21, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s8, s0, 15
-; GFX9-NEXT: s_and_b32 s15, s1, 15
-; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40004
-; GFX9-NEXT: v_mov_b32_e32 v4, s15
-; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018
-; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014
-; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010
-; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40008
-; GFX9-NEXT: s_lshr_b32 s9, s1, 28
-; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40004
-; GFX9-NEXT: v_mov_b32_e32 v5, s14
-; GFX9-NEXT: s_lshr_b32 s2, s0, 28
-; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018
-; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014
-; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010
-; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008
-; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v6, s13
-; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3
-; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
-; GFX9-NEXT: v_mov_b32_e32 v7, s12
-; GFX9-NEXT: v_mov_b32_e32 v8, s11
-; GFX9-NEXT: v_mov_b32_e32 v9, s10
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NEXT: s_and_b32 s10, s2, 15
+; GFX9-NEXT: s_and_b32 s17, s3, 15
+; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40004
+; GFX9-NEXT: v_mov_b32_e32 v3, s17
+; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018
+; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014
+; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010
+; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40008
+; GFX9-NEXT: s_lshr_b32 s11, s3, 28
+; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40004
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: s_lshr_b32 s4, s2, 28
+; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018
+; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014
+; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010
+; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008
+; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-NEXT: v_mul_u32_u24_e32 v2, s2, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT: v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot8_acc4:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT: s_mov_b32 s18, -1
-; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-DL-NEXT: s_mov_b32 s22, -1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
-; GFX9-DL-NEXT: s_add_u32 s16, s16, s3
-; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0
+; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000
+; GFX9-DL-NEXT: s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_and_b32 s8, s0, 15
-; GFX9-DL-NEXT: s_and_b32 s15, s1, 15
-; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40004
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15
-; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018
-; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40008
-; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28
-; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40004
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14
-; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28
-; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018
-; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3
-; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-DL-NEXT: s_and_b32 s10, s2, 15
+; GFX9-DL-NEXT: s_and_b32 s17, s3, 15
+; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40004
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17
+; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x40008
+; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28
+; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40004
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28
+; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s2, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc4:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_and_b32 s2, s0, 15
; GFX10-DL-NEXT: s_and_b32 s3, s1, 15
-; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008
-; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c
+; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008
+; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008
; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c
-; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s3, s5
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s3, s7
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s6, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010
-; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3
; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018
; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1
+; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i4 addrspace(1)* nocapture %dst) {
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s18, -1
-; GFX9-NEXT: s_mov_b32 s19, 0xe00000
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s22, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
-; GFX9-NEXT: s_add_u32 s16, s16, s3
-; GFX9-NEXT: s_addc_u32 s17, s17, 0
+; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-NEXT: s_mov_b32 s23, 0xe00000
+; GFX9-NEXT: s_add_u32 s20, s20, s3
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT: s_addc_u32 s21, s21, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s8, s0, 15
-; GFX9-NEXT: s_and_b32 s15, s1, 15
-; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40004
-; GFX9-NEXT: v_mov_b32_e32 v4, s15
-; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018
-; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014
-; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010
-; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40008
-; GFX9-NEXT: s_lshr_b32 s9, s1, 28
-; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40004
-; GFX9-NEXT: v_mov_b32_e32 v5, s14
-; GFX9-NEXT: s_lshr_b32 s2, s0, 28
-; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018
-; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014
-; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010
-; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008
-; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v6, s13
-; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3
-; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
-; GFX9-NEXT: v_mov_b32_e32 v7, s12
-; GFX9-NEXT: v_mov_b32_e32 v8, s11
-; GFX9-NEXT: v_mov_b32_e32 v9, s10
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NEXT: s_and_b32 s10, s2, 15
+; GFX9-NEXT: s_and_b32 s17, s3, 15
+; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40004
+; GFX9-NEXT: v_mov_b32_e32 v3, s17
+; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018
+; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014
+; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010
+; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40008
+; GFX9-NEXT: s_lshr_b32 s11, s3, 28
+; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40004
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: s_lshr_b32 s4, s2, 28
+; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018
+; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014
+; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010
+; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008
+; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-NEXT: v_mul_u32_u24_e32 v2, s2, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot8_CommutationInsideMAD:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT: s_mov_b32 s18, -1
-; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-DL-NEXT: s_mov_b32 s22, -1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
-; GFX9-DL-NEXT: s_add_u32 s16, s16, s3
-; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0
+; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000
+; GFX9-DL-NEXT: s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_and_b32 s8, s0, 15
-; GFX9-DL-NEXT: s_and_b32 s15, s1, 15
-; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40004
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15
-; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018
-; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40008
-; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28
-; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40004
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14
-; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28
-; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018
-; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3
-; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-DL-NEXT: s_and_b32 s10, s2, 15
+; GFX9-DL-NEXT: s_and_b32 s17, s3, 15
+; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40004
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17
+; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x40008
+; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28
+; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40004
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28
+; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s2, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_CommutationInsideMAD:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_and_b32 s2, s0, 15
; GFX10-DL-NEXT: s_and_b32 s3, s1, 15
-; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40008
-; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40008
+; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008
+; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x4000c
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x4000c
-; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s2, s3
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2
+; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s2, s3
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010
-; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3
; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018
; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1
+; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i4 addrspace(1)* nocapture %dst) {
; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008
; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004
; GFX9-NEXT: s_and_b32 s2, s2, 15
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s18
-; GFX9-NEXT: v_mad_u32_u24 v1, s2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, s17
-; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1
-; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, s16
-; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, s15
-; GFX9-NEXT: v_mad_u32_u24 v1, s9, v2, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, s14
-; GFX9-NEXT: v_mad_u32_u24 v1, s8, v2, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, s13
-; GFX9-NEXT: v_mad_u32_u24 v1, s5, v2, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, s12
-; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1
-; GFX9-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1
-; GFX9-NEXT: v_add_u32_e32 v2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: v_mov_b32_e32 v2, s18
+; GFX9-NEXT: v_mad_u32_u24 v2, s2, v1, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s17
+; GFX9-NEXT: v_mad_u32_u24 v1, s2, v1, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s11, v3, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s16
+; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s15
+; GFX9-NEXT: v_mad_u32_u24 v2, s9, v3, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s14
+; GFX9-NEXT: v_mad_u32_u24 v2, s8, v3, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s13
+; GFX9-NEXT: v_mad_u32_u24 v2, s5, v3, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s12
+; GFX9-NEXT: v_mad_u32_u24 v2, s4, v3, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mad_u32_u24 v2, s3, v3, v2
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot8_multiuses_mul1:
; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008
; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40004
; GFX9-DL-NEXT: s_and_b32 s2, s2, 15
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s18
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17
-; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v0, v1
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, s11, v2, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s16
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v2, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s15
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v2, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s14
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v2, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s11, v3, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s16
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v3, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s15
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v3, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s14
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v3, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s13
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s12
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v3, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_multiuses_mul1:
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28
; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1
; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v1
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008
; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004
; GFX9-NEXT: s_and_b32 s2, s2, 15
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s18
-; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, s17
-; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s16
-; GFX9-NEXT: v_mad_u32_u24 v0, s10, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s15
-; GFX9-NEXT: v_mad_u32_u24 v0, s9, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s14
-; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s13
-; GFX9-NEXT: v_mad_u32_u24 v0, s5, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-NEXT: v_mad_u32_u24 v0, s4, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_mad_u32_u24 v2, s3, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: v_mov_b32_e32 v2, s18
+; GFX9-NEXT: v_mad_u32_u24 v1, s2, v1, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, s17
+; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s16
+; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s15
+; GFX9-NEXT: v_mad_u32_u24 v1, s9, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s14
+; GFX9-NEXT: v_mad_u32_u24 v1, s8, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s13
+; GFX9-NEXT: v_mad_u32_u24 v1, s5, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s12
+; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot8_acc32_vecMul:
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s4, v1, v2
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc32_vecMul:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s0, s1, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i32 addrspace(1)* nocapture %dst) {
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s22, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0
+; GFX9-NEXT: global_load_ushort v5, v0, s[0:1]
+; GFX9-NEXT: s_mov_b32 s22, -1
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NEXT: s_mov_b32 s23, 0xe00000
; GFX9-NEXT: s_add_u32 s20, s20, s3
; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010
; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-NEXT: v_pk_mul_lo_u16 v2, s3, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_pk_mul_lo_u16 v1, s3, v1
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s13, s14
-; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010
-; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014
; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40008
; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c
; GFX9-NEXT: s_and_b32 s17, s6, 15
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s8
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: s_pack_ll_b32_b16 s3, s15, s16
+; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: s_pack_ll_b32_b16 s3, s17, s6
+; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010
+; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014
; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008
; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004
-; GFX9-NEXT: s_pack_ll_b32_b16 s3, s15, s16
-; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v0
; GFX9-NEXT: s_and_b32 s11, s2, 15
; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: s_pack_ll_b32_b16 s4, s9, s10
-; GFX9-NEXT: s_pack_ll_b32_b16 s3, s17, s6
-; GFX9-NEXT: v_pk_mul_lo_u16 v4, s4, v0
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ushort v6, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
+; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s8
+; GFX9-NEXT: v_pk_mul_lo_u16 v4, s2, v4
+; GFX9-NEXT: v_pk_mul_lo_u16 v2, s4, v2
+; GFX9-NEXT: s_pack_ll_b32_b16 s4, s9, s10
+; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v6, v5, v6
-; GFX9-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
+; GFX9-NEXT: v_add_u32_e32 v5, v4, v5
; GFX9-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_add_u32_e32 v4, v4, v3
+; GFX9-NEXT: v_add_u32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
; GFX9-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_add_u32_e32 v3, v3, v2
; GFX9-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: v_add_u32_e32 v2, v2, v1
+; GFX9-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot8_acc16_vecMul:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT: s_mov_b32 s22, -1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0
+; GFX9-DL-NEXT: global_load_ushort v5, v0, s[0:1]
+; GFX9-DL-NEXT: s_mov_b32 s22, -1
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000
; GFX9-DL-NEXT: s_add_u32 s20, s20, s3
; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010
; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40014
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s4
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s3, v0
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, s3, v1
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s13, s14
-; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014
; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40008
; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c
; GFX9-DL-NEXT: s_and_b32 s17, s6, 15
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s5, s8
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s15, s16
+; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s17, s6
+; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014
; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008
; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s15, s16
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v0
; GFX9-DL-NEXT: s_and_b32 s11, s2, 15
; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s9, s10
-; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s17, s6
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s4, v0
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v0
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s5, s8
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s2, v4
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s4, v2
+; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s9, s10
+; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_add_u32_e32 v6, v5, v6
-; GFX9-DL-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
+; GFX9-DL-NEXT: v_add_u32_e32 v5, v4, v5
; GFX9-DL-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u32_e32 v4, v4, v3
+; GFX9-DL-NEXT: v_add_u32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
; GFX9-DL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v2
; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v1
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_and_b32 s2, s0, 15
-; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004
+; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40004
; GFX10-DL-NEXT: s_and_b32 s3, s1, 15
-; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40004
-; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s5
-; GFX10-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s4
-; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s3
-; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c
+; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004
+; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s7
+; GFX10-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s6
+; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, s2, s3
+; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008
; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c
-; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5
+; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7
; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3
; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40014
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s2, s4
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s6
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010
-; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010
-; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40014
+; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010
+; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014
; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s3
-; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5
+; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28
; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s3, s1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s4
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, s2, s6
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018
; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s2, s0
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s1
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s1
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i16 addrspace(1)* nocapture %dst) {
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s22, -1
-; GFX9-NEXT: s_mov_b32 s23, 0xe00000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-NEXT: s_mov_b32 s23, 0xe00000
; GFX9-NEXT: s_add_u32 s20, s20, s3
+; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NEXT: s_addc_u32 s21, s21, 0
-; GFX9-NEXT: s_mov_b32 s0, 0xffff
+; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_bfe_u32 s3, s1, 0x40010
-; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010
-; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40014
-; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40018
-; GFX9-NEXT: s_lshr_b32 s13, s2, 28
-; GFX9-NEXT: s_and_b32 s14, s2, 15
-; GFX9-NEXT: s_bfe_u32 s15, s2, 0x40004
-; GFX9-NEXT: s_bfe_u32 s16, s2, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v3, s10
-; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s4, s1, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v4, s11
-; GFX9-NEXT: s_bfe_u32 s5, s1, 0x40018
-; GFX9-NEXT: v_mov_b32_e32 v5, s12
-; GFX9-NEXT: s_lshr_b32 s6, s1, 28
-; GFX9-NEXT: v_mov_b32_e32 v6, s13
-; GFX9-NEXT: s_and_b32 s7, s1, 15
-; GFX9-NEXT: v_mov_b32_e32 v7, s14
-; GFX9-NEXT: s_bfe_u32 s8, s1, 0x40004
-; GFX9-NEXT: v_mov_b32_e32 v8, s15
-; GFX9-NEXT: s_bfe_u32 s9, s1, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v9, s16
-; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v10, s2
-; GFX9-NEXT: v_mul_lo_u16_e32 v3, s3, v3
-; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_mul_lo_u16_e32 v5, s5, v5
-; GFX9-NEXT: v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_mul_lo_u16_e32 v7, s7, v7
-; GFX9-NEXT: v_mul_lo_u16_sdwa v8, s8, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX9-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_e32 v5, v7, v8
-; GFX9-NEXT: v_mul_lo_u16_e32 v9, s9, v9
-; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v5, s0, v5
-; GFX9-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_e32 v6, v5, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v6
-; GFX9-NEXT: v_and_b32_e32 v3, s0, v3
-; GFX9-NEXT: v_or_b32_e32 v4, v3, v4
+; GFX9-NEXT: s_bfe_u32 s5, s3, 0x40010
+; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40010
+; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014
+; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018
+; GFX9-NEXT: s_lshr_b32 s15, s4, 28
+; GFX9-NEXT: s_and_b32 s16, s4, 15
+; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004
+; GFX9-NEXT: s_bfe_u32 s18, s4, 0x40008
+; GFX9-NEXT: v_mov_b32_e32 v2, s12
+; GFX9-NEXT: s_bfe_u32 s4, s4, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s6, s3, 0x40014
+; GFX9-NEXT: v_mov_b32_e32 v3, s13
+; GFX9-NEXT: s_bfe_u32 s7, s3, 0x40018
+; GFX9-NEXT: v_mov_b32_e32 v4, s14
+; GFX9-NEXT: s_lshr_b32 s8, s3, 28
+; GFX9-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-NEXT: s_and_b32 s9, s3, 15
+; GFX9-NEXT: v_mov_b32_e32 v6, s16
+; GFX9-NEXT: s_bfe_u32 s10, s3, 0x40004
+; GFX9-NEXT: v_mov_b32_e32 v7, s17
+; GFX9-NEXT: s_bfe_u32 s11, s3, 0x40008
+; GFX9-NEXT: v_mov_b32_e32 v8, s18
+; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v9, s4
+; GFX9-NEXT: v_mul_lo_u16_e32 v2, s5, v2
+; GFX9-NEXT: v_mul_lo_u16_sdwa v3, s6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_mul_lo_u16_e32 v4, s7, v4
+; GFX9-NEXT: v_mul_lo_u16_sdwa v5, s8, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_mul_lo_u16_e32 v6, s9, v6
+; GFX9-NEXT: v_mul_lo_u16_sdwa v7, s10, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX9-NEXT: v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_e32 v4, v6, v7
+; GFX9-NEXT: v_mul_lo_u16_e32 v8, s11, v8
+; GFX9-NEXT: v_mul_lo_u16_sdwa v9, s3, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v4, s2, v4
+; GFX9-NEXT: v_or_b32_sdwa v5, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_e32 v5, v4, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v5
+; GFX9-NEXT: v_and_b32_e32 v2, s2, v2
+; GFX9-NEXT: v_or_b32_e32 v3, v2, v3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v7
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v4
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-NEXT: v_add_u32_e32 v1, v4, v1
+; GFX9-NEXT: v_add_u32_e32 v1, v1, v6
+; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX9-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v3
+; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot8_acc8_vecMul:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; GFX9-DL-NEXT: s_mov_b32 s22, -1
-; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000
; GFX9-DL-NEXT: s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0
-; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff
+; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_bfe_u32 s3, s1, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40018
-; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 28
-; GFX9-DL-NEXT: s_and_b32 s14, s2, 15
-; GFX9-DL-NEXT: s_bfe_u32 s15, s2, 0x40004
-; GFX9-DL-NEXT: s_bfe_u32 s16, s2, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10
-; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11
-; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x40018
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s12
-; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 28
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13
-; GFX9-DL-NEXT: s_and_b32 s7, s1, 15
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s14
-; GFX9-DL-NEXT: s_bfe_u32 s8, s1, 0x40004
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s15
-; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s16
-; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v10, s2
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s3, v3
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s5, v5
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, s7, v7
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, s8, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_or_b32_e32 v5, v7, v8
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s9, v9
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v5
-; GFX9-DL-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_or_b32_e32 v6, v5, v6
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v6
-; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3
-; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4
+; GFX9-DL-NEXT: s_bfe_u32 s5, s3, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40018
+; GFX9-DL-NEXT: s_lshr_b32 s15, s4, 28
+; GFX9-DL-NEXT: s_and_b32 s16, s4, 15
+; GFX9-DL-NEXT: s_bfe_u32 s17, s4, 0x40004
+; GFX9-DL-NEXT: s_bfe_u32 s18, s4, 0x40008
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12
+; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x40014
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s13
+; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x40018
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s14
+; GFX9-DL-NEXT: s_lshr_b32 s8, s3, 28
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT: s_and_b32 s9, s3, 15
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s16
+; GFX9-DL-NEXT: s_bfe_u32 s10, s3, 0x40004
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s17
+; GFX9-DL-NEXT: s_bfe_u32 s11, s3, 0x40008
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s18
+; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s4
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v2, s5, v2
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v3, s6, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v4, s7, v4
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, s8, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v6, s9, v6
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, s10, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX9-DL-NEXT: v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_or_b32_e32 v4, v6, v7
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, s11, v8
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, s3, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4
+; GFX9-DL-NEXT: v_or_b32_sdwa v5, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_or_b32_e32 v5, v4, v5
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v5
+; GFX9-DL-NEXT: v_and_b32_e32 v2, s2, v2
+; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v7
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v4, v1
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v6
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v3
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT: v_add_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc8_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX10-DL-NEXT: s_mov_b32 s10, -1
-; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
-; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX10-DL-NEXT: s_mov_b32 s14, -1
+; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000
+; GFX10-DL-NEXT: s_add_u32 s12, s12, s3
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
+; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40004
-; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004
+; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40004
; GFX10-DL-NEXT: s_and_b32 s2, s0, 15
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s3, s5
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, s3, s7
; GFX10-DL-NEXT: s_and_b32 s3, s1, 15
-; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c
-; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s2, s3
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s5
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3
-; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40008
+; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c
+; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s2, s3
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s8, s7
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v2
+; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008
; GFX10-DL-NEXT: s_bfe_u32 s2, s1, 0x40008
; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s4, s2
-; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5
-; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014
-; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s2
+; GFX10-DL-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4
+; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40014
+; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40014
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010
-; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v3
-; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s6
-; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40018
-; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40010
+; GFX10-DL-NEXT: v_and_b32_e32 v2, s3, v2
+; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s6, s8
+; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40018
+; GFX10-DL-NEXT: s_bfe_u32 s9, s1, 0x40010
; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28
-; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4
-; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 28
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s7
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s4
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4
+; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3
+; GFX10-DL-NEXT: s_lshr_b32 s6, s1, 28
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s9
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s0, s6
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v3
; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40018
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, s5, s0
-; GFX10-DL-NEXT: v_or_b32_e32 v5, v6, v5
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 8, v7
-; GFX10-DL-NEXT: v_and_b32_e32 v5, s3, v5
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s7, s0
+; GFX10-DL-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v6
+; GFX10-DL-NEXT: v_and_b32_e32 v4, s3, v4
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
-; GFX10-DL-NEXT: v_or_b32_sdwa v3, v10, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v8
-; GFX10-DL-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v2, v1
+; GFX10-DL-NEXT: v_or_b32_sdwa v2, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v7
+; GFX10-DL-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v2
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v4
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v3
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i8 addrspace(1)* nocapture %dst) {
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s18, -1
-; GFX9-NEXT: s_mov_b32 s19, 0xe00000
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s22, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
-; GFX9-NEXT: s_add_u32 s16, s16, s3
-; GFX9-NEXT: s_addc_u32 s17, s17, 0
+; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-NEXT: s_mov_b32 s23, 0xe00000
+; GFX9-NEXT: s_add_u32 s20, s20, s3
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT: s_addc_u32 s21, s21, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s8, s0, 15
-; GFX9-NEXT: s_and_b32 s15, s1, 15
-; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40004
-; GFX9-NEXT: v_mov_b32_e32 v4, s15
-; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018
-; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014
-; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010
-; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40008
-; GFX9-NEXT: s_lshr_b32 s9, s1, 28
-; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40004
-; GFX9-NEXT: v_mov_b32_e32 v5, s14
-; GFX9-NEXT: s_lshr_b32 s2, s0, 28
-; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018
-; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014
-; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010
-; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008
-; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v6, s13
-; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3
-; GFX9-NEXT: v_and_b32_e32 v3, 15, v3
-; GFX9-NEXT: v_mov_b32_e32 v7, s12
-; GFX9-NEXT: v_mov_b32_e32 v8, s11
-; GFX9-NEXT: v_mov_b32_e32 v9, s10
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NEXT: s_and_b32 s10, s2, 15
+; GFX9-NEXT: s_and_b32 s17, s3, 15
+; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40004
+; GFX9-NEXT: v_mov_b32_e32 v3, s17
+; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40018
+; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40014
+; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010
+; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40008
+; GFX9-NEXT: s_lshr_b32 s11, s3, 28
+; GFX9-NEXT: s_bfe_u32 s3, s3, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40004
+; GFX9-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-NEXT: s_lshr_b32 s4, s2, 28
+; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018
+; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014
+; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010
+; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008
+; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-NEXT: v_mul_u32_u24_e32 v2, s2, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT: v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-NEXT: v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot8_acc4_vecMul:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GFX9-DL-NEXT: s_mov_b32 s18, -1
-; GFX9-DL-NEXT: s_mov_b32 s19, 0xe00000
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX9-DL-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX9-DL-NEXT: s_mov_b32 s22, -1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
-; GFX9-DL-NEXT: s_add_u32 s16, s16, s3
-; GFX9-DL-NEXT: s_addc_u32 s17, s17, 0
+; GFX9-DL-NEXT: global_load_ubyte v1, v0, s[0:1]
+; GFX9-DL-NEXT: s_mov_b32 s23, 0xe00000
+; GFX9-DL-NEXT: s_add_u32 s20, s20, s3
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT: s_addc_u32 s21, s21, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_and_b32 s8, s0, 15
-; GFX9-DL-NEXT: s_and_b32 s15, s1, 15
-; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40004
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15
-; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018
-; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40008
-; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28
-; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40004
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14
-; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28
-; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018
-; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3
-; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-DL-NEXT: s_and_b32 s10, s2, 15
+; GFX9-DL-NEXT: s_and_b32 s17, s3, 15
+; GFX9-DL-NEXT: s_bfe_u32 s16, s3, 0x40004
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s17
+; GFX9-DL-NEXT: s_bfe_u32 s12, s3, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s13, s3, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s14, s3, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s15, s3, 0x40008
+; GFX9-DL-NEXT: s_lshr_b32 s11, s3, 28
+; GFX9-DL-NEXT: s_bfe_u32 s3, s3, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40004
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28
+; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, s2, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v3, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v4, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v5, v1
+; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v6, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v7, v1
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v8, v1
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11
+; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1
+; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_acc4_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
-; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5]
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_and_b32 s2, s0, 15
; GFX10-DL-NEXT: s_and_b32 s3, s1, 15
-; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008
-; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c
+; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008
+; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008
; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c
-; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s3, s5
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2
+; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s3, s7
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s6, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010
-; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3
; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40014
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40014
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018
; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
+; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1
+; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
i4 addrspace(1)* nocapture %dst) {
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40014
; GFX9-NEXT: s_bfe_u32 s16, s2, 0x40018
; GFX9-NEXT: s_lshr_b32 s2, s2, 28
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s18
-; GFX9-NEXT: v_mad_u32_u24 v0, s5, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v2, s18
+; GFX9-NEXT: v_mad_u32_u24 v1, s5, v1, v2
; GFX9-NEXT: s_bfe_u32 s7, s3, 0x40004
; GFX9-NEXT: s_bfe_u32 s9, s3, 0x40008
; GFX9-NEXT: s_bfe_u32 s11, s3, 0x4000c
; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40014
; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40018
; GFX9-NEXT: s_lshr_b32 s3, s3, 28
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: v_mad_u32_u24 v0, s3, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: v_mad_u32_u24 v0, s7, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-NEXT: v_mad_u32_u24 v0, s9, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s10
-; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-NEXT: v_mad_u32_u24 v0, s13, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s14
-; GFX9-NEXT: v_mad_u32_u24 v0, s15, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s16
-; GFX9-NEXT: v_mad_u32_u24 v2, s17, v1, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mad_u32_u24 v1, s7, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-NEXT: v_mad_u32_u24 v1, s9, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s12
+; GFX9-NEXT: v_mad_u32_u24 v1, s13, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s14
+; GFX9-NEXT: v_mad_u32_u24 v1, s15, v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s16
+; GFX9-NEXT: v_mad_u32_u24 v1, s17, v2, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot8_variant1:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1
-; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s4, v1, v2
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot8_variant1:
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
-; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s0, v0
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
+; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
i32 addrspace(1)* %v2addr,
i32 addrspace(1)* %dst) {
; GFX9-LABEL: s_insertelement_v2i16_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_lh_b32_b16 s0, 0x3e7, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_pack_lh_b32_b16 s2, 0x3e7, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; CIVI-LABEL: s_insertelement_v2i16_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_lh_b32_b16 s0, s4, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_pack_lh_b32_b16 s2, s4, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: s_insertelement_v2i16_0_reg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s0, s2, 16
-; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_lshr_b32 s2, s2, 16
+; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s0
+; GFX9-NEXT: ; use s2
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_hh_b32_b16 s0, s4, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_pack_hh_b32_b16 s2, s4, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: s_insertelement_v2i16_0_reghi:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_lshr_b32 s0, s4, 16
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_lshr_b32 s3, s4, 16
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_lh_b32_b16 s1, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_pack_lh_b32_b16 s2, s3, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s0
+; GFX9-NEXT: ; use s3
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_lshr_b32 s1, s4, 16
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_lshr_b32 s3, s4, 16
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s0, s0, 16
-; GFX9-NEXT: s_pack_ll_b32_b16 s2, s1, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_lshr_b32 s2, s2, 16
+; GFX9-NEXT: s_pack_ll_b32_b16 s4, s3, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s1
+; GFX9-NEXT: ; use s3
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s0
+; GFX9-NEXT: ; use s2
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX9-LABEL: s_insertelement_v2i16_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, 0x3e7
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x3e7
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; CIVI-LABEL: s_insertelement_v2i16_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: s_insertelement_v2i16_1_reg:
; GFX9-LABEL: s_insertelement_v2f16_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s0, s0, 16
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, 0x4500, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_lshr_b32 s2, s2, 16
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; CIVI-LABEL: s_insertelement_v2f16_0:
; GFX9-LABEL: s_insertelement_v2f16_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, 0x4500
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x4500
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; CIVI-LABEL: s_insertelement_v2f16_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b32 s0, s0, 4
-; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX9-NEXT: s_andn2_b32 s1, s2, s0
-; GFX9-NEXT: s_and_b32 s0, s0, 0x3e703e7
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: s_lshl_b32 s3, s4, 4
+; GFX9-NEXT: s_lshl_b32 s3, 0xffff, s3
+; GFX9-NEXT: s_andn2_b32 s2, s2, s3
+; GFX9-NEXT: s_and_b32 s3, s3, 0x3e703e7
+; GFX9-NEXT: s_or_b32 s2, s3, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: s_insertelement_v2i16_dynamic:
; multiple.
; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
; HSA-GFX9: kernarg_segment_byte_size = 28
-; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
-; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
-; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
+; HSA-GFX9-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-GFX9-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
+; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:17
+; HSA-GFX9: global_load_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:13
define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
%val0 = extractvalue <{i32, i64}> %arg0, 0
%val1 = extractvalue <{i32, i64}> %arg0, 1
; FIXME: Why not all scalar loads?
; GCN-LABEL: {{^}}array_3xi16:
-; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:2
-; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4
-; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:6
+; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:2
+; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:4
+; HSA-GFX9: global_load_ushort v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:6
define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
store volatile i8 %arg0, i8 addrspace(1)* undef
store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
}
; GCN-LABEL: {{^}}small_array_round_down_offset:
-; HSA-GFX9: global_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:1
+; HSA-GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; HSA-GFX9: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:1
define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) {
%val = extractvalue [1 x i8] %arg, 0
store volatile i8 %val, i8 addrspace(1)* undef
; multiple.
; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
; HSA-VI: kernarg_segment_byte_size = 28
-; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
-; HSA-VI: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
+; HSA-VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:17
+; HSA-VI: global_load_dword v{{[0-9]+}}, [[ZERO]], s{{\[[0-9]+:[0-9]+\]}} offset:13
; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
; Byref pointers should only be treated as offsets from kernarg
; GCN-LABEL: {{^}}byref_constant_i8_arg:
; GCN: kernarg_segment_byte_size = 12
-; GCN: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s4
-; GCN: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], s5
-; GCN: global_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]{{\]}}, off offset:8
+; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8
define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byref(i8) %in.byref) {
%in = load i8, i8 addrspace(4)* %in.byref
%ext = zext i8 %in to i32
; GCN-LABEL: {{^}}byref_constant_i16_arg:
; GCN: kernarg_segment_byte_size = 12
-; GCN: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s4
-; GCN: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], s5
-; GCN: global_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]{{\]}}, off offset:8
+; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN: global_load_ushort v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8
define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byref(i16) %in.byref) {
%in = load i16, i16 addrspace(4)* %in.byref
%ext = zext i16 %in to i32
; GCN-DAG: s_load_dword [[AFTER_OFFSET:s[0-9]+]], s[4:5], 0x104{{$}}
; GCN-DAG: v_mov_b32_e32 [[V_IN:v[0-9]+]], [[IN]]
; GCN-DAG: v_mov_b32_e32 [[V_AFTER_OFFSET:v[0-9]+]], [[AFTER_OFFSET]]
-; GCN: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_IN]]
-; GCN: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[V_AFTER_OFFSET]]
+; GCN: global_store_dword v{{[0-9]+}}, [[V_IN]], s
+; GCN: global_store_dword v{{[0-9]+}}, [[V_AFTER_OFFSET]], s
define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) {
%in = load i32, i32 addrspace(4)* %in.byref
store volatile i32 %in, i32 addrspace(1)* %out, align 4
}
; GCN-LABEL: {{^}}global_atomic_csub:
-; GCN: global_atomic_csub v{{[0-9]+}}, v[{{[0-9:]+}}], v{{[0-9]+}}, off glc
+; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc
define amdgpu_kernel void @global_atomic_csub(i32 addrspace(1)* %ptr, i32 %data) {
main_body:
%ret = call i32 @llvm.amdgcn.global.atomic.csub(i32 addrspace(1)* %ptr, i32 %data)
}
; GCN-LABEL: {{^}}global_atomic_csub_off4:
-; GCN: global_atomic_csub v{{[0-9]+}}, v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4 glc
+; GCN: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc
define amdgpu_kernel void @global_atomic_csub_off4(i32 addrspace(1)* %ptr, i32 %data) {
main_body:
%p = getelementptr i32, i32 addrspace(1)* %ptr, i64 1
}
; GCN-LABEL: {{^}}global_atomic_dec_ret_i32:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-; GFX9: global_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]], off glc{{$}}
+; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GFX9: global_atomic_dec v{{[0-9]+}}, [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
%result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
store i32 %result, i32 addrspace(1)* %out
}
; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
-; GFX9: global_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]], off offset:16 glc{{$}}
+
+; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GFX9: global_atomic_dec v{{[0-9]+}}, [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}}
define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
%gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
%result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
}
; GCN-LABEL: {{^}}global_atomic_dec_noret_i32:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GFX9: global_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]], off{{$}}
+
+; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GFX9: global_atomic_dec [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind {
%result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
ret void
}
; GCN-LABEL: {{^}}global_atomic_dec_noret_i32_offset:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42
; CIVI: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
-; GFX9: global_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]], off offset:16{{$}}
+
+; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GFX9: global_atomic_dec [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
%gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
%result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
; GCN-LABEL: {{^}}global_atomic_dec_ret_i64:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off glc{{$}}
+
+; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
%result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
store i64 %result, i64 addrspace(1)* %out
; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
-; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
%gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
%result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
; GCN-LABEL: {{^}}global_atomic_dec_noret_i64:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off{{$}}
+; GFX9: global_atomic_dec_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
%result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
ret void
; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; CIVI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
-; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off offset:32{{$}}
+; GFX9: global_atomic_dec_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
%gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
%result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
}
; GCN-LABEL: {{^}}global_atomic_add_f32:
-; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off
+; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @global_atomic_add_f32(float addrspace(1)* %ptr, float %data) {
main_body:
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
}
; GCN-LABEL: {{^}}global_atomic_add_f32_off4:
-; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4
+; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
define amdgpu_kernel void @global_atomic_add_f32_off4(float addrspace(1)* %ptr, float %data) {
main_body:
%p = getelementptr float, float addrspace(1)* %ptr, i64 1
}
; GCN-LABEL: {{^}}global_atomic_add_f32_offneg4:
-; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:-4
+; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4
define amdgpu_kernel void @global_atomic_add_f32_offneg4(float addrspace(1)* %ptr, float %data) {
main_body:
%p = getelementptr float, float addrspace(1)* %ptr, i64 -1
}
; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16:
-; GCN: global_atomic_pk_add_f16 v[{{[0-9:]+}}], v{{[0-9]+}}, off
+; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @global_atomic_pk_add_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
main_body:
%ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data)
}
; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_off4:
-; GCN: global_atomic_pk_add_f16 v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4
+; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
define amdgpu_kernel void @global_atomic_pk_add_v2f16_off4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
main_body:
%p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 1
}
; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4:
-; GCN: global_atomic_pk_add_f16 v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:-4
+; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4{{$}}
define amdgpu_kernel void @global_atomic_pk_add_v2f16_offneg4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) {
main_body:
%p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -1
; Make sure this artificially selects with an incorrect subtarget, but
; the feature set.
; GCN-LABEL: {{^}}global_atomic_fadd_f32_wrong_subtarget:
-; GCN: global_atomic_add_f32 v[{{[0-9:]+}}], v{{[0-9]+}}, off
+; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @global_atomic_fadd_f32_wrong_subtarget(float addrspace(1)* %ptr, float %data) #0 {
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data)
ret void
; GCN-LABEL: {{^}}global_atomic_inc_ret_i32:
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-; GFX9: global_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]], off glc{{$}}
+; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
store i32 %result, i32 addrspace(1)* %out
; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset:
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
-; GFX9: global_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]], off offset:16 glc{{$}}
+; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}}
define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
%gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
; GCN-LABEL: {{^}}global_atomic_inc_noret_i32:
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GFX9: global_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]], off{{$}}
+; GFX9: global_atomic_inc v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
ret void
; GCN-LABEL: {{^}}global_atomic_inc_noret_i32_offset:
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
-; GFX9: global_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]], off offset:16{{$}}
+; GFX9: global_atomic_inc v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
%gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
; GCN-LABEL: {{^}}global_atomic_inc_ret_i64:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GFX9: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off glc{{$}}
+; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
store i64 %result, i64 addrspace(1)* %out
; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
-; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
+; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
-; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off offset:32 glc{{$}}
+; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
%gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
; GCN-LABEL: {{^}}global_atomic_inc_noret_i64:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off{{$}}
+; GFX9: global_atomic_inc_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
ret void
; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
+; GFX9-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; CIVI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
-; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off offset:32{{$}}
+; GFX9: global_atomic_inc_x2 v[[ZERO]], v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
%gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
%result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s0, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
store <2 x half> %result, <2 x half> addrspace(1)* %out
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s0, s0
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
store <2 x half> %result, <2 x half> addrspace(1)* %out
;
; GFX6789-LABEL: load_1d_tfe:
; GFX6789: ; %bb.0: ; %main_body
+; GFX6789-NEXT: v_mov_b32_e32 v6, 0
; GFX6789-NEXT: v_mov_b32_e32 v5, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v3, v0
-; GFX6789-NEXT: v_mov_b32_e32 v4, v0
+; GFX6789-NEXT: v_mov_b32_e32 v7, v6
+; GFX6789-NEXT: v_mov_b32_e32 v8, v6
+; GFX6789-NEXT: v_mov_b32_e32 v9, v6
+; GFX6789-NEXT: v_mov_b32_e32 v10, v6
+; GFX6789-NEXT: v_mov_b32_e32 v0, v6
+; GFX6789-NEXT: v_mov_b32_e32 v1, v7
+; GFX6789-NEXT: v_mov_b32_e32 v2, v8
+; GFX6789-NEXT: v_mov_b32_e32 v3, v9
+; GFX6789-NEXT: v_mov_b32_e32 v4, v10
; GFX6789-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf unorm tfe
-; GFX6789-NEXT: v_mov_b32_e32 v5, s8
-; GFX6789-NEXT: v_mov_b32_e32 v6, s9
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[5:6], v4, off
+; GFX6789-NEXT: global_store_dword v6, v4, s[8:9]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; NOPRT: ; %bb.0: ; %main_body
; NOPRT-NEXT: v_mov_b32_e32 v4, 0
; NOPRT-NEXT: image_load v[0:4], v0, s[0:7] dmask:0xf unorm tfe
-; NOPRT-NEXT: v_mov_b32_e32 v5, s8
-; NOPRT-NEXT: v_mov_b32_e32 v6, s9
+; NOPRT-NEXT: v_mov_b32_e32 v5, 0
; NOPRT-NEXT: s_waitcnt vmcnt(0)
-; NOPRT-NEXT: global_store_dword v[5:6], v4, off
+; NOPRT-NEXT: global_store_dword v5, v4, s[8:9]
; NOPRT-NEXT: s_waitcnt vmcnt(0)
; NOPRT-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1d_tfe:
; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v6, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
;
; GFX6789-LABEL: load_1d_lwe:
; GFX6789: ; %bb.0: ; %main_body
+; GFX6789-NEXT: v_mov_b32_e32 v6, 0
; GFX6789-NEXT: v_mov_b32_e32 v5, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v3, v0
-; GFX6789-NEXT: v_mov_b32_e32 v4, v0
+; GFX6789-NEXT: v_mov_b32_e32 v7, v6
+; GFX6789-NEXT: v_mov_b32_e32 v8, v6
+; GFX6789-NEXT: v_mov_b32_e32 v9, v6
+; GFX6789-NEXT: v_mov_b32_e32 v10, v6
+; GFX6789-NEXT: v_mov_b32_e32 v0, v6
+; GFX6789-NEXT: v_mov_b32_e32 v1, v7
+; GFX6789-NEXT: v_mov_b32_e32 v2, v8
+; GFX6789-NEXT: v_mov_b32_e32 v3, v9
+; GFX6789-NEXT: v_mov_b32_e32 v4, v10
; GFX6789-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf unorm lwe
-; GFX6789-NEXT: v_mov_b32_e32 v5, s8
-; GFX6789-NEXT: v_mov_b32_e32 v6, s9
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[5:6], v4, off
+; GFX6789-NEXT: global_store_dword v6, v4, s[8:9]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; NOPRT: ; %bb.0: ; %main_body
; NOPRT-NEXT: v_mov_b32_e32 v4, 0
; NOPRT-NEXT: image_load v[0:4], v0, s[0:7] dmask:0xf unorm lwe
-; NOPRT-NEXT: v_mov_b32_e32 v5, s8
-; NOPRT-NEXT: v_mov_b32_e32 v6, s9
+; NOPRT-NEXT: v_mov_b32_e32 v5, 0
; NOPRT-NEXT: s_waitcnt vmcnt(0)
-; NOPRT-NEXT: global_store_dword v[5:6], v4, off
+; NOPRT-NEXT: global_store_dword v5, v4, s[8:9]
; NOPRT-NEXT: s_waitcnt vmcnt(0)
; NOPRT-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1d_lwe:
; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v6, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
;
; GFX6789-LABEL: load_2d_tfe:
; GFX6789: ; %bb.0: ; %main_body
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
+; GFX6789-NEXT: v_mov_b32_e32 v7, 0
; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v3, v0
-; GFX6789-NEXT: v_mov_b32_e32 v4, v0
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
+; GFX6789-NEXT: v_mov_b32_e32 v8, v7
+; GFX6789-NEXT: v_mov_b32_e32 v9, v7
+; GFX6789-NEXT: v_mov_b32_e32 v10, v7
+; GFX6789-NEXT: v_mov_b32_e32 v11, v7
+; GFX6789-NEXT: v_mov_b32_e32 v0, v7
+; GFX6789-NEXT: v_mov_b32_e32 v1, v8
+; GFX6789-NEXT: v_mov_b32_e32 v2, v9
+; GFX6789-NEXT: v_mov_b32_e32 v3, v10
+; GFX6789-NEXT: v_mov_b32_e32 v4, v11
; GFX6789-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm tfe
-; GFX6789-NEXT: v_mov_b32_e32 v5, s8
-; GFX6789-NEXT: v_mov_b32_e32 v6, s9
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[5:6], v4, off
+; GFX6789-NEXT: global_store_dword v7, v4, s[8:9]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; NOPRT: ; %bb.0: ; %main_body
; NOPRT-NEXT: v_mov_b32_e32 v4, 0
; NOPRT-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm tfe
-; NOPRT-NEXT: v_mov_b32_e32 v5, s8
-; NOPRT-NEXT: v_mov_b32_e32 v6, s9
+; NOPRT-NEXT: v_mov_b32_e32 v5, 0
; NOPRT-NEXT: s_waitcnt vmcnt(0)
-; NOPRT-NEXT: global_store_dword v[5:6], v4, off
+; NOPRT-NEXT: global_store_dword v5, v4, s[8:9]
; NOPRT-NEXT: s_waitcnt vmcnt(0)
; NOPRT-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_2d_tfe:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; encoding: [0x80,0x02,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; encoding: [0x07,0x03,0x14,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v11, v7 ; encoding: [0x07,0x03,0x16,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; encoding: [0x07,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; encoding: [0x08,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v10 ; encoding: [0x0a,0x03,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v4, v11 ; encoding: [0x0b,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v7, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x07,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
;
; GFX6789-LABEL: load_3d_tfe_lwe:
; GFX6789: ; %bb.0: ; %main_body
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
+; GFX6789-NEXT: v_mov_b32_e32 v8, 0
; GFX6789-NEXT: v_mov_b32_e32 v7, v2
; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v3, v0
-; GFX6789-NEXT: v_mov_b32_e32 v4, v0
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
+; GFX6789-NEXT: v_mov_b32_e32 v9, v8
+; GFX6789-NEXT: v_mov_b32_e32 v10, v8
+; GFX6789-NEXT: v_mov_b32_e32 v11, v8
+; GFX6789-NEXT: v_mov_b32_e32 v12, v8
+; GFX6789-NEXT: v_mov_b32_e32 v0, v8
+; GFX6789-NEXT: v_mov_b32_e32 v1, v9
+; GFX6789-NEXT: v_mov_b32_e32 v2, v10
+; GFX6789-NEXT: v_mov_b32_e32 v3, v11
+; GFX6789-NEXT: v_mov_b32_e32 v4, v12
; GFX6789-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe lwe
-; GFX6789-NEXT: v_mov_b32_e32 v5, s8
-; GFX6789-NEXT: v_mov_b32_e32 v6, s9
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[5:6], v4, off
+; GFX6789-NEXT: global_store_dword v8, v4, s[8:9]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; NOPRT: ; %bb.0: ; %main_body
; NOPRT-NEXT: v_mov_b32_e32 v4, 0
; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe lwe
-; NOPRT-NEXT: v_mov_b32_e32 v5, s8
-; NOPRT-NEXT: v_mov_b32_e32 v6, s9
+; NOPRT-NEXT: v_mov_b32_e32 v5, 0
; NOPRT-NEXT: s_waitcnt vmcnt(0)
-; NOPRT-NEXT: global_store_dword v[5:6], v4, off
+; NOPRT-NEXT: global_store_dword v5, v4, s[8:9]
; NOPRT-NEXT: s_waitcnt vmcnt(0)
; NOPRT-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_3d_tfe_lwe:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; encoding: [0x10,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
;
; GFX6789-LABEL: load_cube_lwe:
; GFX6789: ; %bb.0: ; %main_body
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
+; GFX6789-NEXT: v_mov_b32_e32 v8, 0
; GFX6789-NEXT: v_mov_b32_e32 v7, v2
; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v3, v0
-; GFX6789-NEXT: v_mov_b32_e32 v4, v0
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
+; GFX6789-NEXT: v_mov_b32_e32 v9, v8
+; GFX6789-NEXT: v_mov_b32_e32 v10, v8
+; GFX6789-NEXT: v_mov_b32_e32 v11, v8
+; GFX6789-NEXT: v_mov_b32_e32 v12, v8
+; GFX6789-NEXT: v_mov_b32_e32 v0, v8
+; GFX6789-NEXT: v_mov_b32_e32 v1, v9
+; GFX6789-NEXT: v_mov_b32_e32 v2, v10
+; GFX6789-NEXT: v_mov_b32_e32 v3, v11
+; GFX6789-NEXT: v_mov_b32_e32 v4, v12
; GFX6789-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm lwe da
-; GFX6789-NEXT: v_mov_b32_e32 v5, s8
-; GFX6789-NEXT: v_mov_b32_e32 v6, s9
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[5:6], v4, off
+; GFX6789-NEXT: global_store_dword v8, v4, s[8:9]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; NOPRT: ; %bb.0: ; %main_body
; NOPRT-NEXT: v_mov_b32_e32 v4, 0
; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm lwe da
-; NOPRT-NEXT: v_mov_b32_e32 v5, s8
-; NOPRT-NEXT: v_mov_b32_e32 v6, s9
+; NOPRT-NEXT: v_mov_b32_e32 v5, 0
; NOPRT-NEXT: s_waitcnt vmcnt(0)
-; NOPRT-NEXT: global_store_dword v[5:6], v4, off
+; NOPRT-NEXT: global_store_dword v5, v4, s[8:9]
; NOPRT-NEXT: s_waitcnt vmcnt(0)
; NOPRT-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_cube_lwe:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; encoding: [0x18,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
;
; GFX6789-LABEL: load_1darray_tfe:
; GFX6789: ; %bb.0: ; %main_body
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
+; GFX6789-NEXT: v_mov_b32_e32 v7, 0
; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v3, v0
-; GFX6789-NEXT: v_mov_b32_e32 v4, v0
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
+; GFX6789-NEXT: v_mov_b32_e32 v8, v7
+; GFX6789-NEXT: v_mov_b32_e32 v9, v7
+; GFX6789-NEXT: v_mov_b32_e32 v10, v7
+; GFX6789-NEXT: v_mov_b32_e32 v11, v7
+; GFX6789-NEXT: v_mov_b32_e32 v0, v7
+; GFX6789-NEXT: v_mov_b32_e32 v1, v8
+; GFX6789-NEXT: v_mov_b32_e32 v2, v9
+; GFX6789-NEXT: v_mov_b32_e32 v3, v10
+; GFX6789-NEXT: v_mov_b32_e32 v4, v11
; GFX6789-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf unorm tfe da
-; GFX6789-NEXT: v_mov_b32_e32 v5, s8
-; GFX6789-NEXT: v_mov_b32_e32 v6, s9
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[5:6], v4, off
+; GFX6789-NEXT: global_store_dword v7, v4, s[8:9]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; NOPRT: ; %bb.0: ; %main_body
; NOPRT-NEXT: v_mov_b32_e32 v4, 0
; NOPRT-NEXT: image_load v[0:4], v[0:1], s[0:7] dmask:0xf unorm tfe da
-; NOPRT-NEXT: v_mov_b32_e32 v5, s8
-; NOPRT-NEXT: v_mov_b32_e32 v6, s9
+; NOPRT-NEXT: v_mov_b32_e32 v5, 0
; NOPRT-NEXT: s_waitcnt vmcnt(0)
-; NOPRT-NEXT: global_store_dword v[5:6], v4, off
+; NOPRT-NEXT: global_store_dword v5, v4, s[8:9]
; NOPRT-NEXT: s_waitcnt vmcnt(0)
; NOPRT-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1darray_tfe:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; encoding: [0x80,0x02,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; encoding: [0x07,0x03,0x14,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v11, v7 ; encoding: [0x07,0x03,0x16,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; encoding: [0x07,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; encoding: [0x08,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v10 ; encoding: [0x0a,0x03,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v4, v11 ; encoding: [0x0b,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; encoding: [0x20,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v7, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x07,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
;
; GFX6789-LABEL: load_2darray_lwe:
; GFX6789: ; %bb.0: ; %main_body
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
+; GFX6789-NEXT: v_mov_b32_e32 v8, 0
; GFX6789-NEXT: v_mov_b32_e32 v7, v2
; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v3, v0
-; GFX6789-NEXT: v_mov_b32_e32 v4, v0
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
+; GFX6789-NEXT: v_mov_b32_e32 v9, v8
+; GFX6789-NEXT: v_mov_b32_e32 v10, v8
+; GFX6789-NEXT: v_mov_b32_e32 v11, v8
+; GFX6789-NEXT: v_mov_b32_e32 v12, v8
+; GFX6789-NEXT: v_mov_b32_e32 v0, v8
+; GFX6789-NEXT: v_mov_b32_e32 v1, v9
+; GFX6789-NEXT: v_mov_b32_e32 v2, v10
+; GFX6789-NEXT: v_mov_b32_e32 v3, v11
+; GFX6789-NEXT: v_mov_b32_e32 v4, v12
; GFX6789-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm lwe da
-; GFX6789-NEXT: v_mov_b32_e32 v5, s8
-; GFX6789-NEXT: v_mov_b32_e32 v6, s9
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[5:6], v4, off
+; GFX6789-NEXT: global_store_dword v8, v4, s[8:9]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; NOPRT: ; %bb.0: ; %main_body
; NOPRT-NEXT: v_mov_b32_e32 v4, 0
; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm lwe da
-; NOPRT-NEXT: v_mov_b32_e32 v5, s8
-; NOPRT-NEXT: v_mov_b32_e32 v6, s9
+; NOPRT-NEXT: v_mov_b32_e32 v5, 0
; NOPRT-NEXT: s_waitcnt vmcnt(0)
-; NOPRT-NEXT: global_store_dword v[5:6], v4, off
+; NOPRT-NEXT: global_store_dword v5, v4, s[8:9]
; NOPRT-NEXT: s_waitcnt vmcnt(0)
; NOPRT-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_2darray_lwe:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; encoding: [0x28,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
;
; GFX6789-LABEL: load_2dmsaa_both:
; GFX6789: ; %bb.0: ; %main_body
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
+; GFX6789-NEXT: v_mov_b32_e32 v8, 0
; GFX6789-NEXT: v_mov_b32_e32 v7, v2
; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v3, v0
-; GFX6789-NEXT: v_mov_b32_e32 v4, v0
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
+; GFX6789-NEXT: v_mov_b32_e32 v9, v8
+; GFX6789-NEXT: v_mov_b32_e32 v10, v8
+; GFX6789-NEXT: v_mov_b32_e32 v11, v8
+; GFX6789-NEXT: v_mov_b32_e32 v12, v8
+; GFX6789-NEXT: v_mov_b32_e32 v0, v8
+; GFX6789-NEXT: v_mov_b32_e32 v1, v9
+; GFX6789-NEXT: v_mov_b32_e32 v2, v10
+; GFX6789-NEXT: v_mov_b32_e32 v3, v11
+; GFX6789-NEXT: v_mov_b32_e32 v4, v12
; GFX6789-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe lwe
-; GFX6789-NEXT: v_mov_b32_e32 v5, s8
-; GFX6789-NEXT: v_mov_b32_e32 v6, s9
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[5:6], v4, off
+; GFX6789-NEXT: global_store_dword v8, v4, s[8:9]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; NOPRT: ; %bb.0: ; %main_body
; NOPRT-NEXT: v_mov_b32_e32 v4, 0
; NOPRT-NEXT: image_load v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe lwe
-; NOPRT-NEXT: v_mov_b32_e32 v5, s8
-; NOPRT-NEXT: v_mov_b32_e32 v6, s9
+; NOPRT-NEXT: v_mov_b32_e32 v5, 0
; NOPRT-NEXT: s_waitcnt vmcnt(0)
-; NOPRT-NEXT: global_store_dword v[5:6], v4, off
+; NOPRT-NEXT: global_store_dword v5, v4, s[8:9]
; NOPRT-NEXT: s_waitcnt vmcnt(0)
; NOPRT-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_2dmsaa_both:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x30,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
;
; GFX6789-LABEL: load_2darraymsaa_tfe:
; GFX6789: ; %bb.0: ; %main_body
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
+; GFX6789-NEXT: v_mov_b32_e32 v9, 0
; GFX6789-NEXT: v_mov_b32_e32 v8, v3
; GFX6789-NEXT: v_mov_b32_e32 v7, v2
; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v3, v0
-; GFX6789-NEXT: v_mov_b32_e32 v4, v0
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
+; GFX6789-NEXT: v_mov_b32_e32 v10, v9
+; GFX6789-NEXT: v_mov_b32_e32 v11, v9
+; GFX6789-NEXT: v_mov_b32_e32 v12, v9
+; GFX6789-NEXT: v_mov_b32_e32 v13, v9
+; GFX6789-NEXT: v_mov_b32_e32 v0, v9
+; GFX6789-NEXT: v_mov_b32_e32 v1, v10
+; GFX6789-NEXT: v_mov_b32_e32 v2, v11
+; GFX6789-NEXT: v_mov_b32_e32 v3, v12
+; GFX6789-NEXT: v_mov_b32_e32 v4, v13
; GFX6789-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf unorm tfe da
-; GFX6789-NEXT: v_mov_b32_e32 v5, s8
-; GFX6789-NEXT: v_mov_b32_e32 v6, s9
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[5:6], v4, off
+; GFX6789-NEXT: global_store_dword v9, v4, s[8:9]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; NOPRT: ; %bb.0: ; %main_body
; NOPRT-NEXT: v_mov_b32_e32 v4, 0
; NOPRT-NEXT: image_load v[0:4], v[0:3], s[0:7] dmask:0xf unorm tfe da
-; NOPRT-NEXT: v_mov_b32_e32 v5, s8
-; NOPRT-NEXT: v_mov_b32_e32 v6, s9
+; NOPRT-NEXT: v_mov_b32_e32 v5, 0
; NOPRT-NEXT: s_waitcnt vmcnt(0)
-; NOPRT-NEXT: global_store_dword v[5:6], v4, off
+; NOPRT-NEXT: global_store_dword v5, v4, s[8:9]
; NOPRT-NEXT: s_waitcnt vmcnt(0)
; NOPRT-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_2darraymsaa_tfe:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; encoding: [0x80,0x02,0x12,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v8, v3 ; encoding: [0x03,0x03,0x10,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
-; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v10, v9 ; encoding: [0x09,0x03,0x14,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v11, v9 ; encoding: [0x09,0x03,0x16,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v12, v9 ; encoding: [0x09,0x03,0x18,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v13, v9 ; encoding: [0x09,0x03,0x1a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v9 ; encoding: [0x09,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v10 ; encoding: [0x0a,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v11 ; encoding: [0x0b,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v12 ; encoding: [0x0c,0x03,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x38,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v9, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x09,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
;
; GFX6789-LABEL: load_mip_1d_lwe:
; GFX6789: ; %bb.0: ; %main_body
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
+; GFX6789-NEXT: v_mov_b32_e32 v7, 0
; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v3, v0
-; GFX6789-NEXT: v_mov_b32_e32 v4, v0
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
+; GFX6789-NEXT: v_mov_b32_e32 v8, v7
+; GFX6789-NEXT: v_mov_b32_e32 v9, v7
+; GFX6789-NEXT: v_mov_b32_e32 v10, v7
+; GFX6789-NEXT: v_mov_b32_e32 v11, v7
+; GFX6789-NEXT: v_mov_b32_e32 v0, v7
+; GFX6789-NEXT: v_mov_b32_e32 v1, v8
+; GFX6789-NEXT: v_mov_b32_e32 v2, v9
+; GFX6789-NEXT: v_mov_b32_e32 v3, v10
+; GFX6789-NEXT: v_mov_b32_e32 v4, v11
; GFX6789-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf unorm lwe
-; GFX6789-NEXT: v_mov_b32_e32 v5, s8
-; GFX6789-NEXT: v_mov_b32_e32 v6, s9
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[5:6], v4, off
+; GFX6789-NEXT: global_store_dword v7, v4, s[8:9]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; NOPRT: ; %bb.0: ; %main_body
; NOPRT-NEXT: v_mov_b32_e32 v4, 0
; NOPRT-NEXT: image_load_mip v[0:4], v[0:1], s[0:7] dmask:0xf unorm lwe
-; NOPRT-NEXT: v_mov_b32_e32 v5, s8
-; NOPRT-NEXT: v_mov_b32_e32 v6, s9
+; NOPRT-NEXT: v_mov_b32_e32 v5, 0
; NOPRT-NEXT: s_waitcnt vmcnt(0)
-; NOPRT-NEXT: global_store_dword v[5:6], v4, off
+; NOPRT-NEXT: global_store_dword v5, v4, s[8:9]
; NOPRT-NEXT: s_waitcnt vmcnt(0)
; NOPRT-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_mip_1d_lwe:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; encoding: [0x80,0x02,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v10, v7 ; encoding: [0x07,0x03,0x14,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v11, v7 ; encoding: [0x07,0x03,0x16,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; encoding: [0x07,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v8 ; encoding: [0x08,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v10 ; encoding: [0x0a,0x03,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v4, v11 ; encoding: [0x0b,0x03,0x08,0x7e]
; GFX10-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x06,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v7, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x07,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
;
; GFX6789-LABEL: load_mip_2d_tfe:
; GFX6789: ; %bb.0: ; %main_body
-; GFX6789-NEXT: v_mov_b32_e32 v5, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
+; GFX6789-NEXT: v_mov_b32_e32 v8, 0
; GFX6789-NEXT: v_mov_b32_e32 v7, v2
; GFX6789-NEXT: v_mov_b32_e32 v6, v1
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v3, v0
-; GFX6789-NEXT: v_mov_b32_e32 v4, v0
+; GFX6789-NEXT: v_mov_b32_e32 v5, v0
+; GFX6789-NEXT: v_mov_b32_e32 v9, v8
+; GFX6789-NEXT: v_mov_b32_e32 v10, v8
+; GFX6789-NEXT: v_mov_b32_e32 v11, v8
+; GFX6789-NEXT: v_mov_b32_e32 v12, v8
+; GFX6789-NEXT: v_mov_b32_e32 v0, v8
+; GFX6789-NEXT: v_mov_b32_e32 v1, v9
+; GFX6789-NEXT: v_mov_b32_e32 v2, v10
+; GFX6789-NEXT: v_mov_b32_e32 v3, v11
+; GFX6789-NEXT: v_mov_b32_e32 v4, v12
; GFX6789-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf unorm tfe
-; GFX6789-NEXT: v_mov_b32_e32 v5, s8
-; GFX6789-NEXT: v_mov_b32_e32 v6, s9
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[5:6], v4, off
+; GFX6789-NEXT: global_store_dword v8, v4, s[8:9]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; NOPRT: ; %bb.0: ; %main_body
; NOPRT-NEXT: v_mov_b32_e32 v4, 0
; NOPRT-NEXT: image_load_mip v[0:4], v[0:2], s[0:7] dmask:0xf unorm tfe
-; NOPRT-NEXT: v_mov_b32_e32 v5, s8
-; NOPRT-NEXT: v_mov_b32_e32 v6, s9
+; NOPRT-NEXT: v_mov_b32_e32 v5, 0
; NOPRT-NEXT: s_waitcnt vmcnt(0)
-; NOPRT-NEXT: global_store_dword v[5:6], v4, off
+; NOPRT-NEXT: global_store_dword v5, v4, s[8:9]
; NOPRT-NEXT: s_waitcnt vmcnt(0)
; NOPRT-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_mip_2d_tfe:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v13, v11 ; encoding: [0x0b,0x03,0x1a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v14, v11 ; encoding: [0x0b,0x03,0x1c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v15, v11 ; encoding: [0x0b,0x03,0x1e,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v13 ; encoding: [0x0d,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v14 ; encoding: [0x0e,0x03,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v4, v15 ; encoding: [0x0f,0x03,0x08,0x7e]
; GFX10-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x05,0xf0,0x05,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v11, v4, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x04,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
;
; GFX6789-LABEL: load_1d_tfe_V4_dmask3:
; GFX6789: ; %bb.0: ; %main_body
+; GFX6789-NEXT: v_mov_b32_e32 v5, 0
; GFX6789-NEXT: v_mov_b32_e32 v4, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v3, v0
+; GFX6789-NEXT: v_mov_b32_e32 v6, v5
+; GFX6789-NEXT: v_mov_b32_e32 v7, v5
+; GFX6789-NEXT: v_mov_b32_e32 v8, v5
+; GFX6789-NEXT: v_mov_b32_e32 v0, v5
+; GFX6789-NEXT: v_mov_b32_e32 v1, v6
+; GFX6789-NEXT: v_mov_b32_e32 v2, v7
+; GFX6789-NEXT: v_mov_b32_e32 v3, v8
; GFX6789-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 unorm tfe
-; GFX6789-NEXT: v_mov_b32_e32 v4, s8
-; GFX6789-NEXT: v_mov_b32_e32 v5, s9
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[4:5], v3, off
+; GFX6789-NEXT: global_store_dword v5, v3, s[8:9]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; NOPRT: ; %bb.0: ; %main_body
; NOPRT-NEXT: v_mov_b32_e32 v3, 0
; NOPRT-NEXT: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm tfe
-; NOPRT-NEXT: v_mov_b32_e32 v4, s8
-; NOPRT-NEXT: v_mov_b32_e32 v5, s9
+; NOPRT-NEXT: v_mov_b32_e32 v4, 0
; NOPRT-NEXT: s_waitcnt vmcnt(0)
-; NOPRT-NEXT: global_store_dword v[4:5], v3, off
+; NOPRT-NEXT: global_store_dword v4, v3, s[8:9]
; NOPRT-NEXT: s_waitcnt vmcnt(0)
; NOPRT-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1d_tfe_V4_dmask3:
; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v5, s9 ; encoding: [0x09,0x02,0x0a,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; encoding: [0x05,0x03,0x0c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; encoding: [0x05,0x03,0x0e,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; encoding: [0x05,0x03,0x10,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; encoding: [0x05,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v6 ; encoding: [0x06,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v7 ; encoding: [0x07,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v8 ; encoding: [0x08,0x03,0x06,0x7e]
; GFX10-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x17,0x01,0xf0,0x04,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v4, s8 ; encoding: [0x08,0x02,0x08,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[4:5], v3, off ; encoding: [0x00,0x80,0x70,0xdc,0x04,0x03,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v5, v3, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x03,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
;
; GFX6789-LABEL: load_1d_tfe_V4_dmask2:
; GFX6789: ; %bb.0: ; %main_body
+; GFX6789-NEXT: v_mov_b32_e32 v4, 0
; GFX6789-NEXT: v_mov_b32_e32 v3, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
+; GFX6789-NEXT: v_mov_b32_e32 v5, v4
+; GFX6789-NEXT: v_mov_b32_e32 v6, v4
+; GFX6789-NEXT: v_mov_b32_e32 v0, v4
+; GFX6789-NEXT: v_mov_b32_e32 v1, v5
+; GFX6789-NEXT: v_mov_b32_e32 v2, v6
; GFX6789-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 unorm tfe
-; GFX6789-NEXT: v_mov_b32_e32 v3, s8
-; GFX6789-NEXT: v_mov_b32_e32 v4, s9
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[3:4], v2, off
+; GFX6789-NEXT: global_store_dword v4, v2, s[8:9]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; NOPRT: ; %bb.0: ; %main_body
; NOPRT-NEXT: v_mov_b32_e32 v2, 0
; NOPRT-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x6 unorm tfe
-; NOPRT-NEXT: v_mov_b32_e32 v3, s8
-; NOPRT-NEXT: v_mov_b32_e32 v4, s9
+; NOPRT-NEXT: v_mov_b32_e32 v3, 0
; NOPRT-NEXT: s_waitcnt vmcnt(0)
-; NOPRT-NEXT: global_store_dword v[3:4], v2, off
+; NOPRT-NEXT: global_store_dword v3, v2, s[8:9]
; NOPRT-NEXT: s_waitcnt vmcnt(0)
; NOPRT-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1d_tfe_V4_dmask2:
; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; encoding: [0x80,0x02,0x08,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v4, s9 ; encoding: [0x09,0x02,0x08,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v5, v4 ; encoding: [0x04,0x03,0x0a,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v6, v4 ; encoding: [0x04,0x03,0x0c,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; encoding: [0x04,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; encoding: [0x05,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v6 ; encoding: [0x06,0x03,0x04,0x7e]
; GFX10-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x16,0x01,0xf0,0x03,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; encoding: [0x08,0x02,0x06,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[3:4], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x02,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v4, v2, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x04,0x02,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
;
; GFX6789-LABEL: load_1d_tfe_V4_dmask1:
; GFX6789: ; %bb.0: ; %main_body
+; GFX6789-NEXT: v_mov_b32_e32 v3, 0
; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
+; GFX6789-NEXT: v_mov_b32_e32 v4, v3
+; GFX6789-NEXT: v_mov_b32_e32 v0, v3
+; GFX6789-NEXT: v_mov_b32_e32 v1, v4
; GFX6789-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe
-; GFX6789-NEXT: v_mov_b32_e32 v2, s8
-; GFX6789-NEXT: v_mov_b32_e32 v3, s9
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[2:3], v1, off
+; GFX6789-NEXT: global_store_dword v3, v1, s[8:9]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; NOPRT: ; %bb.0: ; %main_body
; NOPRT-NEXT: v_mov_b32_e32 v1, 0
; NOPRT-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x8 unorm tfe
-; NOPRT-NEXT: v_mov_b32_e32 v2, s8
-; NOPRT-NEXT: v_mov_b32_e32 v3, s9
+; NOPRT-NEXT: v_mov_b32_e32 v2, 0
; NOPRT-NEXT: s_waitcnt vmcnt(0)
-; NOPRT-NEXT: global_store_dword v[2:3], v1, off
+; NOPRT-NEXT: global_store_dword v2, v1, s[8:9]
; NOPRT-NEXT: s_waitcnt vmcnt(0)
; NOPRT-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1d_tfe_V4_dmask1:
; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, s9 ; encoding: [0x09,0x02,0x06,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; encoding: [0x03,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; encoding: [0x04,0x03,0x02,0x7e]
; GFX10-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x18,0x01,0xf0,0x02,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; encoding: [0x08,0x02,0x04,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v3, v1, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x01,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
;
; GFX6789-LABEL: load_1d_tfe_V2_dmask1:
; GFX6789: ; %bb.0: ; %main_body
+; GFX6789-NEXT: v_mov_b32_e32 v3, 0
; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
+; GFX6789-NEXT: v_mov_b32_e32 v4, v3
+; GFX6789-NEXT: v_mov_b32_e32 v0, v3
+; GFX6789-NEXT: v_mov_b32_e32 v1, v4
; GFX6789-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe
-; GFX6789-NEXT: v_mov_b32_e32 v2, s8
-; GFX6789-NEXT: v_mov_b32_e32 v3, s9
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[2:3], v1, off
+; GFX6789-NEXT: global_store_dword v3, v1, s[8:9]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; NOPRT: ; %bb.0: ; %main_body
; NOPRT-NEXT: v_mov_b32_e32 v1, 0
; NOPRT-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x8 unorm tfe
-; NOPRT-NEXT: v_mov_b32_e32 v2, s8
-; NOPRT-NEXT: v_mov_b32_e32 v3, s9
+; NOPRT-NEXT: v_mov_b32_e32 v2, 0
; NOPRT-NEXT: s_waitcnt vmcnt(0)
-; NOPRT-NEXT: global_store_dword v[2:3], v1, off
+; NOPRT-NEXT: global_store_dword v2, v1, s[8:9]
; NOPRT-NEXT: s_waitcnt vmcnt(0)
; NOPRT-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_1d_tfe_V2_dmask1:
; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, s9 ; encoding: [0x09,0x02,0x06,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; encoding: [0x03,0x03,0x08,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; encoding: [0x03,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; encoding: [0x04,0x03,0x02,0x7e]
; GFX10-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x18,0x01,0xf0,0x02,0x00,0x00,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; encoding: [0x08,0x02,0x04,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v3, v1, s[8:9] ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x01,0x08,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: s_mov_b64 s[14:15], exec
; GFX9-NEXT: s_wqm_b64 exec, exec
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mov_b32_e32 v3, v2
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, v4
+; GFX9-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-NEXT: v_mov_b32_e32 v3, v5
; GFX9-NEXT: s_and_b64 exec, exec, s[14:15]
; GFX9-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16
-; GFX9-NEXT: v_mov_b32_e32 v0, s12
-; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v[0:1], v3, off
; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: global_store_dword v4, v3, s[12:13]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s28, exec_lo
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: v_mov_b32_e32 v5, v4
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
+; GFX10-NEXT: v_mov_b32_e32 v3, v5
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
-; GFX10-NEXT: v_mov_b32_e32 v0, s12
-; GFX10-NEXT: v_mov_b32_e32 v1, s13
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v[0:1], v3, off
; GFX10-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-NEXT: global_store_dword v4, v3, s[12:13]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
main_body:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: s_mov_b64 s[14:15], exec
; GFX6789-NEXT: s_wqm_b64 exec, exec
+; GFX6789-NEXT: v_mov_b32_e32 v6, 0
; GFX6789-NEXT: v_mov_b32_e32 v5, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v3, v0
-; GFX6789-NEXT: v_mov_b32_e32 v4, v0
+; GFX6789-NEXT: v_mov_b32_e32 v7, v6
+; GFX6789-NEXT: v_mov_b32_e32 v8, v6
+; GFX6789-NEXT: v_mov_b32_e32 v9, v6
+; GFX6789-NEXT: v_mov_b32_e32 v10, v6
+; GFX6789-NEXT: v_mov_b32_e32 v0, v6
+; GFX6789-NEXT: v_mov_b32_e32 v1, v7
+; GFX6789-NEXT: v_mov_b32_e32 v2, v8
+; GFX6789-NEXT: v_mov_b32_e32 v3, v9
+; GFX6789-NEXT: v_mov_b32_e32 v4, v10
; GFX6789-NEXT: s_and_b64 exec, exec, s[14:15]
; GFX6789-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf tfe
-; GFX6789-NEXT: v_mov_b32_e32 v5, s12
-; GFX6789-NEXT: v_mov_b32_e32 v6, s13
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[5:6], v4, off
+; GFX6789-NEXT: global_store_dword v6, v4, s[12:13]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe]
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe]
+; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e]
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87]
; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0f,0x81,0xf0,0x05,0x00,0x40,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v5, s12 ; encoding: [0x0c,0x02,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v6, s13 ; encoding: [0x0d,0x02,0x0c,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
; GFX6789: ; %bb.0: ; %main_body
; GFX6789-NEXT: s_mov_b64 s[14:15], exec
; GFX6789-NEXT: s_wqm_b64 exec, exec
+; GFX6789-NEXT: v_mov_b32_e32 v6, 0
; GFX6789-NEXT: v_mov_b32_e32 v5, v0
-; GFX6789-NEXT: v_mov_b32_e32 v0, 0
-; GFX6789-NEXT: v_mov_b32_e32 v1, v0
-; GFX6789-NEXT: v_mov_b32_e32 v2, v0
-; GFX6789-NEXT: v_mov_b32_e32 v3, v0
-; GFX6789-NEXT: v_mov_b32_e32 v4, v0
+; GFX6789-NEXT: v_mov_b32_e32 v7, v6
+; GFX6789-NEXT: v_mov_b32_e32 v8, v6
+; GFX6789-NEXT: v_mov_b32_e32 v9, v6
+; GFX6789-NEXT: v_mov_b32_e32 v10, v6
+; GFX6789-NEXT: v_mov_b32_e32 v0, v6
+; GFX6789-NEXT: v_mov_b32_e32 v1, v7
+; GFX6789-NEXT: v_mov_b32_e32 v2, v8
+; GFX6789-NEXT: v_mov_b32_e32 v3, v9
+; GFX6789-NEXT: v_mov_b32_e32 v4, v10
; GFX6789-NEXT: s_and_b64 exec, exec, s[14:15]
; GFX6789-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf lwe
-; GFX6789-NEXT: v_mov_b32_e32 v5, s12
-; GFX6789-NEXT: v_mov_b32_e32 v6, s13
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[5:6], v4, off
+; GFX6789-NEXT: global_store_dword v6, v4, s[12:13]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_mov_b32 s28, exec_lo ; encoding: [0x7e,0x03,0x9c,0xbe]
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe]
+; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; encoding: [0x80,0x02,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v9, v6 ; encoding: [0x06,0x03,0x12,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v10, v6 ; encoding: [0x06,0x03,0x14,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v6 ; encoding: [0x06,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v7 ; encoding: [0x07,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v3, v9 ; encoding: [0x09,0x03,0x06,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v4, v10 ; encoding: [0x0a,0x03,0x08,0x7e]
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s28 ; encoding: [0x7e,0x1c,0x7e,0x87]
; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; encoding: [0x00,0x0f,0x82,0xf0,0x05,0x00,0x40,0x00]
-; GFX10-NEXT: v_mov_b32_e32 v5, s12 ; encoding: [0x0c,0x02,0x0a,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v6, s13 ; encoding: [0x0d,0x02,0x0c,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v6, v4, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x0c,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
;
; GFX6789-LABEL: sample_c_d_o_2darray_V1_tfe:
; GFX6789: ; %bb.0: ; %main_body
-; GFX6789-NEXT: v_mov_b32_e32 v9, 0
-; GFX6789-NEXT: v_mov_b32_e32 v10, v9
+; GFX6789-NEXT: v_mov_b32_e32 v11, 0
+; GFX6789-NEXT: v_mov_b32_e32 v12, v11
+; GFX6789-NEXT: v_mov_b32_e32 v9, v11
+; GFX6789-NEXT: v_mov_b32_e32 v10, v12
; GFX6789-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da
-; GFX6789-NEXT: v_mov_b32_e32 v0, s12
-; GFX6789-NEXT: v_mov_b32_e32 v1, s13
; GFX6789-NEXT: s_waitcnt vmcnt(0)
-; GFX6789-NEXT: global_store_dword v[0:1], v10, off
; GFX6789-NEXT: v_mov_b32_e32 v0, v9
+; GFX6789-NEXT: global_store_dword v11, v10, s[12:13]
; GFX6789-NEXT: s_waitcnt vmcnt(0)
; GFX6789-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_c_d_o_2darray_V1_tfe:
; GFX10: ; %bb.0: ; %main_body
-; GFX10-NEXT: v_mov_b32_e32 v10, v0 ; encoding: [0x00,0x03,0x14,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; encoding: [0x80,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; encoding: [0x01,0x03,0x12,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v10, v0 ; encoding: [0x00,0x03,0x14,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
-; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v12, v11 ; encoding: [0x0b,0x03,0x18,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, v11 ; encoding: [0x0b,0x03,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, v12 ; encoding: [0x0c,0x03,0x02,0x7e]
; GFX10-NEXT: image_sample_c_d_o v[0:1], [v10, v9, v2, v3, v4, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x2c,0x04,0xe9,0xf0,0x0a,0x00,0x40,0x00,0x09,0x02,0x03,0x04,0x05,0x06,0x07,0x08]
-; GFX10-NEXT: v_mov_b32_e32 v2, s12 ; encoding: [0x0c,0x02,0x04,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v3, s13 ; encoding: [0x0d,0x02,0x06,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00]
+; GFX10-NEXT: global_store_dword v11, v1, s[12:13] ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x01,0x0c,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
; GCN: v_accvgpr_read_b32
; GCN: v_accvgpr_read_b32
; GCN: global_store_dwordx4
-define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(<4 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(<4 x float> addrspace(1)* %arg, i64 %idx) {
bb:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid
+ %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0)
+ ;store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
+ store <4 x float> %mai.1, <4 x float> addrspace(1)* %gep
+ ret void
+}
+
+; FIXME: Resulting code for splat is pretty bad. A v_mov_b32 is moved
+; in the middle of the expanded agpr reg_sequence. The broadcast of
+; the individual AGPR->AGPR components should avoid the intermediate AGPR case.
+; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code:
+; GCN: v_mov_b32_e32 [[VTMP0:v[0-9]+]], 0x42f60000
+; GCN: v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[VTMP0]]
+; GCN: s_nop 0
+; GCN: v_accvgpr_read_b32 [[VTMP1:v[0-9]+]], [[AGPR]]
+; GCN: v_accvgpr_read_b32 [[VTMP2:v[0-9]+]], [[AGPR]]
+; GCN: v_accvgpr_read_b32 [[VTMP3:v[0-9]+]], [[AGPR]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP1]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP2]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP3]]
+; GCN: s_nop 0
+; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]
+; GCN: v_accvgpr_read_b32
+; GCN: v_accvgpr_read_b32
+; GCN: v_accvgpr_read_b32
+; GCN: v_accvgpr_read_b32
+; GCN: global_store_dwordx4
+define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(<4 x float> addrspace(1)* %arg) {
+bb:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid
+
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 123.0, float 123.0, float 123.0, float 123.0>, i32 0, i32 0, i32 0)
store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
ret void
; GFX10: s_get_waveid_in_workgroup [[DEST:s[0-9]+]]
; GFX10: s_waitcnt lgkmcnt(0)
; GFX10: v_mov_b32_e32 [[VDEST:v[0-9]+]], [[DEST]]
-; GFX10: global_store_dword v[{{[0-9:]+}}], [[VDEST]], off
+; GFX10: global_store_dword v{{[0-9]+}}, [[VDEST]], s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @test_s_get_waveid_in_workgroup(i32 addrspace(1)* %out) {
; Make sure %out is loaded and assiciated wait count already inserted
store i32 0, i32 addrspace(1)* %out
; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64
-; GCN: store_dword v[{{[0-9:]+}}], [[V]]
+; GCN: store_dword v{{.+}}, [[V]]
; OPT-W32: store i32 32, i32 addrspace(1)* %arg, align 4
; OPT-W64: store i32 64, i32 addrspace(1)* %arg, align 4
; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
; GCN-NOT: cndmask
-; GCN: store_dword v[{{[0-9:]+}}], [[V]]
+; GCN: store_dword v{{.+}}, [[V]]
; OPT-W32: store i32 1, i32 addrspace(1)* %arg, align 4
; OPT-W64: store i32 2, i32 addrspace(1)* %arg, align 4
; GFX9-LABEL: cos_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX9-NEXT: v_cos_f16_e32 v2, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX9-NEXT: v_cos_f16_e32 v1, v1
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%a.val = load half, half addrspace(1)* %a
%r.val = call half @llvm.cos.f16(half %a.val)
; GFX9-LABEL: cos_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v0
-; GFX9-NEXT: v_cos_f16_e32 v2, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_cos_f16_e32 v3, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
+; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_cos_f16_e32 v3, v3
+; GFX9-NEXT: v_cos_f16_e32 v1, v1
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
%r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val)
; GFX9-LABEL: sin_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
-; GFX9-NEXT: v_sin_f16_e32 v2, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
+; GFX9-NEXT: v_sin_f16_e32 v1, v1
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%a.val = load half, half addrspace(1)* %a
%r.val = call half @llvm.sin.f16(half %a.val)
; GFX9-LABEL: sin_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x3118
+; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v0
-; GFX9-NEXT: v_sin_f16_e32 v2, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_sin_f16_e32 v3, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
+; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_sin_f16_e32 v3, v3
+; GFX9-NEXT: v_sin_f16_e32 v1, v1
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
%r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val)
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-; GCN-HSA: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
+; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]]
; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
; GCN-HSA: {{flat|global}}_load_dword v[[LO:[0-9]+]]
; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN-HSA: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
; EG: MEM_RAT
; GCN-HSA: {{flat|global}}_load_dword v[[LO:[0-9]+]]
; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN-HSA: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN-HSA: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
%ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
%ext = sext <1 x i32> %ld to <1 x i64>
; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
; MUBUF-NEXT: s_waitcnt vmcnt(1)
; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v3
-; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
-; MUBUF-NEXT: v_mov_b32_e32 v2, s4
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; MUBUF-NEXT: v_mov_b32_e32 v3, s5
-; MUBUF-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; MUBUF-NEXT: v_mov_b32_e32 v2, 0
+; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
+; MUBUF-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; MUBUF-NEXT: s_endpgm
;
; FLATSCR-LABEL: local_stack_offset_uses_sp:
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT: v_mov_b32_e32 v3, s1
-; FLATSCR-NEXT: v_mov_b32_e32 v2, s0
-; FLATSCR-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; FLATSCR-NEXT: s_endpgm
entry:
%pin.low = alloca i32, align 8192, addrspace(5)
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: s_lshr_v2i16:
; GFX8: v_mad_u16 v[[R:[0-9]+]], v[[A]], v[[B]], v[[C]]
; GFX9: v_mad_legacy_u16 v[[R:[0-9]+]], v[[A]], v[[B]], v[[C]]
; GFX10: v_mad_u16 v[[R:[0-9]+]], v[[A]], v[[B]], v[[C]]
-; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[R]]
+; GCN: {{flat|global}}_store_short v{{.+}}, v[[R]]
; GCN: s_endpgm
define amdgpu_kernel void @mad_u16(
i16 addrspace(1)* %r,
; GCN-LABEL: {{^}}accvgpr_write_read:
; GFX908: v_accvgpr_write [[AREG:a[0-9]+]], 1
; GFX908: v_accvgpr_read [[VREG:v[0-9]+]], [[AREG]]
-; GFX908: global_store_dword {{[^,]+}}, [[VREG]], off
+; GFX908: global_store_dword v{{[0-9]+}}, [[VREG]], s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @accvgpr_write_read(float addrspace(1)* %arg) {
bb:
%in.1 = load float, float addrspace(1)* %arg
; GCN-LABEL: {{^}}nontemporal_global_0:
; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
-; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}}
-; GFX10: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off slc{{$}}
+; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} glc slc{{$}}
+; GFX10: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} slc{{$}}
; GFX10: .amdhsa_kernel nontemporal_global_0
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x0
; GCN-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x10
; GCN-NEXT: s_load_dwordx4 s[8:11], s[16:17], 0x20
; GCN-NEXT: s_load_dwordx4 s[12:15], s[16:17], 0x30
-; GCN-NEXT: v_mov_b32_e32 v12, s18
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v8, s8
-; GCN-NEXT: v_mov_b32_e32 v13, s19
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v9, s9
; GCN-NEXT: v_mov_b32_e32 v10, s10
; GCN-NEXT: v_mov_b32_e32 v11, s11
-; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off
-; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16
-; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32
+; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[18:19]
+; GCN-NEXT: global_store_dwordx4 v12, v[4:7], s[18:19] offset:16
+; GCN-NEXT: global_store_dwordx4 v12, v[8:11], s[18:19] offset:32
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off offset:48
+; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[18:19] offset:48
; GCN-NEXT: s_endpgm
bb:
%tmp = load <4 x i32>, <4 x i32> addrspace(1)* %arg, align 16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
; GCN-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
-; GCN-NEXT: v_mov_b32_e32 v9, s5
-; GCN-NEXT: v_mov_b32_e32 v8, s4
+; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16
; GCN-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
; MUBUF-NEXT: s_cbranch_scc1 BB0_3
; MUBUF-NEXT: ; %bb.2: ; %bb.1
; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
-; MUBUF-NEXT: v_mov_b32_e32 v1, 0
-; MUBUF-NEXT: v_mov_b32_e32 v2, s6
; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
; MUBUF-NEXT: s_mov_b32 s32, s6
-; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; MUBUF-NEXT: v_mov_b32_e32 v1, 1
+; MUBUF-NEXT: v_mov_b32_e32 v2, s6
+; MUBUF-NEXT: v_mov_b32_e32 v1, 0
+; MUBUF-NEXT: v_mov_b32_e32 v3, 1
; MUBUF-NEXT: s_add_i32 s6, s6, s7
-; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
-; MUBUF-NEXT: v_mov_b32_e32 v1, s6
-; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: v_mov_b32_e32 v2, s6
+; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_add_u32_e32 v2, v1, v0
+; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0
; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
-; MUBUF-NEXT: v_mov_b32_e32 v0, s4
-; MUBUF-NEXT: v_mov_b32_e32 v1, s5
-; MUBUF-NEXT: global_store_dword v[0:1], v2, off
+; MUBUF-NEXT: global_store_dword v1, v0, s[4:5]
; MUBUF-NEXT: BB0_3: ; %bb.2
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: global_store_dword v[0:1], v0, off
; FLATSCR-NEXT: s_lshl_b32 s2, s6, 2
; FLATSCR-NEXT: s_mov_b32 s32, s4
; FLATSCR-NEXT: s_add_i32 s4, s4, s2
-; FLATSCR-NEXT: scratch_load_dword v1, off, s4
+; FLATSCR-NEXT: scratch_load_dword v2, off, s4
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0
+; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
-; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
-; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
+; FLATSCR-NEXT: global_store_dword v1, v0, s[0:1]
; FLATSCR-NEXT: BB0_3: ; %bb.2
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; MUBUF-NEXT: ; %bb.1: ; %bb.0
; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000
-; MUBUF-NEXT: v_mov_b32_e32 v1, 0
-; MUBUF-NEXT: v_mov_b32_e32 v2, s6
; MUBUF-NEXT: s_lshl_b32 s7, s7, 2
; MUBUF-NEXT: s_mov_b32 s32, s6
-; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
-; MUBUF-NEXT: v_mov_b32_e32 v1, 1
+; MUBUF-NEXT: v_mov_b32_e32 v2, s6
+; MUBUF-NEXT: v_mov_b32_e32 v1, 0
+; MUBUF-NEXT: v_mov_b32_e32 v3, 1
; MUBUF-NEXT: s_add_i32 s6, s6, s7
-; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
-; MUBUF-NEXT: v_mov_b32_e32 v1, s6
-; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: v_mov_b32_e32 v2, s6
+; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_add_u32_e32 v2, v1, v0
+; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0
; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
-; MUBUF-NEXT: v_mov_b32_e32 v0, s4
-; MUBUF-NEXT: v_mov_b32_e32 v1, s5
-; MUBUF-NEXT: global_store_dword v[0:1], v2, off
+; MUBUF-NEXT: global_store_dword v1, v0, s[4:5]
; MUBUF-NEXT: BB1_2: ; %bb.1
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
; MUBUF-NEXT: global_store_dword v[0:1], v0, off
; FLATSCR-NEXT: s_mov_b32 s32, s2
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2
; FLATSCR-NEXT: s_add_i32 s2, s2, s3
-; FLATSCR-NEXT: scratch_load_dword v1, off, s2
+; FLATSCR-NEXT: scratch_load_dword v2, off, s2
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: v_add_u32_e32 v2, v1, v0
+; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
-; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
-; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
+; FLATSCR-NEXT: global_store_dword v1, v0, s[0:1]
; FLATSCR-NEXT: BB1_2: ; %bb.1
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: global_store_dword v[0:1], v0, off
; GFX9-LABEL: global_inst_salu_offset_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
; GFX10-LABEL: global_inst_salu_offset_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1
+; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
; GFX9-LABEL: global_inst_salu_offset_11bit_max:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
; GFX10-LABEL: global_inst_salu_offset_11bit_max:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
+; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
; GFX9-LABEL: global_inst_salu_offset_12bit_max:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048
+; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
; GCN: v_cndmask_b32
; GCN: v_cndmask_b32
; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
-; GCN: store_dword v[{{[0-9:]+}}], [[RES]]
+; GCN: store_dword v{{.+}}, [[RES]]
; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
; OPT: store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, <4 x float> addrspace(5)* %alloca, align 4
; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
-; GCN: store_dwordx4 v[{{[0-9:]+}}],
+; GCN: store_dwordx4 v{{.+}},
; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
; GETREG-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
; GETREG-DAG: s_getreg_b32 [[CNT1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES, 0, 20)
; GETREG-DAG: v_mov_b32_e32 v[[VCNT1:[0-9]+]], [[CNT1]]
-; GETREG: global_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[VCNT1]]:[[ZERO]]], off
+; GETREG: global_store_dwordx2 v{{.+}}, v{{\[}}[[VCNT1]]:[[ZERO]]]
; GETREG: s_getreg_b32 [[CNT2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES, 0, 20)
; GETREG: v_mov_b32_e32 v[[VCNT2:[0-9]+]], [[CNT2]]
-; GETREG: global_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[VCNT2]]:[[ZERO]]], off
+; GETREG: global_store_dwordx2 v{{.+}}, v{{\[}}[[VCNT2]]:[[ZERO]]]
define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
%cycle0 = call i64 @llvm.readcyclecounter()
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: s_add_u32 s2, s6, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: s_addc_u32 s3, s7, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
%val = extractvalue { i64, i1 } %sadd, 0
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v4, s1
-; GFX9-NEXT: v_add_i32 v4, s0, v4 clamp
-; GFX9-NEXT: s_add_i32 s0, s0, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v5, s0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v4
-; GFX9-NEXT: global_store_dword v[0:1], v5, off
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v[2:3], v0, off
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_add_i32 s1, s0, s1
+; GFX9-NEXT: v_add_i32 v1, s0, v1 clamp
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
%val = extractvalue { i32, i1 } %sadd, 0
; GFX9-LABEL: v_saddo_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NEXT: global_load_dword v4, v[0:1], off
-; GFX9-NEXT: global_load_dword v5, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_i32 v6, v4, v5 clamp
-; GFX9-NEXT: v_add_u32_e32 v4, v4, v5
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v6
-; GFX9-NEXT: global_store_dword v[0:1], v4, off
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v[2:3], v0, off
+; GFX9-NEXT: v_add_i32 v3, v1, v2 clamp
+; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
%a = load i32, i32 addrspace(1)* %aptr, align 4
%b = load i32, i32 addrspace(1)* %bptr, align 4
; GFX9-LABEL: s_saddo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: s_add_u32 s0, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_addc_u32 s1, s5, s7
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
-; GFX9-NEXT: global_store_dwordx2 v[0:1], v[4:5], off
+; GFX9-NEXT: s_add_u32 s8, s4, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_addc_u32 s9, s5, s7
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[10:11], s[6:7], 0
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT: global_store_byte v[2:3], v0, off
+; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
%val = extractvalue { i64, i1 } %sadd, 0
;
; GFX9-LABEL: v_saddo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-NEXT: v_mov_b32_e32 v7, s3
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v3, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
-; GFX9-NEXT: global_store_dwordx2 v[4:5], v[8:9], off
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
+; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5]
; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT: global_store_byte v[6:7], v0, off
+; GFX9-NEXT: global_store_byte v6, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%a = load i64, i64 addrspace(1)* %aptr, align 4
%b = load i64, i64 addrspace(1)* %bptr, align 4
; GFX9-LABEL: v_saddo_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-NEXT: v_mov_b32_e32 v7, s3
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_i32 v8, v0, v2 clamp
+; GFX9-NEXT: v_add_i32 v5, v0, v2 clamp
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_add_i32 v2, v1, v3 clamp
; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v8
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v0, v5
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
%b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
; CI: buffer_load_dword
; CI: buffer_store_dword
-; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4
-; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:12
+; GFX9: global_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
+; GFX9: global_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:12
; GFX9: ds_write_b32
define amdgpu_kernel void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
%ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
; CI: buffer_store_dword
; CI: s_endpgm
-; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:400
-; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:408
-; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:12
-; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:400
-; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:408
+; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:400
+; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:408
+; GFX9-DAG: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:12
+; GFX9-DAG: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:400
+; GFX9-DAG: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:408
; GFX9: global_store_dword
; GFX9: s_endpgm
define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
; GCN-LABEL: ; %bb.0:
; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[ADDR_LO:[0-9]+]]{{\:}}[[ADDR_HI:[0-9]+]]{{\]}}, 0x0
; GCN: s_waitcnt lgkmcnt(0)
-; GCN: global_store_dword v{{\[}}[[ADDR_LO]]{{\:}}[[ADDR_HI]]{{\]}}, v{{[0-9]+}}, off
+; GCN: global_store_dword v
define amdgpu_kernel void @zot(i32 addrspace(1)* nocapture %arg, i64 addrspace(1)* nocapture %arg1) {
bb:
; SIVI-DAG: buffer_store_byte
; SIVI-DAG: buffer_store_short
-; GFX9-DAG: global_store_byte_d16_hi v{{\[[0-9]:[0-9]+\]}}, v{{[0-9]+}}, off offset:2
+; GFX9-DAG: global_store_byte_d16_hi v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:2
; GFX9-DAG: global_store_short
; EG: MEM_RAT MSKOR
; GCN: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}}
; GCN: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]]
; SIVI: buffer_store_dword [[VAND]]
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAND]]
+; GFX9: global_store_dword v{{[0-9]+}}, [[VAND]], s
; EG: MEM_RAT_CACHELESS STORE_RAW
; EG-NOT: MEM_RAT
;
; GFX9-LABEL: local_store_i55:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8
-; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: s_and_b32 s3, s2, 0xffff
-; GFX9-NEXT: ds_write_b16 v0, v1 offset:4
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: ds_write_b16 v1, v2 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_or_b32_e32 v1, s3, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffff, v1
-; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6
-; GFX9-NEXT: ds_write_b32 v0, v3
+; GFX9-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0x7fffff, v0
+; GFX9-NEXT: ds_write_b8_d16_hi v1, v0 offset:6
+; GFX9-NEXT: ds_write_b32 v1, v3
; GFX9-NEXT: s_endpgm
store i55 %arg, i55 addrspace(3)* %ptr, align 8
ret void
; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}trunc_store_v4i64_v4i8:
-; GCN: global_store_dword v{{\[[0-9]:[0-9]+\]}}, v{{[0-9]+}}, off
+; GCN: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @trunc_store_v4i64_v4i8(< 4 x i8> addrspace(1)* %out, <4 x i64> %in) {
entry:
%trunc = trunc <4 x i64> %in to < 4 x i8>
}
; GCN-LABEL: {{^}}trunc_store_v8i64_v8i8:
-; GCN: global_store_dwordx2 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off
+; GCN: global_store_dwordx2 v{{[0-9]+}}, v{{\[[0-9]:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @trunc_store_v8i64_v8i8(< 8 x i8> addrspace(1)* %out, <8 x i64> %in) {
entry:
%trunc = trunc <8 x i64> %in to < 8 x i8>
}
; GCN-LABEL: {{^}}trunc_store_v8i64_v8i16:
-; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off
+; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @trunc_store_v8i64_v8i16(< 8 x i16> addrspace(1)* %out, <8 x i64> %in) {
entry:
%trunc = trunc <8 x i64> %in to < 8 x i16>
}
; GCN-LABEL: {{^}}trunc_store_v8i64_v8i32:
-; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:16
-; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off
+; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16
+; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @trunc_store_v8i64_v8i32(< 8 x i32> addrspace(1)* %out, <8 x i64> %in) {
entry:
%trunc = trunc <8 x i64> %in to <8 x i32>
}
; GCN-LABEL: {{^}}trunc_store_v16i64_v16i32:
-; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:48
-; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:32
-; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:16
-; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off
+; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:48
+; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32
+; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16
+; GCN: global_store_dwordx4 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]$}}
define amdgpu_kernel void @trunc_store_v16i64_v16i32(< 16 x i32> addrspace(1)* %out, <16 x i64> %in) {
entry:
%trunc = trunc <16 x i64> %in to <16 x i32>
; GFX9-LABEL: shuffle_scalar_load_v8i32_0123:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, s2
-; GFX9-NEXT: v_mov_b32_e32 v5, s3
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16
%id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX1032: s_or_b32 [[MASK0:s[0-9]+]], [[MASK0]], vcc_lo
; GFX1064: s_or_b64 [[MASK0:s\[[0-9:]+\]]], [[MASK0]], vcc
+; GCN: global_store_dword
; GFX1032: s_andn2_b32 [[MASK1:s[0-9]+]], [[MASK1]], exec_lo
; GFX1064: s_andn2_b64 [[MASK1:s\[[0-9:]+\]]], [[MASK1]], exec
; GFX1032: s_and_b32 [[MASK0]], [[MASK0]], exec_lo
; GFX1064: s_and_b64 [[MASK0]], [[MASK0]], exec
-; GCN: global_store_dword
; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], [[MASK0]]
; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], [[MASK0]]
; GCN: BB{{.*}}: ; %Flow
}
; GCN-LABEL: {{^}}fdiv_f32:
+; GFX1032: v_div_scale_f32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GFX1064: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN: v_rcp_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; GFX1032: v_div_scale_f32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
; GFX1064: v_div_scale_f32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: v_rcp_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+
; GCN-NOT: vcc
; GCN: v_div_fmas_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
; GFX1064: v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]]
-; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]],
+; GCN: store_dwordx2 v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]], s
define amdgpu_kernel void @test_intr_fcmp_i64(i64 addrspace(1)* %out, float %src, float %a) {
%temp = call float @llvm.fabs.f32(float %a)
%result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1)
; GFX1064: v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], 0x64, {{s[0-9]+}}
; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]]
-; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]],
+; GCN: store_dwordx2 v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]], s
define amdgpu_kernel void @test_intr_icmp_i64(i64 addrspace(1)* %out, i32 %src) {
%result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32)
store i64 %result, i64 addrspace(1)* %out
; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
; GFX1064: v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}|
; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]
-; GCN: store_dword v[{{[0-9:]+}}], v[[V_LO]],
+; GCN: store_dword v{{[0-9]+}}, v[[V_LO]], s
define amdgpu_kernel void @test_intr_fcmp_i32(i32 addrspace(1)* %out, float %src, float %a) {
%temp = call float @llvm.fabs.f32(float %a)
%result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1)
; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}}
; GFX1064: v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:{{[0-9]+}}], 0x64, {{s[0-9]+}}
; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}}
-; GCN: store_dword v[{{[0-9:]+}}], v[[V_LO]],
+; GCN: store_dword v{{[0-9]+}}, v[[V_LO]], s
define amdgpu_kernel void @test_intr_icmp_i32(i32 addrspace(1)* %out, i32 %src) {
%result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32)
store i32 %result, i32 addrspace(1)* %out